import os, copy from bashlex import yacc, tokenizer, state, ast, subst, flags, errors, heredoc def _partsspan(parts): return parts[0].pos[0], parts[-1].pos[1] tokens = [e.name for e in tokenizer.tokentype] precedence = ( ('left', 'AMPERSAND', 'SEMICOLON', 'NEWLINE', 'EOF'), ('left', 'AND_AND', 'OR_OR'), ('right', 'BAR', 'BAR_AND') ) def handleNotImplemented(p, type): if len(p) == 2: raise NotImplementedError('type = {%s}, token = {%s}' % (type, p[1])) else: raise NotImplementedError('type = {%s}, token = {%s}, parts = {%s}' % (type, p[1], p[2])) def handleAssert(p, test): if not test: raise AssertionError('token = {%s}' % p[1]) def p_inputunit(p): '''inputunit : simple_list simple_list_terminator | NEWLINE | error NEWLINE | EOF''' # XXX if p.lexer._parserstate & flags.parser.CMDSUBST: p.lexer._parserstate.add(flags.parser.EOFTOKEN) if isinstance(p[1], ast.node): p[0] = p[1] # accept right here in case the input contains more lines that are # not part of the current command p.accept() def p_word_list(p): '''word_list : WORD | word_list WORD''' parserobj = p.context if len(p) == 2: p[0] = [_expandword(parserobj, p.slice[1])] else: p[0] = p[1] p[0].append(_expandword(parserobj, p.slice[2])) def p_redirection_heredoc(p): '''redirection : LESS_LESS WORD | NUMBER LESS_LESS WORD | REDIR_WORD LESS_LESS WORD | LESS_LESS_MINUS WORD | NUMBER LESS_LESS_MINUS WORD | REDIR_WORD LESS_LESS_MINUS WORD''' parserobj = p.context assert isinstance(parserobj, _parser) output = ast.node(kind='word', word=p[len(p)-1], parts=[], pos=p.lexspan(len(p)-1)) if len(p) == 3: p[0] = ast.node(kind='redirect', input=None, type=p[1], heredoc=None, output=output, pos=(p.lexpos(1), p.endlexpos(2))) else: p[0] = ast.node(kind='redirect', input=p[1], type=p[2], heredoc=None, output=output, pos=(p.lexpos(1), p.endlexpos(3))) if p.slice[len(p)-2].ttype == tokenizer.tokentype.LESS_LESS: parserobj.redirstack.append((p[0], False)) else: parserobj.redirstack.append((p[0], True)) def p_redirection(p): '''redirection : GREATER WORD | LESS WORD | NUMBER GREATER WORD | NUMBER LESS WORD | REDIR_WORD GREATER WORD | REDIR_WORD LESS WORD | GREATER_GREATER WORD | NUMBER GREATER_GREATER WORD | REDIR_WORD GREATER_GREATER WORD | GREATER_BAR WORD | NUMBER GREATER_BAR WORD | REDIR_WORD GREATER_BAR WORD | LESS_GREATER WORD | NUMBER LESS_GREATER WORD | REDIR_WORD LESS_GREATER WORD | LESS_LESS_LESS WORD | NUMBER LESS_LESS_LESS WORD | REDIR_WORD LESS_LESS_LESS WORD | LESS_AND NUMBER | NUMBER LESS_AND NUMBER | REDIR_WORD LESS_AND NUMBER | GREATER_AND NUMBER | NUMBER GREATER_AND NUMBER | REDIR_WORD GREATER_AND NUMBER | LESS_AND WORD | NUMBER LESS_AND WORD | REDIR_WORD LESS_AND WORD | GREATER_AND WORD | NUMBER GREATER_AND WORD | REDIR_WORD GREATER_AND WORD | GREATER_AND DASH | NUMBER GREATER_AND DASH | REDIR_WORD GREATER_AND DASH | LESS_AND DASH | NUMBER LESS_AND DASH | REDIR_WORD LESS_AND DASH | AND_GREATER WORD | AND_GREATER_GREATER WORD''' parserobj = p.context if len(p) == 3: output = p[2] if p.slice[2].ttype == tokenizer.tokentype.WORD: output = _expandword(parserobj, p.slice[2]) p[0] = ast.node(kind='redirect', input=None, type=p[1], heredoc=None, output=output, pos=(p.lexpos(1), p.endlexpos(2))) else: output = p[3] if p.slice[3].ttype == tokenizer.tokentype.WORD: output = _expandword(parserobj, p.slice[3]) p[0] = ast.node(kind='redirect', input=p[1], type=p[2], heredoc=None, output=output, pos=(p.lexpos(1), p.endlexpos(3))) def _expandword(parser, tokenword): if parser._expansionlimit == -1: # we enter this branch in the following conditions: # - currently parsing a substitution as a result of an expansion # - the previous expansion had limit == 0 # # this means that this node is a descendant of a substitution in an # unexpanded word and will be filtered in the limit == 0 condition below # # (the reason we even expand when limit == 0 is to get quote removal) node = ast.node(kind='word', word=tokenword, pos=(tokenword.lexpos, tokenword.endlexpos), parts=[]) return node else: quoted = bool(tokenword.flags & flags.word.QUOTED) doublequoted = quoted and tokenword.value[0] == '"' # TODO set qheredocument parts, expandedword = subst._expandwordinternal(parser, tokenword, 0, doublequoted, 0, 0) # limit reached, don't include substitutions (still expanded to get # quote removal though) if parser._expansionlimit == 0: parts = [node for node in parts if 'substitution' not in node.kind] node = ast.node(kind='word', word=expandedword, pos=(tokenword.lexpos, tokenword.endlexpos), parts=parts) return node def p_simple_command_element(p): '''simple_command_element : WORD | ASSIGNMENT_WORD | redirection''' if isinstance(p[1], ast.node): p[0] = [p[1]] return parserobj = p.context p[0] = [_expandword(parserobj, p.slice[1])] # change the word node to an assignment if necessary if p.slice[1].ttype == tokenizer.tokentype.ASSIGNMENT_WORD: p[0][0].kind = 'assignment' def p_redirection_list(p): '''redirection_list : redirection | redirection_list redirection''' if len(p) == 2: p[0] = [p[1]] else: p[0] = p[1] p[0].append(p[2]) def p_simple_command(p): '''simple_command : simple_command_element | simple_command simple_command_element''' p[0] = p[1] if len(p) == 3: p[0].extend(p[2]) def p_command(p): '''command : simple_command | shell_command | shell_command redirection_list | function_def | coproc''' if isinstance(p[1], ast.node): p[0] = p[1] if len(p) == 3: handleAssert(p, p[0].kind == 'compound') p[0].redirects.extend(p[2]) handleAssert(p, p[0].pos[0] < p[0].redirects[-1].pos[1]) p[0].pos = (p[0].pos[0], p[0].redirects[-1].pos[1]) else: p[0] = ast.node(kind='command', parts=p[1], pos=_partsspan(p[1])) def p_shell_command(p): '''shell_command : for_command | case_command | WHILE compound_list DO compound_list DONE | UNTIL compound_list DO compound_list DONE | select_command | if_command | subshell | group_command | arith_command | cond_command | arith_for_command''' if len(p) == 2: p[0] = p[1] else: # while or until handleAssert(p, p[2].kind == 'list') parts = _makeparts(p) kind = parts[0].word assert kind in ('while', 'until') p[0] = ast.node(kind='compound', redirects=[], list=[ast.node(kind=kind, parts=parts, pos=_partsspan(parts))], pos=_partsspan(parts)) handleAssert(p, p[0].kind == 'compound') def _makeparts(p): parts = [] for i in range(1, len(p)): if isinstance(p[i], ast.node): parts.append(p[i]) elif isinstance(p[i], list): parts.extend(p[i]) elif isinstance(p.slice[i], tokenizer.token): if p.slice[i].ttype == tokenizer.tokentype.WORD: parserobj = p.context parts.append(_expandword(parserobj, p.slice[i])) else: parts.append(ast.node(kind='reservedword', word=p[i], pos=p.lexspan(i))) else: pass return parts def p_for_command(p): '''for_command : FOR WORD newline_list DO compound_list DONE | FOR WORD newline_list LEFT_CURLY compound_list RIGHT_CURLY | FOR WORD SEMICOLON newline_list DO compound_list DONE | FOR WORD SEMICOLON newline_list LEFT_CURLY compound_list RIGHT_CURLY | FOR WORD newline_list IN word_list list_terminator newline_list DO compound_list DONE | FOR WORD newline_list IN word_list list_terminator newline_list LEFT_CURLY compound_list RIGHT_CURLY | FOR WORD newline_list IN list_terminator newline_list DO compound_list DONE | FOR WORD newline_list IN list_terminator newline_list LEFT_CURLY compound_list RIGHT_CURLY''' parts = _makeparts(p) # find the operatornode that we might have there due to # list_terminator/newline_list and convert it to a reservedword so its # considered as part of the for loop for i, part in enumerate(parts): if part.kind == 'operator' and part.op == ';': parts[i] = ast.node(kind='reservedword', word=';', pos=part.pos) break # there could be only one in there... p[0] = ast.node(kind='compound', redirects=[], list=[ast.node(kind='for', parts=parts, pos=_partsspan(parts))], pos=_partsspan(parts)) def p_arith_for_command(p): '''arith_for_command : FOR ARITH_FOR_EXPRS list_terminator newline_list DO compound_list DONE | FOR ARITH_FOR_EXPRS list_terminator newline_list LEFT_CURLY compound_list RIGHT_CURLY | FOR ARITH_FOR_EXPRS DO compound_list DONE | FOR ARITH_FOR_EXPRS LEFT_CURLY compound_list RIGHT_CURLY''' handleNotImplemented(p, 'arithmetic for') def p_select_command(p): '''select_command : SELECT WORD newline_list DO list DONE | SELECT WORD newline_list LEFT_CURLY list RIGHT_CURLY | SELECT WORD SEMICOLON newline_list DO list DONE | SELECT WORD SEMICOLON newline_list LEFT_CURLY list RIGHT_CURLY | SELECT WORD newline_list IN word_list list_terminator newline_list DO list DONE | SELECT WORD newline_list IN word_list list_terminator newline_list LEFT_CURLY list RIGHT_CURLY''' handleNotImplemented(p, 'select command') def p_case_command(p): '''case_command : CASE WORD newline_list IN newline_list ESAC | CASE WORD newline_list IN case_clause_sequence newline_list ESAC | CASE WORD newline_list IN case_clause ESAC''' handleNotImplemented(p, 'case command') def p_function_def(p): '''function_def : WORD LEFT_PAREN RIGHT_PAREN newline_list function_body | FUNCTION WORD LEFT_PAREN RIGHT_PAREN newline_list function_body | FUNCTION WORD newline_list function_body''' parts = _makeparts(p) body = parts[-1] name = parts[ast.findfirstkind(parts, 'word')] p[0] = ast.node(kind='function', name=name, body=body, parts=parts, pos=_partsspan(parts)) def p_function_body(p): '''function_body : shell_command | shell_command redirection_list''' handleAssert(p, p[1].kind == 'compound') p[0] = p[1] if len(p) == 3: p[0].redirects.extend(p[2]) handleAssert(p, p[0].pos[0] < p[0].redirects[-1].pos[1]) p[0].pos = (p[0].pos[0], p[0].redirects[-1].pos[1]) def p_subshell(p): '''subshell : LEFT_PAREN compound_list RIGHT_PAREN''' lparen = ast.node(kind='reservedword', word=p[1], pos=p.lexspan(1)) rparen = ast.node(kind='reservedword', word=p[3], pos=p.lexspan(3)) parts = [lparen, p[2], rparen] p[0] = ast.node(kind='compound', list=parts, redirects=[], pos=_partsspan(parts)) def p_coproc(p): '''coproc : COPROC shell_command | COPROC shell_command redirection_list | COPROC WORD shell_command | COPROC WORD shell_command redirection_list | COPROC simple_command''' handleNotImplemented(p, 'coproc') def p_if_command(p): '''if_command : IF compound_list THEN compound_list FI | IF compound_list THEN compound_list ELSE compound_list FI | IF compound_list THEN compound_list elif_clause FI''' # we currently don't distinguish the various lists that make up the # command, because it's not needed later on. if there will be a need # we can always add different nodes for elif/else. parts = _makeparts(p) p[0] = ast.node(kind='compound', redirects=[], list=[ast.node(kind='if', parts=parts, pos=_partsspan(parts))], pos=_partsspan(parts)) def p_group_command(p): '''group_command : LEFT_CURLY compound_list RIGHT_CURLY''' lcurly = ast.node(kind='reservedword', word=p[1], pos=p.lexspan(1)) rcurly = ast.node(kind='reservedword', word=p[3], pos=p.lexspan(3)) parts = [lcurly, p[2], rcurly] p[0] = ast.node(kind='compound', list=parts, redirects=[], pos=_partsspan(parts)) def p_arith_command(p): '''arith_command : ARITH_CMD''' handleNotImplemented(p, 'arithmetic command') def p_cond_command(p): '''cond_command : COND_START COND_CMD COND_END''' handleNotImplemented(p, 'cond command') def p_elif_clause(p): '''elif_clause : ELIF compound_list THEN compound_list | ELIF compound_list THEN compound_list ELSE compound_list | ELIF compound_list THEN compound_list elif_clause''' parts = [] for i in range(1, len(p)): if isinstance(p[i], ast.node): parts.append(p[i]) else: parts.append(ast.node(kind='reservedword', word=p[i], pos=p.lexspan(i))) p[0] = parts def p_case_clause(p): '''case_clause : pattern_list | case_clause_sequence pattern_list''' handleNotImplemented(p, 'case clause') def p_pattern_list(p): '''pattern_list : newline_list pattern RIGHT_PAREN compound_list | newline_list pattern RIGHT_PAREN newline_list | newline_list LEFT_PAREN pattern RIGHT_PAREN compound_list | newline_list LEFT_PAREN pattern RIGHT_PAREN newline_list''' handleNotImplemented(p, 'pattern list') def p_case_clause_sequence(p): '''case_clause_sequence : pattern_list SEMI_SEMI | case_clause_sequence pattern_list SEMI_SEMI | pattern_list SEMI_AND | case_clause_sequence pattern_list SEMI_AND | pattern_list SEMI_SEMI_AND | case_clause_sequence pattern_list SEMI_SEMI_AND''' handleNotImplemented(p, 'case clause') def p_pattern(p): '''pattern : WORD | pattern BAR WORD''' handleNotImplemented(p, 'pattern') def p_list(p): '''list : newline_list list0''' p[0] = p[2] def p_compound_list(p): '''compound_list : list | newline_list list1''' if len(p) == 2: p[0] = p[1] else: parts = p[2] if len(parts) > 1: p[0] = ast.node(kind='list', parts=parts, pos=_partsspan(parts)) else: p[0] = parts[0] def p_list0(p): '''list0 : list1 NEWLINE newline_list | list1 AMPERSAND newline_list | list1 SEMICOLON newline_list''' parts = p[1] if len(parts) > 1 or p.slice[2].ttype != tokenizer.tokentype.NEWLINE: parts.append(ast.node(kind='operator', op=p[2], pos=p.lexspan(2))) p[0] = ast.node(kind='list', parts=parts, pos=_partsspan(parts)) else: p[0] = parts[0] def p_list1(p): '''list1 : list1 AND_AND newline_list list1 | list1 OR_OR newline_list list1 | list1 AMPERSAND newline_list list1 | list1 SEMICOLON newline_list list1 | list1 NEWLINE newline_list list1 | pipeline_command''' if len(p) == 2: p[0] = [p[1]] else: p[0] = p[1] # XXX newline p[0].append(ast.node(kind='operator', op=p[2], pos=p.lexspan(2))) p[0].extend(p[len(p) - 1]) def p_simple_list_terminator(p): '''simple_list_terminator : NEWLINE | EOF''' pass def p_list_terminator(p): '''list_terminator : NEWLINE | SEMICOLON | EOF''' if p[1] == ';': p[0] = ast.node(kind='operator', op=';', pos=p.lexspan(1)) def p_newline_list(p): '''newline_list : empty | newline_list NEWLINE''' pass def p_simple_list(p): '''simple_list : simple_list1 | simple_list1 AMPERSAND | simple_list1 SEMICOLON''' tok = p.lexer heredoc.gatherheredocuments(tok) if len(p) == 3 or len(p[1]) > 1: parts = p[1] if len(p) == 3: parts.append(ast.node(kind='operator', op=p[2], pos=p.lexspan(2))) p[0] = ast.node(kind='list', parts=parts, pos=_partsspan(parts)) else: assert len(p[1]) == 1 p[0] = p[1][0] if (len(p) == 2 and p.lexer._parserstate & flags.parser.CMDSUBST and p.lexer._current_token.nopos() == p.lexer._shell_eof_token): # accept the input p.accept() def p_simple_list1(p): '''simple_list1 : simple_list1 AND_AND newline_list simple_list1 | simple_list1 OR_OR newline_list simple_list1 | simple_list1 AMPERSAND simple_list1 | simple_list1 SEMICOLON simple_list1 | pipeline_command''' if len(p) == 2: p[0] = [p[1]] else: p[0] = p[1] p[0].append(ast.node(kind='operator', op=p[2], pos=p.lexspan(2))) p[0].extend(p[len(p) - 1]) def p_pipeline_command(p): '''pipeline_command : pipeline | BANG pipeline_command | timespec pipeline_command | timespec list_terminator | BANG list_terminator''' if len(p) == 2: if len(p[1]) == 1: p[0] = p[1][0] else: p[0] = ast.node(kind='pipeline', parts=p[1], pos=(p[1][0].pos[0], p[1][-1].pos[1])) else: # XXX timespec node = ast.node(kind='reservedword', word='!', pos=p.lexspan(1)) if p[2].kind == 'pipeline': p[0] = p[2] p[0].parts.insert(0, node) p[0].pos = (p[0].parts[0].pos[0], p[0].parts[-1].pos[1]) else: p[0] = ast.node(kind='pipeline', parts=[node, p[2]], pos=(node.pos[0], p[2].pos[1])) def p_pipeline(p): '''pipeline : pipeline BAR newline_list pipeline | pipeline BAR_AND newline_list pipeline | command''' if len(p) == 2: p[0] = [p[1]] else: p[0] = p[1] p[0].append(ast.node(kind='pipe', pipe=p[2], pos=p.lexspan(2))) p[0].extend(p[len(p) - 1]) def p_timespec(p): '''timespec : TIME | TIME TIMEOPT | TIME TIMEOPT TIMEIGN''' handleNotImplemented(p, 'time command') def p_empty(p): '''empty :''' pass def p_error(p): assert isinstance(p, tokenizer.token) if p.ttype == tokenizer.tokentype.EOF: raise errors.ParsingError('unexpected EOF', p.lexer.source, len(p.lexer.source)) else: raise errors.ParsingError('unexpected token %r' % p.value, p.lexer.source, p.lexpos) yaccparser = yacc.yacc(outputdir=os.path.dirname(__file__), debug=False) # some hack to fix yacc's reduction on command substitutions: # which state to fix is derived from static transition tables # as states are changeable among python versions and architectures # the only state that is considered fixed is the initial state: 0 def get_correction_states(): reduce = yaccparser.goto[0]['simple_list'] #~10 state2 = yaccparser.action[reduce]['NEWLINE'] #63 state1 = yaccparser.goto[reduce]['simple_list_terminator'] #~10 return state1, state2 def get_correction_rightparen_states(): state1 = yaccparser.goto[0]['pipeline_command'] state2 = yaccparser.goto[0]['simple_list1'] #11 state_temp = yaccparser.action[state2]['SEMICOLON'] #65 state3 = yaccparser.goto[state_temp]['simple_list1'] return state1, state2, state3 for tt in tokenizer.tokentype: states = get_correction_states() yaccparser.action[states[0]][tt.name] = -1 yaccparser.action[states[1]][tt.name] = -141 states = get_correction_rightparen_states() yaccparser.action[states[0]]['RIGHT_PAREN'] = -155 yaccparser.action[states[1]]['RIGHT_PAREN'] = -148 yaccparser.action[states[2]]['RIGHT_PAREN'] = -154 def parsesingle(s, strictmode=True, expansionlimit=None, convertpos=False): '''like parse, but only consumes a single top level node, e.g. parsing 'a\nb' will only return a node for 'a', leaving b unparsed''' p = _parser(s, strictmode=strictmode, expansionlimit=expansionlimit) tree = p.parse() if convertpos: ast.posconverter(s).visit(tree) return tree def parse(s, strictmode=True, expansionlimit=None, convertpos=False): '''parse the input string, returning a list of nodes top level node kinds are: - command - a simple command - pipeline - a series of simple commands - list - a series of one or more pipelines - compound - contains constructs for { list; }, (list), if, for.. leafs are word nodes (which in turn can also contain any of the aforementioned nodes due to command substitutions). when strictmode is set to False, we will: - skip reading a heredoc if we're at the end of the input expansionlimit is used to limit the amount of recursive parsing done due to command substitutions found during word expansion. ''' p = _parser(s, strictmode=strictmode, expansionlimit=expansionlimit) parts = [p.parse()] class endfinder(ast.nodevisitor): def __init__(self): self.end = -1 def visitheredoc(self, node, value): self.end = node.pos[1] # find the 'real' end incase we have a heredoc in there ef = _endfinder() ef.visit(parts[-1]) index = max(parts[-1].pos[1], ef.end) + 1 while index < len(s): part = _parser(s[index:], strictmode=strictmode).parse() if not isinstance(part, ast.node): break ast.posshifter(index).visit(part) parts.append(part) ef = _endfinder() ef.visit(parts[-1]) index = max(parts[-1].pos[1], ef.end) + 1 if convertpos: for tree in parts: ast.posconverter(s).visit(tree) return parts def split(s): '''a utility function that mimics shlex.split but handles more complex shell constructs such as command substitutions inside words >>> list(split('a b"c"\\'d\\'')) ['a', 'bcd'] >>> list(split('a "b $(c)" $(d) \\'$(e)\\'')) ['a', 'b $(c)', '$(d)', '$(e)'] >>> list(split('a b\\n')) ['a', 'b', '\\n'] ''' p = _parser(s) for t in p.tok: if t.ttype == tokenizer.tokentype.WORD: quoted = bool(t.flags & flags.word.QUOTED) doublequoted = quoted and t.value[0] == '"' parts, expandedword = subst._expandwordinternal(p, t, 0, doublequoted, 0, 0) yield expandedword else: yield s[t.lexpos:t.endlexpos] class _parser(object): ''' this class is mainly used to provide context to the productions when we're in the middle of parsing. as a hack, we shove it into the YaccProduction context attribute to make it accessible. ''' def __init__(self, s, strictmode=True, expansionlimit=None, tokenizerargs=None): assert expansionlimit is None or isinstance(expansionlimit, int) self.s = s self._strictmode = strictmode self._expansionlimit = expansionlimit if tokenizerargs is None: tokenizerargs = {} self.parserstate = tokenizerargs.pop('parserstate', state.parserstate()) self.tok = tokenizer.tokenizer(s, parserstate=self.parserstate, strictmode=strictmode, **tokenizerargs) self.redirstack = self.tok.redirstack def parse(self): # yacc.yacc returns a parser object that is not reentrant, it has # some mutable state. we make a shallow copy of it so no # state spills over to the next call to parse on it theparser = copy.copy(yaccparser) tree = theparser.parse(lexer=self.tok, context=self) return tree class _endfinder(ast.nodevisitor): '''helper class to find the "real" end pos of a node that contains a heredoc. this is a hack because heredoc aren't really part of any node since they don't always follow the end of a node and might appear on a different line''' def __init__(self): self.end = -1 def visitheredoc(self, node, value): self.end = node.pos[1]