You cannot select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
	
	
		
			313 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Python
		
	
			
		
		
	
	
			313 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Python
		
	
"""This module implements an Earley parser.
 | 
						|
 | 
						|
The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
 | 
						|
    https://www.sciencedirect.com/science/article/pii/S1571066108001497
 | 
						|
 | 
						|
That is probably the best reference for understanding the algorithm here.
 | 
						|
 | 
						|
The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format
 | 
						|
is explained here: https://lark-parser.readthedocs.io/en/latest/_static/sppf/sppf.html
 | 
						|
"""
 | 
						|
 | 
						|
from typing import TYPE_CHECKING, Callable, Optional, List, Any
 | 
						|
from collections import deque
 | 
						|
 | 
						|
from ..lexer import Token
 | 
						|
from ..tree import Tree
 | 
						|
from ..exceptions import UnexpectedEOF, UnexpectedToken
 | 
						|
from ..utils import logger, OrderedSet, dedup_list
 | 
						|
from .grammar_analysis import GrammarAnalyzer
 | 
						|
from ..grammar import NonTerminal
 | 
						|
from .earley_common import Item
 | 
						|
from .earley_forest import ForestSumVisitor, SymbolNode, StableSymbolNode, TokenNode, ForestToParseTree
 | 
						|
 | 
						|
if TYPE_CHECKING:
 | 
						|
    from ..common import LexerConf, ParserConf
 | 
						|
 | 
						|
class Parser:
 | 
						|
    lexer_conf: 'LexerConf'
 | 
						|
    parser_conf: 'ParserConf'
 | 
						|
    debug: bool
 | 
						|
 | 
						|
    def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher: Callable,
 | 
						|
                 resolve_ambiguity: bool=True, debug: bool=False,
 | 
						|
                 tree_class: Optional[Callable[[str, List], Any]]=Tree, ordered_sets: bool=True):
 | 
						|
        analysis = GrammarAnalyzer(parser_conf)
 | 
						|
        self.lexer_conf = lexer_conf
 | 
						|
        self.parser_conf = parser_conf
 | 
						|
        self.resolve_ambiguity = resolve_ambiguity
 | 
						|
        self.debug = debug
 | 
						|
        self.Tree = tree_class
 | 
						|
        self.Set = OrderedSet if ordered_sets else set
 | 
						|
        self.SymbolNode = StableSymbolNode if ordered_sets else SymbolNode
 | 
						|
 | 
						|
        self.FIRST = analysis.FIRST
 | 
						|
        self.NULLABLE = analysis.NULLABLE
 | 
						|
        self.callbacks = parser_conf.callbacks
 | 
						|
        # TODO add typing info
 | 
						|
        self.predictions = {}   # type: ignore[var-annotated]
 | 
						|
 | 
						|
        ## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than
 | 
						|
        #  the slow 'isupper' in is_terminal.
 | 
						|
        self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term }
 | 
						|
        self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term }
 | 
						|
 | 
						|
        self.forest_sum_visitor = None
 | 
						|
        for rule in parser_conf.rules:
 | 
						|
            if rule.origin not in self.predictions:
 | 
						|
                self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]
 | 
						|
 | 
						|
            ## Detect if any rules/terminals have priorities set. If the user specified priority = None, then
 | 
						|
            #  the priorities will be stripped from all rules/terminals before they reach us, allowing us to
 | 
						|
            #  skip the extra tree walk. We'll also skip this if the user just didn't specify priorities
 | 
						|
            #  on any rules/terminals.
 | 
						|
            if self.forest_sum_visitor is None and rule.options.priority is not None:
 | 
						|
                self.forest_sum_visitor = ForestSumVisitor
 | 
						|
 | 
						|
        # Check terminals for priorities
 | 
						|
        # Ignore terminal priorities if the basic lexer is used
 | 
						|
        if self.lexer_conf.lexer_type != 'basic' and self.forest_sum_visitor is None:
 | 
						|
            for term in self.lexer_conf.terminals:
 | 
						|
                if term.priority:
 | 
						|
                    self.forest_sum_visitor = ForestSumVisitor
 | 
						|
                    break
 | 
						|
 | 
						|
        self.term_matcher = term_matcher
 | 
						|
 | 
						|
 | 
						|
    def predict_and_complete(self, i, to_scan, columns, transitives, node_cache):
 | 
						|
        """The core Earley Predictor and Completer.
 | 
						|
 | 
						|
        At each stage of the input, we handling any completed items (things
 | 
						|
        that matched on the last cycle) and use those to predict what should
 | 
						|
        come next in the input stream. The completions and any predicted
 | 
						|
        non-terminals are recursively processed until we reach a set of,
 | 
						|
        which can be added to the scan list for the next scanner cycle."""
 | 
						|
        # Held Completions (H in E.Scotts paper).
 | 
						|
        held_completions = {}
 | 
						|
 | 
						|
        column = columns[i]
 | 
						|
        # R (items) = Ei (column.items)
 | 
						|
        items = deque(column)
 | 
						|
        while items:
 | 
						|
            item = items.pop()    # remove an element, A say, from R
 | 
						|
 | 
						|
            ### The Earley completer
 | 
						|
            if item.is_complete:   ### (item.s == string)
 | 
						|
                if item.node is None:
 | 
						|
                    label = (item.s, item.start, i)
 | 
						|
                    item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
 | 
						|
                    item.node.add_family(item.s, item.rule, item.start, None, None)
 | 
						|
 | 
						|
                # create_leo_transitives(item.rule.origin, item.start)
 | 
						|
 | 
						|
                ###R Joop Leo right recursion Completer
 | 
						|
                if item.rule.origin in transitives[item.start]:
 | 
						|
                    transitive = transitives[item.start][item.s]
 | 
						|
                    if transitive.previous in transitives[transitive.column]:
 | 
						|
                        root_transitive = transitives[transitive.column][transitive.previous]
 | 
						|
                    else:
 | 
						|
                        root_transitive = transitive
 | 
						|
 | 
						|
                    new_item = Item(transitive.rule, transitive.ptr, transitive.start)
 | 
						|
                    label = (root_transitive.s, root_transitive.start, i)
 | 
						|
                    new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
 | 
						|
                    new_item.node.add_path(root_transitive, item.node)
 | 
						|
                    if new_item.expect in self.TERMINALS:
 | 
						|
                        # Add (B :: aC.B, h, y) to Q
 | 
						|
                        to_scan.add(new_item)
 | 
						|
                    elif new_item not in column:
 | 
						|
                        # Add (B :: aC.B, h, y) to Ei and R
 | 
						|
                        column.add(new_item)
 | 
						|
                        items.append(new_item)
 | 
						|
                ###R Regular Earley completer
 | 
						|
                else:
 | 
						|
                    # Empty has 0 length. If we complete an empty symbol in a particular
 | 
						|
                    # parse step, we need to be able to use that same empty symbol to complete
 | 
						|
                    # any predictions that result, that themselves require empty. Avoids
 | 
						|
                    # infinite recursion on empty symbols.
 | 
						|
                    # held_completions is 'H' in E.Scott's paper.
 | 
						|
                    is_empty_item = item.start == i
 | 
						|
                    if is_empty_item:
 | 
						|
                        held_completions[item.rule.origin] = item.node
 | 
						|
 | 
						|
                    originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
 | 
						|
                    for originator in originators:
 | 
						|
                        new_item = originator.advance()
 | 
						|
                        label = (new_item.s, originator.start, i)
 | 
						|
                        new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
 | 
						|
                        new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node)
 | 
						|
                        if new_item.expect in self.TERMINALS:
 | 
						|
                            # Add (B :: aC.B, h, y) to Q
 | 
						|
                            to_scan.add(new_item)
 | 
						|
                        elif new_item not in column:
 | 
						|
                            # Add (B :: aC.B, h, y) to Ei and R
 | 
						|
                            column.add(new_item)
 | 
						|
                            items.append(new_item)
 | 
						|
 | 
						|
            ### The Earley predictor
 | 
						|
            elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
 | 
						|
                new_items = []
 | 
						|
                for rule in self.predictions[item.expect]:
 | 
						|
                    new_item = Item(rule, 0, i)
 | 
						|
                    new_items.append(new_item)
 | 
						|
 | 
						|
                # Process any held completions (H).
 | 
						|
                if item.expect in held_completions:
 | 
						|
                    new_item = item.advance()
 | 
						|
                    label = (new_item.s, item.start, i)
 | 
						|
                    new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
 | 
						|
                    new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
 | 
						|
                    new_items.append(new_item)
 | 
						|
 | 
						|
                for new_item in new_items:
 | 
						|
                    if new_item.expect in self.TERMINALS:
 | 
						|
                        to_scan.add(new_item)
 | 
						|
                    elif new_item not in column:
 | 
						|
                        column.add(new_item)
 | 
						|
                        items.append(new_item)
 | 
						|
 | 
						|
    def _parse(self, lexer, columns, to_scan, start_symbol=None):
 | 
						|
 | 
						|
        def is_quasi_complete(item):
 | 
						|
            if item.is_complete:
 | 
						|
                return True
 | 
						|
 | 
						|
            quasi = item.advance()
 | 
						|
            while not quasi.is_complete:
 | 
						|
                if quasi.expect not in self.NULLABLE:
 | 
						|
                    return False
 | 
						|
                if quasi.rule.origin == start_symbol and quasi.expect == start_symbol:
 | 
						|
                    return False
 | 
						|
                quasi = quasi.advance()
 | 
						|
            return True
 | 
						|
 | 
						|
        # def create_leo_transitives(origin, start):
 | 
						|
        #   ...   # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420
 | 
						|
 | 
						|
        def scan(i, token, to_scan):
 | 
						|
            """The core Earley Scanner.
 | 
						|
 | 
						|
            This is a custom implementation of the scanner that uses the
 | 
						|
            Lark lexer to match tokens. The scan list is built by the
 | 
						|
            Earley predictor, based on the previously completed tokens.
 | 
						|
            This ensures that at each phase of the parse we have a custom
 | 
						|
            lexer context, allowing for more complex ambiguities."""
 | 
						|
            next_to_scan = self.Set()
 | 
						|
            next_set = self.Set()
 | 
						|
            columns.append(next_set)
 | 
						|
            transitives.append({})
 | 
						|
            node_cache = {}
 | 
						|
 | 
						|
            for item in self.Set(to_scan):
 | 
						|
                if match(item.expect, token):
 | 
						|
                    new_item = item.advance()
 | 
						|
                    label = (new_item.s, new_item.start, i + 1)
 | 
						|
                    # 'terminals' may not contain token.type when using %declare
 | 
						|
                    # Additionally, token is not always a Token
 | 
						|
                    # For example, it can be a Tree when using TreeMatcher
 | 
						|
                    term = terminals.get(token.type) if isinstance(token, Token) else None
 | 
						|
                    # Set the priority of the token node to 0 so that the
 | 
						|
                    # terminal priorities do not affect the Tree chosen by
 | 
						|
                    # ForestSumVisitor after the basic lexer has already
 | 
						|
                    # "used up" the terminal priorities
 | 
						|
                    token_node = TokenNode(token, term, priority=0)
 | 
						|
                    new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
 | 
						|
                    new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node)
 | 
						|
 | 
						|
                    if new_item.expect in self.TERMINALS:
 | 
						|
                        # add (B ::= Aai+1.B, h, y) to Q'
 | 
						|
                        next_to_scan.add(new_item)
 | 
						|
                    else:
 | 
						|
                        # add (B ::= Aa+1.B, h, y) to Ei+1
 | 
						|
                        next_set.add(new_item)
 | 
						|
 | 
						|
            if not next_set and not next_to_scan:
 | 
						|
                expect = {i.expect.name for i in to_scan}
 | 
						|
                raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan))
 | 
						|
 | 
						|
            return next_to_scan, node_cache
 | 
						|
 | 
						|
 | 
						|
        # Define parser functions
 | 
						|
        match = self.term_matcher
 | 
						|
 | 
						|
        terminals = self.lexer_conf.terminals_by_name
 | 
						|
 | 
						|
        # Cache for nodes & tokens created in a particular parse step.
 | 
						|
        transitives = [{}]
 | 
						|
 | 
						|
        ## The main Earley loop.
 | 
						|
        # Run the Prediction/Completion cycle for any Items in the current Earley set.
 | 
						|
        # Completions will be added to the SPPF tree, and predictions will be recursively
 | 
						|
        # processed down to terminals/empty nodes to be added to the scanner for the next
 | 
						|
        # step.
 | 
						|
        expects = {i.expect for i in to_scan}
 | 
						|
        i = 0
 | 
						|
        node_cache = {}
 | 
						|
        for token in lexer.lex(expects):
 | 
						|
            self.predict_and_complete(i, to_scan, columns, transitives, node_cache)
 | 
						|
 | 
						|
            to_scan, node_cache = scan(i, token, to_scan)
 | 
						|
            i += 1
 | 
						|
 | 
						|
            expects.clear()
 | 
						|
            expects |= {i.expect for i in to_scan}
 | 
						|
 | 
						|
        self.predict_and_complete(i, to_scan, columns, transitives, node_cache)
 | 
						|
 | 
						|
        ## Column is now the final column in the parse.
 | 
						|
        assert i == len(columns)-1
 | 
						|
        return to_scan
 | 
						|
 | 
						|
    def parse(self, lexer, start):
 | 
						|
        assert start, start
 | 
						|
        start_symbol = NonTerminal(start)
 | 
						|
 | 
						|
        columns = [self.Set()]
 | 
						|
        to_scan = self.Set()     # The scan buffer. 'Q' in E.Scott's paper.
 | 
						|
 | 
						|
        ## Predict for the start_symbol.
 | 
						|
        # Add predicted items to the first Earley set (for the predictor) if they
 | 
						|
        # result in a non-terminal, or the scanner if they result in a terminal.
 | 
						|
        for rule in self.predictions[start_symbol]:
 | 
						|
            item = Item(rule, 0, 0)
 | 
						|
            if item.expect in self.TERMINALS:
 | 
						|
                to_scan.add(item)
 | 
						|
            else:
 | 
						|
                columns[0].add(item)
 | 
						|
 | 
						|
        to_scan = self._parse(lexer, columns, to_scan, start_symbol)
 | 
						|
 | 
						|
        # If the parse was successful, the start
 | 
						|
        # symbol should have been completed in the last step of the Earley cycle, and will be in
 | 
						|
        # this column. Find the item for the start_symbol, which is the root of the SPPF tree.
 | 
						|
        solutions = dedup_list(n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0)
 | 
						|
        if not solutions:
 | 
						|
            expected_terminals = [t.expect.name for t in to_scan]
 | 
						|
            raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan))
 | 
						|
        if len(solutions) > 1:
 | 
						|
            raise RuntimeError('Earley should not generate multiple start symbol items! Please report this bug.')
 | 
						|
        solution ,= solutions
 | 
						|
 | 
						|
        if self.debug:
 | 
						|
            from .earley_forest import ForestToPyDotVisitor
 | 
						|
            try:
 | 
						|
                debug_walker = ForestToPyDotVisitor()
 | 
						|
            except ImportError:
 | 
						|
                logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image")
 | 
						|
            else:
 | 
						|
                debug_walker.visit(solution, "sppf.png")
 | 
						|
 | 
						|
 | 
						|
        if self.Tree is not None:
 | 
						|
            # Perform our SPPF -> AST conversion
 | 
						|
            # Disable the ForestToParseTree cache when ambiguity='resolve'
 | 
						|
            # to prevent a tree construction bug. See issue #1283
 | 
						|
            use_cache = not self.resolve_ambiguity
 | 
						|
            transformer = ForestToParseTree(self.Tree, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity, use_cache)
 | 
						|
            return transformer.transform(solution)
 | 
						|
 | 
						|
        # return the root of the SPPF
 | 
						|
        return solution
 |