You cannot select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
	
	
		
			92 lines
		
	
	
		
			3.0 KiB
		
	
	
	
		
			Python
		
	
			
		
		
	
	
			92 lines
		
	
	
		
			3.0 KiB
		
	
	
	
		
			Python
		
	
"""
 | 
						|
    pygments.regexopt
 | 
						|
    ~~~~~~~~~~~~~~~~~
 | 
						|
 | 
						|
    An algorithm that generates optimized regexes for matching long lists of
 | 
						|
    literal strings.
 | 
						|
 | 
						|
    :copyright: Copyright 2006-2025 by the Pygments team, see AUTHORS.
 | 
						|
    :license: BSD, see LICENSE for details.
 | 
						|
"""
 | 
						|
 | 
						|
import re
 | 
						|
from re import escape
 | 
						|
from os.path import commonprefix
 | 
						|
from itertools import groupby
 | 
						|
from operator import itemgetter
 | 
						|
 | 
						|
CS_ESCAPE = re.compile(r'[\[\^\\\-\]]')
 | 
						|
FIRST_ELEMENT = itemgetter(0)
 | 
						|
 | 
						|
 | 
						|
def make_charset(letters):
 | 
						|
    return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']'
 | 
						|
 | 
						|
 | 
						|
def regex_opt_inner(strings, open_paren):
 | 
						|
    """Return a regex that matches any string in the sorted list of strings."""
 | 
						|
    close_paren = open_paren and ')' or ''
 | 
						|
    # print strings, repr(open_paren)
 | 
						|
    if not strings:
 | 
						|
        # print '-> nothing left'
 | 
						|
        return ''
 | 
						|
    first = strings[0]
 | 
						|
    if len(strings) == 1:
 | 
						|
        # print '-> only 1 string'
 | 
						|
        return open_paren + escape(first) + close_paren
 | 
						|
    if not first:
 | 
						|
        # print '-> first string empty'
 | 
						|
        return open_paren + regex_opt_inner(strings[1:], '(?:') \
 | 
						|
            + '?' + close_paren
 | 
						|
    if len(first) == 1:
 | 
						|
        # multiple one-char strings? make a charset
 | 
						|
        oneletter = []
 | 
						|
        rest = []
 | 
						|
        for s in strings:
 | 
						|
            if len(s) == 1:
 | 
						|
                oneletter.append(s)
 | 
						|
            else:
 | 
						|
                rest.append(s)
 | 
						|
        if len(oneletter) > 1:  # do we have more than one oneletter string?
 | 
						|
            if rest:
 | 
						|
                # print '-> 1-character + rest'
 | 
						|
                return open_paren + regex_opt_inner(rest, '') + '|' \
 | 
						|
                    + make_charset(oneletter) + close_paren
 | 
						|
            # print '-> only 1-character'
 | 
						|
            return open_paren + make_charset(oneletter) + close_paren
 | 
						|
    prefix = commonprefix(strings)
 | 
						|
    if prefix:
 | 
						|
        plen = len(prefix)
 | 
						|
        # we have a prefix for all strings
 | 
						|
        # print '-> prefix:', prefix
 | 
						|
        return open_paren + escape(prefix) \
 | 
						|
            + regex_opt_inner([s[plen:] for s in strings], '(?:') \
 | 
						|
            + close_paren
 | 
						|
    # is there a suffix?
 | 
						|
    strings_rev = [s[::-1] for s in strings]
 | 
						|
    suffix = commonprefix(strings_rev)
 | 
						|
    if suffix:
 | 
						|
        slen = len(suffix)
 | 
						|
        # print '-> suffix:', suffix[::-1]
 | 
						|
        return open_paren \
 | 
						|
            + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \
 | 
						|
            + escape(suffix[::-1]) + close_paren
 | 
						|
    # recurse on common 1-string prefixes
 | 
						|
    # print '-> last resort'
 | 
						|
    return open_paren + \
 | 
						|
        '|'.join(regex_opt_inner(list(group[1]), '')
 | 
						|
                 for group in groupby(strings, lambda s: s[0] == first[0])) \
 | 
						|
        + close_paren
 | 
						|
 | 
						|
 | 
						|
def regex_opt(strings, prefix='', suffix=''):
 | 
						|
    """Return a compiled regex that matches any string in the given list.
 | 
						|
 | 
						|
    The strings to match must be literal strings, not regexes.  They will be
 | 
						|
    regex-escaped.
 | 
						|
 | 
						|
    *prefix* and *suffix* are pre- and appended to the final regex.
 | 
						|
    """
 | 
						|
    strings = sorted(strings)
 | 
						|
    return prefix + regex_opt_inner(strings, '(') + suffix
 |