You cannot select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
	
	
		
			634 lines
		
	
	
		
			22 KiB
		
	
	
	
		
			Python
		
	
			
		
		
	
	
			634 lines
		
	
	
		
			22 KiB
		
	
	
	
		
			Python
		
	
import re
 | 
						|
 | 
						|
from urllib.parse import quote
 | 
						|
 | 
						|
from bleach import callbacks as linkify_callbacks
 | 
						|
from bleach import html5lib_shim
 | 
						|
 | 
						|
 | 
						|
#: List of default callbacks
 | 
						|
DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
 | 
						|
 | 
						|
 | 
						|
TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
 | 
						|
       ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
 | 
						|
       cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
 | 
						|
       dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
 | 
						|
       gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
 | 
						|
       im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
 | 
						|
       kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
 | 
						|
       ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
 | 
						|
       net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
 | 
						|
       pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
 | 
						|
       sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
 | 
						|
       tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
 | 
						|
       xn xxx ye yt yu za zm zw""".split()
 | 
						|
 | 
						|
# Make sure that .com doesn't get matched by .co first
 | 
						|
TLDS.reverse()
 | 
						|
 | 
						|
 | 
						|
def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
 | 
						|
    """Builds the url regex used by linkifier
 | 
						|
 | 
						|
    If you want a different set of tlds or allowed protocols, pass those in
 | 
						|
    and stomp on the existing ``url_re``::
 | 
						|
 | 
						|
        from bleach import linkifier
 | 
						|
 | 
						|
        my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
 | 
						|
 | 
						|
        linker = LinkifyFilter(url_re=my_url_re)
 | 
						|
 | 
						|
    """
 | 
						|
    return re.compile(
 | 
						|
        r"""\(*  # Match any opening parentheses.
 | 
						|
        \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
 | 
						|
        ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
 | 
						|
        (?:[/?][^\s\{{\}}\|\\\^`<>"]*)?
 | 
						|
            # /path/zz (excluding "unsafe" chars from RFC 3986,
 | 
						|
            # except for # and ~, which happen in practice)
 | 
						|
        """.format(
 | 
						|
            "|".join(sorted(protocols)), "|".join(sorted(tlds))
 | 
						|
        ),
 | 
						|
        re.IGNORECASE | re.VERBOSE | re.UNICODE,
 | 
						|
    )
 | 
						|
 | 
						|
 | 
						|
URL_RE = build_url_re()
 | 
						|
 | 
						|
 | 
						|
PROTO_RE = re.compile(r"^[\w-]+:/{0,3}", re.IGNORECASE)
 | 
						|
 | 
						|
 | 
						|
def build_email_re(tlds=TLDS):
 | 
						|
    """Builds the email regex used by linkifier
 | 
						|
 | 
						|
    If you want a different set of tlds, pass those in and stomp on the existing ``email_re``::
 | 
						|
 | 
						|
        from bleach import linkifier
 | 
						|
 | 
						|
        my_email_re = linkifier.build_email_re(my_tlds_list)
 | 
						|
 | 
						|
        linker = LinkifyFilter(email_re=my_url_re)
 | 
						|
 | 
						|
    """
 | 
						|
    # open and closing braces doubled below for format string
 | 
						|
    return re.compile(
 | 
						|
        r"""(?<!//)
 | 
						|
        (([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+
 | 
						|
            (\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)*  # dot-atom
 | 
						|
        |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
 | 
						|
            |\\[\001-\011\013\014\016-\177])*"  # quoted-string
 | 
						|
        )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0}))  # domain
 | 
						|
        """.format(
 | 
						|
            "|".join(tlds)
 | 
						|
        ),
 | 
						|
        re.IGNORECASE | re.MULTILINE | re.VERBOSE,
 | 
						|
    )
 | 
						|
 | 
						|
 | 
						|
EMAIL_RE = build_email_re()
 | 
						|
 | 
						|
 | 
						|
class Linker:
 | 
						|
    """Convert URL-like strings in an HTML fragment to links
 | 
						|
 | 
						|
    This function converts strings that look like URLs, domain names and email
 | 
						|
    addresses in text that may be an HTML fragment to links, while preserving:
 | 
						|
 | 
						|
    1. links already in the string
 | 
						|
    2. urls found in attributes
 | 
						|
    3. email addresses
 | 
						|
 | 
						|
    linkify does a best-effort approach and tries to recover from bad
 | 
						|
    situations due to crazy text.
 | 
						|
 | 
						|
    """
 | 
						|
 | 
						|
    def __init__(
 | 
						|
        self,
 | 
						|
        callbacks=DEFAULT_CALLBACKS,
 | 
						|
        skip_tags=None,
 | 
						|
        parse_email=False,
 | 
						|
        url_re=URL_RE,
 | 
						|
        email_re=EMAIL_RE,
 | 
						|
        recognized_tags=html5lib_shim.HTML_TAGS,
 | 
						|
    ):
 | 
						|
        """Creates a Linker instance
 | 
						|
 | 
						|
        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
 | 
						|
            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
 | 
						|
 | 
						|
        :arg set skip_tags: set of tags that you don't want to linkify the
 | 
						|
            contents of; for example, you could set this to ``{'pre'}`` to skip
 | 
						|
            linkifying contents of ``pre`` tags; ``None`` means you don't
 | 
						|
            want linkify to skip any tags
 | 
						|
 | 
						|
        :arg bool parse_email: whether or not to linkify email addresses
 | 
						|
 | 
						|
        :arg url_re: url matching regex
 | 
						|
 | 
						|
        :arg email_re: email matching regex
 | 
						|
 | 
						|
        :arg set recognized_tags: the set of tags that linkify knows about;
 | 
						|
            everything else gets escaped
 | 
						|
 | 
						|
        :returns: linkified text as unicode
 | 
						|
 | 
						|
        """
 | 
						|
        self.callbacks = callbacks
 | 
						|
        self.skip_tags = skip_tags
 | 
						|
        self.parse_email = parse_email
 | 
						|
        self.url_re = url_re
 | 
						|
        self.email_re = email_re
 | 
						|
 | 
						|
        # Create a parser/tokenizer that allows all HTML tags and escapes
 | 
						|
        # anything not in that list.
 | 
						|
        self.parser = html5lib_shim.BleachHTMLParser(
 | 
						|
            tags=frozenset(recognized_tags),
 | 
						|
            strip=False,
 | 
						|
            consume_entities=False,
 | 
						|
            namespaceHTMLElements=False,
 | 
						|
        )
 | 
						|
        self.walker = html5lib_shim.getTreeWalker("etree")
 | 
						|
        self.serializer = html5lib_shim.BleachHTMLSerializer(
 | 
						|
            quote_attr_values="always",
 | 
						|
            omit_optional_tags=False,
 | 
						|
            # We want to leave entities as they are without escaping or
 | 
						|
            # resolving or expanding
 | 
						|
            resolve_entities=False,
 | 
						|
            # linkify does not sanitize
 | 
						|
            sanitize=False,
 | 
						|
            # linkify preserves attr order
 | 
						|
            alphabetical_attributes=False,
 | 
						|
        )
 | 
						|
 | 
						|
    def linkify(self, text):
 | 
						|
        """Linkify specified text
 | 
						|
 | 
						|
        :arg str text: the text to add links to
 | 
						|
 | 
						|
        :returns: linkified text as unicode
 | 
						|
 | 
						|
        :raises TypeError: if ``text`` is not a text type
 | 
						|
 | 
						|
        """
 | 
						|
        if not isinstance(text, str):
 | 
						|
            raise TypeError("argument must be of text type")
 | 
						|
 | 
						|
        if not text:
 | 
						|
            return ""
 | 
						|
 | 
						|
        dom = self.parser.parseFragment(text)
 | 
						|
        filtered = LinkifyFilter(
 | 
						|
            source=self.walker(dom),
 | 
						|
            callbacks=self.callbacks,
 | 
						|
            skip_tags=self.skip_tags,
 | 
						|
            parse_email=self.parse_email,
 | 
						|
            url_re=self.url_re,
 | 
						|
            email_re=self.email_re,
 | 
						|
        )
 | 
						|
        return self.serializer.render(filtered)
 | 
						|
 | 
						|
 | 
						|
class LinkifyFilter(html5lib_shim.Filter):
 | 
						|
    """html5lib filter that linkifies text
 | 
						|
 | 
						|
    This will do the following:
 | 
						|
 | 
						|
    * convert email addresses into links
 | 
						|
    * convert urls into links
 | 
						|
    * edit existing links by running them through callbacks--the default is to
 | 
						|
      add a ``rel="nofollow"``
 | 
						|
 | 
						|
    This filter can be used anywhere html5lib filters can be used.
 | 
						|
 | 
						|
    """
 | 
						|
 | 
						|
    def __init__(
 | 
						|
        self,
 | 
						|
        source,
 | 
						|
        callbacks=DEFAULT_CALLBACKS,
 | 
						|
        skip_tags=None,
 | 
						|
        parse_email=False,
 | 
						|
        url_re=URL_RE,
 | 
						|
        email_re=EMAIL_RE,
 | 
						|
    ):
 | 
						|
        """Creates a LinkifyFilter instance
 | 
						|
 | 
						|
        :arg source: stream as an html5lib TreeWalker
 | 
						|
 | 
						|
        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
 | 
						|
            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
 | 
						|
 | 
						|
        :arg set skip_tags: set of tags that you don't want to linkify the
 | 
						|
            contents of; for example, you could set this to ``{'pre'}`` to skip
 | 
						|
            linkifying contents of ``pre`` tags
 | 
						|
 | 
						|
        :arg bool parse_email: whether or not to linkify email addresses
 | 
						|
 | 
						|
        :arg url_re: url matching regex
 | 
						|
 | 
						|
        :arg email_re: email matching regex
 | 
						|
 | 
						|
        """
 | 
						|
        super().__init__(source)
 | 
						|
 | 
						|
        self.callbacks = callbacks or []
 | 
						|
        self.skip_tags = skip_tags or {}
 | 
						|
        self.parse_email = parse_email
 | 
						|
 | 
						|
        self.url_re = url_re
 | 
						|
        self.email_re = email_re
 | 
						|
 | 
						|
    def apply_callbacks(self, attrs, is_new):
 | 
						|
        """Given an attrs dict and an is_new bool, runs through callbacks
 | 
						|
 | 
						|
        Callbacks can return an adjusted attrs dict or ``None``. In the case of
 | 
						|
        ``None``, we stop going through callbacks and return that and the link
 | 
						|
        gets dropped.
 | 
						|
 | 
						|
        :arg dict attrs: map of ``(namespace, name)`` -> ``value``
 | 
						|
 | 
						|
        :arg bool is_new: whether or not this link was added by linkify
 | 
						|
 | 
						|
        :returns: adjusted attrs dict or ``None``
 | 
						|
 | 
						|
        """
 | 
						|
        for cb in self.callbacks:
 | 
						|
            attrs = cb(attrs, is_new)
 | 
						|
            if attrs is None:
 | 
						|
                return None
 | 
						|
        return attrs
 | 
						|
 | 
						|
    def extract_character_data(self, token_list):
 | 
						|
        """Extracts and squashes character sequences in a token stream"""
 | 
						|
        # FIXME(willkg): This is a terrible idea. What it does is drop all the
 | 
						|
        # tags from the token list and merge the Characters and SpaceCharacters
 | 
						|
        # tokens into a single text.
 | 
						|
        #
 | 
						|
        # So something like this::
 | 
						|
        #
 | 
						|
        #     "<span>" "<b>" "some text" "</b>" "</span>"
 | 
						|
        #
 | 
						|
        # gets converted to "some text".
 | 
						|
        #
 | 
						|
        # This gets used to figure out the ``_text`` fauxttribute value for
 | 
						|
        # linkify callables.
 | 
						|
        #
 | 
						|
        # I'm not really sure how else to support that ``_text`` fauxttribute and
 | 
						|
        # maintain some modicum of backwards compatibility with previous versions
 | 
						|
        # of Bleach.
 | 
						|
 | 
						|
        out = []
 | 
						|
        for token in token_list:
 | 
						|
            token_type = token["type"]
 | 
						|
            if token_type in ["Characters", "SpaceCharacters"]:
 | 
						|
                out.append(token["data"])
 | 
						|
 | 
						|
        return "".join(out)
 | 
						|
 | 
						|
    def handle_email_addresses(self, src_iter):
 | 
						|
        """Handle email addresses in character tokens"""
 | 
						|
        for token in src_iter:
 | 
						|
            if token["type"] == "Characters":
 | 
						|
                text = token["data"]
 | 
						|
                new_tokens = []
 | 
						|
                end = 0
 | 
						|
 | 
						|
                # For each email address we find in the text
 | 
						|
                for match in self.email_re.finditer(text):
 | 
						|
                    if match.start() > end:
 | 
						|
                        new_tokens.append(
 | 
						|
                            {"type": "Characters", "data": text[end : match.start()]}
 | 
						|
                        )
 | 
						|
 | 
						|
                    # URL-encode the "local-part" according to RFC6068
 | 
						|
                    parts = match.group(0).split("@")
 | 
						|
                    parts[0] = quote(parts[0])
 | 
						|
                    address = "@".join(parts)
 | 
						|
 | 
						|
                    # Run attributes through the callbacks to see what we
 | 
						|
                    # should do with this match
 | 
						|
                    attrs = {
 | 
						|
                        (None, "href"): "mailto:%s" % address,
 | 
						|
                        "_text": match.group(0),
 | 
						|
                    }
 | 
						|
                    attrs = self.apply_callbacks(attrs, True)
 | 
						|
 | 
						|
                    if attrs is None:
 | 
						|
                        # Just add the text--but not as a link
 | 
						|
                        new_tokens.append(
 | 
						|
                            {"type": "Characters", "data": match.group(0)}
 | 
						|
                        )
 | 
						|
 | 
						|
                    else:
 | 
						|
                        # Add an "a" tag for the new link
 | 
						|
                        _text = attrs.pop("_text", "")
 | 
						|
                        new_tokens.extend(
 | 
						|
                            [
 | 
						|
                                {"type": "StartTag", "name": "a", "data": attrs},
 | 
						|
                                {"type": "Characters", "data": str(_text)},
 | 
						|
                                {"type": "EndTag", "name": "a"},
 | 
						|
                            ]
 | 
						|
                        )
 | 
						|
                    end = match.end()
 | 
						|
 | 
						|
                if new_tokens:
 | 
						|
                    # Yield the adjusted set of tokens and then continue
 | 
						|
                    # through the loop
 | 
						|
                    if end < len(text):
 | 
						|
                        new_tokens.append({"type": "Characters", "data": text[end:]})
 | 
						|
 | 
						|
                    yield from new_tokens
 | 
						|
 | 
						|
                    continue
 | 
						|
 | 
						|
            yield token
 | 
						|
 | 
						|
    def strip_non_url_bits(self, fragment):
 | 
						|
        """Strips non-url bits from the url
 | 
						|
 | 
						|
        This accounts for over-eager matching by the regex.
 | 
						|
 | 
						|
        """
 | 
						|
        prefix = suffix = ""
 | 
						|
 | 
						|
        while fragment:
 | 
						|
            # Try removing ( from the beginning and, if it's balanced, from the
 | 
						|
            # end, too
 | 
						|
            if fragment.startswith("("):
 | 
						|
                prefix = prefix + "("
 | 
						|
                fragment = fragment[1:]
 | 
						|
 | 
						|
                if fragment.endswith(")"):
 | 
						|
                    suffix = ")" + suffix
 | 
						|
                    fragment = fragment[:-1]
 | 
						|
                continue
 | 
						|
 | 
						|
            # Now try extraneous things from the end. For example, sometimes we
 | 
						|
            # pick up ) at the end of a url, but the url is in a parenthesized
 | 
						|
            # phrase like:
 | 
						|
            #
 | 
						|
            #     "i looked at the site (at http://example.com)"
 | 
						|
 | 
						|
            if fragment.endswith(")") and "(" not in fragment:
 | 
						|
                fragment = fragment[:-1]
 | 
						|
                suffix = ")" + suffix
 | 
						|
                continue
 | 
						|
 | 
						|
            # Handle commas
 | 
						|
            if fragment.endswith(","):
 | 
						|
                fragment = fragment[:-1]
 | 
						|
                suffix = "," + suffix
 | 
						|
                continue
 | 
						|
 | 
						|
            # Handle periods
 | 
						|
            if fragment.endswith("."):
 | 
						|
                fragment = fragment[:-1]
 | 
						|
                suffix = "." + suffix
 | 
						|
                continue
 | 
						|
 | 
						|
            # Nothing matched, so we're done
 | 
						|
            break
 | 
						|
 | 
						|
        return fragment, prefix, suffix
 | 
						|
 | 
						|
    def handle_links(self, src_iter):
 | 
						|
        """Handle links in character tokens"""
 | 
						|
        in_a = False  # happens, if parse_email=True and if a mail was found
 | 
						|
        for token in src_iter:
 | 
						|
            if in_a:
 | 
						|
                if token["type"] == "EndTag" and token["name"] == "a":
 | 
						|
                    in_a = False
 | 
						|
                yield token
 | 
						|
                continue
 | 
						|
            elif token["type"] == "StartTag" and token["name"] == "a":
 | 
						|
                in_a = True
 | 
						|
                yield token
 | 
						|
                continue
 | 
						|
            if token["type"] == "Characters":
 | 
						|
                text = token["data"]
 | 
						|
                new_tokens = []
 | 
						|
                end = 0
 | 
						|
 | 
						|
                for match in self.url_re.finditer(text):
 | 
						|
                    if match.start() > end:
 | 
						|
                        new_tokens.append(
 | 
						|
                            {"type": "Characters", "data": text[end : match.start()]}
 | 
						|
                        )
 | 
						|
 | 
						|
                    url = match.group(0)
 | 
						|
                    prefix = suffix = ""
 | 
						|
 | 
						|
                    # Sometimes we pick up too much in the url match, so look for
 | 
						|
                    # bits we should drop and remove them from the match
 | 
						|
                    url, prefix, suffix = self.strip_non_url_bits(url)
 | 
						|
 | 
						|
                    # If there's no protocol, add one
 | 
						|
                    if PROTO_RE.search(url):
 | 
						|
                        href = url
 | 
						|
                    else:
 | 
						|
                        href = "http://%s" % url
 | 
						|
 | 
						|
                    attrs = {(None, "href"): href, "_text": url}
 | 
						|
                    attrs = self.apply_callbacks(attrs, True)
 | 
						|
 | 
						|
                    if attrs is None:
 | 
						|
                        # Just add the text
 | 
						|
                        new_tokens.append(
 | 
						|
                            {"type": "Characters", "data": prefix + url + suffix}
 | 
						|
                        )
 | 
						|
 | 
						|
                    else:
 | 
						|
                        # Add the "a" tag!
 | 
						|
                        if prefix:
 | 
						|
                            new_tokens.append({"type": "Characters", "data": prefix})
 | 
						|
 | 
						|
                        _text = attrs.pop("_text", "")
 | 
						|
                        new_tokens.extend(
 | 
						|
                            [
 | 
						|
                                {"type": "StartTag", "name": "a", "data": attrs},
 | 
						|
                                {"type": "Characters", "data": str(_text)},
 | 
						|
                                {"type": "EndTag", "name": "a"},
 | 
						|
                            ]
 | 
						|
                        )
 | 
						|
 | 
						|
                        if suffix:
 | 
						|
                            new_tokens.append({"type": "Characters", "data": suffix})
 | 
						|
 | 
						|
                    end = match.end()
 | 
						|
 | 
						|
                if new_tokens:
 | 
						|
                    # Yield the adjusted set of tokens and then continue
 | 
						|
                    # through the loop
 | 
						|
                    if end < len(text):
 | 
						|
                        new_tokens.append({"type": "Characters", "data": text[end:]})
 | 
						|
 | 
						|
                    yield from new_tokens
 | 
						|
 | 
						|
                    continue
 | 
						|
 | 
						|
            yield token
 | 
						|
 | 
						|
    def handle_a_tag(self, token_buffer):
 | 
						|
        """Handle the "a" tag
 | 
						|
 | 
						|
        This could adjust the link or drop it altogether depending on what the
 | 
						|
        callbacks return.
 | 
						|
 | 
						|
        This yields the new set of tokens.
 | 
						|
 | 
						|
        """
 | 
						|
        a_token = token_buffer[0]
 | 
						|
        if a_token["data"]:
 | 
						|
            attrs = a_token["data"]
 | 
						|
        else:
 | 
						|
            attrs = {}
 | 
						|
        text = self.extract_character_data(token_buffer)
 | 
						|
        attrs["_text"] = text
 | 
						|
 | 
						|
        attrs = self.apply_callbacks(attrs, False)
 | 
						|
 | 
						|
        if attrs is None:
 | 
						|
            # We're dropping the "a" tag and everything else and replacing
 | 
						|
            # it with character data. So emit that token.
 | 
						|
            yield {"type": "Characters", "data": text}
 | 
						|
 | 
						|
        else:
 | 
						|
            new_text = attrs.pop("_text", "")
 | 
						|
            a_token["data"] = attrs
 | 
						|
 | 
						|
            if text == new_text:
 | 
						|
                # The callbacks didn't change the text, so we yield the new "a"
 | 
						|
                # token, then whatever else was there, then the end "a" token
 | 
						|
                yield a_token
 | 
						|
                yield from token_buffer[1:]
 | 
						|
 | 
						|
            else:
 | 
						|
                # If the callbacks changed the text, then we're going to drop
 | 
						|
                # all the tokens between the start and end "a" tags and replace
 | 
						|
                # it with the new text
 | 
						|
                yield a_token
 | 
						|
                yield {"type": "Characters", "data": str(new_text)}
 | 
						|
                yield token_buffer[-1]
 | 
						|
 | 
						|
    def extract_entities(self, token):
 | 
						|
        """Handles Characters tokens with entities
 | 
						|
 | 
						|
        Our overridden tokenizer doesn't do anything with entities. However,
 | 
						|
        that means that the serializer will convert all ``&`` in Characters
 | 
						|
        tokens to ``&``.
 | 
						|
 | 
						|
        Since we don't want that, we extract entities here and convert them to
 | 
						|
        Entity tokens so the serializer will let them be.
 | 
						|
 | 
						|
        :arg token: the Characters token to work on
 | 
						|
 | 
						|
        :returns: generator of tokens
 | 
						|
 | 
						|
        """
 | 
						|
        data = token.get("data", "")
 | 
						|
 | 
						|
        # If there isn't a & in the data, we can return now
 | 
						|
        if "&" not in data:
 | 
						|
            yield token
 | 
						|
            return
 | 
						|
 | 
						|
        new_tokens = []
 | 
						|
 | 
						|
        # For each possible entity that starts with a "&", we try to extract an
 | 
						|
        # actual entity and re-tokenize accordingly
 | 
						|
        for part in html5lib_shim.next_possible_entity(data):
 | 
						|
            if not part:
 | 
						|
                continue
 | 
						|
 | 
						|
            if part.startswith("&"):
 | 
						|
                entity = html5lib_shim.match_entity(part)
 | 
						|
                if entity is not None:
 | 
						|
                    if entity == "amp":
 | 
						|
                        # LinkifyFilter can't match urls across token boundaries
 | 
						|
                        # which is problematic with & since that shows up in
 | 
						|
                        # querystrings all the time. This special-cases &
 | 
						|
                        # and converts it to a & and sticks it in as a
 | 
						|
                        # Characters token. It'll get merged with surrounding
 | 
						|
                        # tokens in the BleachSanitizerfilter.__iter__ and
 | 
						|
                        # escaped in the serializer.
 | 
						|
                        new_tokens.append({"type": "Characters", "data": "&"})
 | 
						|
                    else:
 | 
						|
                        new_tokens.append({"type": "Entity", "name": entity})
 | 
						|
 | 
						|
                    # Length of the entity plus 2--one for & at the beginning
 | 
						|
                    # and one for ; at the end
 | 
						|
                    remainder = part[len(entity) + 2 :]
 | 
						|
                    if remainder:
 | 
						|
                        new_tokens.append({"type": "Characters", "data": remainder})
 | 
						|
                    continue
 | 
						|
 | 
						|
            new_tokens.append({"type": "Characters", "data": part})
 | 
						|
 | 
						|
        yield from new_tokens
 | 
						|
 | 
						|
    def __iter__(self):
 | 
						|
        in_a = False
 | 
						|
        in_skip_tag = None
 | 
						|
 | 
						|
        token_buffer = []
 | 
						|
 | 
						|
        for token in super().__iter__():
 | 
						|
            if in_a:
 | 
						|
                # Handle the case where we're in an "a" tag--we want to buffer tokens
 | 
						|
                # until we hit an end "a" tag.
 | 
						|
                if token["type"] == "EndTag" and token["name"] == "a":
 | 
						|
                    # Add the end tag to the token buffer and then handle them
 | 
						|
                    # and yield anything returned
 | 
						|
                    token_buffer.append(token)
 | 
						|
                    yield from self.handle_a_tag(token_buffer)
 | 
						|
 | 
						|
                    # Clear "a" related state and continue since we've yielded all
 | 
						|
                    # the tokens we're going to yield
 | 
						|
                    in_a = False
 | 
						|
                    token_buffer = []
 | 
						|
                else:
 | 
						|
                    token_buffer.extend(list(self.extract_entities(token)))
 | 
						|
                continue
 | 
						|
 | 
						|
            if token["type"] in ["StartTag", "EmptyTag"]:
 | 
						|
                if token["name"] in self.skip_tags:
 | 
						|
                    # Skip tags start a "special mode" where we don't linkify
 | 
						|
                    # anything until the end tag.
 | 
						|
                    in_skip_tag = token["name"]
 | 
						|
 | 
						|
                elif token["name"] == "a":
 | 
						|
                    # The "a" tag is special--we switch to a slurp mode and
 | 
						|
                    # slurp all the tokens until the end "a" tag and then
 | 
						|
                    # figure out what to do with them there.
 | 
						|
                    in_a = True
 | 
						|
                    token_buffer.append(token)
 | 
						|
 | 
						|
                    # We buffer the start tag, so we don't want to yield it,
 | 
						|
                    # yet
 | 
						|
                    continue
 | 
						|
 | 
						|
            elif in_skip_tag and self.skip_tags:
 | 
						|
                # NOTE(willkg): We put this clause here since in_a and
 | 
						|
                # switching in and out of in_a takes precedence.
 | 
						|
                if token["type"] == "EndTag" and token["name"] == in_skip_tag:
 | 
						|
                    in_skip_tag = None
 | 
						|
 | 
						|
            elif not in_a and not in_skip_tag and token["type"] == "Characters":
 | 
						|
                new_stream = iter([token])
 | 
						|
                if self.parse_email:
 | 
						|
                    new_stream = self.handle_email_addresses(new_stream)
 | 
						|
 | 
						|
                new_stream = self.handle_links(new_stream)
 | 
						|
 | 
						|
                for new_token in new_stream:
 | 
						|
                    yield from self.extract_entities(new_token)
 | 
						|
 | 
						|
                # We've already yielded this token, so continue
 | 
						|
                continue
 | 
						|
 | 
						|
            yield token
 |