You cannot select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
	
	
		
			639 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			Python
		
	
			
		
		
	
	
			639 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			Python
		
	
from itertools import chain
 | 
						|
import re
 | 
						|
import warnings
 | 
						|
 | 
						|
from xml.sax.saxutils import unescape
 | 
						|
 | 
						|
from bleach import html5lib_shim
 | 
						|
from bleach import parse_shim
 | 
						|
 | 
						|
 | 
						|
#: Set of allowed tags
 | 
						|
ALLOWED_TAGS = frozenset(
 | 
						|
    (
 | 
						|
        "a",
 | 
						|
        "abbr",
 | 
						|
        "acronym",
 | 
						|
        "b",
 | 
						|
        "blockquote",
 | 
						|
        "code",
 | 
						|
        "em",
 | 
						|
        "i",
 | 
						|
        "li",
 | 
						|
        "ol",
 | 
						|
        "strong",
 | 
						|
        "ul",
 | 
						|
    )
 | 
						|
)
 | 
						|
 | 
						|
 | 
						|
#: Map of allowed attributes by tag
 | 
						|
ALLOWED_ATTRIBUTES = {
 | 
						|
    "a": ["href", "title"],
 | 
						|
    "abbr": ["title"],
 | 
						|
    "acronym": ["title"],
 | 
						|
}
 | 
						|
 | 
						|
#: List of allowed protocols
 | 
						|
ALLOWED_PROTOCOLS = frozenset(("http", "https", "mailto"))
 | 
						|
 | 
						|
#: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
 | 
						|
INVISIBLE_CHARACTERS = "".join(
 | 
						|
    [chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))]
 | 
						|
)
 | 
						|
 | 
						|
#: Regexp for characters that are invisible
 | 
						|
INVISIBLE_CHARACTERS_RE = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICODE)
 | 
						|
 | 
						|
#: String to replace invisible characters with. This can be a character, a
 | 
						|
#: string, or even a function that takes a Python re matchobj
 | 
						|
INVISIBLE_REPLACEMENT_CHAR = "?"
 | 
						|
 | 
						|
 | 
						|
class NoCssSanitizerWarning(UserWarning):
 | 
						|
    pass
 | 
						|
 | 
						|
 | 
						|
class Cleaner:
 | 
						|
    """Cleaner for cleaning HTML fragments of malicious content
 | 
						|
 | 
						|
    This cleaner is a security-focused function whose sole purpose is to remove
 | 
						|
    malicious content from a string such that it can be displayed as content in
 | 
						|
    a web page.
 | 
						|
 | 
						|
    To use::
 | 
						|
 | 
						|
        from bleach.sanitizer import Cleaner
 | 
						|
 | 
						|
        cleaner = Cleaner()
 | 
						|
 | 
						|
        for text in all_the_yucky_things:
 | 
						|
            sanitized = cleaner.clean(text)
 | 
						|
 | 
						|
    .. Note::
 | 
						|
 | 
						|
       This cleaner is not designed to use to transform content to be used in
 | 
						|
       non-web-page contexts.
 | 
						|
 | 
						|
    .. Warning::
 | 
						|
 | 
						|
       This cleaner is not thread-safe--the html parser has internal state.
 | 
						|
       Create a separate cleaner per thread!
 | 
						|
 | 
						|
 | 
						|
    """
 | 
						|
 | 
						|
    def __init__(
 | 
						|
        self,
 | 
						|
        tags=ALLOWED_TAGS,
 | 
						|
        attributes=ALLOWED_ATTRIBUTES,
 | 
						|
        protocols=ALLOWED_PROTOCOLS,
 | 
						|
        strip=False,
 | 
						|
        strip_comments=True,
 | 
						|
        filters=None,
 | 
						|
        css_sanitizer=None,
 | 
						|
    ):
 | 
						|
        """Initializes a Cleaner
 | 
						|
 | 
						|
        :arg set tags: set of allowed tags; defaults to
 | 
						|
            ``bleach.sanitizer.ALLOWED_TAGS``
 | 
						|
 | 
						|
        :arg dict attributes: allowed attributes; can be a callable, list or dict;
 | 
						|
            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
 | 
						|
 | 
						|
        :arg list protocols: allowed list of protocols for links; defaults
 | 
						|
            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
 | 
						|
 | 
						|
        :arg bool strip: whether or not to strip disallowed elements
 | 
						|
 | 
						|
        :arg bool strip_comments: whether or not to strip HTML comments
 | 
						|
 | 
						|
        :arg list filters: list of html5lib Filter classes to pass streamed content through
 | 
						|
 | 
						|
            .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
 | 
						|
 | 
						|
            .. Warning::
 | 
						|
 | 
						|
               Using filters changes the output of ``bleach.Cleaner.clean``.
 | 
						|
               Make sure the way the filters change the output are secure.
 | 
						|
 | 
						|
        :arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
 | 
						|
            sanitizing style attribute values and style text; defaults to None
 | 
						|
 | 
						|
        """
 | 
						|
        self.tags = tags
 | 
						|
        self.attributes = attributes
 | 
						|
        self.protocols = protocols
 | 
						|
        self.strip = strip
 | 
						|
        self.strip_comments = strip_comments
 | 
						|
        self.filters = filters or []
 | 
						|
        self.css_sanitizer = css_sanitizer
 | 
						|
 | 
						|
        self.parser = html5lib_shim.BleachHTMLParser(
 | 
						|
            tags=self.tags,
 | 
						|
            strip=self.strip,
 | 
						|
            consume_entities=False,
 | 
						|
            namespaceHTMLElements=False,
 | 
						|
        )
 | 
						|
        self.walker = html5lib_shim.getTreeWalker("etree")
 | 
						|
        self.serializer = html5lib_shim.BleachHTMLSerializer(
 | 
						|
            quote_attr_values="always",
 | 
						|
            omit_optional_tags=False,
 | 
						|
            escape_lt_in_attrs=True,
 | 
						|
            # We want to leave entities as they are without escaping or
 | 
						|
            # resolving or expanding
 | 
						|
            resolve_entities=False,
 | 
						|
            # Bleach has its own sanitizer, so don't use the html5lib one
 | 
						|
            sanitize=False,
 | 
						|
            # clean preserves attr order
 | 
						|
            alphabetical_attributes=False,
 | 
						|
        )
 | 
						|
 | 
						|
        if css_sanitizer is None:
 | 
						|
            # FIXME(willkg): this doesn't handle when attributes or an
 | 
						|
            # attributes value is a callable
 | 
						|
            attributes_values = []
 | 
						|
            if isinstance(attributes, list):
 | 
						|
                attributes_values = attributes
 | 
						|
 | 
						|
            elif isinstance(attributes, dict):
 | 
						|
                attributes_values = []
 | 
						|
                for values in attributes.values():
 | 
						|
                    if isinstance(values, (list, tuple)):
 | 
						|
                        attributes_values.extend(values)
 | 
						|
 | 
						|
            if "style" in attributes_values:
 | 
						|
                warnings.warn(
 | 
						|
                    "'style' attribute specified, but css_sanitizer not set.",
 | 
						|
                    category=NoCssSanitizerWarning,
 | 
						|
                )
 | 
						|
 | 
						|
    def clean(self, text):
 | 
						|
        """Cleans text and returns sanitized result as unicode
 | 
						|
 | 
						|
        :arg str text: text to be cleaned
 | 
						|
 | 
						|
        :returns: sanitized text as unicode
 | 
						|
 | 
						|
        :raises TypeError: if ``text`` is not a text type
 | 
						|
 | 
						|
        """
 | 
						|
        if not isinstance(text, str):
 | 
						|
            message = (
 | 
						|
                f"argument cannot be of {text.__class__.__name__!r} type, "
 | 
						|
                + "must be of text type"
 | 
						|
            )
 | 
						|
            raise TypeError(message)
 | 
						|
 | 
						|
        if not text:
 | 
						|
            return ""
 | 
						|
 | 
						|
        dom = self.parser.parseFragment(text)
 | 
						|
        filtered = BleachSanitizerFilter(
 | 
						|
            source=self.walker(dom),
 | 
						|
            allowed_tags=self.tags,
 | 
						|
            attributes=self.attributes,
 | 
						|
            strip_disallowed_tags=self.strip,
 | 
						|
            strip_html_comments=self.strip_comments,
 | 
						|
            css_sanitizer=self.css_sanitizer,
 | 
						|
            allowed_protocols=self.protocols,
 | 
						|
        )
 | 
						|
 | 
						|
        # Apply any filters after the BleachSanitizerFilter
 | 
						|
        for filter_class in self.filters:
 | 
						|
            filtered = filter_class(source=filtered)
 | 
						|
 | 
						|
        return self.serializer.render(filtered)
 | 
						|
 | 
						|
 | 
						|
def attribute_filter_factory(attributes):
 | 
						|
    """Generates attribute filter function for the given attributes value
 | 
						|
 | 
						|
    The attributes value can take one of several shapes. This returns a filter
 | 
						|
    function appropriate to the attributes value. One nice thing about this is
 | 
						|
    that there's less if/then shenanigans in the ``allow_token`` method.
 | 
						|
 | 
						|
    """
 | 
						|
    if callable(attributes):
 | 
						|
        return attributes
 | 
						|
 | 
						|
    if isinstance(attributes, dict):
 | 
						|
 | 
						|
        def _attr_filter(tag, attr, value):
 | 
						|
            if tag in attributes:
 | 
						|
                attr_val = attributes[tag]
 | 
						|
                if callable(attr_val):
 | 
						|
                    return attr_val(tag, attr, value)
 | 
						|
 | 
						|
                if attr in attr_val:
 | 
						|
                    return True
 | 
						|
 | 
						|
            if "*" in attributes:
 | 
						|
                attr_val = attributes["*"]
 | 
						|
                if callable(attr_val):
 | 
						|
                    return attr_val(tag, attr, value)
 | 
						|
 | 
						|
                return attr in attr_val
 | 
						|
 | 
						|
            return False
 | 
						|
 | 
						|
        return _attr_filter
 | 
						|
 | 
						|
    if isinstance(attributes, list):
 | 
						|
 | 
						|
        def _attr_filter(tag, attr, value):
 | 
						|
            return attr in attributes
 | 
						|
 | 
						|
        return _attr_filter
 | 
						|
 | 
						|
    raise ValueError("attributes needs to be a callable, a list or a dict")
 | 
						|
 | 
						|
 | 
						|
class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
 | 
						|
    """html5lib Filter that sanitizes text
 | 
						|
 | 
						|
    This filter can be used anywhere html5lib filters can be used.
 | 
						|
 | 
						|
    """
 | 
						|
 | 
						|
    def __init__(
 | 
						|
        self,
 | 
						|
        source,
 | 
						|
        allowed_tags=ALLOWED_TAGS,
 | 
						|
        attributes=ALLOWED_ATTRIBUTES,
 | 
						|
        allowed_protocols=ALLOWED_PROTOCOLS,
 | 
						|
        attr_val_is_uri=html5lib_shim.attr_val_is_uri,
 | 
						|
        svg_attr_val_allows_ref=html5lib_shim.svg_attr_val_allows_ref,
 | 
						|
        svg_allow_local_href=html5lib_shim.svg_allow_local_href,
 | 
						|
        strip_disallowed_tags=False,
 | 
						|
        strip_html_comments=True,
 | 
						|
        css_sanitizer=None,
 | 
						|
    ):
 | 
						|
        """Creates a BleachSanitizerFilter instance
 | 
						|
 | 
						|
        :arg source: html5lib TreeWalker stream as an html5lib TreeWalker
 | 
						|
 | 
						|
        :arg set allowed_tags: set of allowed tags; defaults to
 | 
						|
            ``bleach.sanitizer.ALLOWED_TAGS``
 | 
						|
 | 
						|
        :arg dict attributes: allowed attributes; can be a callable, list or dict;
 | 
						|
            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
 | 
						|
 | 
						|
        :arg list allowed_protocols: allowed list of protocols for links; defaults
 | 
						|
            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
 | 
						|
 | 
						|
        :arg attr_val_is_uri: set of attributes that have URI values
 | 
						|
 | 
						|
        :arg svg_attr_val_allows_ref: set of SVG attributes that can have
 | 
						|
            references
 | 
						|
 | 
						|
        :arg svg_allow_local_href: set of SVG elements that can have local
 | 
						|
            hrefs
 | 
						|
 | 
						|
        :arg bool strip_disallowed_tags: whether or not to strip disallowed
 | 
						|
            tags
 | 
						|
 | 
						|
        :arg bool strip_html_comments: whether or not to strip HTML comments
 | 
						|
 | 
						|
        :arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
 | 
						|
            sanitizing style attribute values and style text; defaults to None
 | 
						|
 | 
						|
        """
 | 
						|
        # NOTE(willkg): This is the superclass of
 | 
						|
        # html5lib.filters.sanitizer.Filter. We call this directly skipping the
 | 
						|
        # __init__ for html5lib.filters.sanitizer.Filter because that does
 | 
						|
        # things we don't need to do and kicks up the deprecation warning for
 | 
						|
        # using Sanitizer.
 | 
						|
        html5lib_shim.Filter.__init__(self, source)
 | 
						|
 | 
						|
        self.allowed_tags = frozenset(allowed_tags)
 | 
						|
        self.allowed_protocols = frozenset(allowed_protocols)
 | 
						|
 | 
						|
        self.attr_filter = attribute_filter_factory(attributes)
 | 
						|
        self.strip_disallowed_tags = strip_disallowed_tags
 | 
						|
        self.strip_html_comments = strip_html_comments
 | 
						|
 | 
						|
        self.attr_val_is_uri = attr_val_is_uri
 | 
						|
        self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
 | 
						|
        self.css_sanitizer = css_sanitizer
 | 
						|
        self.svg_allow_local_href = svg_allow_local_href
 | 
						|
 | 
						|
    def sanitize_stream(self, token_iterator):
 | 
						|
        for token in token_iterator:
 | 
						|
            ret = self.sanitize_token(token)
 | 
						|
 | 
						|
            if not ret:
 | 
						|
                continue
 | 
						|
 | 
						|
            if isinstance(ret, list):
 | 
						|
                yield from ret
 | 
						|
            else:
 | 
						|
                yield ret
 | 
						|
 | 
						|
    def merge_characters(self, token_iterator):
 | 
						|
        """Merge consecutive Characters tokens in a stream"""
 | 
						|
        characters_buffer = []
 | 
						|
 | 
						|
        for token in token_iterator:
 | 
						|
            if characters_buffer:
 | 
						|
                if token["type"] == "Characters":
 | 
						|
                    characters_buffer.append(token)
 | 
						|
                    continue
 | 
						|
                else:
 | 
						|
                    # Merge all the characters tokens together into one and then
 | 
						|
                    # operate on it.
 | 
						|
                    new_token = {
 | 
						|
                        "data": "".join(
 | 
						|
                            [char_token["data"] for char_token in characters_buffer]
 | 
						|
                        ),
 | 
						|
                        "type": "Characters",
 | 
						|
                    }
 | 
						|
                    characters_buffer = []
 | 
						|
                    yield new_token
 | 
						|
 | 
						|
            elif token["type"] == "Characters":
 | 
						|
                characters_buffer.append(token)
 | 
						|
                continue
 | 
						|
 | 
						|
            yield token
 | 
						|
 | 
						|
        new_token = {
 | 
						|
            "data": "".join([char_token["data"] for char_token in characters_buffer]),
 | 
						|
            "type": "Characters",
 | 
						|
        }
 | 
						|
        yield new_token
 | 
						|
 | 
						|
    def __iter__(self):
 | 
						|
        return self.merge_characters(
 | 
						|
            self.sanitize_stream(html5lib_shim.Filter.__iter__(self))
 | 
						|
        )
 | 
						|
 | 
						|
    def sanitize_token(self, token):
 | 
						|
        """Sanitize a token either by HTML-encoding or dropping.
 | 
						|
 | 
						|
        Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
 | 
						|
        ['attribute', 'pairs'], 'tag': callable}.
 | 
						|
 | 
						|
        Here callable is a function with two arguments of attribute name and
 | 
						|
        value. It should return true of false.
 | 
						|
 | 
						|
        Also gives the option to strip tags instead of encoding.
 | 
						|
 | 
						|
        :arg dict token: token to sanitize
 | 
						|
 | 
						|
        :returns: token or list of tokens
 | 
						|
 | 
						|
        """
 | 
						|
        token_type = token["type"]
 | 
						|
        if token_type in ["StartTag", "EndTag", "EmptyTag"]:
 | 
						|
            if token["name"] in self.allowed_tags:
 | 
						|
                return self.allow_token(token)
 | 
						|
 | 
						|
            elif self.strip_disallowed_tags:
 | 
						|
                return None
 | 
						|
 | 
						|
            else:
 | 
						|
                return self.disallowed_token(token)
 | 
						|
 | 
						|
        elif token_type == "Comment":
 | 
						|
            if not self.strip_html_comments:
 | 
						|
                # call lxml.sax.saxutils to escape &, <, and > in addition to " and '
 | 
						|
                token["data"] = html5lib_shim.escape(
 | 
						|
                    token["data"], entities={'"': """, "'": "'"}
 | 
						|
                )
 | 
						|
                return token
 | 
						|
            else:
 | 
						|
                return None
 | 
						|
 | 
						|
        elif token_type == "Characters":
 | 
						|
            return self.sanitize_characters(token)
 | 
						|
 | 
						|
        else:
 | 
						|
            return token
 | 
						|
 | 
						|
    def sanitize_characters(self, token):
 | 
						|
        """Handles Characters tokens
 | 
						|
 | 
						|
        Our overridden tokenizer doesn't do anything with entities. However,
 | 
						|
        that means that the serializer will convert all ``&`` in Characters
 | 
						|
        tokens to ``&``.
 | 
						|
 | 
						|
        Since we don't want that, we extract entities here and convert them to
 | 
						|
        Entity tokens so the serializer will let them be.
 | 
						|
 | 
						|
        :arg token: the Characters token to work on
 | 
						|
 | 
						|
        :returns: a list of tokens
 | 
						|
 | 
						|
        """
 | 
						|
        data = token.get("data", "")
 | 
						|
 | 
						|
        if not data:
 | 
						|
            return token
 | 
						|
 | 
						|
        data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
 | 
						|
        token["data"] = data
 | 
						|
 | 
						|
        # If there isn't a & in the data, we can return now
 | 
						|
        if "&" not in data:
 | 
						|
            return token
 | 
						|
 | 
						|
        new_tokens = []
 | 
						|
 | 
						|
        # For each possible entity that starts with a "&", we try to extract an
 | 
						|
        # actual entity and re-tokenize accordingly
 | 
						|
        for part in html5lib_shim.next_possible_entity(data):
 | 
						|
            if not part:
 | 
						|
                continue
 | 
						|
 | 
						|
            if part.startswith("&"):
 | 
						|
                entity = html5lib_shim.match_entity(part)
 | 
						|
                if entity is not None:
 | 
						|
                    if entity == "amp":
 | 
						|
                        # LinkifyFilter can't match urls across token boundaries
 | 
						|
                        # which is problematic with & since that shows up in
 | 
						|
                        # querystrings all the time. This special-cases &
 | 
						|
                        # and converts it to a & and sticks it in as a
 | 
						|
                        # Characters token. It'll get merged with surrounding
 | 
						|
                        # tokens in the BleachSanitizerfilter.__iter__ and
 | 
						|
                        # escaped in the serializer.
 | 
						|
                        new_tokens.append({"type": "Characters", "data": "&"})
 | 
						|
                    else:
 | 
						|
                        new_tokens.append({"type": "Entity", "name": entity})
 | 
						|
 | 
						|
                    # Length of the entity plus 2--one for & at the beginning
 | 
						|
                    # and one for ; at the end
 | 
						|
                    remainder = part[len(entity) + 2 :]
 | 
						|
                    if remainder:
 | 
						|
                        new_tokens.append({"type": "Characters", "data": remainder})
 | 
						|
                    continue
 | 
						|
 | 
						|
            new_tokens.append({"type": "Characters", "data": part})
 | 
						|
 | 
						|
        return new_tokens
 | 
						|
 | 
						|
    def sanitize_uri_value(self, value, allowed_protocols):
 | 
						|
        """Checks a uri value to see if it's allowed
 | 
						|
 | 
						|
        :arg value: the uri value to sanitize
 | 
						|
        :arg allowed_protocols: list of allowed protocols
 | 
						|
 | 
						|
        :returns: allowed value or None
 | 
						|
 | 
						|
        """
 | 
						|
        # NOTE(willkg): This transforms the value into a normalized one that's
 | 
						|
        # easier to match and verify, but shouldn't get returned since it's
 | 
						|
        # vastly different than the original value.
 | 
						|
 | 
						|
        # Convert all character entities in the value
 | 
						|
        normalized_uri = html5lib_shim.convert_entities(value)
 | 
						|
 | 
						|
        # Nix backtick, space characters, and control characters
 | 
						|
        normalized_uri = re.sub(r"[`\000-\040\177-\240\s]+", "", normalized_uri)
 | 
						|
 | 
						|
        # Remove REPLACEMENT characters
 | 
						|
        normalized_uri = normalized_uri.replace("\ufffd", "")
 | 
						|
 | 
						|
        # Lowercase it--this breaks the value, but makes it easier to match
 | 
						|
        # against
 | 
						|
        normalized_uri = normalized_uri.lower()
 | 
						|
 | 
						|
        try:
 | 
						|
            # Drop attributes with uri values that have protocols that aren't
 | 
						|
            # allowed
 | 
						|
            parsed = parse_shim.urlparse(normalized_uri)
 | 
						|
        except ValueError:
 | 
						|
            # URI is impossible to parse, therefore it's not allowed
 | 
						|
            return None
 | 
						|
 | 
						|
        if parsed.scheme:
 | 
						|
            # If urlparse found a scheme, check that
 | 
						|
            if parsed.scheme in allowed_protocols:
 | 
						|
                return value
 | 
						|
 | 
						|
        else:
 | 
						|
            # Allow uris that are just an anchor
 | 
						|
            if normalized_uri.startswith("#"):
 | 
						|
                return value
 | 
						|
 | 
						|
            # Handle protocols that urlparse doesn't recognize like "myprotocol"
 | 
						|
            if (
 | 
						|
                ":" in normalized_uri
 | 
						|
                and normalized_uri.split(":")[0] in allowed_protocols
 | 
						|
            ):
 | 
						|
                return value
 | 
						|
 | 
						|
            # If there's no protocol/scheme specified, then assume it's "http" or
 | 
						|
            # "https" and see if that's allowed
 | 
						|
            if "http" in allowed_protocols or "https" in allowed_protocols:
 | 
						|
                return value
 | 
						|
 | 
						|
        return None
 | 
						|
 | 
						|
    def allow_token(self, token):
 | 
						|
        """Handles the case where we're allowing the tag"""
 | 
						|
        if "data" in token:
 | 
						|
            # Loop through all the attributes and drop the ones that are not
 | 
						|
            # allowed, are unsafe or break other rules. Additionally, fix
 | 
						|
            # attribute values that need fixing.
 | 
						|
            #
 | 
						|
            # At the end of this loop, we have the final set of attributes
 | 
						|
            # we're keeping.
 | 
						|
            attrs = {}
 | 
						|
            for namespaced_name, val in token["data"].items():
 | 
						|
                namespace, name = namespaced_name
 | 
						|
 | 
						|
                # Drop attributes that are not explicitly allowed
 | 
						|
                #
 | 
						|
                # NOTE(willkg): We pass in the attribute name--not a namespaced
 | 
						|
                # name.
 | 
						|
                if not self.attr_filter(token["name"], name, val):
 | 
						|
                    continue
 | 
						|
 | 
						|
                # Drop attributes with uri values that use a disallowed protocol
 | 
						|
                # Sanitize attributes with uri values
 | 
						|
                if namespaced_name in self.attr_val_is_uri:
 | 
						|
                    new_value = self.sanitize_uri_value(val, self.allowed_protocols)
 | 
						|
                    if new_value is None:
 | 
						|
                        continue
 | 
						|
                    val = new_value
 | 
						|
 | 
						|
                # Drop values in svg attrs with non-local IRIs
 | 
						|
                if namespaced_name in self.svg_attr_val_allows_ref:
 | 
						|
                    new_val = re.sub(r"url\s*\(\s*[^#\s][^)]+?\)", " ", unescape(val))
 | 
						|
                    new_val = new_val.strip()
 | 
						|
                    if not new_val:
 | 
						|
                        continue
 | 
						|
 | 
						|
                    else:
 | 
						|
                        # Replace the val with the unescaped version because
 | 
						|
                        # it's a iri
 | 
						|
                        val = new_val
 | 
						|
 | 
						|
                # Drop href and xlink:href attr for svg elements with non-local IRIs
 | 
						|
                if (None, token["name"]) in self.svg_allow_local_href:
 | 
						|
                    if namespaced_name in [
 | 
						|
                        (None, "href"),
 | 
						|
                        (html5lib_shim.namespaces["xlink"], "href"),
 | 
						|
                    ]:
 | 
						|
                        if re.search(r"^\s*[^#\s]", val):
 | 
						|
                            continue
 | 
						|
 | 
						|
                # If it's a style attribute, sanitize it
 | 
						|
                if namespaced_name == (None, "style"):
 | 
						|
                    if self.css_sanitizer:
 | 
						|
                        val = self.css_sanitizer.sanitize_css(val)
 | 
						|
                    else:
 | 
						|
                        # FIXME(willkg): if style is allowed, but no
 | 
						|
                        # css_sanitizer was set up, then this is probably a
 | 
						|
                        # mistake and we should raise an error here
 | 
						|
                        #
 | 
						|
                        # For now, we're going to set the value to "" because
 | 
						|
                        # there was no sanitizer set
 | 
						|
                        val = ""
 | 
						|
 | 
						|
                # At this point, we want to keep the attribute, so add it in
 | 
						|
                attrs[namespaced_name] = val
 | 
						|
 | 
						|
            token["data"] = attrs
 | 
						|
 | 
						|
        return token
 | 
						|
 | 
						|
    def disallowed_token(self, token):
 | 
						|
        token_type = token["type"]
 | 
						|
        if token_type == "EndTag":
 | 
						|
            token["data"] = f"</{token['name']}>"
 | 
						|
 | 
						|
        elif token["data"]:
 | 
						|
            assert token_type in ("StartTag", "EmptyTag")
 | 
						|
            attrs = []
 | 
						|
            for (ns, name), v in token["data"].items():
 | 
						|
                # If we end up with a namespace, but no name, switch them so we
 | 
						|
                # have a valid name to use.
 | 
						|
                if ns and not name:
 | 
						|
                    ns, name = name, ns
 | 
						|
 | 
						|
                # Figure out namespaced name if the namespace is appropriate
 | 
						|
                # and exists; if the ns isn't in prefixes, then drop it.
 | 
						|
                if ns is None or ns not in html5lib_shim.prefixes:
 | 
						|
                    namespaced_name = name
 | 
						|
                else:
 | 
						|
                    namespaced_name = f"{html5lib_shim.prefixes[ns]}:{name}"
 | 
						|
 | 
						|
                # NOTE(willkg): HTMLSerializer escapes attribute values
 | 
						|
                # already, so if we do it here (like HTMLSerializer does),
 | 
						|
                # then we end up double-escaping.
 | 
						|
                attrs.append(f' {namespaced_name}="{v}"')
 | 
						|
            token["data"] = f"<{token['name']}{''.join(attrs)}>"
 | 
						|
 | 
						|
        else:
 | 
						|
            token["data"] = f"<{token['name']}>"
 | 
						|
 | 
						|
        if token.get("selfClosing"):
 | 
						|
            token["data"] = f"{token['data'][:-1]}/>"
 | 
						|
 | 
						|
        token["type"] = "Characters"
 | 
						|
 | 
						|
        del token["name"]
 | 
						|
        return token
 |