| 
							
								 | 
							
							# -*- coding: utf-8 -*-
 | 
						
						
						
						
							 | 
							
								 | 
							
							"""Beautiful Soup bonus library: Unicode, Dammit
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							This library converts a bytestream to Unicode through any means
 | 
						
						
						
						
							 | 
							
								 | 
							
							necessary. It is heavily based on code from Mark Pilgrim's `Universal
 | 
						
						
						
						
							 | 
							
								 | 
							
							Feed Parser <https://pypi.org/project/feedparser/>`_, now maintained
 | 
						
						
						
						
							 | 
							
								 | 
							
							by Kurt McKee. It does not rewrite the body of an XML or HTML document
 | 
						
						
						
						
							 | 
							
								 | 
							
							to reflect a new encoding; that's the job of `TreeBuilder`.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							"""
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							# Use of this source code is governed by the MIT license.
 | 
						
						
						
						
							 | 
							
								 | 
							
							__license__ = "MIT"
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							from html.entities import codepoint2name
 | 
						
						
						
						
							 | 
							
								 | 
							
							from collections import defaultdict
 | 
						
						
						
						
							 | 
							
								 | 
							
							import codecs
 | 
						
						
						
						
							 | 
							
								 | 
							
							from html.entities import html5
 | 
						
						
						
						
							 | 
							
								 | 
							
							import re
 | 
						
						
						
						
							 | 
							
								 | 
							
							from logging import Logger, getLogger
 | 
						
						
						
						
							 | 
							
								 | 
							
							from types import ModuleType
 | 
						
						
						
						
							 | 
							
								 | 
							
							from typing import (
 | 
						
						
						
						
							 | 
							
								 | 
							
							    Dict,
 | 
						
						
						
						
							 | 
							
								 | 
							
							    Iterator,
 | 
						
						
						
						
							 | 
							
								 | 
							
							    List,
 | 
						
						
						
						
							 | 
							
								 | 
							
							    Optional,
 | 
						
						
						
						
							 | 
							
								 | 
							
							    Pattern,
 | 
						
						
						
						
							 | 
							
								 | 
							
							    Set,
 | 
						
						
						
						
							 | 
							
								 | 
							
							    Tuple,
 | 
						
						
						
						
							 | 
							
								 | 
							
							    Type,
 | 
						
						
						
						
							 | 
							
								 | 
							
							    Union,
 | 
						
						
						
						
							 | 
							
								 | 
							
							    cast,
 | 
						
						
						
						
							 | 
							
								 | 
							
							)
 | 
						
						
						
						
							 | 
							
								 | 
							
							from typing_extensions import Literal
 | 
						
						
						
						
							 | 
							
								 | 
							
							from bs4._typing import (
 | 
						
						
						
						
							 | 
							
								 | 
							
							    _Encoding,
 | 
						
						
						
						
							 | 
							
								 | 
							
							    _Encodings,
 | 
						
						
						
						
							 | 
							
								 | 
							
							)
 | 
						
						
						
						
							 | 
							
								 | 
							
							import warnings
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							# Import a library to autodetect character encodings. We'll support
 | 
						
						
						
						
							 | 
							
								 | 
							
							# any of a number of libraries that all support the same API:
 | 
						
						
						
						
							 | 
							
								 | 
							
							#
 | 
						
						
						
						
							 | 
							
								 | 
							
							# * cchardet
 | 
						
						
						
						
							 | 
							
								 | 
							
							# * chardet
 | 
						
						
						
						
							 | 
							
								 | 
							
							# * charset-normalizer
 | 
						
						
						
						
							 | 
							
								 | 
							
							chardet_module: Optional[ModuleType] = None
 | 
						
						
						
						
							 | 
							
								 | 
							
							try:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #  PyPI package: cchardet
 | 
						
						
						
						
							 | 
							
								 | 
							
							    import cchardet # type:ignore
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    chardet_module = cchardet
 | 
						
						
						
						
							 | 
							
								 | 
							
							except ImportError:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    try:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        #  Debian package: python-chardet
 | 
						
						
						
						
							 | 
							
								 | 
							
							        #  PyPI package: chardet
 | 
						
						
						
						
							 | 
							
								 | 
							
							        import chardet
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        chardet_module = chardet
 | 
						
						
						
						
							 | 
							
								 | 
							
							    except ImportError:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        try:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # PyPI package: charset-normalizer
 | 
						
						
						
						
							 | 
							
								 | 
							
							            import charset_normalizer # type:ignore
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							            chardet_module = charset_normalizer
 | 
						
						
						
						
							 | 
							
								 | 
							
							        except ImportError:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # No chardet available.
 | 
						
						
						
						
							 | 
							
								 | 
							
							            pass
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							def _chardet_dammit(s: bytes) -> Optional[str]:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    """Try as hard as possible to detect the encoding of a bytestring."""
 | 
						
						
						
						
							 | 
							
								 | 
							
							    if chardet_module is None or isinstance(s, str):
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return None
 | 
						
						
						
						
							 | 
							
								 | 
							
							    module = chardet_module
 | 
						
						
						
						
							 | 
							
								 | 
							
							    return module.detect(s)["encoding"]
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							# Build bytestring and Unicode versions of regular expressions for finding
 | 
						
						
						
						
							 | 
							
								 | 
							
							# a declared encoding inside an XML or HTML document.
 | 
						
						
						
						
							 | 
							
								 | 
							
							xml_encoding: str = "^\\s*<\\?.*encoding=['\"](.*?)['\"].*\\?>"  #: :meta private:
 | 
						
						
						
						
							 | 
							
								 | 
							
							html_meta: str = (
 | 
						
						
						
						
							 | 
							
								 | 
							
							    "<\\s*meta[^>]+charset\\s*=\\s*[\"']?([^>]*?)[ /;'\">]"  #: :meta private:
 | 
						
						
						
						
							 | 
							
								 | 
							
							)
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							# TODO-TYPING: The Pattern type here could use more refinement, but it's tricky.
 | 
						
						
						
						
							 | 
							
								 | 
							
							encoding_res: Dict[Type, Dict[str, Pattern]] = dict()
 | 
						
						
						
						
							 | 
							
								 | 
							
							encoding_res[bytes] = {
 | 
						
						
						
						
							 | 
							
								 | 
							
							    "html": re.compile(html_meta.encode("ascii"), re.I),
 | 
						
						
						
						
							 | 
							
								 | 
							
							    "xml": re.compile(xml_encoding.encode("ascii"), re.I),
 | 
						
						
						
						
							 | 
							
								 | 
							
							}
 | 
						
						
						
						
							 | 
							
								 | 
							
							encoding_res[str] = {
 | 
						
						
						
						
							 | 
							
								 | 
							
							    "html": re.compile(html_meta, re.I),
 | 
						
						
						
						
							 | 
							
								 | 
							
							    "xml": re.compile(xml_encoding, re.I),
 | 
						
						
						
						
							 | 
							
								 | 
							
							}
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							class EntitySubstitution(object):
 | 
						
						
						
						
							 | 
							
								 | 
							
							    """The ability to substitute XML or HTML entities for certain characters."""
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: A map of named HTML entities to the corresponding Unicode string.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: :meta hide-value:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    HTML_ENTITY_TO_CHARACTER: Dict[str, str]
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: A map of Unicode strings to the corresponding named HTML entities;
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: the inverse of HTML_ENTITY_TO_CHARACTER.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: :meta hide-value:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    CHARACTER_TO_HTML_ENTITY: Dict[str, str]
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: A regular expression that matches any character (or, in rare
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: cases, pair of characters) that can be replaced with a named
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: HTML entity.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: :meta hide-value:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    CHARACTER_TO_HTML_ENTITY_RE: Pattern[str]
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: A very similar regular expression to
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: CHARACTER_TO_HTML_ENTITY_RE, but which also matches unescaped
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: ampersands. This is used by the 'html' formatted to provide
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: backwards-compatibility, even though the HTML5 spec allows most
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: ampersands to go unescaped.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: :meta hide-value:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: Pattern[str]
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    @classmethod
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def _populate_class_variables(cls) -> None:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """Initialize variables used by this class to manage the plethora of
 | 
						
						
						
						
							 | 
							
								 | 
							
							        HTML5 named entities.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        This function sets the following class variables:
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        CHARACTER_TO_HTML_ENTITY - A mapping of Unicode strings like "⦨" to
 | 
						
						
						
						
							 | 
							
								 | 
							
							        entity names like "angmsdaa". When a single Unicode string has
 | 
						
						
						
						
							 | 
							
								 | 
							
							        multiple entity names, we try to choose the most commonly-used
 | 
						
						
						
						
							 | 
							
								 | 
							
							        name.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        HTML_ENTITY_TO_CHARACTER: A mapping of entity names like "angmsdaa" to
 | 
						
						
						
						
							 | 
							
								 | 
							
							        Unicode strings like "⦨".
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        CHARACTER_TO_HTML_ENTITY_RE: A regular expression matching (almost) any
 | 
						
						
						
						
							 | 
							
								 | 
							
							        Unicode string that corresponds to an HTML5 named entity.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: A very similar
 | 
						
						
						
						
							 | 
							
								 | 
							
							        regular expression to CHARACTER_TO_HTML_ENTITY_RE, but which
 | 
						
						
						
						
							 | 
							
								 | 
							
							        also matches unescaped ampersands. This is used by the 'html'
 | 
						
						
						
						
							 | 
							
								 | 
							
							        formatted to provide backwards-compatibility, even though the HTML5
 | 
						
						
						
						
							 | 
							
								 | 
							
							        spec allows most ampersands to go unescaped.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """
 | 
						
						
						
						
							 | 
							
								 | 
							
							        unicode_to_name = {}
 | 
						
						
						
						
							 | 
							
								 | 
							
							        name_to_unicode = {}
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        short_entities = set()
 | 
						
						
						
						
							 | 
							
								 | 
							
							        long_entities_by_first_character = defaultdict(set)
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        for name_with_semicolon, character in sorted(html5.items()):
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # "It is intentional, for legacy compatibility, that many
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # code points have multiple character reference names. For
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # example, some appear both with and without the trailing
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # semicolon, or with different capitalizations."
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
 | 
						
						
						
						
							 | 
							
								 | 
							
							            #
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # The parsers are in charge of handling (or not) character
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # references with no trailing semicolon, so we remove the
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # semicolon whenever it appears.
 | 
						
						
						
						
							 | 
							
								 | 
							
							            if name_with_semicolon.endswith(";"):
 | 
						
						
						
						
							 | 
							
								 | 
							
							                name = name_with_semicolon[:-1]
 | 
						
						
						
						
							 | 
							
								 | 
							
							            else:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                name = name_with_semicolon
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # When parsing HTML, we want to recognize any known named
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # entity and convert it to a sequence of Unicode
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # characters.
 | 
						
						
						
						
							 | 
							
								 | 
							
							            if name not in name_to_unicode:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                name_to_unicode[name] = character
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # When _generating_ HTML, we want to recognize special
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # character sequences that _could_ be converted to named
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # entities.
 | 
						
						
						
						
							 | 
							
								 | 
							
							            unicode_to_name[character] = name
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # We also need to build a regular expression that lets us
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # _find_ those characters in output strings so we can
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # replace them.
 | 
						
						
						
						
							 | 
							
								 | 
							
							            #
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # This is tricky, for two reasons.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							            if len(character) == 1 and ord(character) < 128 and character not in "<>":
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # First, it would be annoying to turn single ASCII
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # characters like | into named entities like
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # |. The exceptions are <>, which we _must_
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # turn into named entities to produce valid HTML.
 | 
						
						
						
						
							 | 
							
								 | 
							
							                continue
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							            if len(character) > 1 and all(ord(x) < 128 for x in character):
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # We also do not want to turn _combinations_ of ASCII
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # characters like 'fj' into named entities like 'fj',
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # though that's more debateable.
 | 
						
						
						
						
							 | 
							
								 | 
							
							                continue
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # Second, some named entities have a Unicode value that's
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # a subset of the Unicode value for some _other_ named
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # entity.  As an example, \u2267' is ≧,
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # but '\u2267\u0338' is ≧̸. Our regular
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # expression needs to match the first two characters of
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # "\u2267\u0338foo", but only the first character of
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # "\u2267foo".
 | 
						
						
						
						
							 | 
							
								 | 
							
							            #
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # In this step, we build two sets of characters that
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # _eventually_ need to go into the regular expression. But
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # we won't know exactly what the regular expression needs
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # to look like until we've gone through the entire list of
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # named entities.
 | 
						
						
						
						
							 | 
							
								 | 
							
							            if len(character) == 1 and character != "&":
 | 
						
						
						
						
							 | 
							
								 | 
							
							                short_entities.add(character)
 | 
						
						
						
						
							 | 
							
								 | 
							
							            else:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                long_entities_by_first_character[character[0]].add(character)
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # Now that we've been through the entire list of entities, we
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # can create a regular expression that matches any of them.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        particles = set()
 | 
						
						
						
						
							 | 
							
								 | 
							
							        for short in short_entities:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            long_versions = long_entities_by_first_character[short]
 | 
						
						
						
						
							 | 
							
								 | 
							
							            if not long_versions:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                particles.add(short)
 | 
						
						
						
						
							 | 
							
								 | 
							
							            else:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                ignore = "".join([x[1] for x in long_versions])
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # This finds, e.g. \u2267 but only if it is _not_
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # followed by \u0338.
 | 
						
						
						
						
							 | 
							
								 | 
							
							                particles.add("%s(?![%s])" % (short, ignore))
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        for long_entities in list(long_entities_by_first_character.values()):
 | 
						
						
						
						
							 | 
							
								 | 
							
							            for long_entity in long_entities:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                particles.add(long_entity)
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        re_definition = "(%s)" % "|".join(particles)
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        particles.add("&")
 | 
						
						
						
						
							 | 
							
								 | 
							
							        re_definition_with_ampersand = "(%s)" % "|".join(particles)
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # If an entity shows up in both html5 and codepoint2name, it's
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # likely that HTML5 gives it several different names, such as
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # 'rsquo' and 'rsquor'. When converting Unicode characters to
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # named entities, the codepoint2name name should take
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # precedence where possible, since that's the more easily
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # recognizable one.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        for codepoint, name in list(codepoint2name.items()):
 | 
						
						
						
						
							 | 
							
								 | 
							
							            character = chr(codepoint)
 | 
						
						
						
						
							 | 
							
								 | 
							
							            unicode_to_name[character] = name
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        cls.CHARACTER_TO_HTML_ENTITY = unicode_to_name
 | 
						
						
						
						
							 | 
							
								 | 
							
							        cls.HTML_ENTITY_TO_CHARACTER = name_to_unicode
 | 
						
						
						
						
							 | 
							
								 | 
							
							        cls.CHARACTER_TO_HTML_ENTITY_RE = re.compile(re_definition)
 | 
						
						
						
						
							 | 
							
								 | 
							
							        cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE = re.compile(
 | 
						
						
						
						
							 | 
							
								 | 
							
							            re_definition_with_ampersand
 | 
						
						
						
						
							 | 
							
								 | 
							
							        )
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: A map of Unicode strings to the corresponding named XML entities.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: :meta hide-value:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    CHARACTER_TO_XML_ENTITY: Dict[str, str] = {
 | 
						
						
						
						
							 | 
							
								 | 
							
							        "'": "apos",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        '"': "quot",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        "&": "amp",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        "<": "lt",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        ">": "gt",
 | 
						
						
						
						
							 | 
							
								 | 
							
							    }
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    # Matches any named or numeric HTML entity.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    ANY_ENTITY_RE = re.compile("&(#\\d+|#x[0-9a-fA-F]+|\\w+);", re.I)
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: A regular expression matching an angle bracket or an ampersand that
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: is not part of an XML or HTML entity.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: :meta hide-value:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    BARE_AMPERSAND_OR_BRACKET: Pattern[str] = re.compile(
 | 
						
						
						
						
							 | 
							
								 | 
							
							        "([<>]|" "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" ")"
 | 
						
						
						
						
							 | 
							
								 | 
							
							    )
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: A regular expression matching an angle bracket or an ampersand.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: :meta hide-value:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    AMPERSAND_OR_BRACKET: Pattern[str] = re.compile("([<>&])")
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    @classmethod
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def _substitute_html_entity(cls, matchobj: re.Match) -> str:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """Used with a regular expression to substitute the
 | 
						
						
						
						
							 | 
							
								 | 
							
							        appropriate HTML entity for a special character string."""
 | 
						
						
						
						
							 | 
							
								 | 
							
							        original_entity = matchobj.group(0)
 | 
						
						
						
						
							 | 
							
								 | 
							
							        entity = cls.CHARACTER_TO_HTML_ENTITY.get(original_entity)
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if entity is None:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            return "&%s;" % original_entity
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return "&%s;" % entity
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    @classmethod
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def _substitute_xml_entity(cls, matchobj: re.Match) -> str:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """Used with a regular expression to substitute the
 | 
						
						
						
						
							 | 
							
								 | 
							
							        appropriate XML entity for a special character string."""
 | 
						
						
						
						
							 | 
							
								 | 
							
							        entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return "&%s;" % entity
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    @classmethod
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def _escape_entity_name(cls, matchobj: re.Match) -> str:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return "&%s;" % matchobj.group(1)
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    @classmethod
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def _escape_unrecognized_entity_name(cls, matchobj: re.Match) -> str:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        possible_entity = matchobj.group(1)
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if possible_entity in cls.HTML_ENTITY_TO_CHARACTER:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            return "&%s;" % possible_entity
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return "&%s;" % possible_entity
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    @classmethod
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def quoted_attribute_value(cls, value: str) -> str:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """Make a value into a quoted XML attribute, possibly escaping it.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							         Most strings will be quoted using double quotes.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							          Bob's Bar -> "Bob's Bar"
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							         If a string contains double quotes, it will be quoted using
 | 
						
						
						
						
							 | 
							
								 | 
							
							         single quotes.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							          Welcome to "my bar" -> 'Welcome to "my bar"'
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							         If a string contains both single and double quotes, the
 | 
						
						
						
						
							 | 
							
								 | 
							
							         double quotes will be escaped, and the string will be quoted
 | 
						
						
						
						
							 | 
							
								 | 
							
							         using double quotes.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							          Welcome to "Bob's Bar" -> Welcome to "Bob's bar"
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param value: The XML attribute value to quote
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :return: The quoted value
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """
 | 
						
						
						
						
							 | 
							
								 | 
							
							        quote_with = '"'
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if '"' in value:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            if "'" in value:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # The string contains both single and double
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # quotes.  Turn the double quotes into
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # entities. We quote the double quotes rather than
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # the single quotes because the entity name is
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # """ whether this is HTML or XML.  If we
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # quoted the single quotes, we'd have to decide
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # between ' and &squot;.
 | 
						
						
						
						
							 | 
							
								 | 
							
							                replace_with = """
 | 
						
						
						
						
							 | 
							
								 | 
							
							                value = value.replace('"', replace_with)
 | 
						
						
						
						
							 | 
							
								 | 
							
							            else:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # There are double quotes but no single quotes.
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # We can use single quotes to quote the attribute.
 | 
						
						
						
						
							 | 
							
								 | 
							
							                quote_with = "'"
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return quote_with + value + quote_with
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    @classmethod
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def substitute_xml(cls, value: str, make_quoted_attribute: bool = False) -> str:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """Replace special XML characters with named XML entities.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        The less-than sign will become <, the greater-than sign
 | 
						
						
						
						
							 | 
							
								 | 
							
							        will become >, and any ampersands will become &. If you
 | 
						
						
						
						
							 | 
							
								 | 
							
							        want ampersands that seem to be part of an entity definition
 | 
						
						
						
						
							 | 
							
								 | 
							
							        to be left alone, use `substitute_xml_containing_entities`
 | 
						
						
						
						
							 | 
							
								 | 
							
							        instead.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param value: A string to be substituted.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param make_quoted_attribute: If True, then the string will be
 | 
						
						
						
						
							 | 
							
								 | 
							
							         quoted, as befits an attribute value.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :return: A version of ``value`` with special characters replaced
 | 
						
						
						
						
							 | 
							
								 | 
							
							         with named entities.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # Escape angle brackets and ampersands.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        value = cls.AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value)
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if make_quoted_attribute:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            value = cls.quoted_attribute_value(value)
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return value
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    @classmethod
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def substitute_xml_containing_entities(
 | 
						
						
						
						
							 | 
							
								 | 
							
							        cls, value: str, make_quoted_attribute: bool = False
 | 
						
						
						
						
							 | 
							
								 | 
							
							    ) -> str:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """Substitute XML entities for special XML characters.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param value: A string to be substituted. The less-than sign will
 | 
						
						
						
						
							 | 
							
								 | 
							
							          become <, the greater-than sign will become >, and any
 | 
						
						
						
						
							 | 
							
								 | 
							
							          ampersands that are not part of an entity defition will
 | 
						
						
						
						
							 | 
							
								 | 
							
							          become &.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param make_quoted_attribute: If True, then the string will be
 | 
						
						
						
						
							 | 
							
								 | 
							
							         quoted, as befits an attribute value.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # Escape angle brackets, and ampersands that aren't part of
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # entities.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        value = cls.BARE_AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value)
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if make_quoted_attribute:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            value = cls.quoted_attribute_value(value)
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return value
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    @classmethod
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def substitute_html(cls, s: str) -> str:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """Replace certain Unicode characters with named HTML entities.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        This differs from ``data.encode(encoding, 'xmlcharrefreplace')``
 | 
						
						
						
						
							 | 
							
								 | 
							
							        in that the goal is to make the result more readable (to those
 | 
						
						
						
						
							 | 
							
								 | 
							
							        with ASCII displays) rather than to recover from
 | 
						
						
						
						
							 | 
							
								 | 
							
							        errors. There's absolutely nothing wrong with a UTF-8 string
 | 
						
						
						
						
							 | 
							
								 | 
							
							        containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
 | 
						
						
						
						
							 | 
							
								 | 
							
							        character with "é" will make it more readable to some
 | 
						
						
						
						
							 | 
							
								 | 
							
							        people.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param s: The string to be modified.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :return: The string with some Unicode characters replaced with
 | 
						
						
						
						
							 | 
							
								 | 
							
							           HTML entities.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # Convert any appropriate characters to HTML entities.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE.sub(
 | 
						
						
						
						
							 | 
							
								 | 
							
							            cls._substitute_html_entity, s
 | 
						
						
						
						
							 | 
							
								 | 
							
							        )
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    @classmethod
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def substitute_html5(cls, s: str) -> str:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """Replace certain Unicode characters with named HTML entities
 | 
						
						
						
						
							 | 
							
								 | 
							
							        using HTML5 rules.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        Specifically, this method is much less aggressive about
 | 
						
						
						
						
							 | 
							
								 | 
							
							        escaping ampersands than substitute_html. Only ambiguous
 | 
						
						
						
						
							 | 
							
								 | 
							
							        ampersands are escaped, per the HTML5 standard:
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        "An ambiguous ampersand is a U+0026 AMPERSAND character (&)
 | 
						
						
						
						
							 | 
							
								 | 
							
							        that is followed by one or more ASCII alphanumerics, followed
 | 
						
						
						
						
							 | 
							
								 | 
							
							        by a U+003B SEMICOLON character (;), where these characters do
 | 
						
						
						
						
							 | 
							
								 | 
							
							        not match any of the names given in the named character
 | 
						
						
						
						
							 | 
							
								 | 
							
							        references section."
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        Unlike substitute_html5_raw, this method assumes HTML entities
 | 
						
						
						
						
							 | 
							
								 | 
							
							        were converted to Unicode characters on the way in, as
 | 
						
						
						
						
							 | 
							
								 | 
							
							        Beautiful Soup does. By the time Beautiful Soup does its work,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        the only ambiguous ampersands that need to be escaped are the
 | 
						
						
						
						
							 | 
							
								 | 
							
							        ones that were escaped in the original markup when mentioning
 | 
						
						
						
						
							 | 
							
								 | 
							
							        HTML entities.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param s: The string to be modified.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :return: The string with some Unicode characters replaced with
 | 
						
						
						
						
							 | 
							
								 | 
							
							           HTML entities.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # First, escape any HTML entities found in the markup.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        s = cls.ANY_ENTITY_RE.sub(cls._escape_entity_name, s)
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # Next, convert any appropriate characters to unescaped HTML entities.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s)
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return s
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    @classmethod
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def substitute_html5_raw(cls, s: str) -> str:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """Replace certain Unicode characters with named HTML entities
 | 
						
						
						
						
							 | 
							
								 | 
							
							        using HTML5 rules.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        substitute_html5_raw is similar to substitute_html5 but it is
 | 
						
						
						
						
							 | 
							
								 | 
							
							        designed for standalone use (whereas substitute_html5 is
 | 
						
						
						
						
							 | 
							
								 | 
							
							        designed for use with Beautiful Soup).
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param s: The string to be modified.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :return: The string with some Unicode characters replaced with
 | 
						
						
						
						
							 | 
							
								 | 
							
							           HTML entities.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # First, escape the ampersand for anything that looks like an
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # entity but isn't in the list of recognized entities. All other
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # ampersands can be left alone.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        s = cls.ANY_ENTITY_RE.sub(cls._escape_unrecognized_entity_name, s)
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # Then, convert a range of Unicode characters to unescaped
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # HTML entities.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s)
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return s
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							EntitySubstitution._populate_class_variables()
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							class EncodingDetector:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    """This class is capable of guessing a number of possible encodings
 | 
						
						
						
						
							 | 
							
								 | 
							
							    for a bytestring.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    Order of precedence:
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    1. Encodings you specifically tell EncodingDetector to try first
 | 
						
						
						
						
							 | 
							
								 | 
							
							       (the ``known_definite_encodings`` argument to the constructor).
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    2. An encoding determined by sniffing the document's byte-order mark.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    3. Encodings you specifically tell EncodingDetector to try if
 | 
						
						
						
						
							 | 
							
								 | 
							
							       byte-order mark sniffing fails (the ``user_encodings`` argument to the
 | 
						
						
						
						
							 | 
							
								 | 
							
							       constructor).
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    4. An encoding declared within the bytestring itself, either in an
 | 
						
						
						
						
							 | 
							
								 | 
							
							       XML declaration (if the bytestring is to be interpreted as an XML
 | 
						
						
						
						
							 | 
							
								 | 
							
							       document), or in a <meta> tag (if the bytestring is to be
 | 
						
						
						
						
							 | 
							
								 | 
							
							       interpreted as an HTML document.)
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    5. An encoding detected through textual analysis by chardet,
 | 
						
						
						
						
							 | 
							
								 | 
							
							       cchardet, or a similar external library.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    6. UTF-8.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    7. Windows-1252.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    :param markup: Some markup in an unknown encoding.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    :param known_definite_encodings: When determining the encoding
 | 
						
						
						
						
							 | 
							
								 | 
							
							        of ``markup``, these encodings will be tried first, in
 | 
						
						
						
						
							 | 
							
								 | 
							
							        order. In HTML terms, this corresponds to the "known
 | 
						
						
						
						
							 | 
							
								 | 
							
							        definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    :param user_encodings: These encodings will be tried after the
 | 
						
						
						
						
							 | 
							
								 | 
							
							        ``known_definite_encodings`` have been tried and failed, and
 | 
						
						
						
						
							 | 
							
								 | 
							
							        after an attempt to sniff the encoding by looking at a
 | 
						
						
						
						
							 | 
							
								 | 
							
							        byte order mark has failed. In HTML terms, this
 | 
						
						
						
						
							 | 
							
								 | 
							
							        corresponds to the step "user has explicitly instructed
 | 
						
						
						
						
							 | 
							
								 | 
							
							        the user agent to override the document's character
 | 
						
						
						
						
							 | 
							
								 | 
							
							        encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    :param override_encodings: A **deprecated** alias for
 | 
						
						
						
						
							 | 
							
								 | 
							
							        ``known_definite_encodings``. Any encodings here will be tried
 | 
						
						
						
						
							 | 
							
								 | 
							
							        immediately after the encodings in
 | 
						
						
						
						
							 | 
							
								 | 
							
							        ``known_definite_encodings``.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    :param is_html: If True, this markup is considered to be
 | 
						
						
						
						
							 | 
							
								 | 
							
							        HTML. Otherwise it's assumed to be XML.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    :param exclude_encodings: These encodings will not be tried,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        even if they otherwise would be.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    """
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def __init__(
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        markup: bytes,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        known_definite_encodings: Optional[_Encodings] = None,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        is_html: Optional[bool] = False,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        exclude_encodings: Optional[_Encodings] = None,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        user_encodings: Optional[_Encodings] = None,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        override_encodings: Optional[_Encodings] = None,
 | 
						
						
						
						
							 | 
							
								 | 
							
							    ):
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self.known_definite_encodings = list(known_definite_encodings or [])
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if override_encodings:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            warnings.warn(
 | 
						
						
						
						
							 | 
							
								 | 
							
							                "The 'override_encodings' argument was deprecated in 4.10.0. Use 'known_definite_encodings' instead.",
 | 
						
						
						
						
							 | 
							
								 | 
							
							                DeprecationWarning,
 | 
						
						
						
						
							 | 
							
								 | 
							
							                stacklevel=3,
 | 
						
						
						
						
							 | 
							
								 | 
							
							            )
 | 
						
						
						
						
							 | 
							
								 | 
							
							            self.known_definite_encodings += override_encodings
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self.user_encodings = user_encodings or []
 | 
						
						
						
						
							 | 
							
								 | 
							
							        exclude_encodings = exclude_encodings or []
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self.exclude_encodings = set([x.lower() for x in exclude_encodings])
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self.chardet_encoding = None
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self.is_html = False if is_html is None else is_html
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self.declared_encoding: Optional[str] = None
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # First order of business: strip a byte-order mark.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    known_definite_encodings: _Encodings
 | 
						
						
						
						
							 | 
							
								 | 
							
							    user_encodings: _Encodings
 | 
						
						
						
						
							 | 
							
								 | 
							
							    exclude_encodings: _Encodings
 | 
						
						
						
						
							 | 
							
								 | 
							
							    chardet_encoding: Optional[_Encoding]
 | 
						
						
						
						
							 | 
							
								 | 
							
							    is_html: bool
 | 
						
						
						
						
							 | 
							
								 | 
							
							    declared_encoding: Optional[_Encoding]
 | 
						
						
						
						
							 | 
							
								 | 
							
							    markup: bytes
 | 
						
						
						
						
							 | 
							
								 | 
							
							    sniffed_encoding: Optional[_Encoding]
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def _usable(self, encoding: Optional[_Encoding], tried: Set[_Encoding]) -> bool:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """Should we even bother to try this encoding?
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param encoding: Name of an encoding.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param tried: Encodings that have already been tried. This
 | 
						
						
						
						
							 | 
							
								 | 
							
							            will be modified as a side effect.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if encoding is None:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            return False
 | 
						
						
						
						
							 | 
							
								 | 
							
							        encoding = encoding.lower()
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if encoding in self.exclude_encodings:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            return False
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if encoding not in tried:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            tried.add(encoding)
 | 
						
						
						
						
							 | 
							
								 | 
							
							            return True
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return False
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    @property
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def encodings(self) -> Iterator[_Encoding]:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """Yield a number of encodings that might work for this markup.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :yield: A sequence of strings. Each is the name of an encoding
 | 
						
						
						
						
							 | 
							
								 | 
							
							           that *might* work to convert a bytestring into Unicode.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """
 | 
						
						
						
						
							 | 
							
								 | 
							
							        tried: Set[_Encoding] = set()
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # First, try the known definite encodings
 | 
						
						
						
						
							 | 
							
								 | 
							
							        for e in self.known_definite_encodings:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            if self._usable(e, tried):
 | 
						
						
						
						
							 | 
							
								 | 
							
							                yield e
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # Did the document originally start with a byte-order mark
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # that indicated its encoding?
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if self.sniffed_encoding is not None and self._usable(
 | 
						
						
						
						
							 | 
							
								 | 
							
							            self.sniffed_encoding, tried
 | 
						
						
						
						
							 | 
							
								 | 
							
							        ):
 | 
						
						
						
						
							 | 
							
								 | 
							
							            yield self.sniffed_encoding
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # Sniffing the byte-order mark did nothing; try the user
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # encodings.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        for e in self.user_encodings:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            if self._usable(e, tried):
 | 
						
						
						
						
							 | 
							
								 | 
							
							                yield e
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # Look within the document for an XML or HTML encoding
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # declaration.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if self.declared_encoding is None:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            self.declared_encoding = self.find_declared_encoding(
 | 
						
						
						
						
							 | 
							
								 | 
							
							                self.markup, self.is_html
 | 
						
						
						
						
							 | 
							
								 | 
							
							            )
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if self.declared_encoding is not None and self._usable(
 | 
						
						
						
						
							 | 
							
								 | 
							
							            self.declared_encoding, tried
 | 
						
						
						
						
							 | 
							
								 | 
							
							        ):
 | 
						
						
						
						
							 | 
							
								 | 
							
							            yield self.declared_encoding
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # Use third-party character set detection to guess at the
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # encoding.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if self.chardet_encoding is None:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            self.chardet_encoding = _chardet_dammit(self.markup)
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if self.chardet_encoding is not None and self._usable(
 | 
						
						
						
						
							 | 
							
								 | 
							
							            self.chardet_encoding, tried
 | 
						
						
						
						
							 | 
							
								 | 
							
							        ):
 | 
						
						
						
						
							 | 
							
								 | 
							
							            yield self.chardet_encoding
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # As a last-ditch effort, try utf-8 and windows-1252.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        for e in ("utf-8", "windows-1252"):
 | 
						
						
						
						
							 | 
							
								 | 
							
							            if self._usable(e, tried):
 | 
						
						
						
						
							 | 
							
								 | 
							
							                yield e
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    @classmethod
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def strip_byte_order_mark(cls, data: bytes) -> Tuple[bytes, Optional[_Encoding]]:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """If a byte-order mark is present, strip it and return the encoding it implies.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param data: A bytestring that may or may not begin with a
 | 
						
						
						
						
							 | 
							
								 | 
							
							           byte-order mark.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :return: A 2-tuple (data stripped of byte-order mark, encoding implied by byte-order mark)
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """
 | 
						
						
						
						
							 | 
							
								 | 
							
							        encoding = None
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if isinstance(data, str):
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # Unicode data cannot have a byte-order mark.
 | 
						
						
						
						
							 | 
							
								 | 
							
							            return data, encoding
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if (
 | 
						
						
						
						
							 | 
							
								 | 
							
							            (len(data) >= 4)
 | 
						
						
						
						
							 | 
							
								 | 
							
							            and (data[:2] == b"\xfe\xff")
 | 
						
						
						
						
							 | 
							
								 | 
							
							            and (data[2:4] != b"\x00\x00")
 | 
						
						
						
						
							 | 
							
								 | 
							
							        ):
 | 
						
						
						
						
							 | 
							
								 | 
							
							            encoding = "utf-16be"
 | 
						
						
						
						
							 | 
							
								 | 
							
							            data = data[2:]
 | 
						
						
						
						
							 | 
							
								 | 
							
							        elif (
 | 
						
						
						
						
							 | 
							
								 | 
							
							            (len(data) >= 4)
 | 
						
						
						
						
							 | 
							
								 | 
							
							            and (data[:2] == b"\xff\xfe")
 | 
						
						
						
						
							 | 
							
								 | 
							
							            and (data[2:4] != b"\x00\x00")
 | 
						
						
						
						
							 | 
							
								 | 
							
							        ):
 | 
						
						
						
						
							 | 
							
								 | 
							
							            encoding = "utf-16le"
 | 
						
						
						
						
							 | 
							
								 | 
							
							            data = data[2:]
 | 
						
						
						
						
							 | 
							
								 | 
							
							        elif data[:3] == b"\xef\xbb\xbf":
 | 
						
						
						
						
							 | 
							
								 | 
							
							            encoding = "utf-8"
 | 
						
						
						
						
							 | 
							
								 | 
							
							            data = data[3:]
 | 
						
						
						
						
							 | 
							
								 | 
							
							        elif data[:4] == b"\x00\x00\xfe\xff":
 | 
						
						
						
						
							 | 
							
								 | 
							
							            encoding = "utf-32be"
 | 
						
						
						
						
							 | 
							
								 | 
							
							            data = data[4:]
 | 
						
						
						
						
							 | 
							
								 | 
							
							        elif data[:4] == b"\xff\xfe\x00\x00":
 | 
						
						
						
						
							 | 
							
								 | 
							
							            encoding = "utf-32le"
 | 
						
						
						
						
							 | 
							
								 | 
							
							            data = data[4:]
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return data, encoding
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    @classmethod
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def find_declared_encoding(
 | 
						
						
						
						
							 | 
							
								 | 
							
							        cls,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        markup: Union[bytes, str],
 | 
						
						
						
						
							 | 
							
								 | 
							
							        is_html: bool = False,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        search_entire_document: bool = False,
 | 
						
						
						
						
							 | 
							
								 | 
							
							    ) -> Optional[_Encoding]:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """Given a document, tries to find an encoding declared within the
 | 
						
						
						
						
							 | 
							
								 | 
							
							        text of the document itself.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        An XML encoding is declared at the beginning of the document.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        An HTML encoding is declared in a <meta> tag, hopefully near the
 | 
						
						
						
						
							 | 
							
								 | 
							
							        beginning of the document.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param markup: Some markup.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param is_html: If True, this markup is considered to be HTML. Otherwise
 | 
						
						
						
						
							 | 
							
								 | 
							
							            it's assumed to be XML.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param search_entire_document: Since an encoding is supposed
 | 
						
						
						
						
							 | 
							
								 | 
							
							            to declared near the beginning of the document, most of
 | 
						
						
						
						
							 | 
							
								 | 
							
							            the time it's only necessary to search a few kilobytes of
 | 
						
						
						
						
							 | 
							
								 | 
							
							            data.  Set this to True to force this method to search the
 | 
						
						
						
						
							 | 
							
								 | 
							
							            entire document.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :return: The declared encoding, if one is found.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if search_entire_document:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            xml_endpos = html_endpos = len(markup)
 | 
						
						
						
						
							 | 
							
								 | 
							
							        else:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            xml_endpos = 1024
 | 
						
						
						
						
							 | 
							
								 | 
							
							            html_endpos = max(2048, int(len(markup) * 0.05))
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if isinstance(markup, bytes):
 | 
						
						
						
						
							 | 
							
								 | 
							
							            res = encoding_res[bytes]
 | 
						
						
						
						
							 | 
							
								 | 
							
							        else:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            res = encoding_res[str]
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        xml_re = res["xml"]
 | 
						
						
						
						
							 | 
							
								 | 
							
							        html_re = res["html"]
 | 
						
						
						
						
							 | 
							
								 | 
							
							        declared_encoding: Optional[_Encoding] = None
 | 
						
						
						
						
							 | 
							
								 | 
							
							        declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if not declared_encoding_match and is_html:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            declared_encoding_match = html_re.search(markup, endpos=html_endpos)
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if declared_encoding_match is not None:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            declared_encoding = declared_encoding_match.groups()[0]
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if declared_encoding:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            if isinstance(declared_encoding, bytes):
 | 
						
						
						
						
							 | 
							
								 | 
							
							                declared_encoding = declared_encoding.decode("ascii", "replace")
 | 
						
						
						
						
							 | 
							
								 | 
							
							            return declared_encoding.lower()
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return None
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							class UnicodeDammit:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    """A class for detecting the encoding of a bytestring containing an
 | 
						
						
						
						
							 | 
							
								 | 
							
							    HTML or XML document, and decoding it to Unicode. If the source
 | 
						
						
						
						
							 | 
							
								 | 
							
							    encoding is windows-1252, `UnicodeDammit` can also replace
 | 
						
						
						
						
							 | 
							
								 | 
							
							    Microsoft smart quotes with their HTML or XML equivalents.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    :param markup: HTML or XML markup in an unknown encoding.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    :param known_definite_encodings: When determining the encoding
 | 
						
						
						
						
							 | 
							
								 | 
							
							        of ``markup``, these encodings will be tried first, in
 | 
						
						
						
						
							 | 
							
								 | 
							
							        order. In HTML terms, this corresponds to the "known
 | 
						
						
						
						
							 | 
							
								 | 
							
							        definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    :param user_encodings: These encodings will be tried after the
 | 
						
						
						
						
							 | 
							
								 | 
							
							        ``known_definite_encodings`` have been tried and failed, and
 | 
						
						
						
						
							 | 
							
								 | 
							
							        after an attempt to sniff the encoding by looking at a
 | 
						
						
						
						
							 | 
							
								 | 
							
							        byte order mark has failed. In HTML terms, this
 | 
						
						
						
						
							 | 
							
								 | 
							
							        corresponds to the step "user has explicitly instructed
 | 
						
						
						
						
							 | 
							
								 | 
							
							        the user agent to override the document's character
 | 
						
						
						
						
							 | 
							
								 | 
							
							        encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    :param override_encodings: A **deprecated** alias for
 | 
						
						
						
						
							 | 
							
								 | 
							
							        ``known_definite_encodings``. Any encodings here will be tried
 | 
						
						
						
						
							 | 
							
								 | 
							
							        immediately after the encodings in
 | 
						
						
						
						
							 | 
							
								 | 
							
							        ``known_definite_encodings``.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    :param smart_quotes_to: By default, Microsoft smart quotes will,
 | 
						
						
						
						
							 | 
							
								 | 
							
							       like all other characters, be converted to Unicode
 | 
						
						
						
						
							 | 
							
								 | 
							
							       characters. Setting this to ``ascii`` will convert them to ASCII
 | 
						
						
						
						
							 | 
							
								 | 
							
							       quotes instead.  Setting it to ``xml`` will convert them to XML
 | 
						
						
						
						
							 | 
							
								 | 
							
							       entity references, and setting it to ``html`` will convert them
 | 
						
						
						
						
							 | 
							
								 | 
							
							       to HTML entity references.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    :param is_html: If True, ``markup`` is treated as an HTML
 | 
						
						
						
						
							 | 
							
								 | 
							
							       document. Otherwise it's treated as an XML document.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    :param exclude_encodings: These encodings will not be considered,
 | 
						
						
						
						
							 | 
							
								 | 
							
							       even if the sniffing code thinks they might make sense.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    """
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def __init__(
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        markup: bytes,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        known_definite_encodings: Optional[_Encodings] = [],
 | 
						
						
						
						
							 | 
							
								 | 
							
							        smart_quotes_to: Optional[Literal["ascii", "xml", "html"]] = None,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        is_html: bool = False,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        exclude_encodings: Optional[_Encodings] = [],
 | 
						
						
						
						
							 | 
							
								 | 
							
							        user_encodings: Optional[_Encodings] = None,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        override_encodings: Optional[_Encodings] = None,
 | 
						
						
						
						
							 | 
							
								 | 
							
							    ):
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self.smart_quotes_to = smart_quotes_to
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self.tried_encodings = []
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self.contains_replacement_characters = False
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self.is_html = is_html
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self.log = getLogger(__name__)
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self.detector = EncodingDetector(
 | 
						
						
						
						
							 | 
							
								 | 
							
							            markup,
 | 
						
						
						
						
							 | 
							
								 | 
							
							            known_definite_encodings,
 | 
						
						
						
						
							 | 
							
								 | 
							
							            is_html,
 | 
						
						
						
						
							 | 
							
								 | 
							
							            exclude_encodings,
 | 
						
						
						
						
							 | 
							
								 | 
							
							            user_encodings,
 | 
						
						
						
						
							 | 
							
								 | 
							
							            override_encodings,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        )
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # Short-circuit if the data is in Unicode to begin with.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if isinstance(markup, str):
 | 
						
						
						
						
							 | 
							
								 | 
							
							            self.markup = markup.encode("utf8")
 | 
						
						
						
						
							 | 
							
								 | 
							
							            self.unicode_markup = markup
 | 
						
						
						
						
							 | 
							
								 | 
							
							            self.original_encoding = None
 | 
						
						
						
						
							 | 
							
								 | 
							
							            return
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # The encoding detector may have stripped a byte-order mark.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # Use the stripped markup from this point on.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self.markup = self.detector.markup
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        u = None
 | 
						
						
						
						
							 | 
							
								 | 
							
							        for encoding in self.detector.encodings:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            markup = self.detector.markup
 | 
						
						
						
						
							 | 
							
								 | 
							
							            u = self._convert_from(encoding)
 | 
						
						
						
						
							 | 
							
								 | 
							
							            if u is not None:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                break
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if not u:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # None of the encodings worked. As an absolute last resort,
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # try them again with character replacement.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							            for encoding in self.detector.encodings:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                if encoding != "ascii":
 | 
						
						
						
						
							 | 
							
								 | 
							
							                    u = self._convert_from(encoding, "replace")
 | 
						
						
						
						
							 | 
							
								 | 
							
							                if u is not None:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                    self.log.warning(
 | 
						
						
						
						
							 | 
							
								 | 
							
							                        "Some characters could not be decoded, and were "
 | 
						
						
						
						
							 | 
							
								 | 
							
							                        "replaced with REPLACEMENT CHARACTER."
 | 
						
						
						
						
							 | 
							
								 | 
							
							                    )
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							                    self.contains_replacement_characters = True
 | 
						
						
						
						
							 | 
							
								 | 
							
							                    break
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # If none of that worked, we could at this point force it to
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # ASCII, but that would destroy so much data that I think
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # giving up is better.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        #
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # Note that this is extremely unlikely, probably impossible,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # because the "replace" strategy is so powerful. Even running
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # the Python binary through Unicode, Dammit gives you Unicode,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # albeit Unicode riddled with REPLACEMENT CHARACTER.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if u is None:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            self.original_encoding = None
 | 
						
						
						
						
							 | 
							
								 | 
							
							            self.unicode_markup = None
 | 
						
						
						
						
							 | 
							
								 | 
							
							        else:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            self.unicode_markup = u
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: The original markup, before it was converted to Unicode.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: This is not necessarily the same as what was passed in to the
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: constructor, since any byte-order mark will be stripped.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    markup: bytes
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: The Unicode version of the markup, following conversion. This
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: is set to None if there was simply no way to convert the
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: bytestring to Unicode (as with binary data).
 | 
						
						
						
						
							 | 
							
								 | 
							
							    unicode_markup: Optional[str]
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: This is True if `UnicodeDammit.unicode_markup` contains
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: U+FFFD REPLACEMENT_CHARACTER characters which were not present
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: in `UnicodeDammit.markup`. These mark character sequences that
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: could not be represented in Unicode.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    contains_replacement_characters: bool
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: Unicode, Dammit's best guess as to the original character
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: encoding of `UnicodeDammit.markup`.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    original_encoding: Optional[_Encoding]
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: The strategy used to handle Microsoft smart quotes.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    smart_quotes_to: Optional[str]
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: The (encoding, error handling strategy) 2-tuples that were used to
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: try and convert the markup to Unicode.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    tried_encodings: List[Tuple[_Encoding, str]]
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    log: Logger  #: :meta private:
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def _sub_ms_char(self, match: re.Match) -> bytes:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """Changes a MS smart quote character to an XML or HTML
 | 
						
						
						
						
							 | 
							
								 | 
							
							        entity, or an ASCII character.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        TODO: Since this is only used to convert smart quotes, it
 | 
						
						
						
						
							 | 
							
								 | 
							
							        could be simplified, and MS_CHARS_TO_ASCII made much less
 | 
						
						
						
						
							 | 
							
								 | 
							
							        parochial.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """
 | 
						
						
						
						
							 | 
							
								 | 
							
							        orig: bytes = match.group(1)
 | 
						
						
						
						
							 | 
							
								 | 
							
							        sub: bytes
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if self.smart_quotes_to == "ascii":
 | 
						
						
						
						
							 | 
							
								 | 
							
							            if orig in self.MS_CHARS_TO_ASCII:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                sub = self.MS_CHARS_TO_ASCII[orig].encode()
 | 
						
						
						
						
							 | 
							
								 | 
							
							            else:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # Shouldn't happen; substitute the character
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # with itself.
 | 
						
						
						
						
							 | 
							
								 | 
							
							                sub = orig
 | 
						
						
						
						
							 | 
							
								 | 
							
							        else:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            if orig in self.MS_CHARS:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                substitutions = self.MS_CHARS[orig]
 | 
						
						
						
						
							 | 
							
								 | 
							
							                if type(substitutions) is tuple:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                    if self.smart_quotes_to == "xml":
 | 
						
						
						
						
							 | 
							
								 | 
							
							                        sub = b"&#x" + substitutions[1].encode() + b";"
 | 
						
						
						
						
							 | 
							
								 | 
							
							                    else:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                        sub = b"&" + substitutions[0].encode() + b";"
 | 
						
						
						
						
							 | 
							
								 | 
							
							                else:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                    substitutions = cast(str, substitutions)
 | 
						
						
						
						
							 | 
							
								 | 
							
							                    sub = substitutions.encode()
 | 
						
						
						
						
							 | 
							
								 | 
							
							            else:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # Shouldn't happen; substitute the character
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # for itself.
 | 
						
						
						
						
							 | 
							
								 | 
							
							                sub = orig
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return sub
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: This dictionary maps commonly seen values for "charset" in HTML
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: meta tags to the corresponding Python codec names. It only covers
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: values that aren't in Python's aliases and can't be determined
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: by the heuristics in `find_codec`.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: :meta hide-value:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    CHARSET_ALIASES: Dict[str, _Encoding] = {
 | 
						
						
						
						
							 | 
							
								 | 
							
							        "macintosh": "mac-roman",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        "x-sjis": "shift-jis",
 | 
						
						
						
						
							 | 
							
								 | 
							
							    }
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: A list of encodings that tend to contain Microsoft smart quotes.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: :meta hide-value:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    ENCODINGS_WITH_SMART_QUOTES: _Encodings = [
 | 
						
						
						
						
							 | 
							
								 | 
							
							        "windows-1252",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        "iso-8859-1",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        "iso-8859-2",
 | 
						
						
						
						
							 | 
							
								 | 
							
							    ]
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def _convert_from(
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self, proposed: _Encoding, errors: str = "strict"
 | 
						
						
						
						
							 | 
							
								 | 
							
							    ) -> Optional[str]:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """Attempt to convert the markup to the proposed encoding.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param proposed: The name of a character encoding.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param errors: An error handling strategy, used when calling `str`.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :return: The converted markup, or `None` if the proposed
 | 
						
						
						
						
							 | 
							
								 | 
							
							           encoding/error handling strategy didn't work.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """
 | 
						
						
						
						
							 | 
							
								 | 
							
							        lookup_result = self.find_codec(proposed)
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if lookup_result is None or (lookup_result, errors) in self.tried_encodings:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            return None
 | 
						
						
						
						
							 | 
							
								 | 
							
							        proposed = lookup_result
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self.tried_encodings.append((proposed, errors))
 | 
						
						
						
						
							 | 
							
								 | 
							
							        markup = self.markup
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # Convert smart quotes to HTML if coming from an encoding
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # that might have them.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if (
 | 
						
						
						
						
							 | 
							
								 | 
							
							            self.smart_quotes_to is not None
 | 
						
						
						
						
							 | 
							
								 | 
							
							            and proposed in self.ENCODINGS_WITH_SMART_QUOTES
 | 
						
						
						
						
							 | 
							
								 | 
							
							        ):
 | 
						
						
						
						
							 | 
							
								 | 
							
							            smart_quotes_re = b"([\x80-\x9f])"
 | 
						
						
						
						
							 | 
							
								 | 
							
							            smart_quotes_compiled = re.compile(smart_quotes_re)
 | 
						
						
						
						
							 | 
							
								 | 
							
							            markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        try:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # print("Trying to convert document to %s (errors=%s)" % (
 | 
						
						
						
						
							 | 
							
								 | 
							
							            #    proposed, errors))
 | 
						
						
						
						
							 | 
							
								 | 
							
							            u = self._to_unicode(markup, proposed, errors)
 | 
						
						
						
						
							 | 
							
								 | 
							
							            self.unicode_markup = u
 | 
						
						
						
						
							 | 
							
								 | 
							
							            self.original_encoding = proposed
 | 
						
						
						
						
							 | 
							
								 | 
							
							        except Exception:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # print("That didn't work!")
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # print(e)
 | 
						
						
						
						
							 | 
							
								 | 
							
							            return None
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # print("Correct encoding: %s" % proposed)
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return self.unicode_markup
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def _to_unicode(
 | 
						
						
						
						
							 | 
							
								 | 
							
							        self, data: bytes, encoding: _Encoding, errors: str = "strict"
 | 
						
						
						
						
							 | 
							
								 | 
							
							    ) -> str:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """Given a bytestring and its encoding, decodes the string into Unicode.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param encoding: The name of an encoding.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param errors: An error handling strategy, used when calling `str`.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return str(data, encoding, errors)
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    @property
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def declared_html_encoding(self) -> Optional[_Encoding]:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """If the markup is an HTML document, returns the encoding, if any,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        declared *inside* the document.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if not self.is_html:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            return None
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return self.detector.declared_encoding
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def find_codec(self, charset: _Encoding) -> Optional[str]:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """Look up the Python codec corresponding to a given character set.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param charset: The name of a character set.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :return: The name of a Python codec.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """
 | 
						
						
						
						
							 | 
							
								 | 
							
							        value = (
 | 
						
						
						
						
							 | 
							
								 | 
							
							            self._codec(self.CHARSET_ALIASES.get(charset, charset))
 | 
						
						
						
						
							 | 
							
								 | 
							
							            or (charset and self._codec(charset.replace("-", "")))
 | 
						
						
						
						
							 | 
							
								 | 
							
							            or (charset and self._codec(charset.replace("-", "_")))
 | 
						
						
						
						
							 | 
							
								 | 
							
							            or (charset and charset.lower())
 | 
						
						
						
						
							 | 
							
								 | 
							
							            or charset
 | 
						
						
						
						
							 | 
							
								 | 
							
							        )
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if value:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            return value.lower()
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return None
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def _codec(self, charset: _Encoding) -> Optional[str]:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if not charset:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            return charset
 | 
						
						
						
						
							 | 
							
								 | 
							
							        codec = None
 | 
						
						
						
						
							 | 
							
								 | 
							
							        try:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            codecs.lookup(charset)
 | 
						
						
						
						
							 | 
							
								 | 
							
							            codec = charset
 | 
						
						
						
						
							 | 
							
								 | 
							
							        except (LookupError, ValueError):
 | 
						
						
						
						
							 | 
							
								 | 
							
							            pass
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return codec
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: :meta hide-value:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    MS_CHARS: Dict[bytes, Union[str, Tuple[str, str]]] = {
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x80": ("euro", "20AC"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x81": " ",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x82": ("sbquo", "201A"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x83": ("fnof", "192"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x84": ("bdquo", "201E"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x85": ("hellip", "2026"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x86": ("dagger", "2020"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x87": ("Dagger", "2021"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x88": ("circ", "2C6"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x89": ("permil", "2030"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x8a": ("Scaron", "160"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x8b": ("lsaquo", "2039"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x8c": ("OElig", "152"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x8d": "?",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x8e": ("#x17D", "17D"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x8f": "?",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x90": "?",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x91": ("lsquo", "2018"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x92": ("rsquo", "2019"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x93": ("ldquo", "201C"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x94": ("rdquo", "201D"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x95": ("bull", "2022"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x96": ("ndash", "2013"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x97": ("mdash", "2014"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x98": ("tilde", "2DC"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x99": ("trade", "2122"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x9a": ("scaron", "161"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x9b": ("rsaquo", "203A"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x9c": ("oelig", "153"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x9d": "?",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x9e": ("#x17E", "17E"),
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x9f": ("Yuml", ""),
 | 
						
						
						
						
							 | 
							
								 | 
							
							    }
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: horrors like stripping diacritical marks to turn á into a, but also
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: contains non-horrors like turning “ into ".
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: Seriously, don't use this for anything other than removing smart
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: quotes.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: :meta private:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    MS_CHARS_TO_ASCII: Dict[bytes, str] = {
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x80": "EUR",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x81": " ",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x82": ",",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x83": "f",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x84": ",,",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x85": "...",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x86": "+",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x87": "++",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x88": "^",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x89": "%",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x8a": "S",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x8b": "<",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x8c": "OE",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x8d": "?",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x8e": "Z",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x8f": "?",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x90": "?",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x91": "'",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x92": "'",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x93": '"',
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x94": '"',
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x95": "*",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x96": "-",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x97": "--",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x98": "~",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x99": "(TM)",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x9a": "s",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x9b": ">",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x9c": "oe",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x9d": "?",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x9e": "z",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\x9f": "Y",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xa0": " ",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xa1": "!",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xa2": "c",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xa3": "GBP",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xa4": "$",  # This approximation is especially parochial--this is the
 | 
						
						
						
						
							 | 
							
								 | 
							
							        # generic currency symbol.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xa5": "YEN",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xa6": "|",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xa7": "S",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xa8": "..",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xa9": "",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xaa": "(th)",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xab": "<<",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xac": "!",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xad": " ",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xae": "(R)",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xaf": "-",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xb0": "o",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xb1": "+-",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xb2": "2",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xb3": "3",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xb4": "'",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xb5": "u",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xb6": "P",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xb7": "*",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xb8": ",",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xb9": "1",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xba": "(th)",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xbb": ">>",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xbc": "1/4",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xbd": "1/2",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xbe": "3/4",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xbf": "?",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xc0": "A",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xc1": "A",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xc2": "A",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xc3": "A",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xc4": "A",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xc5": "A",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xc6": "AE",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xc7": "C",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xc8": "E",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xc9": "E",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xca": "E",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xcb": "E",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xcc": "I",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xcd": "I",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xce": "I",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xcf": "I",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xd0": "D",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xd1": "N",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xd2": "O",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xd3": "O",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xd4": "O",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xd5": "O",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xd6": "O",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xd7": "*",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xd8": "O",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xd9": "U",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xda": "U",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xdb": "U",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xdc": "U",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xdd": "Y",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xde": "b",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xdf": "B",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xe0": "a",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xe1": "a",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xe2": "a",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xe3": "a",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xe4": "a",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xe5": "a",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xe6": "ae",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xe7": "c",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xe8": "e",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xe9": "e",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xea": "e",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xeb": "e",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xec": "i",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xed": "i",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xee": "i",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xef": "i",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xf0": "o",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xf1": "n",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xf2": "o",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xf3": "o",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xf4": "o",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xf5": "o",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xf6": "o",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xf7": "/",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xf8": "o",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xf9": "u",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xfa": "u",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xfb": "u",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xfc": "u",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xfd": "y",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xfe": "b",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        b"\xff": "y",
 | 
						
						
						
						
							 | 
							
								 | 
							
							    }
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: A map used when removing rogue Windows-1252/ISO-8859-1
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: characters in otherwise UTF-8 documents.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: Note that \\x81, \\x8d, \\x8f, \\x90, and \\x9d are undefined in
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: Windows-1252.
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: :meta hide-value:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    WINDOWS_1252_TO_UTF8: Dict[int, bytes] = {
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x80: b"\xe2\x82\xac",  # €
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x82: b"\xe2\x80\x9a",  # ‚
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x83: b"\xc6\x92",  # ƒ
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x84: b"\xe2\x80\x9e",  # „
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x85: b"\xe2\x80\xa6",  # …
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x86: b"\xe2\x80\xa0",  # †
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x87: b"\xe2\x80\xa1",  # ‡
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x88: b"\xcb\x86",  # ˆ
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x89: b"\xe2\x80\xb0",  # ‰
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x8A: b"\xc5\xa0",  # Š
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x8B: b"\xe2\x80\xb9",  # ‹
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x8C: b"\xc5\x92",  # Œ
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x8E: b"\xc5\xbd",  # Ž
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x91: b"\xe2\x80\x98",  # ‘
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x92: b"\xe2\x80\x99",  # ’
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x93: b"\xe2\x80\x9c",  # “
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x94: b"\xe2\x80\x9d",  # ”
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x95: b"\xe2\x80\xa2",  # •
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x96: b"\xe2\x80\x93",  # –
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x97: b"\xe2\x80\x94",  # —
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x98: b"\xcb\x9c",  # ˜
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x99: b"\xe2\x84\xa2",  # ™
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x9A: b"\xc5\xa1",  # š
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x9B: b"\xe2\x80\xba",  # ›
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x9C: b"\xc5\x93",  # œ
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x9E: b"\xc5\xbe",  # ž
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0x9F: b"\xc5\xb8",  # Ÿ
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xA0: b"\xc2\xa0",  #
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xA1: b"\xc2\xa1",  # ¡
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xA2: b"\xc2\xa2",  # ¢
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xA3: b"\xc2\xa3",  # £
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xA4: b"\xc2\xa4",  # ¤
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xA5: b"\xc2\xa5",  # ¥
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xA6: b"\xc2\xa6",  # ¦
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xA7: b"\xc2\xa7",  # §
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xA8: b"\xc2\xa8",  # ¨
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xA9: b"\xc2\xa9",  # ©
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xAA: b"\xc2\xaa",  # ª
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xAB: b"\xc2\xab",  # «
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xAC: b"\xc2\xac",  # ¬
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xAD: b"\xc2\xad",  # 
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xAE: b"\xc2\xae",  # ®
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xAF: b"\xc2\xaf",  # ¯
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xB0: b"\xc2\xb0",  # °
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xB1: b"\xc2\xb1",  # ±
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xB2: b"\xc2\xb2",  # ²
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xB3: b"\xc2\xb3",  # ³
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xB4: b"\xc2\xb4",  # ´
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xB5: b"\xc2\xb5",  # µ
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xB6: b"\xc2\xb6",  # ¶
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xB7: b"\xc2\xb7",  # ·
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xB8: b"\xc2\xb8",  # ¸
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xB9: b"\xc2\xb9",  # ¹
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xBA: b"\xc2\xba",  # º
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xBB: b"\xc2\xbb",  # »
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xBC: b"\xc2\xbc",  # ¼
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xBD: b"\xc2\xbd",  # ½
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xBE: b"\xc2\xbe",  # ¾
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xBF: b"\xc2\xbf",  # ¿
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xC0: b"\xc3\x80",  # À
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xC1: b"\xc3\x81",  # Á
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xC2: b"\xc3\x82",  # Â
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xC3: b"\xc3\x83",  # Ã
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xC4: b"\xc3\x84",  # Ä
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xC5: b"\xc3\x85",  # Å
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xC6: b"\xc3\x86",  # Æ
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xC7: b"\xc3\x87",  # Ç
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xC8: b"\xc3\x88",  # È
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xC9: b"\xc3\x89",  # É
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xCA: b"\xc3\x8a",  # Ê
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xCB: b"\xc3\x8b",  # Ë
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xCC: b"\xc3\x8c",  # Ì
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xCD: b"\xc3\x8d",  # Í
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xCE: b"\xc3\x8e",  # Î
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xCF: b"\xc3\x8f",  # Ï
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xD0: b"\xc3\x90",  # Ð
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xD1: b"\xc3\x91",  # Ñ
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xD2: b"\xc3\x92",  # Ò
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xD3: b"\xc3\x93",  # Ó
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xD4: b"\xc3\x94",  # Ô
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xD5: b"\xc3\x95",  # Õ
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xD6: b"\xc3\x96",  # Ö
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xD7: b"\xc3\x97",  # ×
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xD8: b"\xc3\x98",  # Ø
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xD9: b"\xc3\x99",  # Ù
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xDA: b"\xc3\x9a",  # Ú
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xDB: b"\xc3\x9b",  # Û
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xDC: b"\xc3\x9c",  # Ü
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xDD: b"\xc3\x9d",  # Ý
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xDE: b"\xc3\x9e",  # Þ
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xDF: b"\xc3\x9f",  # ß
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xE0: b"\xc3\xa0",  # à
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xE1: b"\xa1",  # á
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xE2: b"\xc3\xa2",  # â
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xE3: b"\xc3\xa3",  # ã
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xE4: b"\xc3\xa4",  # ä
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xE5: b"\xc3\xa5",  # å
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xE6: b"\xc3\xa6",  # æ
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xE7: b"\xc3\xa7",  # ç
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xE8: b"\xc3\xa8",  # è
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xE9: b"\xc3\xa9",  # é
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xEA: b"\xc3\xaa",  # ê
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xEB: b"\xc3\xab",  # ë
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xEC: b"\xc3\xac",  # ì
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xED: b"\xc3\xad",  # í
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xEE: b"\xc3\xae",  # î
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xEF: b"\xc3\xaf",  # ï
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xF0: b"\xc3\xb0",  # ð
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xF1: b"\xc3\xb1",  # ñ
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xF2: b"\xc3\xb2",  # ò
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xF3: b"\xc3\xb3",  # ó
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xF4: b"\xc3\xb4",  # ô
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xF5: b"\xc3\xb5",  # õ
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xF6: b"\xc3\xb6",  # ö
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xF7: b"\xc3\xb7",  # ÷
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xF8: b"\xc3\xb8",  # ø
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xF9: b"\xc3\xb9",  # ù
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xFA: b"\xc3\xba",  # ú
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xFB: b"\xc3\xbb",  # û
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xFC: b"\xc3\xbc",  # ü
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xFD: b"\xc3\xbd",  # ý
 | 
						
						
						
						
							 | 
							
								 | 
							
							        0xFE: b"\xc3\xbe",  # þ
 | 
						
						
						
						
							 | 
							
								 | 
							
							    }
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: :meta private:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    MULTIBYTE_MARKERS_AND_SIZES: List[Tuple[int, int, int]] = [
 | 
						
						
						
						
							 | 
							
								 | 
							
							        (0xC2, 0xDF, 2),  # 2-byte characters start with a byte C2-DF
 | 
						
						
						
						
							 | 
							
								 | 
							
							        (0xE0, 0xEF, 3),  # 3-byte characters start with E0-EF
 | 
						
						
						
						
							 | 
							
								 | 
							
							        (0xF0, 0xF4, 4),  # 4-byte characters start with F0-F4
 | 
						
						
						
						
							 | 
							
								 | 
							
							    ]
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: :meta private:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    FIRST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[0][0]
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    #: :meta private:
 | 
						
						
						
						
							 | 
							
								 | 
							
							    LAST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							    @classmethod
 | 
						
						
						
						
							 | 
							
								 | 
							
							    def detwingle(
 | 
						
						
						
						
							 | 
							
								 | 
							
							        cls,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        in_bytes: bytes,
 | 
						
						
						
						
							 | 
							
								 | 
							
							        main_encoding: _Encoding = "utf8",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        embedded_encoding: _Encoding = "windows-1252",
 | 
						
						
						
						
							 | 
							
								 | 
							
							    ) -> bytes:
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """Fix characters from one encoding embedded in some other encoding.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        Currently the only situation supported is Windows-1252 (or its
 | 
						
						
						
						
							 | 
							
								 | 
							
							        subset ISO-8859-1), embedded in UTF-8.
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param in_bytes: A bytestring that you suspect contains
 | 
						
						
						
						
							 | 
							
								 | 
							
							            characters from multiple encodings. Note that this *must*
 | 
						
						
						
						
							 | 
							
								 | 
							
							            be a bytestring. If you've already converted the document
 | 
						
						
						
						
							 | 
							
								 | 
							
							            to Unicode, you're too late.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param main_encoding: The primary encoding of ``in_bytes``.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :param embedded_encoding: The encoding that was used to embed characters
 | 
						
						
						
						
							 | 
							
								 | 
							
							            in the main document.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        :return: A bytestring similar to ``in_bytes``, in which
 | 
						
						
						
						
							 | 
							
								 | 
							
							          ``embedded_encoding`` characters have been converted to
 | 
						
						
						
						
							 | 
							
								 | 
							
							          their ``main_encoding`` equivalents.
 | 
						
						
						
						
							 | 
							
								 | 
							
							        """
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if embedded_encoding.replace("_", "-").lower() not in (
 | 
						
						
						
						
							 | 
							
								 | 
							
							            "windows-1252",
 | 
						
						
						
						
							 | 
							
								 | 
							
							            "windows_1252",
 | 
						
						
						
						
							 | 
							
								 | 
							
							        ):
 | 
						
						
						
						
							 | 
							
								 | 
							
							            raise NotImplementedError(
 | 
						
						
						
						
							 | 
							
								 | 
							
							                "Windows-1252 and ISO-8859-1 are the only currently supported "
 | 
						
						
						
						
							 | 
							
								 | 
							
							                "embedded encodings."
 | 
						
						
						
						
							 | 
							
								 | 
							
							            )
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if main_encoding.lower() not in ("utf8", "utf-8"):
 | 
						
						
						
						
							 | 
							
								 | 
							
							            raise NotImplementedError(
 | 
						
						
						
						
							 | 
							
								 | 
							
							                "UTF-8 is the only currently supported main encoding."
 | 
						
						
						
						
							 | 
							
								 | 
							
							            )
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        byte_chunks = []
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							        chunk_start = 0
 | 
						
						
						
						
							 | 
							
								 | 
							
							        pos = 0
 | 
						
						
						
						
							 | 
							
								 | 
							
							        while pos < len(in_bytes):
 | 
						
						
						
						
							 | 
							
								 | 
							
							            byte = in_bytes[pos]
 | 
						
						
						
						
							 | 
							
								 | 
							
							            if byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # This is the start of a UTF-8 multibyte character. Skip
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # to the end.
 | 
						
						
						
						
							 | 
							
								 | 
							
							                for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                    if byte >= start and byte <= end:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                        pos += size
 | 
						
						
						
						
							 | 
							
								 | 
							
							                        break
 | 
						
						
						
						
							 | 
							
								 | 
							
							            elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # We found a Windows-1252 character!
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # Save the string up to this point as a chunk.
 | 
						
						
						
						
							 | 
							
								 | 
							
							                byte_chunks.append(in_bytes[chunk_start:pos])
 | 
						
						
						
						
							 | 
							
								 | 
							
							
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # Now translate the Windows-1252 character into UTF-8
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # and add it as another, one-byte chunk.
 | 
						
						
						
						
							 | 
							
								 | 
							
							                byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
 | 
						
						
						
						
							 | 
							
								 | 
							
							                pos += 1
 | 
						
						
						
						
							 | 
							
								 | 
							
							                chunk_start = pos
 | 
						
						
						
						
							 | 
							
								 | 
							
							            else:
 | 
						
						
						
						
							 | 
							
								 | 
							
							                # Go on to the next character.
 | 
						
						
						
						
							 | 
							
								 | 
							
							                pos += 1
 | 
						
						
						
						
							 | 
							
								 | 
							
							        if chunk_start == 0:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # The string is unchanged.
 | 
						
						
						
						
							 | 
							
								 | 
							
							            return in_bytes
 | 
						
						
						
						
							 | 
							
								 | 
							
							        else:
 | 
						
						
						
						
							 | 
							
								 | 
							
							            # Store the final chunk.
 | 
						
						
						
						
							 | 
							
								 | 
							
							            byte_chunks.append(in_bytes[chunk_start:])
 | 
						
						
						
						
							 | 
							
								 | 
							
							        return b"".join(byte_chunks)
 |