|
|
# -*- coding: utf-8 -*-
|
|
|
"""Beautiful Soup bonus library: Unicode, Dammit
|
|
|
|
|
|
This library converts a bytestream to Unicode through any means
|
|
|
necessary. It is heavily based on code from Mark Pilgrim's `Universal
|
|
|
Feed Parser <https://pypi.org/project/feedparser/>`_, now maintained
|
|
|
by Kurt McKee. It does not rewrite the body of an XML or HTML document
|
|
|
to reflect a new encoding; that's the job of `TreeBuilder`.
|
|
|
|
|
|
"""
|
|
|
|
|
|
# Use of this source code is governed by the MIT license.
|
|
|
__license__ = "MIT"
|
|
|
|
|
|
from html.entities import codepoint2name
|
|
|
from collections import defaultdict
|
|
|
import codecs
|
|
|
from html.entities import html5
|
|
|
import re
|
|
|
from logging import Logger, getLogger
|
|
|
from types import ModuleType
|
|
|
from typing import (
|
|
|
Dict,
|
|
|
Iterator,
|
|
|
List,
|
|
|
Optional,
|
|
|
Pattern,
|
|
|
Set,
|
|
|
Tuple,
|
|
|
Type,
|
|
|
Union,
|
|
|
cast,
|
|
|
)
|
|
|
from typing_extensions import Literal
|
|
|
from bs4._typing import (
|
|
|
_Encoding,
|
|
|
_Encodings,
|
|
|
)
|
|
|
import warnings
|
|
|
|
|
|
# Import a library to autodetect character encodings. We'll support
|
|
|
# any of a number of libraries that all support the same API:
|
|
|
#
|
|
|
# * cchardet
|
|
|
# * chardet
|
|
|
# * charset-normalizer
|
|
|
chardet_module: Optional[ModuleType] = None
|
|
|
try:
|
|
|
# PyPI package: cchardet
|
|
|
import cchardet # type:ignore
|
|
|
|
|
|
chardet_module = cchardet
|
|
|
except ImportError:
|
|
|
try:
|
|
|
# Debian package: python-chardet
|
|
|
# PyPI package: chardet
|
|
|
import chardet
|
|
|
|
|
|
chardet_module = chardet
|
|
|
except ImportError:
|
|
|
try:
|
|
|
# PyPI package: charset-normalizer
|
|
|
import charset_normalizer # type:ignore
|
|
|
|
|
|
chardet_module = charset_normalizer
|
|
|
except ImportError:
|
|
|
# No chardet available.
|
|
|
pass
|
|
|
|
|
|
|
|
|
def _chardet_dammit(s: bytes) -> Optional[str]:
|
|
|
"""Try as hard as possible to detect the encoding of a bytestring."""
|
|
|
if chardet_module is None or isinstance(s, str):
|
|
|
return None
|
|
|
module = chardet_module
|
|
|
return module.detect(s)["encoding"]
|
|
|
|
|
|
|
|
|
# Build bytestring and Unicode versions of regular expressions for finding
|
|
|
# a declared encoding inside an XML or HTML document.
|
|
|
xml_encoding: str = "^\\s*<\\?.*encoding=['\"](.*?)['\"].*\\?>" #: :meta private:
|
|
|
html_meta: str = (
|
|
|
"<\\s*meta[^>]+charset\\s*=\\s*[\"']?([^>]*?)[ /;'\">]" #: :meta private:
|
|
|
)
|
|
|
|
|
|
# TODO-TYPING: The Pattern type here could use more refinement, but it's tricky.
|
|
|
encoding_res: Dict[Type, Dict[str, Pattern]] = dict()
|
|
|
encoding_res[bytes] = {
|
|
|
"html": re.compile(html_meta.encode("ascii"), re.I),
|
|
|
"xml": re.compile(xml_encoding.encode("ascii"), re.I),
|
|
|
}
|
|
|
encoding_res[str] = {
|
|
|
"html": re.compile(html_meta, re.I),
|
|
|
"xml": re.compile(xml_encoding, re.I),
|
|
|
}
|
|
|
|
|
|
|
|
|
class EntitySubstitution(object):
|
|
|
"""The ability to substitute XML or HTML entities for certain characters."""
|
|
|
|
|
|
#: A map of named HTML entities to the corresponding Unicode string.
|
|
|
#:
|
|
|
#: :meta hide-value:
|
|
|
HTML_ENTITY_TO_CHARACTER: Dict[str, str]
|
|
|
|
|
|
#: A map of Unicode strings to the corresponding named HTML entities;
|
|
|
#: the inverse of HTML_ENTITY_TO_CHARACTER.
|
|
|
#:
|
|
|
#: :meta hide-value:
|
|
|
CHARACTER_TO_HTML_ENTITY: Dict[str, str]
|
|
|
|
|
|
#: A regular expression that matches any character (or, in rare
|
|
|
#: cases, pair of characters) that can be replaced with a named
|
|
|
#: HTML entity.
|
|
|
#:
|
|
|
#: :meta hide-value:
|
|
|
CHARACTER_TO_HTML_ENTITY_RE: Pattern[str]
|
|
|
|
|
|
#: A very similar regular expression to
|
|
|
#: CHARACTER_TO_HTML_ENTITY_RE, but which also matches unescaped
|
|
|
#: ampersands. This is used by the 'html' formatted to provide
|
|
|
#: backwards-compatibility, even though the HTML5 spec allows most
|
|
|
#: ampersands to go unescaped.
|
|
|
#:
|
|
|
#: :meta hide-value:
|
|
|
CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: Pattern[str]
|
|
|
|
|
|
@classmethod
|
|
|
def _populate_class_variables(cls) -> None:
|
|
|
"""Initialize variables used by this class to manage the plethora of
|
|
|
HTML5 named entities.
|
|
|
|
|
|
This function sets the following class variables:
|
|
|
|
|
|
CHARACTER_TO_HTML_ENTITY - A mapping of Unicode strings like "⦨" to
|
|
|
entity names like "angmsdaa". When a single Unicode string has
|
|
|
multiple entity names, we try to choose the most commonly-used
|
|
|
name.
|
|
|
|
|
|
HTML_ENTITY_TO_CHARACTER: A mapping of entity names like "angmsdaa" to
|
|
|
Unicode strings like "⦨".
|
|
|
|
|
|
CHARACTER_TO_HTML_ENTITY_RE: A regular expression matching (almost) any
|
|
|
Unicode string that corresponds to an HTML5 named entity.
|
|
|
|
|
|
CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: A very similar
|
|
|
regular expression to CHARACTER_TO_HTML_ENTITY_RE, but which
|
|
|
also matches unescaped ampersands. This is used by the 'html'
|
|
|
formatted to provide backwards-compatibility, even though the HTML5
|
|
|
spec allows most ampersands to go unescaped.
|
|
|
"""
|
|
|
unicode_to_name = {}
|
|
|
name_to_unicode = {}
|
|
|
|
|
|
short_entities = set()
|
|
|
long_entities_by_first_character = defaultdict(set)
|
|
|
|
|
|
for name_with_semicolon, character in sorted(html5.items()):
|
|
|
# "It is intentional, for legacy compatibility, that many
|
|
|
# code points have multiple character reference names. For
|
|
|
# example, some appear both with and without the trailing
|
|
|
# semicolon, or with different capitalizations."
|
|
|
# - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
|
|
|
#
|
|
|
# The parsers are in charge of handling (or not) character
|
|
|
# references with no trailing semicolon, so we remove the
|
|
|
# semicolon whenever it appears.
|
|
|
if name_with_semicolon.endswith(";"):
|
|
|
name = name_with_semicolon[:-1]
|
|
|
else:
|
|
|
name = name_with_semicolon
|
|
|
|
|
|
# When parsing HTML, we want to recognize any known named
|
|
|
# entity and convert it to a sequence of Unicode
|
|
|
# characters.
|
|
|
if name not in name_to_unicode:
|
|
|
name_to_unicode[name] = character
|
|
|
|
|
|
# When _generating_ HTML, we want to recognize special
|
|
|
# character sequences that _could_ be converted to named
|
|
|
# entities.
|
|
|
unicode_to_name[character] = name
|
|
|
|
|
|
# We also need to build a regular expression that lets us
|
|
|
# _find_ those characters in output strings so we can
|
|
|
# replace them.
|
|
|
#
|
|
|
# This is tricky, for two reasons.
|
|
|
|
|
|
if len(character) == 1 and ord(character) < 128 and character not in "<>":
|
|
|
# First, it would be annoying to turn single ASCII
|
|
|
# characters like | into named entities like
|
|
|
# |. The exceptions are <>, which we _must_
|
|
|
# turn into named entities to produce valid HTML.
|
|
|
continue
|
|
|
|
|
|
if len(character) > 1 and all(ord(x) < 128 for x in character):
|
|
|
# We also do not want to turn _combinations_ of ASCII
|
|
|
# characters like 'fj' into named entities like 'fj',
|
|
|
# though that's more debateable.
|
|
|
continue
|
|
|
|
|
|
# Second, some named entities have a Unicode value that's
|
|
|
# a subset of the Unicode value for some _other_ named
|
|
|
# entity. As an example, \u2267' is ≧,
|
|
|
# but '\u2267\u0338' is ≧̸. Our regular
|
|
|
# expression needs to match the first two characters of
|
|
|
# "\u2267\u0338foo", but only the first character of
|
|
|
# "\u2267foo".
|
|
|
#
|
|
|
# In this step, we build two sets of characters that
|
|
|
# _eventually_ need to go into the regular expression. But
|
|
|
# we won't know exactly what the regular expression needs
|
|
|
# to look like until we've gone through the entire list of
|
|
|
# named entities.
|
|
|
if len(character) == 1 and character != "&":
|
|
|
short_entities.add(character)
|
|
|
else:
|
|
|
long_entities_by_first_character[character[0]].add(character)
|
|
|
|
|
|
# Now that we've been through the entire list of entities, we
|
|
|
# can create a regular expression that matches any of them.
|
|
|
particles = set()
|
|
|
for short in short_entities:
|
|
|
long_versions = long_entities_by_first_character[short]
|
|
|
if not long_versions:
|
|
|
particles.add(short)
|
|
|
else:
|
|
|
ignore = "".join([x[1] for x in long_versions])
|
|
|
# This finds, e.g. \u2267 but only if it is _not_
|
|
|
# followed by \u0338.
|
|
|
particles.add("%s(?![%s])" % (short, ignore))
|
|
|
|
|
|
for long_entities in list(long_entities_by_first_character.values()):
|
|
|
for long_entity in long_entities:
|
|
|
particles.add(long_entity)
|
|
|
|
|
|
re_definition = "(%s)" % "|".join(particles)
|
|
|
|
|
|
particles.add("&")
|
|
|
re_definition_with_ampersand = "(%s)" % "|".join(particles)
|
|
|
|
|
|
# If an entity shows up in both html5 and codepoint2name, it's
|
|
|
# likely that HTML5 gives it several different names, such as
|
|
|
# 'rsquo' and 'rsquor'. When converting Unicode characters to
|
|
|
# named entities, the codepoint2name name should take
|
|
|
# precedence where possible, since that's the more easily
|
|
|
# recognizable one.
|
|
|
for codepoint, name in list(codepoint2name.items()):
|
|
|
character = chr(codepoint)
|
|
|
unicode_to_name[character] = name
|
|
|
|
|
|
cls.CHARACTER_TO_HTML_ENTITY = unicode_to_name
|
|
|
cls.HTML_ENTITY_TO_CHARACTER = name_to_unicode
|
|
|
cls.CHARACTER_TO_HTML_ENTITY_RE = re.compile(re_definition)
|
|
|
cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE = re.compile(
|
|
|
re_definition_with_ampersand
|
|
|
)
|
|
|
|
|
|
#: A map of Unicode strings to the corresponding named XML entities.
|
|
|
#:
|
|
|
#: :meta hide-value:
|
|
|
CHARACTER_TO_XML_ENTITY: Dict[str, str] = {
|
|
|
"'": "apos",
|
|
|
'"': "quot",
|
|
|
"&": "amp",
|
|
|
"<": "lt",
|
|
|
">": "gt",
|
|
|
}
|
|
|
|
|
|
# Matches any named or numeric HTML entity.
|
|
|
ANY_ENTITY_RE = re.compile("&(#\\d+|#x[0-9a-fA-F]+|\\w+);", re.I)
|
|
|
|
|
|
#: A regular expression matching an angle bracket or an ampersand that
|
|
|
#: is not part of an XML or HTML entity.
|
|
|
#:
|
|
|
#: :meta hide-value:
|
|
|
BARE_AMPERSAND_OR_BRACKET: Pattern[str] = re.compile(
|
|
|
"([<>]|" "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" ")"
|
|
|
)
|
|
|
|
|
|
#: A regular expression matching an angle bracket or an ampersand.
|
|
|
#:
|
|
|
#: :meta hide-value:
|
|
|
AMPERSAND_OR_BRACKET: Pattern[str] = re.compile("([<>&])")
|
|
|
|
|
|
@classmethod
|
|
|
def _substitute_html_entity(cls, matchobj: re.Match) -> str:
|
|
|
"""Used with a regular expression to substitute the
|
|
|
appropriate HTML entity for a special character string."""
|
|
|
original_entity = matchobj.group(0)
|
|
|
entity = cls.CHARACTER_TO_HTML_ENTITY.get(original_entity)
|
|
|
if entity is None:
|
|
|
return "&%s;" % original_entity
|
|
|
return "&%s;" % entity
|
|
|
|
|
|
@classmethod
|
|
|
def _substitute_xml_entity(cls, matchobj: re.Match) -> str:
|
|
|
"""Used with a regular expression to substitute the
|
|
|
appropriate XML entity for a special character string."""
|
|
|
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
|
|
|
return "&%s;" % entity
|
|
|
|
|
|
@classmethod
|
|
|
def _escape_entity_name(cls, matchobj: re.Match) -> str:
|
|
|
return "&%s;" % matchobj.group(1)
|
|
|
|
|
|
@classmethod
|
|
|
def _escape_unrecognized_entity_name(cls, matchobj: re.Match) -> str:
|
|
|
possible_entity = matchobj.group(1)
|
|
|
if possible_entity in cls.HTML_ENTITY_TO_CHARACTER:
|
|
|
return "&%s;" % possible_entity
|
|
|
return "&%s;" % possible_entity
|
|
|
|
|
|
@classmethod
|
|
|
def quoted_attribute_value(cls, value: str) -> str:
|
|
|
"""Make a value into a quoted XML attribute, possibly escaping it.
|
|
|
|
|
|
Most strings will be quoted using double quotes.
|
|
|
|
|
|
Bob's Bar -> "Bob's Bar"
|
|
|
|
|
|
If a string contains double quotes, it will be quoted using
|
|
|
single quotes.
|
|
|
|
|
|
Welcome to "my bar" -> 'Welcome to "my bar"'
|
|
|
|
|
|
If a string contains both single and double quotes, the
|
|
|
double quotes will be escaped, and the string will be quoted
|
|
|
using double quotes.
|
|
|
|
|
|
Welcome to "Bob's Bar" -> Welcome to "Bob's bar"
|
|
|
|
|
|
:param value: The XML attribute value to quote
|
|
|
:return: The quoted value
|
|
|
"""
|
|
|
quote_with = '"'
|
|
|
if '"' in value:
|
|
|
if "'" in value:
|
|
|
# The string contains both single and double
|
|
|
# quotes. Turn the double quotes into
|
|
|
# entities. We quote the double quotes rather than
|
|
|
# the single quotes because the entity name is
|
|
|
# """ whether this is HTML or XML. If we
|
|
|
# quoted the single quotes, we'd have to decide
|
|
|
# between ' and &squot;.
|
|
|
replace_with = """
|
|
|
value = value.replace('"', replace_with)
|
|
|
else:
|
|
|
# There are double quotes but no single quotes.
|
|
|
# We can use single quotes to quote the attribute.
|
|
|
quote_with = "'"
|
|
|
return quote_with + value + quote_with
|
|
|
|
|
|
@classmethod
|
|
|
def substitute_xml(cls, value: str, make_quoted_attribute: bool = False) -> str:
|
|
|
"""Replace special XML characters with named XML entities.
|
|
|
|
|
|
The less-than sign will become <, the greater-than sign
|
|
|
will become >, and any ampersands will become &. If you
|
|
|
want ampersands that seem to be part of an entity definition
|
|
|
to be left alone, use `substitute_xml_containing_entities`
|
|
|
instead.
|
|
|
|
|
|
:param value: A string to be substituted.
|
|
|
|
|
|
:param make_quoted_attribute: If True, then the string will be
|
|
|
quoted, as befits an attribute value.
|
|
|
|
|
|
:return: A version of ``value`` with special characters replaced
|
|
|
with named entities.
|
|
|
"""
|
|
|
# Escape angle brackets and ampersands.
|
|
|
value = cls.AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value)
|
|
|
|
|
|
if make_quoted_attribute:
|
|
|
value = cls.quoted_attribute_value(value)
|
|
|
return value
|
|
|
|
|
|
@classmethod
|
|
|
def substitute_xml_containing_entities(
|
|
|
cls, value: str, make_quoted_attribute: bool = False
|
|
|
) -> str:
|
|
|
"""Substitute XML entities for special XML characters.
|
|
|
|
|
|
:param value: A string to be substituted. The less-than sign will
|
|
|
become <, the greater-than sign will become >, and any
|
|
|
ampersands that are not part of an entity defition will
|
|
|
become &.
|
|
|
|
|
|
:param make_quoted_attribute: If True, then the string will be
|
|
|
quoted, as befits an attribute value.
|
|
|
"""
|
|
|
# Escape angle brackets, and ampersands that aren't part of
|
|
|
# entities.
|
|
|
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value)
|
|
|
|
|
|
if make_quoted_attribute:
|
|
|
value = cls.quoted_attribute_value(value)
|
|
|
return value
|
|
|
|
|
|
@classmethod
|
|
|
def substitute_html(cls, s: str) -> str:
|
|
|
"""Replace certain Unicode characters with named HTML entities.
|
|
|
|
|
|
This differs from ``data.encode(encoding, 'xmlcharrefreplace')``
|
|
|
in that the goal is to make the result more readable (to those
|
|
|
with ASCII displays) rather than to recover from
|
|
|
errors. There's absolutely nothing wrong with a UTF-8 string
|
|
|
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
|
|
|
character with "é" will make it more readable to some
|
|
|
people.
|
|
|
|
|
|
:param s: The string to be modified.
|
|
|
:return: The string with some Unicode characters replaced with
|
|
|
HTML entities.
|
|
|
"""
|
|
|
# Convert any appropriate characters to HTML entities.
|
|
|
return cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE.sub(
|
|
|
cls._substitute_html_entity, s
|
|
|
)
|
|
|
|
|
|
@classmethod
|
|
|
def substitute_html5(cls, s: str) -> str:
|
|
|
"""Replace certain Unicode characters with named HTML entities
|
|
|
using HTML5 rules.
|
|
|
|
|
|
Specifically, this method is much less aggressive about
|
|
|
escaping ampersands than substitute_html. Only ambiguous
|
|
|
ampersands are escaped, per the HTML5 standard:
|
|
|
|
|
|
"An ambiguous ampersand is a U+0026 AMPERSAND character (&)
|
|
|
that is followed by one or more ASCII alphanumerics, followed
|
|
|
by a U+003B SEMICOLON character (;), where these characters do
|
|
|
not match any of the names given in the named character
|
|
|
references section."
|
|
|
|
|
|
Unlike substitute_html5_raw, this method assumes HTML entities
|
|
|
were converted to Unicode characters on the way in, as
|
|
|
Beautiful Soup does. By the time Beautiful Soup does its work,
|
|
|
the only ambiguous ampersands that need to be escaped are the
|
|
|
ones that were escaped in the original markup when mentioning
|
|
|
HTML entities.
|
|
|
|
|
|
:param s: The string to be modified.
|
|
|
:return: The string with some Unicode characters replaced with
|
|
|
HTML entities.
|
|
|
"""
|
|
|
# First, escape any HTML entities found in the markup.
|
|
|
s = cls.ANY_ENTITY_RE.sub(cls._escape_entity_name, s)
|
|
|
|
|
|
# Next, convert any appropriate characters to unescaped HTML entities.
|
|
|
s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s)
|
|
|
|
|
|
return s
|
|
|
|
|
|
@classmethod
|
|
|
def substitute_html5_raw(cls, s: str) -> str:
|
|
|
"""Replace certain Unicode characters with named HTML entities
|
|
|
using HTML5 rules.
|
|
|
|
|
|
substitute_html5_raw is similar to substitute_html5 but it is
|
|
|
designed for standalone use (whereas substitute_html5 is
|
|
|
designed for use with Beautiful Soup).
|
|
|
|
|
|
:param s: The string to be modified.
|
|
|
:return: The string with some Unicode characters replaced with
|
|
|
HTML entities.
|
|
|
"""
|
|
|
# First, escape the ampersand for anything that looks like an
|
|
|
# entity but isn't in the list of recognized entities. All other
|
|
|
# ampersands can be left alone.
|
|
|
s = cls.ANY_ENTITY_RE.sub(cls._escape_unrecognized_entity_name, s)
|
|
|
|
|
|
# Then, convert a range of Unicode characters to unescaped
|
|
|
# HTML entities.
|
|
|
s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s)
|
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
EntitySubstitution._populate_class_variables()
|
|
|
|
|
|
|
|
|
class EncodingDetector:
|
|
|
"""This class is capable of guessing a number of possible encodings
|
|
|
for a bytestring.
|
|
|
|
|
|
Order of precedence:
|
|
|
|
|
|
1. Encodings you specifically tell EncodingDetector to try first
|
|
|
(the ``known_definite_encodings`` argument to the constructor).
|
|
|
|
|
|
2. An encoding determined by sniffing the document's byte-order mark.
|
|
|
|
|
|
3. Encodings you specifically tell EncodingDetector to try if
|
|
|
byte-order mark sniffing fails (the ``user_encodings`` argument to the
|
|
|
constructor).
|
|
|
|
|
|
4. An encoding declared within the bytestring itself, either in an
|
|
|
XML declaration (if the bytestring is to be interpreted as an XML
|
|
|
document), or in a <meta> tag (if the bytestring is to be
|
|
|
interpreted as an HTML document.)
|
|
|
|
|
|
5. An encoding detected through textual analysis by chardet,
|
|
|
cchardet, or a similar external library.
|
|
|
|
|
|
6. UTF-8.
|
|
|
|
|
|
7. Windows-1252.
|
|
|
|
|
|
:param markup: Some markup in an unknown encoding.
|
|
|
|
|
|
:param known_definite_encodings: When determining the encoding
|
|
|
of ``markup``, these encodings will be tried first, in
|
|
|
order. In HTML terms, this corresponds to the "known
|
|
|
definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.
|
|
|
|
|
|
:param user_encodings: These encodings will be tried after the
|
|
|
``known_definite_encodings`` have been tried and failed, and
|
|
|
after an attempt to sniff the encoding by looking at a
|
|
|
byte order mark has failed. In HTML terms, this
|
|
|
corresponds to the step "user has explicitly instructed
|
|
|
the user agent to override the document's character
|
|
|
encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.
|
|
|
|
|
|
:param override_encodings: A **deprecated** alias for
|
|
|
``known_definite_encodings``. Any encodings here will be tried
|
|
|
immediately after the encodings in
|
|
|
``known_definite_encodings``.
|
|
|
|
|
|
:param is_html: If True, this markup is considered to be
|
|
|
HTML. Otherwise it's assumed to be XML.
|
|
|
|
|
|
:param exclude_encodings: These encodings will not be tried,
|
|
|
even if they otherwise would be.
|
|
|
|
|
|
"""
|
|
|
|
|
|
def __init__(
|
|
|
self,
|
|
|
markup: bytes,
|
|
|
known_definite_encodings: Optional[_Encodings] = None,
|
|
|
is_html: Optional[bool] = False,
|
|
|
exclude_encodings: Optional[_Encodings] = None,
|
|
|
user_encodings: Optional[_Encodings] = None,
|
|
|
override_encodings: Optional[_Encodings] = None,
|
|
|
):
|
|
|
self.known_definite_encodings = list(known_definite_encodings or [])
|
|
|
if override_encodings:
|
|
|
warnings.warn(
|
|
|
"The 'override_encodings' argument was deprecated in 4.10.0. Use 'known_definite_encodings' instead.",
|
|
|
DeprecationWarning,
|
|
|
stacklevel=3,
|
|
|
)
|
|
|
self.known_definite_encodings += override_encodings
|
|
|
self.user_encodings = user_encodings or []
|
|
|
exclude_encodings = exclude_encodings or []
|
|
|
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
|
|
|
self.chardet_encoding = None
|
|
|
self.is_html = False if is_html is None else is_html
|
|
|
self.declared_encoding: Optional[str] = None
|
|
|
|
|
|
# First order of business: strip a byte-order mark.
|
|
|
self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
|
|
|
|
|
|
known_definite_encodings: _Encodings
|
|
|
user_encodings: _Encodings
|
|
|
exclude_encodings: _Encodings
|
|
|
chardet_encoding: Optional[_Encoding]
|
|
|
is_html: bool
|
|
|
declared_encoding: Optional[_Encoding]
|
|
|
markup: bytes
|
|
|
sniffed_encoding: Optional[_Encoding]
|
|
|
|
|
|
def _usable(self, encoding: Optional[_Encoding], tried: Set[_Encoding]) -> bool:
|
|
|
"""Should we even bother to try this encoding?
|
|
|
|
|
|
:param encoding: Name of an encoding.
|
|
|
:param tried: Encodings that have already been tried. This
|
|
|
will be modified as a side effect.
|
|
|
"""
|
|
|
if encoding is None:
|
|
|
return False
|
|
|
encoding = encoding.lower()
|
|
|
if encoding in self.exclude_encodings:
|
|
|
return False
|
|
|
if encoding not in tried:
|
|
|
tried.add(encoding)
|
|
|
return True
|
|
|
return False
|
|
|
|
|
|
@property
|
|
|
def encodings(self) -> Iterator[_Encoding]:
|
|
|
"""Yield a number of encodings that might work for this markup.
|
|
|
|
|
|
:yield: A sequence of strings. Each is the name of an encoding
|
|
|
that *might* work to convert a bytestring into Unicode.
|
|
|
"""
|
|
|
tried: Set[_Encoding] = set()
|
|
|
|
|
|
# First, try the known definite encodings
|
|
|
for e in self.known_definite_encodings:
|
|
|
if self._usable(e, tried):
|
|
|
yield e
|
|
|
|
|
|
# Did the document originally start with a byte-order mark
|
|
|
# that indicated its encoding?
|
|
|
if self.sniffed_encoding is not None and self._usable(
|
|
|
self.sniffed_encoding, tried
|
|
|
):
|
|
|
yield self.sniffed_encoding
|
|
|
|
|
|
# Sniffing the byte-order mark did nothing; try the user
|
|
|
# encodings.
|
|
|
for e in self.user_encodings:
|
|
|
if self._usable(e, tried):
|
|
|
yield e
|
|
|
|
|
|
# Look within the document for an XML or HTML encoding
|
|
|
# declaration.
|
|
|
if self.declared_encoding is None:
|
|
|
self.declared_encoding = self.find_declared_encoding(
|
|
|
self.markup, self.is_html
|
|
|
)
|
|
|
if self.declared_encoding is not None and self._usable(
|
|
|
self.declared_encoding, tried
|
|
|
):
|
|
|
yield self.declared_encoding
|
|
|
|
|
|
# Use third-party character set detection to guess at the
|
|
|
# encoding.
|
|
|
if self.chardet_encoding is None:
|
|
|
self.chardet_encoding = _chardet_dammit(self.markup)
|
|
|
if self.chardet_encoding is not None and self._usable(
|
|
|
self.chardet_encoding, tried
|
|
|
):
|
|
|
yield self.chardet_encoding
|
|
|
|
|
|
# As a last-ditch effort, try utf-8 and windows-1252.
|
|
|
for e in ("utf-8", "windows-1252"):
|
|
|
if self._usable(e, tried):
|
|
|
yield e
|
|
|
|
|
|
@classmethod
|
|
|
def strip_byte_order_mark(cls, data: bytes) -> Tuple[bytes, Optional[_Encoding]]:
|
|
|
"""If a byte-order mark is present, strip it and return the encoding it implies.
|
|
|
|
|
|
:param data: A bytestring that may or may not begin with a
|
|
|
byte-order mark.
|
|
|
|
|
|
:return: A 2-tuple (data stripped of byte-order mark, encoding implied by byte-order mark)
|
|
|
"""
|
|
|
encoding = None
|
|
|
if isinstance(data, str):
|
|
|
# Unicode data cannot have a byte-order mark.
|
|
|
return data, encoding
|
|
|
if (
|
|
|
(len(data) >= 4)
|
|
|
and (data[:2] == b"\xfe\xff")
|
|
|
and (data[2:4] != b"\x00\x00")
|
|
|
):
|
|
|
encoding = "utf-16be"
|
|
|
data = data[2:]
|
|
|
elif (
|
|
|
(len(data) >= 4)
|
|
|
and (data[:2] == b"\xff\xfe")
|
|
|
and (data[2:4] != b"\x00\x00")
|
|
|
):
|
|
|
encoding = "utf-16le"
|
|
|
data = data[2:]
|
|
|
elif data[:3] == b"\xef\xbb\xbf":
|
|
|
encoding = "utf-8"
|
|
|
data = data[3:]
|
|
|
elif data[:4] == b"\x00\x00\xfe\xff":
|
|
|
encoding = "utf-32be"
|
|
|
data = data[4:]
|
|
|
elif data[:4] == b"\xff\xfe\x00\x00":
|
|
|
encoding = "utf-32le"
|
|
|
data = data[4:]
|
|
|
return data, encoding
|
|
|
|
|
|
@classmethod
|
|
|
def find_declared_encoding(
|
|
|
cls,
|
|
|
markup: Union[bytes, str],
|
|
|
is_html: bool = False,
|
|
|
search_entire_document: bool = False,
|
|
|
) -> Optional[_Encoding]:
|
|
|
"""Given a document, tries to find an encoding declared within the
|
|
|
text of the document itself.
|
|
|
|
|
|
An XML encoding is declared at the beginning of the document.
|
|
|
|
|
|
An HTML encoding is declared in a <meta> tag, hopefully near the
|
|
|
beginning of the document.
|
|
|
|
|
|
:param markup: Some markup.
|
|
|
:param is_html: If True, this markup is considered to be HTML. Otherwise
|
|
|
it's assumed to be XML.
|
|
|
:param search_entire_document: Since an encoding is supposed
|
|
|
to declared near the beginning of the document, most of
|
|
|
the time it's only necessary to search a few kilobytes of
|
|
|
data. Set this to True to force this method to search the
|
|
|
entire document.
|
|
|
:return: The declared encoding, if one is found.
|
|
|
"""
|
|
|
if search_entire_document:
|
|
|
xml_endpos = html_endpos = len(markup)
|
|
|
else:
|
|
|
xml_endpos = 1024
|
|
|
html_endpos = max(2048, int(len(markup) * 0.05))
|
|
|
|
|
|
if isinstance(markup, bytes):
|
|
|
res = encoding_res[bytes]
|
|
|
else:
|
|
|
res = encoding_res[str]
|
|
|
|
|
|
xml_re = res["xml"]
|
|
|
html_re = res["html"]
|
|
|
declared_encoding: Optional[_Encoding] = None
|
|
|
declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
|
|
|
if not declared_encoding_match and is_html:
|
|
|
declared_encoding_match = html_re.search(markup, endpos=html_endpos)
|
|
|
if declared_encoding_match is not None:
|
|
|
declared_encoding = declared_encoding_match.groups()[0]
|
|
|
if declared_encoding:
|
|
|
if isinstance(declared_encoding, bytes):
|
|
|
declared_encoding = declared_encoding.decode("ascii", "replace")
|
|
|
return declared_encoding.lower()
|
|
|
return None
|
|
|
|
|
|
|
|
|
class UnicodeDammit:
|
|
|
"""A class for detecting the encoding of a bytestring containing an
|
|
|
HTML or XML document, and decoding it to Unicode. If the source
|
|
|
encoding is windows-1252, `UnicodeDammit` can also replace
|
|
|
Microsoft smart quotes with their HTML or XML equivalents.
|
|
|
|
|
|
:param markup: HTML or XML markup in an unknown encoding.
|
|
|
|
|
|
:param known_definite_encodings: When determining the encoding
|
|
|
of ``markup``, these encodings will be tried first, in
|
|
|
order. In HTML terms, this corresponds to the "known
|
|
|
definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.
|
|
|
|
|
|
:param user_encodings: These encodings will be tried after the
|
|
|
``known_definite_encodings`` have been tried and failed, and
|
|
|
after an attempt to sniff the encoding by looking at a
|
|
|
byte order mark has failed. In HTML terms, this
|
|
|
corresponds to the step "user has explicitly instructed
|
|
|
the user agent to override the document's character
|
|
|
encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.
|
|
|
|
|
|
:param override_encodings: A **deprecated** alias for
|
|
|
``known_definite_encodings``. Any encodings here will be tried
|
|
|
immediately after the encodings in
|
|
|
``known_definite_encodings``.
|
|
|
|
|
|
:param smart_quotes_to: By default, Microsoft smart quotes will,
|
|
|
like all other characters, be converted to Unicode
|
|
|
characters. Setting this to ``ascii`` will convert them to ASCII
|
|
|
quotes instead. Setting it to ``xml`` will convert them to XML
|
|
|
entity references, and setting it to ``html`` will convert them
|
|
|
to HTML entity references.
|
|
|
|
|
|
:param is_html: If True, ``markup`` is treated as an HTML
|
|
|
document. Otherwise it's treated as an XML document.
|
|
|
|
|
|
:param exclude_encodings: These encodings will not be considered,
|
|
|
even if the sniffing code thinks they might make sense.
|
|
|
|
|
|
"""
|
|
|
|
|
|
def __init__(
|
|
|
self,
|
|
|
markup: bytes,
|
|
|
known_definite_encodings: Optional[_Encodings] = [],
|
|
|
smart_quotes_to: Optional[Literal["ascii", "xml", "html"]] = None,
|
|
|
is_html: bool = False,
|
|
|
exclude_encodings: Optional[_Encodings] = [],
|
|
|
user_encodings: Optional[_Encodings] = None,
|
|
|
override_encodings: Optional[_Encodings] = None,
|
|
|
):
|
|
|
self.smart_quotes_to = smart_quotes_to
|
|
|
self.tried_encodings = []
|
|
|
self.contains_replacement_characters = False
|
|
|
self.is_html = is_html
|
|
|
self.log = getLogger(__name__)
|
|
|
self.detector = EncodingDetector(
|
|
|
markup,
|
|
|
known_definite_encodings,
|
|
|
is_html,
|
|
|
exclude_encodings,
|
|
|
user_encodings,
|
|
|
override_encodings,
|
|
|
)
|
|
|
|
|
|
# Short-circuit if the data is in Unicode to begin with.
|
|
|
if isinstance(markup, str):
|
|
|
self.markup = markup.encode("utf8")
|
|
|
self.unicode_markup = markup
|
|
|
self.original_encoding = None
|
|
|
return
|
|
|
|
|
|
# The encoding detector may have stripped a byte-order mark.
|
|
|
# Use the stripped markup from this point on.
|
|
|
self.markup = self.detector.markup
|
|
|
|
|
|
u = None
|
|
|
for encoding in self.detector.encodings:
|
|
|
markup = self.detector.markup
|
|
|
u = self._convert_from(encoding)
|
|
|
if u is not None:
|
|
|
break
|
|
|
|
|
|
if not u:
|
|
|
# None of the encodings worked. As an absolute last resort,
|
|
|
# try them again with character replacement.
|
|
|
|
|
|
for encoding in self.detector.encodings:
|
|
|
if encoding != "ascii":
|
|
|
u = self._convert_from(encoding, "replace")
|
|
|
if u is not None:
|
|
|
self.log.warning(
|
|
|
"Some characters could not be decoded, and were "
|
|
|
"replaced with REPLACEMENT CHARACTER."
|
|
|
)
|
|
|
|
|
|
self.contains_replacement_characters = True
|
|
|
break
|
|
|
|
|
|
# If none of that worked, we could at this point force it to
|
|
|
# ASCII, but that would destroy so much data that I think
|
|
|
# giving up is better.
|
|
|
#
|
|
|
# Note that this is extremely unlikely, probably impossible,
|
|
|
# because the "replace" strategy is so powerful. Even running
|
|
|
# the Python binary through Unicode, Dammit gives you Unicode,
|
|
|
# albeit Unicode riddled with REPLACEMENT CHARACTER.
|
|
|
if u is None:
|
|
|
self.original_encoding = None
|
|
|
self.unicode_markup = None
|
|
|
else:
|
|
|
self.unicode_markup = u
|
|
|
|
|
|
#: The original markup, before it was converted to Unicode.
|
|
|
#: This is not necessarily the same as what was passed in to the
|
|
|
#: constructor, since any byte-order mark will be stripped.
|
|
|
markup: bytes
|
|
|
|
|
|
#: The Unicode version of the markup, following conversion. This
|
|
|
#: is set to None if there was simply no way to convert the
|
|
|
#: bytestring to Unicode (as with binary data).
|
|
|
unicode_markup: Optional[str]
|
|
|
|
|
|
#: This is True if `UnicodeDammit.unicode_markup` contains
|
|
|
#: U+FFFD REPLACEMENT_CHARACTER characters which were not present
|
|
|
#: in `UnicodeDammit.markup`. These mark character sequences that
|
|
|
#: could not be represented in Unicode.
|
|
|
contains_replacement_characters: bool
|
|
|
|
|
|
#: Unicode, Dammit's best guess as to the original character
|
|
|
#: encoding of `UnicodeDammit.markup`.
|
|
|
original_encoding: Optional[_Encoding]
|
|
|
|
|
|
#: The strategy used to handle Microsoft smart quotes.
|
|
|
smart_quotes_to: Optional[str]
|
|
|
|
|
|
#: The (encoding, error handling strategy) 2-tuples that were used to
|
|
|
#: try and convert the markup to Unicode.
|
|
|
tried_encodings: List[Tuple[_Encoding, str]]
|
|
|
|
|
|
log: Logger #: :meta private:
|
|
|
|
|
|
def _sub_ms_char(self, match: re.Match) -> bytes:
|
|
|
"""Changes a MS smart quote character to an XML or HTML
|
|
|
entity, or an ASCII character.
|
|
|
|
|
|
TODO: Since this is only used to convert smart quotes, it
|
|
|
could be simplified, and MS_CHARS_TO_ASCII made much less
|
|
|
parochial.
|
|
|
"""
|
|
|
orig: bytes = match.group(1)
|
|
|
sub: bytes
|
|
|
if self.smart_quotes_to == "ascii":
|
|
|
if orig in self.MS_CHARS_TO_ASCII:
|
|
|
sub = self.MS_CHARS_TO_ASCII[orig].encode()
|
|
|
else:
|
|
|
# Shouldn't happen; substitute the character
|
|
|
# with itself.
|
|
|
sub = orig
|
|
|
else:
|
|
|
if orig in self.MS_CHARS:
|
|
|
substitutions = self.MS_CHARS[orig]
|
|
|
if type(substitutions) is tuple:
|
|
|
if self.smart_quotes_to == "xml":
|
|
|
sub = b"&#x" + substitutions[1].encode() + b";"
|
|
|
else:
|
|
|
sub = b"&" + substitutions[0].encode() + b";"
|
|
|
else:
|
|
|
substitutions = cast(str, substitutions)
|
|
|
sub = substitutions.encode()
|
|
|
else:
|
|
|
# Shouldn't happen; substitute the character
|
|
|
# for itself.
|
|
|
sub = orig
|
|
|
return sub
|
|
|
|
|
|
#: This dictionary maps commonly seen values for "charset" in HTML
|
|
|
#: meta tags to the corresponding Python codec names. It only covers
|
|
|
#: values that aren't in Python's aliases and can't be determined
|
|
|
#: by the heuristics in `find_codec`.
|
|
|
#:
|
|
|
#: :meta hide-value:
|
|
|
CHARSET_ALIASES: Dict[str, _Encoding] = {
|
|
|
"macintosh": "mac-roman",
|
|
|
"x-sjis": "shift-jis",
|
|
|
}
|
|
|
|
|
|
#: A list of encodings that tend to contain Microsoft smart quotes.
|
|
|
#:
|
|
|
#: :meta hide-value:
|
|
|
ENCODINGS_WITH_SMART_QUOTES: _Encodings = [
|
|
|
"windows-1252",
|
|
|
"iso-8859-1",
|
|
|
"iso-8859-2",
|
|
|
]
|
|
|
|
|
|
def _convert_from(
|
|
|
self, proposed: _Encoding, errors: str = "strict"
|
|
|
) -> Optional[str]:
|
|
|
"""Attempt to convert the markup to the proposed encoding.
|
|
|
|
|
|
:param proposed: The name of a character encoding.
|
|
|
:param errors: An error handling strategy, used when calling `str`.
|
|
|
:return: The converted markup, or `None` if the proposed
|
|
|
encoding/error handling strategy didn't work.
|
|
|
"""
|
|
|
lookup_result = self.find_codec(proposed)
|
|
|
if lookup_result is None or (lookup_result, errors) in self.tried_encodings:
|
|
|
return None
|
|
|
proposed = lookup_result
|
|
|
self.tried_encodings.append((proposed, errors))
|
|
|
markup = self.markup
|
|
|
# Convert smart quotes to HTML if coming from an encoding
|
|
|
# that might have them.
|
|
|
if (
|
|
|
self.smart_quotes_to is not None
|
|
|
and proposed in self.ENCODINGS_WITH_SMART_QUOTES
|
|
|
):
|
|
|
smart_quotes_re = b"([\x80-\x9f])"
|
|
|
smart_quotes_compiled = re.compile(smart_quotes_re)
|
|
|
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
|
|
|
|
|
|
try:
|
|
|
# print("Trying to convert document to %s (errors=%s)" % (
|
|
|
# proposed, errors))
|
|
|
u = self._to_unicode(markup, proposed, errors)
|
|
|
self.unicode_markup = u
|
|
|
self.original_encoding = proposed
|
|
|
except Exception:
|
|
|
# print("That didn't work!")
|
|
|
# print(e)
|
|
|
return None
|
|
|
# print("Correct encoding: %s" % proposed)
|
|
|
return self.unicode_markup
|
|
|
|
|
|
def _to_unicode(
|
|
|
self, data: bytes, encoding: _Encoding, errors: str = "strict"
|
|
|
) -> str:
|
|
|
"""Given a bytestring and its encoding, decodes the string into Unicode.
|
|
|
|
|
|
:param encoding: The name of an encoding.
|
|
|
:param errors: An error handling strategy, used when calling `str`.
|
|
|
"""
|
|
|
return str(data, encoding, errors)
|
|
|
|
|
|
@property
|
|
|
def declared_html_encoding(self) -> Optional[_Encoding]:
|
|
|
"""If the markup is an HTML document, returns the encoding, if any,
|
|
|
declared *inside* the document.
|
|
|
"""
|
|
|
if not self.is_html:
|
|
|
return None
|
|
|
return self.detector.declared_encoding
|
|
|
|
|
|
def find_codec(self, charset: _Encoding) -> Optional[str]:
|
|
|
"""Look up the Python codec corresponding to a given character set.
|
|
|
|
|
|
:param charset: The name of a character set.
|
|
|
:return: The name of a Python codec.
|
|
|
"""
|
|
|
value = (
|
|
|
self._codec(self.CHARSET_ALIASES.get(charset, charset))
|
|
|
or (charset and self._codec(charset.replace("-", "")))
|
|
|
or (charset and self._codec(charset.replace("-", "_")))
|
|
|
or (charset and charset.lower())
|
|
|
or charset
|
|
|
)
|
|
|
if value:
|
|
|
return value.lower()
|
|
|
return None
|
|
|
|
|
|
def _codec(self, charset: _Encoding) -> Optional[str]:
|
|
|
if not charset:
|
|
|
return charset
|
|
|
codec = None
|
|
|
try:
|
|
|
codecs.lookup(charset)
|
|
|
codec = charset
|
|
|
except (LookupError, ValueError):
|
|
|
pass
|
|
|
return codec
|
|
|
|
|
|
#: A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
|
|
|
#:
|
|
|
#: :meta hide-value:
|
|
|
MS_CHARS: Dict[bytes, Union[str, Tuple[str, str]]] = {
|
|
|
b"\x80": ("euro", "20AC"),
|
|
|
b"\x81": " ",
|
|
|
b"\x82": ("sbquo", "201A"),
|
|
|
b"\x83": ("fnof", "192"),
|
|
|
b"\x84": ("bdquo", "201E"),
|
|
|
b"\x85": ("hellip", "2026"),
|
|
|
b"\x86": ("dagger", "2020"),
|
|
|
b"\x87": ("Dagger", "2021"),
|
|
|
b"\x88": ("circ", "2C6"),
|
|
|
b"\x89": ("permil", "2030"),
|
|
|
b"\x8a": ("Scaron", "160"),
|
|
|
b"\x8b": ("lsaquo", "2039"),
|
|
|
b"\x8c": ("OElig", "152"),
|
|
|
b"\x8d": "?",
|
|
|
b"\x8e": ("#x17D", "17D"),
|
|
|
b"\x8f": "?",
|
|
|
b"\x90": "?",
|
|
|
b"\x91": ("lsquo", "2018"),
|
|
|
b"\x92": ("rsquo", "2019"),
|
|
|
b"\x93": ("ldquo", "201C"),
|
|
|
b"\x94": ("rdquo", "201D"),
|
|
|
b"\x95": ("bull", "2022"),
|
|
|
b"\x96": ("ndash", "2013"),
|
|
|
b"\x97": ("mdash", "2014"),
|
|
|
b"\x98": ("tilde", "2DC"),
|
|
|
b"\x99": ("trade", "2122"),
|
|
|
b"\x9a": ("scaron", "161"),
|
|
|
b"\x9b": ("rsaquo", "203A"),
|
|
|
b"\x9c": ("oelig", "153"),
|
|
|
b"\x9d": "?",
|
|
|
b"\x9e": ("#x17E", "17E"),
|
|
|
b"\x9f": ("Yuml", ""),
|
|
|
}
|
|
|
|
|
|
#: A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
|
|
|
#: horrors like stripping diacritical marks to turn á into a, but also
|
|
|
#: contains non-horrors like turning “ into ".
|
|
|
#:
|
|
|
#: Seriously, don't use this for anything other than removing smart
|
|
|
#: quotes.
|
|
|
#:
|
|
|
#: :meta private:
|
|
|
MS_CHARS_TO_ASCII: Dict[bytes, str] = {
|
|
|
b"\x80": "EUR",
|
|
|
b"\x81": " ",
|
|
|
b"\x82": ",",
|
|
|
b"\x83": "f",
|
|
|
b"\x84": ",,",
|
|
|
b"\x85": "...",
|
|
|
b"\x86": "+",
|
|
|
b"\x87": "++",
|
|
|
b"\x88": "^",
|
|
|
b"\x89": "%",
|
|
|
b"\x8a": "S",
|
|
|
b"\x8b": "<",
|
|
|
b"\x8c": "OE",
|
|
|
b"\x8d": "?",
|
|
|
b"\x8e": "Z",
|
|
|
b"\x8f": "?",
|
|
|
b"\x90": "?",
|
|
|
b"\x91": "'",
|
|
|
b"\x92": "'",
|
|
|
b"\x93": '"',
|
|
|
b"\x94": '"',
|
|
|
b"\x95": "*",
|
|
|
b"\x96": "-",
|
|
|
b"\x97": "--",
|
|
|
b"\x98": "~",
|
|
|
b"\x99": "(TM)",
|
|
|
b"\x9a": "s",
|
|
|
b"\x9b": ">",
|
|
|
b"\x9c": "oe",
|
|
|
b"\x9d": "?",
|
|
|
b"\x9e": "z",
|
|
|
b"\x9f": "Y",
|
|
|
b"\xa0": " ",
|
|
|
b"\xa1": "!",
|
|
|
b"\xa2": "c",
|
|
|
b"\xa3": "GBP",
|
|
|
b"\xa4": "$", # This approximation is especially parochial--this is the
|
|
|
# generic currency symbol.
|
|
|
b"\xa5": "YEN",
|
|
|
b"\xa6": "|",
|
|
|
b"\xa7": "S",
|
|
|
b"\xa8": "..",
|
|
|
b"\xa9": "",
|
|
|
b"\xaa": "(th)",
|
|
|
b"\xab": "<<",
|
|
|
b"\xac": "!",
|
|
|
b"\xad": " ",
|
|
|
b"\xae": "(R)",
|
|
|
b"\xaf": "-",
|
|
|
b"\xb0": "o",
|
|
|
b"\xb1": "+-",
|
|
|
b"\xb2": "2",
|
|
|
b"\xb3": "3",
|
|
|
b"\xb4": "'",
|
|
|
b"\xb5": "u",
|
|
|
b"\xb6": "P",
|
|
|
b"\xb7": "*",
|
|
|
b"\xb8": ",",
|
|
|
b"\xb9": "1",
|
|
|
b"\xba": "(th)",
|
|
|
b"\xbb": ">>",
|
|
|
b"\xbc": "1/4",
|
|
|
b"\xbd": "1/2",
|
|
|
b"\xbe": "3/4",
|
|
|
b"\xbf": "?",
|
|
|
b"\xc0": "A",
|
|
|
b"\xc1": "A",
|
|
|
b"\xc2": "A",
|
|
|
b"\xc3": "A",
|
|
|
b"\xc4": "A",
|
|
|
b"\xc5": "A",
|
|
|
b"\xc6": "AE",
|
|
|
b"\xc7": "C",
|
|
|
b"\xc8": "E",
|
|
|
b"\xc9": "E",
|
|
|
b"\xca": "E",
|
|
|
b"\xcb": "E",
|
|
|
b"\xcc": "I",
|
|
|
b"\xcd": "I",
|
|
|
b"\xce": "I",
|
|
|
b"\xcf": "I",
|
|
|
b"\xd0": "D",
|
|
|
b"\xd1": "N",
|
|
|
b"\xd2": "O",
|
|
|
b"\xd3": "O",
|
|
|
b"\xd4": "O",
|
|
|
b"\xd5": "O",
|
|
|
b"\xd6": "O",
|
|
|
b"\xd7": "*",
|
|
|
b"\xd8": "O",
|
|
|
b"\xd9": "U",
|
|
|
b"\xda": "U",
|
|
|
b"\xdb": "U",
|
|
|
b"\xdc": "U",
|
|
|
b"\xdd": "Y",
|
|
|
b"\xde": "b",
|
|
|
b"\xdf": "B",
|
|
|
b"\xe0": "a",
|
|
|
b"\xe1": "a",
|
|
|
b"\xe2": "a",
|
|
|
b"\xe3": "a",
|
|
|
b"\xe4": "a",
|
|
|
b"\xe5": "a",
|
|
|
b"\xe6": "ae",
|
|
|
b"\xe7": "c",
|
|
|
b"\xe8": "e",
|
|
|
b"\xe9": "e",
|
|
|
b"\xea": "e",
|
|
|
b"\xeb": "e",
|
|
|
b"\xec": "i",
|
|
|
b"\xed": "i",
|
|
|
b"\xee": "i",
|
|
|
b"\xef": "i",
|
|
|
b"\xf0": "o",
|
|
|
b"\xf1": "n",
|
|
|
b"\xf2": "o",
|
|
|
b"\xf3": "o",
|
|
|
b"\xf4": "o",
|
|
|
b"\xf5": "o",
|
|
|
b"\xf6": "o",
|
|
|
b"\xf7": "/",
|
|
|
b"\xf8": "o",
|
|
|
b"\xf9": "u",
|
|
|
b"\xfa": "u",
|
|
|
b"\xfb": "u",
|
|
|
b"\xfc": "u",
|
|
|
b"\xfd": "y",
|
|
|
b"\xfe": "b",
|
|
|
b"\xff": "y",
|
|
|
}
|
|
|
|
|
|
#: A map used when removing rogue Windows-1252/ISO-8859-1
|
|
|
#: characters in otherwise UTF-8 documents.
|
|
|
#:
|
|
|
#: Note that \\x81, \\x8d, \\x8f, \\x90, and \\x9d are undefined in
|
|
|
#: Windows-1252.
|
|
|
#:
|
|
|
#: :meta hide-value:
|
|
|
WINDOWS_1252_TO_UTF8: Dict[int, bytes] = {
|
|
|
0x80: b"\xe2\x82\xac", # €
|
|
|
0x82: b"\xe2\x80\x9a", # ‚
|
|
|
0x83: b"\xc6\x92", # ƒ
|
|
|
0x84: b"\xe2\x80\x9e", # „
|
|
|
0x85: b"\xe2\x80\xa6", # …
|
|
|
0x86: b"\xe2\x80\xa0", # †
|
|
|
0x87: b"\xe2\x80\xa1", # ‡
|
|
|
0x88: b"\xcb\x86", # ˆ
|
|
|
0x89: b"\xe2\x80\xb0", # ‰
|
|
|
0x8A: b"\xc5\xa0", # Š
|
|
|
0x8B: b"\xe2\x80\xb9", # ‹
|
|
|
0x8C: b"\xc5\x92", # Œ
|
|
|
0x8E: b"\xc5\xbd", # Ž
|
|
|
0x91: b"\xe2\x80\x98", # ‘
|
|
|
0x92: b"\xe2\x80\x99", # ’
|
|
|
0x93: b"\xe2\x80\x9c", # “
|
|
|
0x94: b"\xe2\x80\x9d", # ”
|
|
|
0x95: b"\xe2\x80\xa2", # •
|
|
|
0x96: b"\xe2\x80\x93", # –
|
|
|
0x97: b"\xe2\x80\x94", # —
|
|
|
0x98: b"\xcb\x9c", # ˜
|
|
|
0x99: b"\xe2\x84\xa2", # ™
|
|
|
0x9A: b"\xc5\xa1", # š
|
|
|
0x9B: b"\xe2\x80\xba", # ›
|
|
|
0x9C: b"\xc5\x93", # œ
|
|
|
0x9E: b"\xc5\xbe", # ž
|
|
|
0x9F: b"\xc5\xb8", # Ÿ
|
|
|
0xA0: b"\xc2\xa0", #
|
|
|
0xA1: b"\xc2\xa1", # ¡
|
|
|
0xA2: b"\xc2\xa2", # ¢
|
|
|
0xA3: b"\xc2\xa3", # £
|
|
|
0xA4: b"\xc2\xa4", # ¤
|
|
|
0xA5: b"\xc2\xa5", # ¥
|
|
|
0xA6: b"\xc2\xa6", # ¦
|
|
|
0xA7: b"\xc2\xa7", # §
|
|
|
0xA8: b"\xc2\xa8", # ¨
|
|
|
0xA9: b"\xc2\xa9", # ©
|
|
|
0xAA: b"\xc2\xaa", # ª
|
|
|
0xAB: b"\xc2\xab", # «
|
|
|
0xAC: b"\xc2\xac", # ¬
|
|
|
0xAD: b"\xc2\xad", #
|
|
|
0xAE: b"\xc2\xae", # ®
|
|
|
0xAF: b"\xc2\xaf", # ¯
|
|
|
0xB0: b"\xc2\xb0", # °
|
|
|
0xB1: b"\xc2\xb1", # ±
|
|
|
0xB2: b"\xc2\xb2", # ²
|
|
|
0xB3: b"\xc2\xb3", # ³
|
|
|
0xB4: b"\xc2\xb4", # ´
|
|
|
0xB5: b"\xc2\xb5", # µ
|
|
|
0xB6: b"\xc2\xb6", # ¶
|
|
|
0xB7: b"\xc2\xb7", # ·
|
|
|
0xB8: b"\xc2\xb8", # ¸
|
|
|
0xB9: b"\xc2\xb9", # ¹
|
|
|
0xBA: b"\xc2\xba", # º
|
|
|
0xBB: b"\xc2\xbb", # »
|
|
|
0xBC: b"\xc2\xbc", # ¼
|
|
|
0xBD: b"\xc2\xbd", # ½
|
|
|
0xBE: b"\xc2\xbe", # ¾
|
|
|
0xBF: b"\xc2\xbf", # ¿
|
|
|
0xC0: b"\xc3\x80", # À
|
|
|
0xC1: b"\xc3\x81", # Á
|
|
|
0xC2: b"\xc3\x82", # Â
|
|
|
0xC3: b"\xc3\x83", # Ã
|
|
|
0xC4: b"\xc3\x84", # Ä
|
|
|
0xC5: b"\xc3\x85", # Å
|
|
|
0xC6: b"\xc3\x86", # Æ
|
|
|
0xC7: b"\xc3\x87", # Ç
|
|
|
0xC8: b"\xc3\x88", # È
|
|
|
0xC9: b"\xc3\x89", # É
|
|
|
0xCA: b"\xc3\x8a", # Ê
|
|
|
0xCB: b"\xc3\x8b", # Ë
|
|
|
0xCC: b"\xc3\x8c", # Ì
|
|
|
0xCD: b"\xc3\x8d", # Í
|
|
|
0xCE: b"\xc3\x8e", # Î
|
|
|
0xCF: b"\xc3\x8f", # Ï
|
|
|
0xD0: b"\xc3\x90", # Ð
|
|
|
0xD1: b"\xc3\x91", # Ñ
|
|
|
0xD2: b"\xc3\x92", # Ò
|
|
|
0xD3: b"\xc3\x93", # Ó
|
|
|
0xD4: b"\xc3\x94", # Ô
|
|
|
0xD5: b"\xc3\x95", # Õ
|
|
|
0xD6: b"\xc3\x96", # Ö
|
|
|
0xD7: b"\xc3\x97", # ×
|
|
|
0xD8: b"\xc3\x98", # Ø
|
|
|
0xD9: b"\xc3\x99", # Ù
|
|
|
0xDA: b"\xc3\x9a", # Ú
|
|
|
0xDB: b"\xc3\x9b", # Û
|
|
|
0xDC: b"\xc3\x9c", # Ü
|
|
|
0xDD: b"\xc3\x9d", # Ý
|
|
|
0xDE: b"\xc3\x9e", # Þ
|
|
|
0xDF: b"\xc3\x9f", # ß
|
|
|
0xE0: b"\xc3\xa0", # à
|
|
|
0xE1: b"\xa1", # á
|
|
|
0xE2: b"\xc3\xa2", # â
|
|
|
0xE3: b"\xc3\xa3", # ã
|
|
|
0xE4: b"\xc3\xa4", # ä
|
|
|
0xE5: b"\xc3\xa5", # å
|
|
|
0xE6: b"\xc3\xa6", # æ
|
|
|
0xE7: b"\xc3\xa7", # ç
|
|
|
0xE8: b"\xc3\xa8", # è
|
|
|
0xE9: b"\xc3\xa9", # é
|
|
|
0xEA: b"\xc3\xaa", # ê
|
|
|
0xEB: b"\xc3\xab", # ë
|
|
|
0xEC: b"\xc3\xac", # ì
|
|
|
0xED: b"\xc3\xad", # í
|
|
|
0xEE: b"\xc3\xae", # î
|
|
|
0xEF: b"\xc3\xaf", # ï
|
|
|
0xF0: b"\xc3\xb0", # ð
|
|
|
0xF1: b"\xc3\xb1", # ñ
|
|
|
0xF2: b"\xc3\xb2", # ò
|
|
|
0xF3: b"\xc3\xb3", # ó
|
|
|
0xF4: b"\xc3\xb4", # ô
|
|
|
0xF5: b"\xc3\xb5", # õ
|
|
|
0xF6: b"\xc3\xb6", # ö
|
|
|
0xF7: b"\xc3\xb7", # ÷
|
|
|
0xF8: b"\xc3\xb8", # ø
|
|
|
0xF9: b"\xc3\xb9", # ù
|
|
|
0xFA: b"\xc3\xba", # ú
|
|
|
0xFB: b"\xc3\xbb", # û
|
|
|
0xFC: b"\xc3\xbc", # ü
|
|
|
0xFD: b"\xc3\xbd", # ý
|
|
|
0xFE: b"\xc3\xbe", # þ
|
|
|
}
|
|
|
|
|
|
#: :meta private:
|
|
|
MULTIBYTE_MARKERS_AND_SIZES: List[Tuple[int, int, int]] = [
|
|
|
(0xC2, 0xDF, 2), # 2-byte characters start with a byte C2-DF
|
|
|
(0xE0, 0xEF, 3), # 3-byte characters start with E0-EF
|
|
|
(0xF0, 0xF4, 4), # 4-byte characters start with F0-F4
|
|
|
]
|
|
|
|
|
|
#: :meta private:
|
|
|
FIRST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[0][0]
|
|
|
|
|
|
#: :meta private:
|
|
|
LAST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
|
|
|
|
|
|
@classmethod
|
|
|
def detwingle(
|
|
|
cls,
|
|
|
in_bytes: bytes,
|
|
|
main_encoding: _Encoding = "utf8",
|
|
|
embedded_encoding: _Encoding = "windows-1252",
|
|
|
) -> bytes:
|
|
|
"""Fix characters from one encoding embedded in some other encoding.
|
|
|
|
|
|
Currently the only situation supported is Windows-1252 (or its
|
|
|
subset ISO-8859-1), embedded in UTF-8.
|
|
|
|
|
|
:param in_bytes: A bytestring that you suspect contains
|
|
|
characters from multiple encodings. Note that this *must*
|
|
|
be a bytestring. If you've already converted the document
|
|
|
to Unicode, you're too late.
|
|
|
:param main_encoding: The primary encoding of ``in_bytes``.
|
|
|
:param embedded_encoding: The encoding that was used to embed characters
|
|
|
in the main document.
|
|
|
:return: A bytestring similar to ``in_bytes``, in which
|
|
|
``embedded_encoding`` characters have been converted to
|
|
|
their ``main_encoding`` equivalents.
|
|
|
"""
|
|
|
if embedded_encoding.replace("_", "-").lower() not in (
|
|
|
"windows-1252",
|
|
|
"windows_1252",
|
|
|
):
|
|
|
raise NotImplementedError(
|
|
|
"Windows-1252 and ISO-8859-1 are the only currently supported "
|
|
|
"embedded encodings."
|
|
|
)
|
|
|
|
|
|
if main_encoding.lower() not in ("utf8", "utf-8"):
|
|
|
raise NotImplementedError(
|
|
|
"UTF-8 is the only currently supported main encoding."
|
|
|
)
|
|
|
|
|
|
byte_chunks = []
|
|
|
|
|
|
chunk_start = 0
|
|
|
pos = 0
|
|
|
while pos < len(in_bytes):
|
|
|
byte = in_bytes[pos]
|
|
|
if byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER:
|
|
|
# This is the start of a UTF-8 multibyte character. Skip
|
|
|
# to the end.
|
|
|
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
|
|
|
if byte >= start and byte <= end:
|
|
|
pos += size
|
|
|
break
|
|
|
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
|
|
|
# We found a Windows-1252 character!
|
|
|
# Save the string up to this point as a chunk.
|
|
|
byte_chunks.append(in_bytes[chunk_start:pos])
|
|
|
|
|
|
# Now translate the Windows-1252 character into UTF-8
|
|
|
# and add it as another, one-byte chunk.
|
|
|
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
|
|
|
pos += 1
|
|
|
chunk_start = pos
|
|
|
else:
|
|
|
# Go on to the next character.
|
|
|
pos += 1
|
|
|
if chunk_start == 0:
|
|
|
# The string is unchanged.
|
|
|
return in_bytes
|
|
|
else:
|
|
|
# Store the final chunk.
|
|
|
byte_chunks.append(in_bytes[chunk_start:])
|
|
|
return b"".join(byte_chunks)
|