You cannot select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
	
	
		
			361 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
			
		
		
	
	
			361 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
from __future__ import annotations
 | 
						|
 | 
						|
from encodings.aliases import aliases
 | 
						|
from hashlib import sha256
 | 
						|
from json import dumps
 | 
						|
from re import sub
 | 
						|
from typing import Any, Iterator, List, Tuple
 | 
						|
 | 
						|
from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
 | 
						|
from .utils import iana_name, is_multi_byte_encoding, unicode_range
 | 
						|
 | 
						|
 | 
						|
class CharsetMatch:
 | 
						|
    def __init__(
 | 
						|
        self,
 | 
						|
        payload: bytes,
 | 
						|
        guessed_encoding: str,
 | 
						|
        mean_mess_ratio: float,
 | 
						|
        has_sig_or_bom: bool,
 | 
						|
        languages: CoherenceMatches,
 | 
						|
        decoded_payload: str | None = None,
 | 
						|
        preemptive_declaration: str | None = None,
 | 
						|
    ):
 | 
						|
        self._payload: bytes = payload
 | 
						|
 | 
						|
        self._encoding: str = guessed_encoding
 | 
						|
        self._mean_mess_ratio: float = mean_mess_ratio
 | 
						|
        self._languages: CoherenceMatches = languages
 | 
						|
        self._has_sig_or_bom: bool = has_sig_or_bom
 | 
						|
        self._unicode_ranges: list[str] | None = None
 | 
						|
 | 
						|
        self._leaves: list[CharsetMatch] = []
 | 
						|
        self._mean_coherence_ratio: float = 0.0
 | 
						|
 | 
						|
        self._output_payload: bytes | None = None
 | 
						|
        self._output_encoding: str | None = None
 | 
						|
 | 
						|
        self._string: str | None = decoded_payload
 | 
						|
 | 
						|
        self._preemptive_declaration: str | None = preemptive_declaration
 | 
						|
 | 
						|
    def __eq__(self, other: object) -> bool:
 | 
						|
        if not isinstance(other, CharsetMatch):
 | 
						|
            if isinstance(other, str):
 | 
						|
                return iana_name(other) == self.encoding
 | 
						|
            return False
 | 
						|
        return self.encoding == other.encoding and self.fingerprint == other.fingerprint
 | 
						|
 | 
						|
    def __lt__(self, other: object) -> bool:
 | 
						|
        """
 | 
						|
        Implemented to make sorted available upon CharsetMatches items.
 | 
						|
        """
 | 
						|
        if not isinstance(other, CharsetMatch):
 | 
						|
            raise ValueError
 | 
						|
 | 
						|
        chaos_difference: float = abs(self.chaos - other.chaos)
 | 
						|
        coherence_difference: float = abs(self.coherence - other.coherence)
 | 
						|
 | 
						|
        # Below 1% difference --> Use Coherence
 | 
						|
        if chaos_difference < 0.01 and coherence_difference > 0.02:
 | 
						|
            return self.coherence > other.coherence
 | 
						|
        elif chaos_difference < 0.01 and coherence_difference <= 0.02:
 | 
						|
            # When having a difficult decision, use the result that decoded as many multi-byte as possible.
 | 
						|
            # preserve RAM usage!
 | 
						|
            if len(self._payload) >= TOO_BIG_SEQUENCE:
 | 
						|
                return self.chaos < other.chaos
 | 
						|
            return self.multi_byte_usage > other.multi_byte_usage
 | 
						|
 | 
						|
        return self.chaos < other.chaos
 | 
						|
 | 
						|
    @property
 | 
						|
    def multi_byte_usage(self) -> float:
 | 
						|
        return 1.0 - (len(str(self)) / len(self.raw))
 | 
						|
 | 
						|
    def __str__(self) -> str:
 | 
						|
        # Lazy Str Loading
 | 
						|
        if self._string is None:
 | 
						|
            self._string = str(self._payload, self._encoding, "strict")
 | 
						|
        return self._string
 | 
						|
 | 
						|
    def __repr__(self) -> str:
 | 
						|
        return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
 | 
						|
 | 
						|
    def add_submatch(self, other: CharsetMatch) -> None:
 | 
						|
        if not isinstance(other, CharsetMatch) or other == self:
 | 
						|
            raise ValueError(
 | 
						|
                "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
 | 
						|
                    other.__class__
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        other._string = None  # Unload RAM usage; dirty trick.
 | 
						|
        self._leaves.append(other)
 | 
						|
 | 
						|
    @property
 | 
						|
    def encoding(self) -> str:
 | 
						|
        return self._encoding
 | 
						|
 | 
						|
    @property
 | 
						|
    def encoding_aliases(self) -> list[str]:
 | 
						|
        """
 | 
						|
        Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
 | 
						|
        """
 | 
						|
        also_known_as: list[str] = []
 | 
						|
        for u, p in aliases.items():
 | 
						|
            if self.encoding == u:
 | 
						|
                also_known_as.append(p)
 | 
						|
            elif self.encoding == p:
 | 
						|
                also_known_as.append(u)
 | 
						|
        return also_known_as
 | 
						|
 | 
						|
    @property
 | 
						|
    def bom(self) -> bool:
 | 
						|
        return self._has_sig_or_bom
 | 
						|
 | 
						|
    @property
 | 
						|
    def byte_order_mark(self) -> bool:
 | 
						|
        return self._has_sig_or_bom
 | 
						|
 | 
						|
    @property
 | 
						|
    def languages(self) -> list[str]:
 | 
						|
        """
 | 
						|
        Return the complete list of possible languages found in decoded sequence.
 | 
						|
        Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
 | 
						|
        """
 | 
						|
        return [e[0] for e in self._languages]
 | 
						|
 | 
						|
    @property
 | 
						|
    def language(self) -> str:
 | 
						|
        """
 | 
						|
        Most probable language found in decoded sequence. If none were detected or inferred, the property will return
 | 
						|
        "Unknown".
 | 
						|
        """
 | 
						|
        if not self._languages:
 | 
						|
            # Trying to infer the language based on the given encoding
 | 
						|
            # Its either English or we should not pronounce ourselves in certain cases.
 | 
						|
            if "ascii" in self.could_be_from_charset:
 | 
						|
                return "English"
 | 
						|
 | 
						|
            # doing it there to avoid circular import
 | 
						|
            from charset_normalizer.cd import encoding_languages, mb_encoding_languages
 | 
						|
 | 
						|
            languages = (
 | 
						|
                mb_encoding_languages(self.encoding)
 | 
						|
                if is_multi_byte_encoding(self.encoding)
 | 
						|
                else encoding_languages(self.encoding)
 | 
						|
            )
 | 
						|
 | 
						|
            if len(languages) == 0 or "Latin Based" in languages:
 | 
						|
                return "Unknown"
 | 
						|
 | 
						|
            return languages[0]
 | 
						|
 | 
						|
        return self._languages[0][0]
 | 
						|
 | 
						|
    @property
 | 
						|
    def chaos(self) -> float:
 | 
						|
        return self._mean_mess_ratio
 | 
						|
 | 
						|
    @property
 | 
						|
    def coherence(self) -> float:
 | 
						|
        if not self._languages:
 | 
						|
            return 0.0
 | 
						|
        return self._languages[0][1]
 | 
						|
 | 
						|
    @property
 | 
						|
    def percent_chaos(self) -> float:
 | 
						|
        return round(self.chaos * 100, ndigits=3)
 | 
						|
 | 
						|
    @property
 | 
						|
    def percent_coherence(self) -> float:
 | 
						|
        return round(self.coherence * 100, ndigits=3)
 | 
						|
 | 
						|
    @property
 | 
						|
    def raw(self) -> bytes:
 | 
						|
        """
 | 
						|
        Original untouched bytes.
 | 
						|
        """
 | 
						|
        return self._payload
 | 
						|
 | 
						|
    @property
 | 
						|
    def submatch(self) -> list[CharsetMatch]:
 | 
						|
        return self._leaves
 | 
						|
 | 
						|
    @property
 | 
						|
    def has_submatch(self) -> bool:
 | 
						|
        return len(self._leaves) > 0
 | 
						|
 | 
						|
    @property
 | 
						|
    def alphabets(self) -> list[str]:
 | 
						|
        if self._unicode_ranges is not None:
 | 
						|
            return self._unicode_ranges
 | 
						|
        # list detected ranges
 | 
						|
        detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
 | 
						|
        # filter and sort
 | 
						|
        self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
 | 
						|
        return self._unicode_ranges
 | 
						|
 | 
						|
    @property
 | 
						|
    def could_be_from_charset(self) -> list[str]:
 | 
						|
        """
 | 
						|
        The complete list of encoding that output the exact SAME str result and therefore could be the originating
 | 
						|
        encoding.
 | 
						|
        This list does include the encoding available in property 'encoding'.
 | 
						|
        """
 | 
						|
        return [self._encoding] + [m.encoding for m in self._leaves]
 | 
						|
 | 
						|
    def output(self, encoding: str = "utf_8") -> bytes:
 | 
						|
        """
 | 
						|
        Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
 | 
						|
        Any errors will be simply ignored by the encoder NOT replaced.
 | 
						|
        """
 | 
						|
        if self._output_encoding is None or self._output_encoding != encoding:
 | 
						|
            self._output_encoding = encoding
 | 
						|
            decoded_string = str(self)
 | 
						|
            if (
 | 
						|
                self._preemptive_declaration is not None
 | 
						|
                and self._preemptive_declaration.lower()
 | 
						|
                not in ["utf-8", "utf8", "utf_8"]
 | 
						|
            ):
 | 
						|
                patched_header = sub(
 | 
						|
                    RE_POSSIBLE_ENCODING_INDICATION,
 | 
						|
                    lambda m: m.string[m.span()[0] : m.span()[1]].replace(
 | 
						|
                        m.groups()[0],
 | 
						|
                        iana_name(self._output_encoding).replace("_", "-"),  # type: ignore[arg-type]
 | 
						|
                    ),
 | 
						|
                    decoded_string[:8192],
 | 
						|
                    count=1,
 | 
						|
                )
 | 
						|
 | 
						|
                decoded_string = patched_header + decoded_string[8192:]
 | 
						|
 | 
						|
            self._output_payload = decoded_string.encode(encoding, "replace")
 | 
						|
 | 
						|
        return self._output_payload  # type: ignore
 | 
						|
 | 
						|
    @property
 | 
						|
    def fingerprint(self) -> str:
 | 
						|
        """
 | 
						|
        Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
 | 
						|
        """
 | 
						|
        return sha256(self.output()).hexdigest()
 | 
						|
 | 
						|
 | 
						|
class CharsetMatches:
 | 
						|
    """
 | 
						|
    Container with every CharsetMatch items ordered by default from most probable to the less one.
 | 
						|
    Act like a list(iterable) but does not implements all related methods.
 | 
						|
    """
 | 
						|
 | 
						|
    def __init__(self, results: list[CharsetMatch] | None = None):
 | 
						|
        self._results: list[CharsetMatch] = sorted(results) if results else []
 | 
						|
 | 
						|
    def __iter__(self) -> Iterator[CharsetMatch]:
 | 
						|
        yield from self._results
 | 
						|
 | 
						|
    def __getitem__(self, item: int | str) -> CharsetMatch:
 | 
						|
        """
 | 
						|
        Retrieve a single item either by its position or encoding name (alias may be used here).
 | 
						|
        Raise KeyError upon invalid index or encoding not present in results.
 | 
						|
        """
 | 
						|
        if isinstance(item, int):
 | 
						|
            return self._results[item]
 | 
						|
        if isinstance(item, str):
 | 
						|
            item = iana_name(item, False)
 | 
						|
            for result in self._results:
 | 
						|
                if item in result.could_be_from_charset:
 | 
						|
                    return result
 | 
						|
        raise KeyError
 | 
						|
 | 
						|
    def __len__(self) -> int:
 | 
						|
        return len(self._results)
 | 
						|
 | 
						|
    def __bool__(self) -> bool:
 | 
						|
        return len(self._results) > 0
 | 
						|
 | 
						|
    def append(self, item: CharsetMatch) -> None:
 | 
						|
        """
 | 
						|
        Insert a single match. Will be inserted accordingly to preserve sort.
 | 
						|
        Can be inserted as a submatch.
 | 
						|
        """
 | 
						|
        if not isinstance(item, CharsetMatch):
 | 
						|
            raise ValueError(
 | 
						|
                "Cannot append instance '{}' to CharsetMatches".format(
 | 
						|
                    str(item.__class__)
 | 
						|
                )
 | 
						|
            )
 | 
						|
        # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
 | 
						|
        if len(item.raw) < TOO_BIG_SEQUENCE:
 | 
						|
            for match in self._results:
 | 
						|
                if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
 | 
						|
                    match.add_submatch(item)
 | 
						|
                    return
 | 
						|
        self._results.append(item)
 | 
						|
        self._results = sorted(self._results)
 | 
						|
 | 
						|
    def best(self) -> CharsetMatch | None:
 | 
						|
        """
 | 
						|
        Simply return the first match. Strict equivalent to matches[0].
 | 
						|
        """
 | 
						|
        if not self._results:
 | 
						|
            return None
 | 
						|
        return self._results[0]
 | 
						|
 | 
						|
    def first(self) -> CharsetMatch | None:
 | 
						|
        """
 | 
						|
        Redundant method, call the method best(). Kept for BC reasons.
 | 
						|
        """
 | 
						|
        return self.best()
 | 
						|
 | 
						|
 | 
						|
CoherenceMatch = Tuple[str, float]
 | 
						|
CoherenceMatches = List[CoherenceMatch]
 | 
						|
 | 
						|
 | 
						|
class CliDetectionResult:
 | 
						|
    def __init__(
 | 
						|
        self,
 | 
						|
        path: str,
 | 
						|
        encoding: str | None,
 | 
						|
        encoding_aliases: list[str],
 | 
						|
        alternative_encodings: list[str],
 | 
						|
        language: str,
 | 
						|
        alphabets: list[str],
 | 
						|
        has_sig_or_bom: bool,
 | 
						|
        chaos: float,
 | 
						|
        coherence: float,
 | 
						|
        unicode_path: str | None,
 | 
						|
        is_preferred: bool,
 | 
						|
    ):
 | 
						|
        self.path: str = path
 | 
						|
        self.unicode_path: str | None = unicode_path
 | 
						|
        self.encoding: str | None = encoding
 | 
						|
        self.encoding_aliases: list[str] = encoding_aliases
 | 
						|
        self.alternative_encodings: list[str] = alternative_encodings
 | 
						|
        self.language: str = language
 | 
						|
        self.alphabets: list[str] = alphabets
 | 
						|
        self.has_sig_or_bom: bool = has_sig_or_bom
 | 
						|
        self.chaos: float = chaos
 | 
						|
        self.coherence: float = coherence
 | 
						|
        self.is_preferred: bool = is_preferred
 | 
						|
 | 
						|
    @property
 | 
						|
    def __dict__(self) -> dict[str, Any]:  # type: ignore
 | 
						|
        return {
 | 
						|
            "path": self.path,
 | 
						|
            "encoding": self.encoding,
 | 
						|
            "encoding_aliases": self.encoding_aliases,
 | 
						|
            "alternative_encodings": self.alternative_encodings,
 | 
						|
            "language": self.language,
 | 
						|
            "alphabets": self.alphabets,
 | 
						|
            "has_sig_or_bom": self.has_sig_or_bom,
 | 
						|
            "chaos": self.chaos,
 | 
						|
            "coherence": self.coherence,
 | 
						|
            "unicode_path": self.unicode_path,
 | 
						|
            "is_preferred": self.is_preferred,
 | 
						|
        }
 | 
						|
 | 
						|
    def to_json(self) -> str:
 | 
						|
        return dumps(self.__dict__, ensure_ascii=True, indent=4)
 |