You cannot select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
	
	
		
			636 lines
		
	
	
		
			20 KiB
		
	
	
	
		
			Python
		
	
			
		
		
	
	
			636 lines
		
	
	
		
			20 KiB
		
	
	
	
		
			Python
		
	
from __future__ import annotations
 | 
						|
 | 
						|
from functools import lru_cache
 | 
						|
from logging import getLogger
 | 
						|
 | 
						|
from .constant import (
 | 
						|
    COMMON_SAFE_ASCII_CHARACTERS,
 | 
						|
    TRACE,
 | 
						|
    UNICODE_SECONDARY_RANGE_KEYWORD,
 | 
						|
)
 | 
						|
from .utils import (
 | 
						|
    is_accentuated,
 | 
						|
    is_arabic,
 | 
						|
    is_arabic_isolated_form,
 | 
						|
    is_case_variable,
 | 
						|
    is_cjk,
 | 
						|
    is_emoticon,
 | 
						|
    is_hangul,
 | 
						|
    is_hiragana,
 | 
						|
    is_katakana,
 | 
						|
    is_latin,
 | 
						|
    is_punctuation,
 | 
						|
    is_separator,
 | 
						|
    is_symbol,
 | 
						|
    is_thai,
 | 
						|
    is_unprintable,
 | 
						|
    remove_accent,
 | 
						|
    unicode_range,
 | 
						|
    is_cjk_uncommon,
 | 
						|
)
 | 
						|
 | 
						|
 | 
						|
class MessDetectorPlugin:
 | 
						|
    """
 | 
						|
    Base abstract class used for mess detection plugins.
 | 
						|
    All detectors MUST extend and implement given methods.
 | 
						|
    """
 | 
						|
 | 
						|
    def eligible(self, character: str) -> bool:
 | 
						|
        """
 | 
						|
        Determine if given character should be fed in.
 | 
						|
        """
 | 
						|
        raise NotImplementedError  # pragma: nocover
 | 
						|
 | 
						|
    def feed(self, character: str) -> None:
 | 
						|
        """
 | 
						|
        The main routine to be executed upon character.
 | 
						|
        Insert the logic in witch the text would be considered chaotic.
 | 
						|
        """
 | 
						|
        raise NotImplementedError  # pragma: nocover
 | 
						|
 | 
						|
    def reset(self) -> None:  # pragma: no cover
 | 
						|
        """
 | 
						|
        Permit to reset the plugin to the initial state.
 | 
						|
        """
 | 
						|
        raise NotImplementedError
 | 
						|
 | 
						|
    @property
 | 
						|
    def ratio(self) -> float:
 | 
						|
        """
 | 
						|
        Compute the chaos ratio based on what your feed() has seen.
 | 
						|
        Must NOT be lower than 0.; No restriction gt 0.
 | 
						|
        """
 | 
						|
        raise NotImplementedError  # pragma: nocover
 | 
						|
 | 
						|
 | 
						|
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
 | 
						|
    def __init__(self) -> None:
 | 
						|
        self._punctuation_count: int = 0
 | 
						|
        self._symbol_count: int = 0
 | 
						|
        self._character_count: int = 0
 | 
						|
 | 
						|
        self._last_printable_char: str | None = None
 | 
						|
        self._frenzy_symbol_in_word: bool = False
 | 
						|
 | 
						|
    def eligible(self, character: str) -> bool:
 | 
						|
        return character.isprintable()
 | 
						|
 | 
						|
    def feed(self, character: str) -> None:
 | 
						|
        self._character_count += 1
 | 
						|
 | 
						|
        if (
 | 
						|
            character != self._last_printable_char
 | 
						|
            and character not in COMMON_SAFE_ASCII_CHARACTERS
 | 
						|
        ):
 | 
						|
            if is_punctuation(character):
 | 
						|
                self._punctuation_count += 1
 | 
						|
            elif (
 | 
						|
                character.isdigit() is False
 | 
						|
                and is_symbol(character)
 | 
						|
                and is_emoticon(character) is False
 | 
						|
            ):
 | 
						|
                self._symbol_count += 2
 | 
						|
 | 
						|
        self._last_printable_char = character
 | 
						|
 | 
						|
    def reset(self) -> None:  # Abstract
 | 
						|
        self._punctuation_count = 0
 | 
						|
        self._character_count = 0
 | 
						|
        self._symbol_count = 0
 | 
						|
 | 
						|
    @property
 | 
						|
    def ratio(self) -> float:
 | 
						|
        if self._character_count == 0:
 | 
						|
            return 0.0
 | 
						|
 | 
						|
        ratio_of_punctuation: float = (
 | 
						|
            self._punctuation_count + self._symbol_count
 | 
						|
        ) / self._character_count
 | 
						|
 | 
						|
        return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
 | 
						|
 | 
						|
 | 
						|
class TooManyAccentuatedPlugin(MessDetectorPlugin):
 | 
						|
    def __init__(self) -> None:
 | 
						|
        self._character_count: int = 0
 | 
						|
        self._accentuated_count: int = 0
 | 
						|
 | 
						|
    def eligible(self, character: str) -> bool:
 | 
						|
        return character.isalpha()
 | 
						|
 | 
						|
    def feed(self, character: str) -> None:
 | 
						|
        self._character_count += 1
 | 
						|
 | 
						|
        if is_accentuated(character):
 | 
						|
            self._accentuated_count += 1
 | 
						|
 | 
						|
    def reset(self) -> None:  # Abstract
 | 
						|
        self._character_count = 0
 | 
						|
        self._accentuated_count = 0
 | 
						|
 | 
						|
    @property
 | 
						|
    def ratio(self) -> float:
 | 
						|
        if self._character_count < 8:
 | 
						|
            return 0.0
 | 
						|
 | 
						|
        ratio_of_accentuation: float = self._accentuated_count / self._character_count
 | 
						|
        return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
 | 
						|
 | 
						|
 | 
						|
class UnprintablePlugin(MessDetectorPlugin):
 | 
						|
    def __init__(self) -> None:
 | 
						|
        self._unprintable_count: int = 0
 | 
						|
        self._character_count: int = 0
 | 
						|
 | 
						|
    def eligible(self, character: str) -> bool:
 | 
						|
        return True
 | 
						|
 | 
						|
    def feed(self, character: str) -> None:
 | 
						|
        if is_unprintable(character):
 | 
						|
            self._unprintable_count += 1
 | 
						|
        self._character_count += 1
 | 
						|
 | 
						|
    def reset(self) -> None:  # Abstract
 | 
						|
        self._unprintable_count = 0
 | 
						|
 | 
						|
    @property
 | 
						|
    def ratio(self) -> float:
 | 
						|
        if self._character_count == 0:
 | 
						|
            return 0.0
 | 
						|
 | 
						|
        return (self._unprintable_count * 8) / self._character_count
 | 
						|
 | 
						|
 | 
						|
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
 | 
						|
    def __init__(self) -> None:
 | 
						|
        self._successive_count: int = 0
 | 
						|
        self._character_count: int = 0
 | 
						|
 | 
						|
        self._last_latin_character: str | None = None
 | 
						|
 | 
						|
    def eligible(self, character: str) -> bool:
 | 
						|
        return character.isalpha() and is_latin(character)
 | 
						|
 | 
						|
    def feed(self, character: str) -> None:
 | 
						|
        self._character_count += 1
 | 
						|
        if (
 | 
						|
            self._last_latin_character is not None
 | 
						|
            and is_accentuated(character)
 | 
						|
            and is_accentuated(self._last_latin_character)
 | 
						|
        ):
 | 
						|
            if character.isupper() and self._last_latin_character.isupper():
 | 
						|
                self._successive_count += 1
 | 
						|
            # Worse if its the same char duplicated with different accent.
 | 
						|
            if remove_accent(character) == remove_accent(self._last_latin_character):
 | 
						|
                self._successive_count += 1
 | 
						|
        self._last_latin_character = character
 | 
						|
 | 
						|
    def reset(self) -> None:  # Abstract
 | 
						|
        self._successive_count = 0
 | 
						|
        self._character_count = 0
 | 
						|
        self._last_latin_character = None
 | 
						|
 | 
						|
    @property
 | 
						|
    def ratio(self) -> float:
 | 
						|
        if self._character_count == 0:
 | 
						|
            return 0.0
 | 
						|
 | 
						|
        return (self._successive_count * 2) / self._character_count
 | 
						|
 | 
						|
 | 
						|
class SuspiciousRange(MessDetectorPlugin):
 | 
						|
    def __init__(self) -> None:
 | 
						|
        self._suspicious_successive_range_count: int = 0
 | 
						|
        self._character_count: int = 0
 | 
						|
        self._last_printable_seen: str | None = None
 | 
						|
 | 
						|
    def eligible(self, character: str) -> bool:
 | 
						|
        return character.isprintable()
 | 
						|
 | 
						|
    def feed(self, character: str) -> None:
 | 
						|
        self._character_count += 1
 | 
						|
 | 
						|
        if (
 | 
						|
            character.isspace()
 | 
						|
            or is_punctuation(character)
 | 
						|
            or character in COMMON_SAFE_ASCII_CHARACTERS
 | 
						|
        ):
 | 
						|
            self._last_printable_seen = None
 | 
						|
            return
 | 
						|
 | 
						|
        if self._last_printable_seen is None:
 | 
						|
            self._last_printable_seen = character
 | 
						|
            return
 | 
						|
 | 
						|
        unicode_range_a: str | None = unicode_range(self._last_printable_seen)
 | 
						|
        unicode_range_b: str | None = unicode_range(character)
 | 
						|
 | 
						|
        if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
 | 
						|
            self._suspicious_successive_range_count += 1
 | 
						|
 | 
						|
        self._last_printable_seen = character
 | 
						|
 | 
						|
    def reset(self) -> None:  # Abstract
 | 
						|
        self._character_count = 0
 | 
						|
        self._suspicious_successive_range_count = 0
 | 
						|
        self._last_printable_seen = None
 | 
						|
 | 
						|
    @property
 | 
						|
    def ratio(self) -> float:
 | 
						|
        if self._character_count <= 13:
 | 
						|
            return 0.0
 | 
						|
 | 
						|
        ratio_of_suspicious_range_usage: float = (
 | 
						|
            self._suspicious_successive_range_count * 2
 | 
						|
        ) / self._character_count
 | 
						|
 | 
						|
        return ratio_of_suspicious_range_usage
 | 
						|
 | 
						|
 | 
						|
class SuperWeirdWordPlugin(MessDetectorPlugin):
 | 
						|
    def __init__(self) -> None:
 | 
						|
        self._word_count: int = 0
 | 
						|
        self._bad_word_count: int = 0
 | 
						|
        self._foreign_long_count: int = 0
 | 
						|
 | 
						|
        self._is_current_word_bad: bool = False
 | 
						|
        self._foreign_long_watch: bool = False
 | 
						|
 | 
						|
        self._character_count: int = 0
 | 
						|
        self._bad_character_count: int = 0
 | 
						|
 | 
						|
        self._buffer: str = ""
 | 
						|
        self._buffer_accent_count: int = 0
 | 
						|
        self._buffer_glyph_count: int = 0
 | 
						|
 | 
						|
    def eligible(self, character: str) -> bool:
 | 
						|
        return True
 | 
						|
 | 
						|
    def feed(self, character: str) -> None:
 | 
						|
        if character.isalpha():
 | 
						|
            self._buffer += character
 | 
						|
            if is_accentuated(character):
 | 
						|
                self._buffer_accent_count += 1
 | 
						|
            if (
 | 
						|
                self._foreign_long_watch is False
 | 
						|
                and (is_latin(character) is False or is_accentuated(character))
 | 
						|
                and is_cjk(character) is False
 | 
						|
                and is_hangul(character) is False
 | 
						|
                and is_katakana(character) is False
 | 
						|
                and is_hiragana(character) is False
 | 
						|
                and is_thai(character) is False
 | 
						|
            ):
 | 
						|
                self._foreign_long_watch = True
 | 
						|
            if (
 | 
						|
                is_cjk(character)
 | 
						|
                or is_hangul(character)
 | 
						|
                or is_katakana(character)
 | 
						|
                or is_hiragana(character)
 | 
						|
                or is_thai(character)
 | 
						|
            ):
 | 
						|
                self._buffer_glyph_count += 1
 | 
						|
            return
 | 
						|
        if not self._buffer:
 | 
						|
            return
 | 
						|
        if (
 | 
						|
            character.isspace() or is_punctuation(character) or is_separator(character)
 | 
						|
        ) and self._buffer:
 | 
						|
            self._word_count += 1
 | 
						|
            buffer_length: int = len(self._buffer)
 | 
						|
 | 
						|
            self._character_count += buffer_length
 | 
						|
 | 
						|
            if buffer_length >= 4:
 | 
						|
                if self._buffer_accent_count / buffer_length >= 0.5:
 | 
						|
                    self._is_current_word_bad = True
 | 
						|
                # Word/Buffer ending with an upper case accentuated letter are so rare,
 | 
						|
                # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
 | 
						|
                elif (
 | 
						|
                    is_accentuated(self._buffer[-1])
 | 
						|
                    and self._buffer[-1].isupper()
 | 
						|
                    and all(_.isupper() for _ in self._buffer) is False
 | 
						|
                ):
 | 
						|
                    self._foreign_long_count += 1
 | 
						|
                    self._is_current_word_bad = True
 | 
						|
                elif self._buffer_glyph_count == 1:
 | 
						|
                    self._is_current_word_bad = True
 | 
						|
                    self._foreign_long_count += 1
 | 
						|
            if buffer_length >= 24 and self._foreign_long_watch:
 | 
						|
                camel_case_dst = [
 | 
						|
                    i
 | 
						|
                    for c, i in zip(self._buffer, range(0, buffer_length))
 | 
						|
                    if c.isupper()
 | 
						|
                ]
 | 
						|
                probable_camel_cased: bool = False
 | 
						|
 | 
						|
                if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
 | 
						|
                    probable_camel_cased = True
 | 
						|
 | 
						|
                if not probable_camel_cased:
 | 
						|
                    self._foreign_long_count += 1
 | 
						|
                    self._is_current_word_bad = True
 | 
						|
 | 
						|
            if self._is_current_word_bad:
 | 
						|
                self._bad_word_count += 1
 | 
						|
                self._bad_character_count += len(self._buffer)
 | 
						|
                self._is_current_word_bad = False
 | 
						|
 | 
						|
            self._foreign_long_watch = False
 | 
						|
            self._buffer = ""
 | 
						|
            self._buffer_accent_count = 0
 | 
						|
            self._buffer_glyph_count = 0
 | 
						|
        elif (
 | 
						|
            character not in {"<", ">", "-", "=", "~", "|", "_"}
 | 
						|
            and character.isdigit() is False
 | 
						|
            and is_symbol(character)
 | 
						|
        ):
 | 
						|
            self._is_current_word_bad = True
 | 
						|
            self._buffer += character
 | 
						|
 | 
						|
    def reset(self) -> None:  # Abstract
 | 
						|
        self._buffer = ""
 | 
						|
        self._is_current_word_bad = False
 | 
						|
        self._foreign_long_watch = False
 | 
						|
        self._bad_word_count = 0
 | 
						|
        self._word_count = 0
 | 
						|
        self._character_count = 0
 | 
						|
        self._bad_character_count = 0
 | 
						|
        self._foreign_long_count = 0
 | 
						|
 | 
						|
    @property
 | 
						|
    def ratio(self) -> float:
 | 
						|
        if self._word_count <= 10 and self._foreign_long_count == 0:
 | 
						|
            return 0.0
 | 
						|
 | 
						|
        return self._bad_character_count / self._character_count
 | 
						|
 | 
						|
 | 
						|
class CjkUncommonPlugin(MessDetectorPlugin):
 | 
						|
    """
 | 
						|
    Detect messy CJK text that probably means nothing.
 | 
						|
    """
 | 
						|
 | 
						|
    def __init__(self) -> None:
 | 
						|
        self._character_count: int = 0
 | 
						|
        self._uncommon_count: int = 0
 | 
						|
 | 
						|
    def eligible(self, character: str) -> bool:
 | 
						|
        return is_cjk(character)
 | 
						|
 | 
						|
    def feed(self, character: str) -> None:
 | 
						|
        self._character_count += 1
 | 
						|
 | 
						|
        if is_cjk_uncommon(character):
 | 
						|
            self._uncommon_count += 1
 | 
						|
            return
 | 
						|
 | 
						|
    def reset(self) -> None:  # Abstract
 | 
						|
        self._character_count = 0
 | 
						|
        self._uncommon_count = 0
 | 
						|
 | 
						|
    @property
 | 
						|
    def ratio(self) -> float:
 | 
						|
        if self._character_count < 8:
 | 
						|
            return 0.0
 | 
						|
 | 
						|
        uncommon_form_usage: float = self._uncommon_count / self._character_count
 | 
						|
 | 
						|
        # we can be pretty sure it's garbage when uncommon characters are widely
 | 
						|
        # used. otherwise it could just be traditional chinese for example.
 | 
						|
        return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
 | 
						|
 | 
						|
 | 
						|
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
 | 
						|
    def __init__(self) -> None:
 | 
						|
        self._buf: bool = False
 | 
						|
 | 
						|
        self._character_count_since_last_sep: int = 0
 | 
						|
 | 
						|
        self._successive_upper_lower_count: int = 0
 | 
						|
        self._successive_upper_lower_count_final: int = 0
 | 
						|
 | 
						|
        self._character_count: int = 0
 | 
						|
 | 
						|
        self._last_alpha_seen: str | None = None
 | 
						|
        self._current_ascii_only: bool = True
 | 
						|
 | 
						|
    def eligible(self, character: str) -> bool:
 | 
						|
        return True
 | 
						|
 | 
						|
    def feed(self, character: str) -> None:
 | 
						|
        is_concerned = character.isalpha() and is_case_variable(character)
 | 
						|
        chunk_sep = is_concerned is False
 | 
						|
 | 
						|
        if chunk_sep and self._character_count_since_last_sep > 0:
 | 
						|
            if (
 | 
						|
                self._character_count_since_last_sep <= 64
 | 
						|
                and character.isdigit() is False
 | 
						|
                and self._current_ascii_only is False
 | 
						|
            ):
 | 
						|
                self._successive_upper_lower_count_final += (
 | 
						|
                    self._successive_upper_lower_count
 | 
						|
                )
 | 
						|
 | 
						|
            self._successive_upper_lower_count = 0
 | 
						|
            self._character_count_since_last_sep = 0
 | 
						|
            self._last_alpha_seen = None
 | 
						|
            self._buf = False
 | 
						|
            self._character_count += 1
 | 
						|
            self._current_ascii_only = True
 | 
						|
 | 
						|
            return
 | 
						|
 | 
						|
        if self._current_ascii_only is True and character.isascii() is False:
 | 
						|
            self._current_ascii_only = False
 | 
						|
 | 
						|
        if self._last_alpha_seen is not None:
 | 
						|
            if (character.isupper() and self._last_alpha_seen.islower()) or (
 | 
						|
                character.islower() and self._last_alpha_seen.isupper()
 | 
						|
            ):
 | 
						|
                if self._buf is True:
 | 
						|
                    self._successive_upper_lower_count += 2
 | 
						|
                    self._buf = False
 | 
						|
                else:
 | 
						|
                    self._buf = True
 | 
						|
            else:
 | 
						|
                self._buf = False
 | 
						|
 | 
						|
        self._character_count += 1
 | 
						|
        self._character_count_since_last_sep += 1
 | 
						|
        self._last_alpha_seen = character
 | 
						|
 | 
						|
    def reset(self) -> None:  # Abstract
 | 
						|
        self._character_count = 0
 | 
						|
        self._character_count_since_last_sep = 0
 | 
						|
        self._successive_upper_lower_count = 0
 | 
						|
        self._successive_upper_lower_count_final = 0
 | 
						|
        self._last_alpha_seen = None
 | 
						|
        self._buf = False
 | 
						|
        self._current_ascii_only = True
 | 
						|
 | 
						|
    @property
 | 
						|
    def ratio(self) -> float:
 | 
						|
        if self._character_count == 0:
 | 
						|
            return 0.0
 | 
						|
 | 
						|
        return self._successive_upper_lower_count_final / self._character_count
 | 
						|
 | 
						|
 | 
						|
class ArabicIsolatedFormPlugin(MessDetectorPlugin):
 | 
						|
    def __init__(self) -> None:
 | 
						|
        self._character_count: int = 0
 | 
						|
        self._isolated_form_count: int = 0
 | 
						|
 | 
						|
    def reset(self) -> None:  # Abstract
 | 
						|
        self._character_count = 0
 | 
						|
        self._isolated_form_count = 0
 | 
						|
 | 
						|
    def eligible(self, character: str) -> bool:
 | 
						|
        return is_arabic(character)
 | 
						|
 | 
						|
    def feed(self, character: str) -> None:
 | 
						|
        self._character_count += 1
 | 
						|
 | 
						|
        if is_arabic_isolated_form(character):
 | 
						|
            self._isolated_form_count += 1
 | 
						|
 | 
						|
    @property
 | 
						|
    def ratio(self) -> float:
 | 
						|
        if self._character_count < 8:
 | 
						|
            return 0.0
 | 
						|
 | 
						|
        isolated_form_usage: float = self._isolated_form_count / self._character_count
 | 
						|
 | 
						|
        return isolated_form_usage
 | 
						|
 | 
						|
 | 
						|
@lru_cache(maxsize=1024)
 | 
						|
def is_suspiciously_successive_range(
 | 
						|
    unicode_range_a: str | None, unicode_range_b: str | None
 | 
						|
) -> bool:
 | 
						|
    """
 | 
						|
    Determine if two Unicode range seen next to each other can be considered as suspicious.
 | 
						|
    """
 | 
						|
    if unicode_range_a is None or unicode_range_b is None:
 | 
						|
        return True
 | 
						|
 | 
						|
    if unicode_range_a == unicode_range_b:
 | 
						|
        return False
 | 
						|
 | 
						|
    if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
 | 
						|
        return False
 | 
						|
 | 
						|
    if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
 | 
						|
        return False
 | 
						|
 | 
						|
    # Latin characters can be accompanied with a combining diacritical mark
 | 
						|
    # eg. Vietnamese.
 | 
						|
    if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
 | 
						|
        "Combining" in unicode_range_a or "Combining" in unicode_range_b
 | 
						|
    ):
 | 
						|
        return False
 | 
						|
 | 
						|
    keywords_range_a, keywords_range_b = (
 | 
						|
        unicode_range_a.split(" "),
 | 
						|
        unicode_range_b.split(" "),
 | 
						|
    )
 | 
						|
 | 
						|
    for el in keywords_range_a:
 | 
						|
        if el in UNICODE_SECONDARY_RANGE_KEYWORD:
 | 
						|
            continue
 | 
						|
        if el in keywords_range_b:
 | 
						|
            return False
 | 
						|
 | 
						|
    # Japanese Exception
 | 
						|
    range_a_jp_chars, range_b_jp_chars = (
 | 
						|
        unicode_range_a
 | 
						|
        in (
 | 
						|
            "Hiragana",
 | 
						|
            "Katakana",
 | 
						|
        ),
 | 
						|
        unicode_range_b in ("Hiragana", "Katakana"),
 | 
						|
    )
 | 
						|
    if (range_a_jp_chars or range_b_jp_chars) and (
 | 
						|
        "CJK" in unicode_range_a or "CJK" in unicode_range_b
 | 
						|
    ):
 | 
						|
        return False
 | 
						|
    if range_a_jp_chars and range_b_jp_chars:
 | 
						|
        return False
 | 
						|
 | 
						|
    if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
 | 
						|
        if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
 | 
						|
            return False
 | 
						|
        if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
 | 
						|
            return False
 | 
						|
 | 
						|
    # Chinese/Japanese use dedicated range for punctuation and/or separators.
 | 
						|
    if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
 | 
						|
        unicode_range_a in ["Katakana", "Hiragana"]
 | 
						|
        and unicode_range_b in ["Katakana", "Hiragana"]
 | 
						|
    ):
 | 
						|
        if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
 | 
						|
            return False
 | 
						|
        if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
 | 
						|
            return False
 | 
						|
        if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
 | 
						|
            return False
 | 
						|
 | 
						|
    return True
 | 
						|
 | 
						|
 | 
						|
@lru_cache(maxsize=2048)
 | 
						|
def mess_ratio(
 | 
						|
    decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
 | 
						|
) -> float:
 | 
						|
    """
 | 
						|
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
 | 
						|
    """
 | 
						|
 | 
						|
    detectors: list[MessDetectorPlugin] = [
 | 
						|
        md_class() for md_class in MessDetectorPlugin.__subclasses__()
 | 
						|
    ]
 | 
						|
 | 
						|
    length: int = len(decoded_sequence) + 1
 | 
						|
 | 
						|
    mean_mess_ratio: float = 0.0
 | 
						|
 | 
						|
    if length < 512:
 | 
						|
        intermediary_mean_mess_ratio_calc: int = 32
 | 
						|
    elif length <= 1024:
 | 
						|
        intermediary_mean_mess_ratio_calc = 64
 | 
						|
    else:
 | 
						|
        intermediary_mean_mess_ratio_calc = 128
 | 
						|
 | 
						|
    for character, index in zip(decoded_sequence + "\n", range(length)):
 | 
						|
        for detector in detectors:
 | 
						|
            if detector.eligible(character):
 | 
						|
                detector.feed(character)
 | 
						|
 | 
						|
        if (
 | 
						|
            index > 0 and index % intermediary_mean_mess_ratio_calc == 0
 | 
						|
        ) or index == length - 1:
 | 
						|
            mean_mess_ratio = sum(dt.ratio for dt in detectors)
 | 
						|
 | 
						|
            if mean_mess_ratio >= maximum_threshold:
 | 
						|
                break
 | 
						|
 | 
						|
    if debug:
 | 
						|
        logger = getLogger("charset_normalizer")
 | 
						|
 | 
						|
        logger.log(
 | 
						|
            TRACE,
 | 
						|
            "Mess-detector extended-analysis start. "
 | 
						|
            f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
 | 
						|
            f"maximum_threshold={maximum_threshold}",
 | 
						|
        )
 | 
						|
 | 
						|
        if len(decoded_sequence) > 16:
 | 
						|
            logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
 | 
						|
            logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
 | 
						|
 | 
						|
        for dt in detectors:
 | 
						|
            logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
 | 
						|
 | 
						|
    return round(mean_mess_ratio, 3)
 |