You cannot select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
	
	
		
			396 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
			
		
		
	
	
			396 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
from __future__ import annotations
 | 
						|
 | 
						|
import importlib
 | 
						|
from codecs import IncrementalDecoder
 | 
						|
from collections import Counter
 | 
						|
from functools import lru_cache
 | 
						|
from typing import Counter as TypeCounter
 | 
						|
 | 
						|
from .constant import (
 | 
						|
    FREQUENCIES,
 | 
						|
    KO_NAMES,
 | 
						|
    LANGUAGE_SUPPORTED_COUNT,
 | 
						|
    TOO_SMALL_SEQUENCE,
 | 
						|
    ZH_NAMES,
 | 
						|
)
 | 
						|
from .md import is_suspiciously_successive_range
 | 
						|
from .models import CoherenceMatches
 | 
						|
from .utils import (
 | 
						|
    is_accentuated,
 | 
						|
    is_latin,
 | 
						|
    is_multi_byte_encoding,
 | 
						|
    is_unicode_range_secondary,
 | 
						|
    unicode_range,
 | 
						|
)
 | 
						|
 | 
						|
 | 
						|
def encoding_unicode_range(iana_name: str) -> list[str]:
 | 
						|
    """
 | 
						|
    Return associated unicode ranges in a single byte code page.
 | 
						|
    """
 | 
						|
    if is_multi_byte_encoding(iana_name):
 | 
						|
        raise OSError("Function not supported on multi-byte code page")
 | 
						|
 | 
						|
    decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
 | 
						|
 | 
						|
    p: IncrementalDecoder = decoder(errors="ignore")
 | 
						|
    seen_ranges: dict[str, int] = {}
 | 
						|
    character_count: int = 0
 | 
						|
 | 
						|
    for i in range(0x40, 0xFF):
 | 
						|
        chunk: str = p.decode(bytes([i]))
 | 
						|
 | 
						|
        if chunk:
 | 
						|
            character_range: str | None = unicode_range(chunk)
 | 
						|
 | 
						|
            if character_range is None:
 | 
						|
                continue
 | 
						|
 | 
						|
            if is_unicode_range_secondary(character_range) is False:
 | 
						|
                if character_range not in seen_ranges:
 | 
						|
                    seen_ranges[character_range] = 0
 | 
						|
                seen_ranges[character_range] += 1
 | 
						|
                character_count += 1
 | 
						|
 | 
						|
    return sorted(
 | 
						|
        [
 | 
						|
            character_range
 | 
						|
            for character_range in seen_ranges
 | 
						|
            if seen_ranges[character_range] / character_count >= 0.15
 | 
						|
        ]
 | 
						|
    )
 | 
						|
 | 
						|
 | 
						|
def unicode_range_languages(primary_range: str) -> list[str]:
 | 
						|
    """
 | 
						|
    Return inferred languages used with a unicode range.
 | 
						|
    """
 | 
						|
    languages: list[str] = []
 | 
						|
 | 
						|
    for language, characters in FREQUENCIES.items():
 | 
						|
        for character in characters:
 | 
						|
            if unicode_range(character) == primary_range:
 | 
						|
                languages.append(language)
 | 
						|
                break
 | 
						|
 | 
						|
    return languages
 | 
						|
 | 
						|
 | 
						|
@lru_cache()
 | 
						|
def encoding_languages(iana_name: str) -> list[str]:
 | 
						|
    """
 | 
						|
    Single-byte encoding language association. Some code page are heavily linked to particular language(s).
 | 
						|
    This function does the correspondence.
 | 
						|
    """
 | 
						|
    unicode_ranges: list[str] = encoding_unicode_range(iana_name)
 | 
						|
    primary_range: str | None = None
 | 
						|
 | 
						|
    for specified_range in unicode_ranges:
 | 
						|
        if "Latin" not in specified_range:
 | 
						|
            primary_range = specified_range
 | 
						|
            break
 | 
						|
 | 
						|
    if primary_range is None:
 | 
						|
        return ["Latin Based"]
 | 
						|
 | 
						|
    return unicode_range_languages(primary_range)
 | 
						|
 | 
						|
 | 
						|
@lru_cache()
 | 
						|
def mb_encoding_languages(iana_name: str) -> list[str]:
 | 
						|
    """
 | 
						|
    Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
 | 
						|
    This function does the correspondence.
 | 
						|
    """
 | 
						|
    if (
 | 
						|
        iana_name.startswith("shift_")
 | 
						|
        or iana_name.startswith("iso2022_jp")
 | 
						|
        or iana_name.startswith("euc_j")
 | 
						|
        or iana_name == "cp932"
 | 
						|
    ):
 | 
						|
        return ["Japanese"]
 | 
						|
    if iana_name.startswith("gb") or iana_name in ZH_NAMES:
 | 
						|
        return ["Chinese"]
 | 
						|
    if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
 | 
						|
        return ["Korean"]
 | 
						|
 | 
						|
    return []
 | 
						|
 | 
						|
 | 
						|
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
 | 
						|
def get_target_features(language: str) -> tuple[bool, bool]:
 | 
						|
    """
 | 
						|
    Determine main aspects from a supported language if it contains accents and if is pure Latin.
 | 
						|
    """
 | 
						|
    target_have_accents: bool = False
 | 
						|
    target_pure_latin: bool = True
 | 
						|
 | 
						|
    for character in FREQUENCIES[language]:
 | 
						|
        if not target_have_accents and is_accentuated(character):
 | 
						|
            target_have_accents = True
 | 
						|
        if target_pure_latin and is_latin(character) is False:
 | 
						|
            target_pure_latin = False
 | 
						|
 | 
						|
    return target_have_accents, target_pure_latin
 | 
						|
 | 
						|
 | 
						|
def alphabet_languages(
 | 
						|
    characters: list[str], ignore_non_latin: bool = False
 | 
						|
) -> list[str]:
 | 
						|
    """
 | 
						|
    Return associated languages associated to given characters.
 | 
						|
    """
 | 
						|
    languages: list[tuple[str, float]] = []
 | 
						|
 | 
						|
    source_have_accents = any(is_accentuated(character) for character in characters)
 | 
						|
 | 
						|
    for language, language_characters in FREQUENCIES.items():
 | 
						|
        target_have_accents, target_pure_latin = get_target_features(language)
 | 
						|
 | 
						|
        if ignore_non_latin and target_pure_latin is False:
 | 
						|
            continue
 | 
						|
 | 
						|
        if target_have_accents is False and source_have_accents:
 | 
						|
            continue
 | 
						|
 | 
						|
        character_count: int = len(language_characters)
 | 
						|
 | 
						|
        character_match_count: int = len(
 | 
						|
            [c for c in language_characters if c in characters]
 | 
						|
        )
 | 
						|
 | 
						|
        ratio: float = character_match_count / character_count
 | 
						|
 | 
						|
        if ratio >= 0.2:
 | 
						|
            languages.append((language, ratio))
 | 
						|
 | 
						|
    languages = sorted(languages, key=lambda x: x[1], reverse=True)
 | 
						|
 | 
						|
    return [compatible_language[0] for compatible_language in languages]
 | 
						|
 | 
						|
 | 
						|
def characters_popularity_compare(
 | 
						|
    language: str, ordered_characters: list[str]
 | 
						|
) -> float:
 | 
						|
    """
 | 
						|
    Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
 | 
						|
    The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
 | 
						|
    Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
 | 
						|
    """
 | 
						|
    if language not in FREQUENCIES:
 | 
						|
        raise ValueError(f"{language} not available")
 | 
						|
 | 
						|
    character_approved_count: int = 0
 | 
						|
    FREQUENCIES_language_set = set(FREQUENCIES[language])
 | 
						|
 | 
						|
    ordered_characters_count: int = len(ordered_characters)
 | 
						|
    target_language_characters_count: int = len(FREQUENCIES[language])
 | 
						|
 | 
						|
    large_alphabet: bool = target_language_characters_count > 26
 | 
						|
 | 
						|
    for character, character_rank in zip(
 | 
						|
        ordered_characters, range(0, ordered_characters_count)
 | 
						|
    ):
 | 
						|
        if character not in FREQUENCIES_language_set:
 | 
						|
            continue
 | 
						|
 | 
						|
        character_rank_in_language: int = FREQUENCIES[language].index(character)
 | 
						|
        expected_projection_ratio: float = (
 | 
						|
            target_language_characters_count / ordered_characters_count
 | 
						|
        )
 | 
						|
        character_rank_projection: int = int(character_rank * expected_projection_ratio)
 | 
						|
 | 
						|
        if (
 | 
						|
            large_alphabet is False
 | 
						|
            and abs(character_rank_projection - character_rank_in_language) > 4
 | 
						|
        ):
 | 
						|
            continue
 | 
						|
 | 
						|
        if (
 | 
						|
            large_alphabet is True
 | 
						|
            and abs(character_rank_projection - character_rank_in_language)
 | 
						|
            < target_language_characters_count / 3
 | 
						|
        ):
 | 
						|
            character_approved_count += 1
 | 
						|
            continue
 | 
						|
 | 
						|
        characters_before_source: list[str] = FREQUENCIES[language][
 | 
						|
            0:character_rank_in_language
 | 
						|
        ]
 | 
						|
        characters_after_source: list[str] = FREQUENCIES[language][
 | 
						|
            character_rank_in_language:
 | 
						|
        ]
 | 
						|
        characters_before: list[str] = ordered_characters[0:character_rank]
 | 
						|
        characters_after: list[str] = ordered_characters[character_rank:]
 | 
						|
 | 
						|
        before_match_count: int = len(
 | 
						|
            set(characters_before) & set(characters_before_source)
 | 
						|
        )
 | 
						|
 | 
						|
        after_match_count: int = len(
 | 
						|
            set(characters_after) & set(characters_after_source)
 | 
						|
        )
 | 
						|
 | 
						|
        if len(characters_before_source) == 0 and before_match_count <= 4:
 | 
						|
            character_approved_count += 1
 | 
						|
            continue
 | 
						|
 | 
						|
        if len(characters_after_source) == 0 and after_match_count <= 4:
 | 
						|
            character_approved_count += 1
 | 
						|
            continue
 | 
						|
 | 
						|
        if (
 | 
						|
            before_match_count / len(characters_before_source) >= 0.4
 | 
						|
            or after_match_count / len(characters_after_source) >= 0.4
 | 
						|
        ):
 | 
						|
            character_approved_count += 1
 | 
						|
            continue
 | 
						|
 | 
						|
    return character_approved_count / len(ordered_characters)
 | 
						|
 | 
						|
 | 
						|
def alpha_unicode_split(decoded_sequence: str) -> list[str]:
 | 
						|
    """
 | 
						|
    Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
 | 
						|
    Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
 | 
						|
    One containing the latin letters and the other hebrew.
 | 
						|
    """
 | 
						|
    layers: dict[str, str] = {}
 | 
						|
 | 
						|
    for character in decoded_sequence:
 | 
						|
        if character.isalpha() is False:
 | 
						|
            continue
 | 
						|
 | 
						|
        character_range: str | None = unicode_range(character)
 | 
						|
 | 
						|
        if character_range is None:
 | 
						|
            continue
 | 
						|
 | 
						|
        layer_target_range: str | None = None
 | 
						|
 | 
						|
        for discovered_range in layers:
 | 
						|
            if (
 | 
						|
                is_suspiciously_successive_range(discovered_range, character_range)
 | 
						|
                is False
 | 
						|
            ):
 | 
						|
                layer_target_range = discovered_range
 | 
						|
                break
 | 
						|
 | 
						|
        if layer_target_range is None:
 | 
						|
            layer_target_range = character_range
 | 
						|
 | 
						|
        if layer_target_range not in layers:
 | 
						|
            layers[layer_target_range] = character.lower()
 | 
						|
            continue
 | 
						|
 | 
						|
        layers[layer_target_range] += character.lower()
 | 
						|
 | 
						|
    return list(layers.values())
 | 
						|
 | 
						|
 | 
						|
def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
 | 
						|
    """
 | 
						|
    This function merge results previously given by the function coherence_ratio.
 | 
						|
    The return type is the same as coherence_ratio.
 | 
						|
    """
 | 
						|
    per_language_ratios: dict[str, list[float]] = {}
 | 
						|
    for result in results:
 | 
						|
        for sub_result in result:
 | 
						|
            language, ratio = sub_result
 | 
						|
            if language not in per_language_ratios:
 | 
						|
                per_language_ratios[language] = [ratio]
 | 
						|
                continue
 | 
						|
            per_language_ratios[language].append(ratio)
 | 
						|
 | 
						|
    merge = [
 | 
						|
        (
 | 
						|
            language,
 | 
						|
            round(
 | 
						|
                sum(per_language_ratios[language]) / len(per_language_ratios[language]),
 | 
						|
                4,
 | 
						|
            ),
 | 
						|
        )
 | 
						|
        for language in per_language_ratios
 | 
						|
    ]
 | 
						|
 | 
						|
    return sorted(merge, key=lambda x: x[1], reverse=True)
 | 
						|
 | 
						|
 | 
						|
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
 | 
						|
    """
 | 
						|
    We shall NOT return "English—" in CoherenceMatches because it is an alternative
 | 
						|
    of "English". This function only keeps the best match and remove the em-dash in it.
 | 
						|
    """
 | 
						|
    index_results: dict[str, list[float]] = dict()
 | 
						|
 | 
						|
    for result in results:
 | 
						|
        language, ratio = result
 | 
						|
        no_em_name: str = language.replace("—", "")
 | 
						|
 | 
						|
        if no_em_name not in index_results:
 | 
						|
            index_results[no_em_name] = []
 | 
						|
 | 
						|
        index_results[no_em_name].append(ratio)
 | 
						|
 | 
						|
    if any(len(index_results[e]) > 1 for e in index_results):
 | 
						|
        filtered_results: CoherenceMatches = []
 | 
						|
 | 
						|
        for language in index_results:
 | 
						|
            filtered_results.append((language, max(index_results[language])))
 | 
						|
 | 
						|
        return filtered_results
 | 
						|
 | 
						|
    return results
 | 
						|
 | 
						|
 | 
						|
@lru_cache(maxsize=2048)
 | 
						|
def coherence_ratio(
 | 
						|
    decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
 | 
						|
) -> CoherenceMatches:
 | 
						|
    """
 | 
						|
    Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
 | 
						|
    A layer = Character extraction by alphabets/ranges.
 | 
						|
    """
 | 
						|
 | 
						|
    results: list[tuple[str, float]] = []
 | 
						|
    ignore_non_latin: bool = False
 | 
						|
 | 
						|
    sufficient_match_count: int = 0
 | 
						|
 | 
						|
    lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
 | 
						|
    if "Latin Based" in lg_inclusion_list:
 | 
						|
        ignore_non_latin = True
 | 
						|
        lg_inclusion_list.remove("Latin Based")
 | 
						|
 | 
						|
    for layer in alpha_unicode_split(decoded_sequence):
 | 
						|
        sequence_frequencies: TypeCounter[str] = Counter(layer)
 | 
						|
        most_common = sequence_frequencies.most_common()
 | 
						|
 | 
						|
        character_count: int = sum(o for c, o in most_common)
 | 
						|
 | 
						|
        if character_count <= TOO_SMALL_SEQUENCE:
 | 
						|
            continue
 | 
						|
 | 
						|
        popular_character_ordered: list[str] = [c for c, o in most_common]
 | 
						|
 | 
						|
        for language in lg_inclusion_list or alphabet_languages(
 | 
						|
            popular_character_ordered, ignore_non_latin
 | 
						|
        ):
 | 
						|
            ratio: float = characters_popularity_compare(
 | 
						|
                language, popular_character_ordered
 | 
						|
            )
 | 
						|
 | 
						|
            if ratio < threshold:
 | 
						|
                continue
 | 
						|
            elif ratio >= 0.8:
 | 
						|
                sufficient_match_count += 1
 | 
						|
 | 
						|
            results.append((language, round(ratio, 4)))
 | 
						|
 | 
						|
            if sufficient_match_count >= 3:
 | 
						|
                break
 | 
						|
 | 
						|
    return sorted(
 | 
						|
        filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
 | 
						|
    )
 |