You cannot select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
	
	
		
			357 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Python
		
	
			
		
		
	
	
			357 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Python
		
	
# unicode.py
 | 
						|
 | 
						|
import sys
 | 
						|
from itertools import filterfalse
 | 
						|
from typing import Union
 | 
						|
 | 
						|
 | 
						|
class _lazyclassproperty:
 | 
						|
    def __init__(self, fn):
 | 
						|
        self.fn = fn
 | 
						|
        self.__doc__ = fn.__doc__
 | 
						|
        self.__name__ = fn.__name__
 | 
						|
 | 
						|
    def __get__(self, obj, cls):
 | 
						|
        if cls is None:
 | 
						|
            cls = type(obj)
 | 
						|
        if not hasattr(cls, "_intern") or any(
 | 
						|
            cls._intern is getattr(superclass, "_intern", [])
 | 
						|
            for superclass in cls.__mro__[1:]
 | 
						|
        ):
 | 
						|
            cls._intern = {}
 | 
						|
        attrname = self.fn.__name__
 | 
						|
        if attrname not in cls._intern:
 | 
						|
            cls._intern[attrname] = self.fn(cls)
 | 
						|
        return cls._intern[attrname]
 | 
						|
 | 
						|
 | 
						|
UnicodeRangeList = list[Union[tuple[int, int], tuple[int]]]
 | 
						|
 | 
						|
 | 
						|
class unicode_set:
 | 
						|
    """
 | 
						|
    A set of Unicode characters, for language-specific strings for
 | 
						|
    ``alphas``, ``nums``, ``alphanums``, and ``printables``.
 | 
						|
    A unicode_set is defined by a list of ranges in the Unicode character
 | 
						|
    set, in a class attribute ``_ranges``. Ranges can be specified using
 | 
						|
    2-tuples or a 1-tuple, such as::
 | 
						|
 | 
						|
        _ranges = [
 | 
						|
            (0x0020, 0x007e),
 | 
						|
            (0x00a0, 0x00ff),
 | 
						|
            (0x0100,),
 | 
						|
            ]
 | 
						|
 | 
						|
    Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x).
 | 
						|
 | 
						|
    A unicode set can also be defined using multiple inheritance of other unicode sets::
 | 
						|
 | 
						|
        class CJK(Chinese, Japanese, Korean):
 | 
						|
            pass
 | 
						|
    """
 | 
						|
 | 
						|
    _ranges: UnicodeRangeList = []
 | 
						|
 | 
						|
    @_lazyclassproperty
 | 
						|
    def _chars_for_ranges(cls) -> list[str]:
 | 
						|
        ret: list[int] = []
 | 
						|
        for cc in cls.__mro__:  # type: ignore[attr-defined]
 | 
						|
            if cc is unicode_set:
 | 
						|
                break
 | 
						|
            for rr in getattr(cc, "_ranges", ()):
 | 
						|
                ret.extend(range(rr[0], rr[-1] + 1))
 | 
						|
        return sorted(chr(c) for c in set(ret))
 | 
						|
 | 
						|
    @_lazyclassproperty
 | 
						|
    def printables(cls) -> str:
 | 
						|
        """all non-whitespace characters in this range"""
 | 
						|
        return "".join(filterfalse(str.isspace, cls._chars_for_ranges))
 | 
						|
 | 
						|
    @_lazyclassproperty
 | 
						|
    def alphas(cls) -> str:
 | 
						|
        """all alphabetic characters in this range"""
 | 
						|
        return "".join(filter(str.isalpha, cls._chars_for_ranges))
 | 
						|
 | 
						|
    @_lazyclassproperty
 | 
						|
    def nums(cls) -> str:
 | 
						|
        """all numeric digit characters in this range"""
 | 
						|
        return "".join(filter(str.isdigit, cls._chars_for_ranges))
 | 
						|
 | 
						|
    @_lazyclassproperty
 | 
						|
    def alphanums(cls) -> str:
 | 
						|
        """all alphanumeric characters in this range"""
 | 
						|
        return cls.alphas + cls.nums
 | 
						|
 | 
						|
    @_lazyclassproperty
 | 
						|
    def identchars(cls) -> str:
 | 
						|
        """all characters in this range that are valid identifier characters, plus underscore '_'"""
 | 
						|
        return "".join(
 | 
						|
            sorted(
 | 
						|
                set(filter(str.isidentifier, cls._chars_for_ranges))
 | 
						|
                | set(
 | 
						|
                    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº"
 | 
						|
                    "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
 | 
						|
                    "_"
 | 
						|
                )
 | 
						|
            )
 | 
						|
        )
 | 
						|
 | 
						|
    @_lazyclassproperty
 | 
						|
    def identbodychars(cls) -> str:
 | 
						|
        """
 | 
						|
        all characters in this range that are valid identifier body characters,
 | 
						|
        plus the digits 0-9, and · (Unicode MIDDLE DOT)
 | 
						|
        """
 | 
						|
        identifier_chars = set(
 | 
						|
            c for c in cls._chars_for_ranges if ("_" + c).isidentifier()
 | 
						|
        )
 | 
						|
        return "".join(
 | 
						|
            sorted(identifier_chars | set(cls.identchars) | set("0123456789·"))
 | 
						|
        )
 | 
						|
 | 
						|
    @_lazyclassproperty
 | 
						|
    def identifier(cls):
 | 
						|
        """
 | 
						|
        a pyparsing Word expression for an identifier using this range's definitions for
 | 
						|
        identchars and identbodychars
 | 
						|
        """
 | 
						|
        from pyparsing import Word
 | 
						|
 | 
						|
        return Word(cls.identchars, cls.identbodychars)
 | 
						|
 | 
						|
 | 
						|
class pyparsing_unicode(unicode_set):
 | 
						|
    """
 | 
						|
    A namespace class for defining common language unicode_sets.
 | 
						|
    """
 | 
						|
 | 
						|
    # fmt: off
 | 
						|
 | 
						|
    # define ranges in language character sets
 | 
						|
    _ranges: UnicodeRangeList = [
 | 
						|
        (0x0020, sys.maxunicode),
 | 
						|
    ]
 | 
						|
 | 
						|
    class BasicMultilingualPlane(unicode_set):
 | 
						|
        """Unicode set for the Basic Multilingual Plane"""
 | 
						|
        _ranges: UnicodeRangeList = [
 | 
						|
            (0x0020, 0xFFFF),
 | 
						|
        ]
 | 
						|
 | 
						|
    class Latin1(unicode_set):
 | 
						|
        """Unicode set for Latin-1 Unicode Character Range"""
 | 
						|
        _ranges: UnicodeRangeList = [
 | 
						|
            (0x0020, 0x007E),
 | 
						|
            (0x00A0, 0x00FF),
 | 
						|
        ]
 | 
						|
 | 
						|
    class LatinA(unicode_set):
 | 
						|
        """Unicode set for Latin-A Unicode Character Range"""
 | 
						|
        _ranges: UnicodeRangeList = [
 | 
						|
            (0x0100, 0x017F),
 | 
						|
        ]
 | 
						|
 | 
						|
    class LatinB(unicode_set):
 | 
						|
        """Unicode set for Latin-B Unicode Character Range"""
 | 
						|
        _ranges: UnicodeRangeList = [
 | 
						|
            (0x0180, 0x024F),
 | 
						|
        ]
 | 
						|
 | 
						|
    class Greek(unicode_set):
 | 
						|
        """Unicode set for Greek Unicode Character Ranges"""
 | 
						|
        _ranges: UnicodeRangeList = [
 | 
						|
            (0x0342, 0x0345),
 | 
						|
            (0x0370, 0x0377),
 | 
						|
            (0x037A, 0x037F),
 | 
						|
            (0x0384, 0x038A),
 | 
						|
            (0x038C,),
 | 
						|
            (0x038E, 0x03A1),
 | 
						|
            (0x03A3, 0x03E1),
 | 
						|
            (0x03F0, 0x03FF),
 | 
						|
            (0x1D26, 0x1D2A),
 | 
						|
            (0x1D5E,),
 | 
						|
            (0x1D60,),
 | 
						|
            (0x1D66, 0x1D6A),
 | 
						|
            (0x1F00, 0x1F15),
 | 
						|
            (0x1F18, 0x1F1D),
 | 
						|
            (0x1F20, 0x1F45),
 | 
						|
            (0x1F48, 0x1F4D),
 | 
						|
            (0x1F50, 0x1F57),
 | 
						|
            (0x1F59,),
 | 
						|
            (0x1F5B,),
 | 
						|
            (0x1F5D,),
 | 
						|
            (0x1F5F, 0x1F7D),
 | 
						|
            (0x1F80, 0x1FB4),
 | 
						|
            (0x1FB6, 0x1FC4),
 | 
						|
            (0x1FC6, 0x1FD3),
 | 
						|
            (0x1FD6, 0x1FDB),
 | 
						|
            (0x1FDD, 0x1FEF),
 | 
						|
            (0x1FF2, 0x1FF4),
 | 
						|
            (0x1FF6, 0x1FFE),
 | 
						|
            (0x2129,),
 | 
						|
            (0x2719, 0x271A),
 | 
						|
            (0xAB65,),
 | 
						|
            (0x10140, 0x1018D),
 | 
						|
            (0x101A0,),
 | 
						|
            (0x1D200, 0x1D245),
 | 
						|
            (0x1F7A1, 0x1F7A7),
 | 
						|
        ]
 | 
						|
 | 
						|
    class Cyrillic(unicode_set):
 | 
						|
        """Unicode set for Cyrillic Unicode Character Range"""
 | 
						|
        _ranges: UnicodeRangeList = [
 | 
						|
            (0x0400, 0x052F),
 | 
						|
            (0x1C80, 0x1C88),
 | 
						|
            (0x1D2B,),
 | 
						|
            (0x1D78,),
 | 
						|
            (0x2DE0, 0x2DFF),
 | 
						|
            (0xA640, 0xA672),
 | 
						|
            (0xA674, 0xA69F),
 | 
						|
            (0xFE2E, 0xFE2F),
 | 
						|
        ]
 | 
						|
 | 
						|
    class Chinese(unicode_set):
 | 
						|
        """Unicode set for Chinese Unicode Character Range"""
 | 
						|
        _ranges: UnicodeRangeList = [
 | 
						|
            (0x2E80, 0x2E99),
 | 
						|
            (0x2E9B, 0x2EF3),
 | 
						|
            (0x31C0, 0x31E3),
 | 
						|
            (0x3400, 0x4DB5),
 | 
						|
            (0x4E00, 0x9FEF),
 | 
						|
            (0xA700, 0xA707),
 | 
						|
            (0xF900, 0xFA6D),
 | 
						|
            (0xFA70, 0xFAD9),
 | 
						|
            (0x16FE2, 0x16FE3),
 | 
						|
            (0x1F210, 0x1F212),
 | 
						|
            (0x1F214, 0x1F23B),
 | 
						|
            (0x1F240, 0x1F248),
 | 
						|
            (0x20000, 0x2A6D6),
 | 
						|
            (0x2A700, 0x2B734),
 | 
						|
            (0x2B740, 0x2B81D),
 | 
						|
            (0x2B820, 0x2CEA1),
 | 
						|
            (0x2CEB0, 0x2EBE0),
 | 
						|
            (0x2F800, 0x2FA1D),
 | 
						|
        ]
 | 
						|
 | 
						|
    class Japanese(unicode_set):
 | 
						|
        """Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges"""
 | 
						|
 | 
						|
        class Kanji(unicode_set):
 | 
						|
            "Unicode set for Kanji Unicode Character Range"
 | 
						|
            _ranges: UnicodeRangeList = [
 | 
						|
                (0x4E00, 0x9FBF),
 | 
						|
                (0x3000, 0x303F),
 | 
						|
            ]
 | 
						|
 | 
						|
        class Hiragana(unicode_set):
 | 
						|
            """Unicode set for Hiragana Unicode Character Range"""
 | 
						|
            _ranges: UnicodeRangeList = [
 | 
						|
                (0x3041, 0x3096),
 | 
						|
                (0x3099, 0x30A0),
 | 
						|
                (0x30FC,),
 | 
						|
                (0xFF70,),
 | 
						|
                (0x1B001,),
 | 
						|
                (0x1B150, 0x1B152),
 | 
						|
                (0x1F200,),
 | 
						|
            ]
 | 
						|
 | 
						|
        class Katakana(unicode_set):
 | 
						|
            """Unicode set for Katakana  Unicode Character Range"""
 | 
						|
            _ranges: UnicodeRangeList = [
 | 
						|
                (0x3099, 0x309C),
 | 
						|
                (0x30A0, 0x30FF),
 | 
						|
                (0x31F0, 0x31FF),
 | 
						|
                (0x32D0, 0x32FE),
 | 
						|
                (0xFF65, 0xFF9F),
 | 
						|
                (0x1B000,),
 | 
						|
                (0x1B164, 0x1B167),
 | 
						|
                (0x1F201, 0x1F202),
 | 
						|
                (0x1F213,),
 | 
						|
            ]
 | 
						|
 | 
						|
        漢字 = Kanji
 | 
						|
        カタカナ = Katakana
 | 
						|
        ひらがな = Hiragana
 | 
						|
 | 
						|
        _ranges = (
 | 
						|
            Kanji._ranges
 | 
						|
            + Hiragana._ranges
 | 
						|
            + Katakana._ranges
 | 
						|
        )
 | 
						|
 | 
						|
    class Hangul(unicode_set):
 | 
						|
        """Unicode set for Hangul (Korean) Unicode Character Range"""
 | 
						|
        _ranges: UnicodeRangeList = [
 | 
						|
            (0x1100, 0x11FF),
 | 
						|
            (0x302E, 0x302F),
 | 
						|
            (0x3131, 0x318E),
 | 
						|
            (0x3200, 0x321C),
 | 
						|
            (0x3260, 0x327B),
 | 
						|
            (0x327E,),
 | 
						|
            (0xA960, 0xA97C),
 | 
						|
            (0xAC00, 0xD7A3),
 | 
						|
            (0xD7B0, 0xD7C6),
 | 
						|
            (0xD7CB, 0xD7FB),
 | 
						|
            (0xFFA0, 0xFFBE),
 | 
						|
            (0xFFC2, 0xFFC7),
 | 
						|
            (0xFFCA, 0xFFCF),
 | 
						|
            (0xFFD2, 0xFFD7),
 | 
						|
            (0xFFDA, 0xFFDC),
 | 
						|
        ]
 | 
						|
 | 
						|
    Korean = Hangul
 | 
						|
 | 
						|
    class CJK(Chinese, Japanese, Hangul):
 | 
						|
        """Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range"""
 | 
						|
 | 
						|
    class Thai(unicode_set):
 | 
						|
        """Unicode set for Thai Unicode Character Range"""
 | 
						|
        _ranges: UnicodeRangeList = [
 | 
						|
            (0x0E01, 0x0E3A),
 | 
						|
            (0x0E3F, 0x0E5B)
 | 
						|
        ]
 | 
						|
 | 
						|
    class Arabic(unicode_set):
 | 
						|
        """Unicode set for Arabic Unicode Character Range"""
 | 
						|
        _ranges: UnicodeRangeList = [
 | 
						|
            (0x0600, 0x061B),
 | 
						|
            (0x061E, 0x06FF),
 | 
						|
            (0x0700, 0x077F),
 | 
						|
        ]
 | 
						|
 | 
						|
    class Hebrew(unicode_set):
 | 
						|
        """Unicode set for Hebrew Unicode Character Range"""
 | 
						|
        _ranges: UnicodeRangeList = [
 | 
						|
            (0x0591, 0x05C7),
 | 
						|
            (0x05D0, 0x05EA),
 | 
						|
            (0x05EF, 0x05F4),
 | 
						|
            (0xFB1D, 0xFB36),
 | 
						|
            (0xFB38, 0xFB3C),
 | 
						|
            (0xFB3E,),
 | 
						|
            (0xFB40, 0xFB41),
 | 
						|
            (0xFB43, 0xFB44),
 | 
						|
            (0xFB46, 0xFB4F),
 | 
						|
        ]
 | 
						|
 | 
						|
    class Devanagari(unicode_set):
 | 
						|
        """Unicode set for Devanagari Unicode Character Range"""
 | 
						|
        _ranges: UnicodeRangeList = [
 | 
						|
            (0x0900, 0x097F),
 | 
						|
            (0xA8E0, 0xA8FF)
 | 
						|
        ]
 | 
						|
 | 
						|
    BMP = BasicMultilingualPlane
 | 
						|
 | 
						|
    # add language identifiers using language Unicode
 | 
						|
    العربية = Arabic
 | 
						|
    中文 = Chinese
 | 
						|
    кириллица = Cyrillic
 | 
						|
    Ελληνικά = Greek
 | 
						|
    עִברִית = Hebrew
 | 
						|
    日本語 = Japanese
 | 
						|
    한국어 = Korean
 | 
						|
    ไทย = Thai
 | 
						|
    देवनागरी = Devanagari
 | 
						|
 | 
						|
    # fmt: on
 |