py-data-analysis/.venv/lib/python3.12/site-packages/numpy/_core/tests/test_stringdtype.py

import copy
import itertools
import os
import pickle
import sys
import tempfile

import pytest

import numpy as np
from numpy._core.tests._natype import get_stringdtype_dtype as get_dtype
from numpy._core.tests._natype import pd_NA
from numpy.dtypes import StringDType
from numpy.testing import IS_PYPY, assert_array_equal


@pytest.fixture
def string_list():
    return ["abc", "def", "ghi" * 10, "A¢☃€ 😊" * 100, "Abc" * 1000, "DEF"]


# second copy for cast tests to do a cartesian product over dtypes
@pytest.fixture(params=[True, False])
def coerce2(request):
    return request.param


@pytest.fixture(
    params=["unset", None, pd_NA, np.nan, float("nan"), "__nan__"],
    ids=["unset", "None", "pandas.NA", "np.nan", "float('nan')", "string nan"],
)
def na_object2(request):
    return request.param


@pytest.fixture()
def dtype2(na_object2, coerce2):
    # explicit is check for pd_NA because != with pd_NA returns pd_NA
    if na_object2 is pd_NA or na_object2 != "unset":
        return StringDType(na_object=na_object2, coerce=coerce2)
    else:
        return StringDType(coerce=coerce2)


def test_dtype_creation():
    hashes = set()
    dt = StringDType()
    assert not hasattr(dt, "na_object") and dt.coerce is True
    hashes.add(hash(dt))

    dt = StringDType(na_object=None)
    assert dt.na_object is None and dt.coerce is True
    hashes.add(hash(dt))

    dt = StringDType(coerce=False)
    assert not hasattr(dt, "na_object") and dt.coerce is False
    hashes.add(hash(dt))

    dt = StringDType(na_object=None, coerce=False)
    assert dt.na_object is None and dt.coerce is False
    hashes.add(hash(dt))

    assert len(hashes) == 4

    dt = np.dtype("T")
    assert dt == StringDType()
    assert dt.kind == "T"
    assert dt.char == "T"

    hashes.add(hash(dt))
    assert len(hashes) == 4


def test_dtype_equality(dtype):
    assert dtype == dtype
    for ch in "SU":
        assert dtype != np.dtype(ch)
        assert dtype != np.dtype(f"{ch}8")


def test_dtype_repr(dtype):
    if not hasattr(dtype, "na_object") and dtype.coerce:
        assert repr(dtype) == "StringDType()"
    elif dtype.coerce:
        assert repr(dtype) == f"StringDType(na_object={dtype.na_object!r})"
    elif not hasattr(dtype, "na_object"):
        assert repr(dtype) == "StringDType(coerce=False)"
    else:
        assert (
            repr(dtype)
            == f"StringDType(na_object={dtype.na_object!r}, coerce=False)"
        )


def test_create_with_na(dtype):
    if not hasattr(dtype, "na_object"):
        pytest.skip("does not have an na object")
    na_val = dtype.na_object
    string_list = ["hello", na_val, "world"]
    arr = np.array(string_list, dtype=dtype)
    assert str(arr) == "[" + " ".join([repr(s) for s in string_list]) + "]"
    assert arr[1] is dtype.na_object


@pytest.mark.parametrize("i", list(range(5)))
def test_set_replace_na(i):
    # Test strings of various lengths can be set to NaN and then replaced.
    s_empty = ""
    s_short = "0123456789"
    s_medium = "abcdefghijklmnopqrstuvwxyz"
    s_long = "-=+" * 100
    strings = [s_medium, s_empty, s_short, s_medium, s_long]
    a = np.array(strings, StringDType(na_object=np.nan))
    for s in [a[i], s_medium + s_short, s_short, s_empty, s_long]:
        a[i] = np.nan
        assert np.isnan(a[i])
        a[i] = s
        assert a[i] == s
        assert_array_equal(a, strings[:i] + [s] + strings[i + 1:])


def test_null_roundtripping():
    data = ["hello\0world", "ABC\0DEF\0\0"]
    arr = np.array(data, dtype="T")
    assert data[0] == arr[0]
    assert data[1] == arr[1]


def test_string_too_large_error():
    arr = np.array(["a", "b", "c"], dtype=StringDType())
    with pytest.raises(OverflowError):
        arr * (sys.maxsize + 1)


@pytest.mark.parametrize(
    "data",
    [
        ["abc", "def", "ghi"],
        ["🤣", "📵", "😰"],
        ["🚜", "🙃", "😾"],
        ["😹", "🚠", "🚌"],
    ],
)
def test_array_creation_utf8(dtype, data):
    arr = np.array(data, dtype=dtype)
    assert str(arr) == "[" + " ".join(["'" + str(d) + "'" for d in data]) + "]"
    assert arr.dtype == dtype


@pytest.mark.parametrize(
    "data",
    [
        [1, 2, 3],
        [b"abc", b"def", b"ghi"],
        [object, object, object],
    ],
)
def test_scalars_string_conversion(data, dtype):
    try:
        str_vals = [str(d.decode('utf-8')) for d in data]
    except AttributeError:
        str_vals = [str(d) for d in data]
    if dtype.coerce:
        assert_array_equal(
            np.array(data, dtype=dtype),
            np.array(str_vals, dtype=dtype),
        )
    else:
        with pytest.raises(ValueError):
            np.array(data, dtype=dtype)


@pytest.mark.parametrize(
    ("strings"),
    [
        ["this", "is", "an", "array"],
        ["€", "", "😊"],
        ["A¢☃€ 😊", " A☃€¢😊", "☃€😊 A¢", "😊☃A¢ €"],
    ],
)
def test_self_casts(dtype, dtype2, strings):
    if hasattr(dtype, "na_object"):
        strings = strings + [dtype.na_object]
    elif hasattr(dtype2, "na_object"):
        strings = strings + [""]
    arr = np.array(strings, dtype=dtype)
    newarr = arr.astype(dtype2)

    if hasattr(dtype, "na_object") and not hasattr(dtype2, "na_object"):
        assert newarr[-1] == str(dtype.na_object)
        with pytest.raises(TypeError):
            arr.astype(dtype2, casting="safe")
    elif hasattr(dtype, "na_object") and hasattr(dtype2, "na_object"):
        assert newarr[-1] is dtype2.na_object
        arr.astype(dtype2, casting="safe")
    elif hasattr(dtype2, "na_object"):
        assert newarr[-1] == ""
        arr.astype(dtype2, casting="safe")
    else:
        arr.astype(dtype2, casting="safe")

    if hasattr(dtype, "na_object") and hasattr(dtype2, "na_object"):
        na1 = dtype.na_object
        na2 = dtype2.na_object
        if (na1 is not na2 and
             # check for pd_NA first because bool(pd_NA) is an error
             ((na1 is pd_NA or na2 is pd_NA) or
              # the second check is a NaN check, spelled this way
              # to avoid errors from math.isnan and np.isnan
              (na1 != na2 and not (na1 != na1 and na2 != na2)))):
            with pytest.raises(TypeError):
                arr[:-1] == newarr[:-1]
            return
    assert_array_equal(arr[:-1], newarr[:-1])


@pytest.mark.parametrize(
    ("strings"),
    [
        ["this", "is", "an", "array"],
        ["€", "", "😊"],
        ["A¢☃€ 😊", " A☃€¢😊", "☃€😊 A¢", "😊☃A¢ €"],
    ],
)
class TestStringLikeCasts:
    def test_unicode_casts(self, dtype, strings):
        arr = np.array(strings, dtype=np.str_).astype(dtype)
        expected = np.array(strings, dtype=dtype)
        assert_array_equal(arr, expected)

        arr_as_U8 = expected.astype("U8")
        assert_array_equal(arr_as_U8, np.array(strings, dtype="U8"))
        assert_array_equal(arr_as_U8.astype(dtype), arr)
        arr_as_U3 = expected.astype("U3")
        assert_array_equal(arr_as_U3, np.array(strings, dtype="U3"))
        assert_array_equal(
            arr_as_U3.astype(dtype),
            np.array([s[:3] for s in strings], dtype=dtype),
        )

    def test_void_casts(self, dtype, strings):
        sarr = np.array(strings, dtype=dtype)
        utf8_bytes = [s.encode("utf-8") for s in strings]
        void_dtype = f"V{max(len(s) for s in utf8_bytes)}"
        varr = np.array(utf8_bytes, dtype=void_dtype)
        assert_array_equal(varr, sarr.astype(void_dtype))
        assert_array_equal(varr.astype(dtype), sarr)

    def test_bytes_casts(self, dtype, strings):
        sarr = np.array(strings, dtype=dtype)
        try:
            utf8_bytes = [s.encode("ascii") for s in strings]
            bytes_dtype = f"S{max(len(s) for s in utf8_bytes)}"
            barr = np.array(utf8_bytes, dtype=bytes_dtype)
            assert_array_equal(barr, sarr.astype(bytes_dtype))
            assert_array_equal(barr.astype(dtype), sarr)
            if dtype.coerce:
                barr = np.array(utf8_bytes, dtype=dtype)
                assert_array_equal(barr, sarr)
                barr = np.array(utf8_bytes, dtype="O")
                assert_array_equal(barr.astype(dtype), sarr)
            else:
                with pytest.raises(ValueError):
                    np.array(utf8_bytes, dtype=dtype)
        except UnicodeEncodeError:
            with pytest.raises(UnicodeEncodeError):
                sarr.astype("S20")


def test_additional_unicode_cast(random_string_list, dtype):
    arr = np.array(random_string_list, dtype=dtype)
    # test that this short-circuits correctly
    assert_array_equal(arr, arr.astype(arr.dtype))
    # tests the casts via the comparison promoter
    assert_array_equal(arr, arr.astype(random_string_list.dtype))


def test_insert_scalar(dtype, string_list):
    """Test that inserting a scalar works."""
    arr = np.array(string_list, dtype=dtype)
    scalar_instance = "what"
    arr[1] = scalar_instance
    assert_array_equal(
        arr,
        np.array(string_list[:1] + ["what"] + string_list[2:], dtype=dtype),
    )


comparison_operators = [
    np.equal,
    np.not_equal,
    np.greater,
    np.greater_equal,
    np.less,
    np.less_equal,
]


@pytest.mark.parametrize("op", comparison_operators)
@pytest.mark.parametrize("o_dtype", [np.str_, object, StringDType()])
def test_comparisons(string_list, dtype, op, o_dtype):
    sarr = np.array(string_list, dtype=dtype)
    oarr = np.array(string_list, dtype=o_dtype)

    # test that comparison operators work
    res = op(sarr, sarr)
    ores = op(oarr, oarr)
    # test that promotion works as well
    orres = op(sarr, oarr)
    olres = op(oarr, sarr)

    assert_array_equal(res, ores)
    assert_array_equal(res, orres)
    assert_array_equal(res, olres)

    # test we get the correct answer for unequal length strings
    sarr2 = np.array([s + "2" for s in string_list], dtype=dtype)
    oarr2 = np.array([s + "2" for s in string_list], dtype=o_dtype)

    res = op(sarr, sarr2)
    ores = op(oarr, oarr2)
    olres = op(oarr, sarr2)
    orres = op(sarr, oarr2)

    assert_array_equal(res, ores)
    assert_array_equal(res, olres)
    assert_array_equal(res, orres)

    res = op(sarr2, sarr)
    ores = op(oarr2, oarr)
    olres = op(oarr2, sarr)
    orres = op(sarr2, oarr)

    assert_array_equal(res, ores)
    assert_array_equal(res, olres)
    assert_array_equal(res, orres)


def test_isnan(dtype, string_list):
    if not hasattr(dtype, "na_object"):
        pytest.skip("no na support")
    sarr = np.array(string_list + [dtype.na_object], dtype=dtype)
    is_nan = isinstance(dtype.na_object, float) and np.isnan(dtype.na_object)
    bool_errors = 0
    try:
        bool(dtype.na_object)
    except TypeError:
        bool_errors = 1
    if is_nan or bool_errors:
        # isnan is only true when na_object is a NaN
        assert_array_equal(
            np.isnan(sarr),
            np.array([0] * len(string_list) + [1], dtype=np.bool),
        )
    else:
        assert not np.any(np.isnan(sarr))


def test_pickle(dtype, string_list):
    arr = np.array(string_list, dtype=dtype)

    with tempfile.NamedTemporaryFile("wb", delete=False) as f:
        pickle.dump([arr, dtype], f)

    with open(f.name, "rb") as f:
        res = pickle.load(f)

    assert_array_equal(res[0], arr)
    assert res[1] == dtype

    os.remove(f.name)


def test_stdlib_copy(dtype, string_list):
    arr = np.array(string_list, dtype=dtype)

    assert_array_equal(copy.copy(arr), arr)
    assert_array_equal(copy.deepcopy(arr), arr)


@pytest.mark.parametrize(
    "strings",
    [
        ["left", "right", "leftovers", "righty", "up", "down"],
        [
            "left" * 10,
            "right" * 10,
            "leftovers" * 10,
            "righty" * 10,
            "up" * 10,
        ],
        ["🤣🤣", "🤣", "📵", "😰"],
        ["🚜", "🙃", "😾"],
        ["😹", "🚠", "🚌"],
        ["A¢☃€ 😊", " A☃€¢😊", "☃€😊 A¢", "😊☃A¢ €"],
    ],
)
def test_sort(dtype, strings):
    """Test that sorting matches python's internal sorting."""

    def test_sort(strings, arr_sorted):
        arr = np.array(strings, dtype=dtype)
        na_object = getattr(arr.dtype, "na_object", "")
        if na_object is None and None in strings:
            with pytest.raises(
                ValueError,
                match="Cannot compare null that is not a nan-like value",
            ):
                np.argsort(arr)
            argsorted = None
        elif na_object is pd_NA or na_object != '':
            argsorted = None
        else:
            argsorted = np.argsort(arr)
        np.random.default_rng().shuffle(arr)
        if na_object is None and None in strings:
            with pytest.raises(
                ValueError,
                match="Cannot compare null that is not a nan-like value",
            ):
                arr.sort()
        else:
            arr.sort()
            assert np.array_equal(arr, arr_sorted, equal_nan=True)
        if argsorted is not None:
            assert np.array_equal(argsorted, np.argsort(strings))

    # make a copy so we don't mutate the lists in the fixture
    strings = strings.copy()
    arr_sorted = np.array(sorted(strings), dtype=dtype)
    test_sort(strings, arr_sorted)

    if not hasattr(dtype, "na_object"):
        return

    # make sure NAs get sorted to the end of the array and string NAs get
    # sorted like normal strings
    strings.insert(0, dtype.na_object)
    strings.insert(2, dtype.na_object)
    # can't use append because doing that with NA converts
    # the result to object dtype
    if not isinstance(dtype.na_object, str):
        arr_sorted = np.array(
            arr_sorted.tolist() + [dtype.na_object, dtype.na_object],
            dtype=dtype,
        )
    else:
        arr_sorted = np.array(sorted(strings), dtype=dtype)

    test_sort(strings, arr_sorted)


@pytest.mark.parametrize(
    "strings",
    [
        ["A¢☃€ 😊", " A☃€¢😊", "☃€😊 A¢", "😊☃A¢ €"],
        ["A¢☃€ 😊", "", " ", " "],
        ["", "a", "😸", "ááðfáíóåéë"],
    ],
)
def test_nonzero(strings, na_object):
    dtype = get_dtype(na_object)
    arr = np.array(strings, dtype=dtype)
    is_nonzero = np.array(
        [i for i, item in enumerate(strings) if len(item) != 0])
    assert_array_equal(arr.nonzero()[0], is_nonzero)

    if na_object is not pd_NA and na_object == 'unset':
        return

    strings_with_na = np.array(strings + [na_object], dtype=dtype)
    is_nan = np.isnan(np.array([dtype.na_object], dtype=dtype))[0]

    if is_nan:
        assert strings_with_na.nonzero()[0][-1] == 4
    else:
        assert strings_with_na.nonzero()[0][-1] == 3

    # check that the casting to bool and nonzero give consistent results
    assert_array_equal(strings_with_na[strings_with_na.nonzero()],
                       strings_with_na[strings_with_na.astype(bool)])


def test_where(string_list, na_object):
    dtype = get_dtype(na_object)
    a = np.array(string_list, dtype=dtype)
    b = a[::-1]
    res = np.where([True, False, True, False, True, False], a, b)
    assert_array_equal(res, [a[0], b[1], a[2], b[3], a[4], b[5]])


def test_fancy_indexing(string_list):
    sarr = np.array(string_list, dtype="T")
    assert_array_equal(sarr, sarr[np.arange(sarr.shape[0])])

    inds = [
        [True, True],
        [0, 1],
        ...,
        np.array([0, 1], dtype='uint8'),
    ]

    lops = [
        ['a' * 25, 'b' * 25],
        ['', ''],
        ['hello', 'world'],
        ['hello', 'world' * 25],
    ]

    # see gh-27003 and gh-27053
    for ind in inds:
        for lop in lops:
            a = np.array(lop, dtype="T")
            assert_array_equal(a[ind], a)
            rop = ['d' * 25, 'e' * 25]
            for b in [rop, np.array(rop, dtype="T")]:
                a[ind] = b
                assert_array_equal(a, b)
                assert a[0] == 'd' * 25

    # see gh-29279
    data = [
        ["AAAAAAAAAAAAAAAAA"],
        ["BBBBBBBBBBBBBBBBBBBBBBBBBBBBB"],
        ["CCCCCCCCCCCCCCCCC"],
        ["DDDDDDDDDDDDDDDDD"],
    ]
    sarr = np.array(data, dtype=np.dtypes.StringDType())
    uarr = np.array(data, dtype="U30")
    for ind in [[0], [1], [2], [3], [[0, 0]], [[1, 1, 3]], [[1, 1]]]:
        assert_array_equal(sarr[ind], uarr[ind])


def test_flatiter_indexing():
    # see gh-29659
    arr = np.array(['hello', 'world'], dtype='T')
    arr.flat[:] = 9223372036854775
    assert_array_equal(arr, np.array([9223372036854775] * 2, dtype='T'))


def test_creation_functions():
    assert_array_equal(np.zeros(3, dtype="T"), ["", "", ""])
    assert_array_equal(np.empty(3, dtype="T"), ["", "", ""])

    assert np.zeros(3, dtype="T")[0] == ""
    assert np.empty(3, dtype="T")[0] == ""


def test_concatenate(string_list):
    sarr = np.array(string_list, dtype="T")
    sarr_cat = np.array(string_list + string_list, dtype="T")

    assert_array_equal(np.concatenate([sarr], axis=0), sarr)


def test_resize_method(string_list):
    sarr = np.array(string_list, dtype="T")
    if IS_PYPY:
        sarr.resize(len(string_list) + 3, refcheck=False)
    else:
        sarr.resize(len(string_list) + 3)
    assert_array_equal(sarr, np.array(string_list + [''] * 3,  dtype="T"))


def test_create_with_copy_none(string_list):
    arr = np.array(string_list, dtype=StringDType())
    # create another stringdtype array with an arena that has a different
    # in-memory layout than the first array
    arr_rev = np.array(string_list[::-1], dtype=StringDType())

    # this should create a copy and the resulting array
    # shouldn't share an allocator or arena with arr_rev, despite
    # explicitly passing arr_rev.dtype
    arr_copy = np.array(arr, copy=None, dtype=arr_rev.dtype)
    np.testing.assert_array_equal(arr, arr_copy)
    assert arr_copy.base is None

    with pytest.raises(ValueError, match="Unable to avoid copy"):
        np.array(arr, copy=False, dtype=arr_rev.dtype)

    # because we're using arr's dtype instance, the view is safe
    arr_view = np.array(arr, copy=None, dtype=arr.dtype)
    np.testing.assert_array_equal(arr, arr)
    np.testing.assert_array_equal(arr_view[::-1], arr_rev)
    assert arr_view is arr


def test_astype_copy_false():
    orig_dt = StringDType()
    arr = np.array(["hello", "world"], dtype=StringDType())
    assert not arr.astype(StringDType(coerce=False), copy=False).dtype.coerce

    assert arr.astype(orig_dt, copy=False).dtype is orig_dt

@pytest.mark.parametrize(
    "strings",
    [
        ["left", "right", "leftovers", "righty", "up", "down"],
        ["🤣🤣", "🤣", "📵", "😰"],
        ["🚜", "🙃", "😾"],
        ["😹", "🚠", "🚌"],
        ["A¢☃€ 😊", " A☃€¢😊", "☃€😊 A¢", "😊☃A¢ €"],
    ],
)
def test_argmax(strings):
    """Test that argmax/argmin matches what python calculates."""
    arr = np.array(strings, dtype="T")
    assert np.argmax(arr) == strings.index(max(strings))
    assert np.argmin(arr) == strings.index(min(strings))


@pytest.mark.parametrize(
    "arrfunc,expected",
    [
        [np.sort, None],
        [np.nonzero, (np.array([], dtype=np.int_),)],
        [np.argmax, 0],
        [np.argmin, 0],
    ],
)
def test_arrfuncs_zeros(arrfunc, expected):
    arr = np.zeros(10, dtype="T")
    result = arrfunc(arr)
    if expected is None:
        expected = arr
    assert_array_equal(result, expected, strict=True)


@pytest.mark.parametrize(
    ("strings", "cast_answer", "any_answer", "all_answer"),
    [
        [["hello", "world"], [True, True], True, True],
        [["", ""], [False, False], False, False],
        [["hello", ""], [True, False], True, False],
        [["", "world"], [False, True], True, False],
    ],
)
def test_cast_to_bool(strings, cast_answer, any_answer, all_answer):
    sarr = np.array(strings, dtype="T")
    assert_array_equal(sarr.astype("bool"), cast_answer)

    assert np.any(sarr) == any_answer
    assert np.all(sarr) == all_answer


@pytest.mark.parametrize(
    ("strings", "cast_answer"),
    [
        [[True, True], ["True", "True"]],
        [[False, False], ["False", "False"]],
        [[True, False], ["True", "False"]],
        [[False, True], ["False", "True"]],
    ],
)
def test_cast_from_bool(strings, cast_answer):
    barr = np.array(strings, dtype=bool)
    assert_array_equal(barr.astype("T"), np.array(cast_answer, dtype="T"))


@pytest.mark.parametrize("bitsize", [8, 16, 32, 64])
@pytest.mark.parametrize("signed", [True, False])
def test_sized_integer_casts(bitsize, signed):
    idtype = f"int{bitsize}"
    if signed:
        inp = [-(2**p - 1) for p in reversed(range(bitsize - 1))]
        inp += [2**p - 1 for p in range(1, bitsize - 1)]
    else:
        idtype = "u" + idtype
        inp = [2**p - 1 for p in range(bitsize)]
    ainp = np.array(inp, dtype=idtype)
    assert_array_equal(ainp, ainp.astype("T").astype(idtype))

    # safe casting works
    ainp.astype("T", casting="safe")

    with pytest.raises(TypeError):
        ainp.astype("T").astype(idtype, casting="safe")

    oob = [str(2**bitsize), str(-(2**bitsize))]
    with pytest.raises(OverflowError):
        np.array(oob, dtype="T").astype(idtype)

    with pytest.raises(ValueError):
        np.array(["1", np.nan, "3"],
                 dtype=StringDType(na_object=np.nan)).astype(idtype)


@pytest.mark.parametrize("typename", ["byte", "short", "int", "longlong"])
@pytest.mark.parametrize("signed", ["", "u"])
def test_unsized_integer_casts(typename, signed):
    idtype = f"{signed}{typename}"

    inp = [1, 2, 3, 4]
    ainp = np.array(inp, dtype=idtype)
    assert_array_equal(ainp, ainp.astype("T").astype(idtype))


@pytest.mark.parametrize(
    "typename",
    [
        pytest.param(
            "longdouble",
            marks=pytest.mark.xfail(
                np.dtypes.LongDoubleDType() != np.dtypes.Float64DType(),
                reason="numpy lacks an ld2a implementation",
                strict=True,
            ),
        ),
        "float64",
        "float32",
        "float16",
    ],
)
def test_float_casts(typename):
    inp = [1.1, 2.8, -3.2, 2.7e4]
    ainp = np.array(inp, dtype=typename)
    assert_array_equal(ainp, ainp.astype("T").astype(typename))

    inp = [0.1]
    sres = np.array(inp, dtype=typename).astype("T")
    res = sres.astype(typename)
    assert_array_equal(np.array(inp, dtype=typename), res)
    assert sres[0] == "0.1"

    if typename == "longdouble":
        # let's not worry about platform-dependent rounding of longdouble
        return

    fi = np.finfo(typename)

    inp = [1e-324, fi.smallest_subnormal, -1e-324, -fi.smallest_subnormal]
    eres = [0, fi.smallest_subnormal, -0, -fi.smallest_subnormal]
    res = np.array(inp, dtype=typename).astype("T").astype(typename)
    assert_array_equal(eres, res)

    inp = [2e308, fi.max, -2e308, fi.min]
    eres = [np.inf, fi.max, -np.inf, fi.min]
    res = np.array(inp, dtype=typename).astype("T").astype(typename)
    assert_array_equal(eres, res)


def test_float_nan_cast_na_object():
    # gh-28157
    dt = np.dtypes.StringDType(na_object=np.nan)
    arr1 = np.full((1,), fill_value=np.nan, dtype=dt)
    arr2 = np.full_like(arr1, fill_value=np.nan)

    assert arr1.item() is np.nan
    assert arr2.item() is np.nan

    inp = [1.2, 2.3, np.nan]
    arr = np.array(inp).astype(dt)
    assert arr[2] is np.nan
    assert arr[0] == '1.2'


@pytest.mark.parametrize(
    "typename",
    [
        "csingle",
        "cdouble",
        pytest.param(
            "clongdouble",
            marks=pytest.mark.xfail(
                np.dtypes.CLongDoubleDType() != np.dtypes.Complex128DType(),
                reason="numpy lacks an ld2a implementation",
                strict=True,
            ),
        ),
    ],
)
def test_cfloat_casts(typename):
    inp = [1.1 + 1.1j, 2.8 + 2.8j, -3.2 - 3.2j, 2.7e4 + 2.7e4j]
    ainp = np.array(inp, dtype=typename)
    assert_array_equal(ainp, ainp.astype("T").astype(typename))

    inp = [0.1 + 0.1j]
    sres = np.array(inp, dtype=typename).astype("T")
    res = sres.astype(typename)
    assert_array_equal(np.array(inp, dtype=typename), res)
    assert sres[0] == "(0.1+0.1j)"


def test_take(string_list):
    sarr = np.array(string_list, dtype="T")
    res = sarr.take(np.arange(len(string_list)))
    assert_array_equal(sarr, res)

    # make sure it also works for out
    out = np.empty(len(string_list), dtype="T")
    out[0] = "hello"
    res = sarr.take(np.arange(len(string_list)), out=out)
    assert res is out
    assert_array_equal(sarr, res)


@pytest.mark.parametrize("use_out", [True, False])
@pytest.mark.parametrize(
    "ufunc_name,func",
    [
        ("min", min),
        ("max", max),
    ],
)
def test_ufuncs_minmax(string_list, ufunc_name, func, use_out):
    """Test that the min/max ufuncs match Python builtin min/max behavior."""
    arr = np.array(string_list, dtype="T")
    uarr = np.array(string_list, dtype=str)
    res = np.array(func(string_list), dtype="T")
    assert_array_equal(getattr(arr, ufunc_name)(), res)

    ufunc = getattr(np, ufunc_name + "imum")

    if use_out:
        res = ufunc(arr, arr, out=arr)
    else:
        res = ufunc(arr, arr)

    assert_array_equal(uarr, res)
    assert_array_equal(getattr(arr, ufunc_name)(), func(string_list))


def test_max_regression():
    arr = np.array(['y', 'y', 'z'], dtype="T")
    assert arr.max() == 'z'


@pytest.mark.parametrize("use_out", [True, False])
@pytest.mark.parametrize(
    "other_strings",
    [
        ["abc", "def" * 500, "ghi" * 16, "🤣" * 100, "📵", "😰"],
        ["🚜", "🙃", "😾", "😹", "🚠", "🚌"],
        ["🥦", "¨", "⨯", "∰ ", "⨌ ", "⎶ "],
    ],
)
def test_ufunc_add(dtype, string_list, other_strings, use_out):
    arr1 = np.array(string_list, dtype=dtype)
    arr2 = np.array(other_strings, dtype=dtype)
    result = np.array([a + b for a, b in zip(arr1, arr2)], dtype=dtype)

    if use_out:
        res = np.add(arr1, arr2, out=arr1)
    else:
        res = np.add(arr1, arr2)

    assert_array_equal(res, result)

    if not hasattr(dtype, "na_object"):
        return

    is_nan = isinstance(dtype.na_object, float) and np.isnan(dtype.na_object)
    is_str = isinstance(dtype.na_object, str)
    bool_errors = 0
    try:
        bool(dtype.na_object)
    except TypeError:
        bool_errors = 1

    arr1 = np.array([dtype.na_object] + string_list, dtype=dtype)
    arr2 = np.array(other_strings + [dtype.na_object], dtype=dtype)

    if is_nan or bool_errors or is_str:
        res = np.add(arr1, arr2)
        assert_array_equal(res[1:-1], arr1[1:-1] + arr2[1:-1])
        if not is_str:
            assert res[0] is dtype.na_object and res[-1] is dtype.na_object
        else:
            assert res[0] == dtype.na_object + arr2[0]
            assert res[-1] == arr1[-1] + dtype.na_object
    else:
        with pytest.raises(ValueError):
            np.add(arr1, arr2)


def test_ufunc_add_reduce(dtype):
    values = ["a", "this is a long string", "c"]
    arr = np.array(values, dtype=dtype)
    out = np.empty((), dtype=dtype)

    expected = np.array("".join(values), dtype=dtype)
    assert_array_equal(np.add.reduce(arr), expected)

    np.add.reduce(arr, out=out)
    assert_array_equal(out, expected)


def test_add_promoter(string_list):
    arr = np.array(string_list, dtype=StringDType())
    lresult = np.array(["hello" + s for s in string_list], dtype=StringDType())
    rresult = np.array([s + "hello" for s in string_list], dtype=StringDType())

    for op in ["hello", np.str_("hello"), np.array(["hello"])]:
        assert_array_equal(op + arr, lresult)
        assert_array_equal(arr + op, rresult)

    # The promoter should be able to handle things if users pass `dtype=`
    res = np.add("hello", string_list, dtype=StringDType)
    assert res.dtype == StringDType()

    # The promoter should not kick in if users override the input,
    # which means arr is cast, this fails because of the unknown length.
    with pytest.raises(TypeError, match="cannot cast dtype"):
        np.add(arr, "add", signature=("U", "U", None), casting="unsafe")

    # But it must simply reject the following:
    with pytest.raises(TypeError, match=".*did not contain a loop"):
        np.add(arr, "add", signature=(None, "U", None))

    with pytest.raises(TypeError, match=".*did not contain a loop"):
        np.add("a", "b", signature=("U", "U", StringDType))


def test_add_no_legacy_promote_with_signature():
    # Possibly misplaced, but useful to test with string DType.  We check that
    # if there is clearly no loop found, a stray `dtype=` doesn't break things
    # Regression test for the bad error in gh-26735
    # (If legacy promotion is gone, this can be deleted...)
    with pytest.raises(TypeError, match=".*did not contain a loop"):
        np.add("3", 6, dtype=StringDType)


def test_add_promoter_reduce():
    # Exact TypeError could change, but ensure StringDtype doesn't match
    with pytest.raises(TypeError, match="the resolved dtypes are not"):
        np.add.reduce(np.array(["a", "b"], dtype="U"))

    # On the other hand, using `dtype=T` in the *ufunc* should work.
    np.add.reduce(np.array(["a", "b"], dtype="U"), dtype=np.dtypes.StringDType)


def test_multiply_reduce():
    # At the time of writing (NumPy 2.0) this is very limited (and rather
    # ridiculous anyway).  But it works and actually makes some sense...
    # (NumPy does not allow non-scalar initial values)
    repeats = np.array([2, 3, 4])
    val = "school-🚌"
    res = np.multiply.reduce(repeats, initial=val, dtype=np.dtypes.StringDType)
    assert res == val * np.prod(repeats)


def test_multiply_two_string_raises():
    arr = np.array(["hello", "world"], dtype="T")
    with pytest.raises(np._core._exceptions._UFuncNoLoopError):
        np.multiply(arr, arr)


@pytest.mark.parametrize("use_out", [True, False])
@pytest.mark.parametrize("other", [2, [2, 1, 3, 4, 1, 3]])
@pytest.mark.parametrize(
    "other_dtype",
    [
        None,
        "int8",
        "int16",
        "int32",
        "int64",
        "uint8",
        "uint16",
        "uint32",
        "uint64",
        "short",
        "int",
        "intp",
        "long",
        "longlong",
        "ushort",
        "uint",
        "uintp",
        "ulong",
        "ulonglong",
    ],
)
def test_ufunc_multiply(dtype, string_list, other, other_dtype, use_out):
    """Test the two-argument ufuncs match python builtin behavior."""
    arr = np.array(string_list, dtype=dtype)
    if other_dtype is not None:
        other_dtype = np.dtype(other_dtype)
    try:
        len(other)
        result = [s * o for s, o in zip(string_list, other)]
        other = np.array(other)
        if other_dtype is not None:
            other = other.astype(other_dtype)
    except TypeError:
        if other_dtype is not None:
            other = other_dtype.type(other)
        result = [s * other for s in string_list]

    if use_out:
        arr_cache = arr.copy()
        lres = np.multiply(arr, other, out=arr)
        assert_array_equal(lres, result)
        arr[:] = arr_cache
        assert lres is arr
        arr *= other
        assert_array_equal(arr, result)
        arr[:] = arr_cache
        rres = np.multiply(other, arr, out=arr)
        assert rres is arr
        assert_array_equal(rres, result)
    else:
        lres = arr * other
        assert_array_equal(lres, result)
        rres = other * arr
        assert_array_equal(rres, result)

    if not hasattr(dtype, "na_object"):
        return

    is_nan = np.isnan(np.array([dtype.na_object], dtype=dtype))[0]
    is_str = isinstance(dtype.na_object, str)
    bool_errors = 0
    try:
        bool(dtype.na_object)
    except TypeError:
        bool_errors = 1

    arr = np.array(string_list + [dtype.na_object], dtype=dtype)

    try:
        len(other)
        other = np.append(other, 3)
        if other_dtype is not None:
            other = other.astype(other_dtype)
    except TypeError:
        pass

    if is_nan or bool_errors or is_str:
        for res in [arr * other, other * arr]:
            assert_array_equal(res[:-1], result)
            if not is_str:
                assert res[-1] is dtype.na_object
            else:
                try:
                    assert res[-1] == dtype.na_object * other[-1]
                except (IndexError, TypeError):
                    assert res[-1] == dtype.na_object * other
    else:
        with pytest.raises(TypeError):
            arr * other
        with pytest.raises(TypeError):
            other * arr


def test_findlike_promoters():
    r = "Wally"
    l = "Where's Wally?"
    s = np.int32(3)
    e = np.int8(13)
    for dtypes in [("T", "U"), ("U", "T")]:
        for function, answer in [
            (np.strings.index, 8),
            (np.strings.endswith, True),
        ]:
            assert answer == function(
                np.array(l, dtype=dtypes[0]), np.array(r, dtype=dtypes[1]), s, e
            )


def test_strip_promoter():
    arg = ["Hello!!!!", "Hello??!!"]
    strip_char = "!"
    answer = ["Hello", "Hello??"]
    for dtypes in [("T", "U"), ("U", "T")]:
        result = np.strings.strip(
            np.array(arg, dtype=dtypes[0]),
            np.array(strip_char, dtype=dtypes[1])
        )
        assert_array_equal(result, answer)
        assert result.dtype.char == "T"


def test_replace_promoter():
    arg = ["Hello, planet!", "planet, Hello!"]
    old = "planet"
    new = "world"
    answer = ["Hello, world!", "world, Hello!"]
    for dtypes in itertools.product("TU", repeat=3):
        if dtypes == ("U", "U", "U"):
            continue
        answer_arr = np.strings.replace(
            np.array(arg, dtype=dtypes[0]),
            np.array(old, dtype=dtypes[1]),
            np.array(new, dtype=dtypes[2]),
        )
        assert_array_equal(answer_arr, answer)
        assert answer_arr.dtype.char == "T"


def test_center_promoter():
    arg = ["Hello", "planet!"]
    fillchar = "/"
    for dtypes in [("T", "U"), ("U", "T")]:
        answer = np.strings.center(
            np.array(arg, dtype=dtypes[0]), 9, np.array(fillchar, dtype=dtypes[1])
        )
        assert_array_equal(answer, ["//Hello//", "/planet!/"])
        assert answer.dtype.char == "T"


DATETIME_INPUT = [
    np.datetime64("1923-04-14T12:43:12"),
    np.datetime64("1994-06-21T14:43:15"),
    np.datetime64("2001-10-15T04:10:32"),
    np.datetime64("NaT"),
    np.datetime64("1995-11-25T16:02:16"),
    np.datetime64("2005-01-04T03:14:12"),
    np.datetime64("2041-12-03T14:05:03"),
]


TIMEDELTA_INPUT = [
    np.timedelta64(12358, "s"),
    np.timedelta64(23, "s"),
    np.timedelta64(74, "s"),
    np.timedelta64("NaT"),
    np.timedelta64(23, "s"),
    np.timedelta64(73, "s"),
    np.timedelta64(7, "s"),
]


@pytest.mark.parametrize(
    "input_data, input_dtype",
    [
        (DATETIME_INPUT, "M8[s]"),
        (TIMEDELTA_INPUT, "m8[s]")
    ]
)
def test_datetime_timedelta_cast(dtype, input_data, input_dtype):

    a = np.array(input_data, dtype=input_dtype)

    has_na = hasattr(dtype, "na_object")
    is_str = isinstance(getattr(dtype, "na_object", None), str)

    if not has_na or is_str:
        a = np.delete(a, 3)

    sa = a.astype(dtype)
    ra = sa.astype(a.dtype)

    if has_na and not is_str:
        assert sa[3] is dtype.na_object
        assert np.isnat(ra[3])

    assert_array_equal(a, ra)

    if has_na and not is_str:
        # don't worry about comparing how NaT is converted
        sa = np.delete(sa, 3)
        a = np.delete(a, 3)

    if input_dtype.startswith("M"):
        assert_array_equal(sa, a.astype("U"))
    else:
        # The timedelta to unicode cast produces strings
        # that aren't round-trippable and we don't want to
        # reproduce that behavior in stringdtype
        assert_array_equal(sa, a.astype("int64").astype("U"))


def test_nat_casts():
    s = 'nat'
    all_nats = itertools.product(*zip(s.upper(), s.lower()))
    all_nats = list(map(''.join, all_nats))
    NaT_dt = np.datetime64('NaT')
    NaT_td = np.timedelta64('NaT')
    for na_object in [np._NoValue, None, np.nan, 'nat', '']:
        # numpy treats empty string and all case combinations of 'nat' as NaT
        dtype = StringDType(na_object=na_object)
        arr = np.array([''] + all_nats, dtype=dtype)
        dt_array = arr.astype('M8[s]')
        td_array = arr.astype('m8[s]')
        assert_array_equal(dt_array, NaT_dt)
        assert_array_equal(td_array, NaT_td)

        if na_object is np._NoValue:
            output_object = 'NaT'
        else:
            output_object = na_object

        for arr in [dt_array, td_array]:
            assert_array_equal(
                arr.astype(dtype),
                np.array([output_object] * arr.size, dtype=dtype))


def test_nat_conversion():
    for nat in [np.datetime64("NaT", "s"), np.timedelta64("NaT", "s")]:
        with pytest.raises(ValueError, match="string coercion is disabled"):
            np.array(["a", nat], dtype=StringDType(coerce=False))


def test_growing_strings(dtype):
    # growing a string leads to a heap allocation, this tests to make sure
    # we do that bookkeeping correctly for all possible starting cases
    data = [
        "hello",  # a short string
        "abcdefghijklmnopqestuvwxyz",  # a medium heap-allocated string
        "hello" * 200,  # a long heap-allocated string
    ]

    arr = np.array(data, dtype=dtype)
    uarr = np.array(data, dtype=str)

    for _ in range(5):
        arr = arr + arr
        uarr = uarr + uarr

    assert_array_equal(arr, uarr)


def test_assign_medium_strings():
    # see gh-29261
    N = 9
    src = np.array(
        (
            ['0' * 256] * 3 + ['0' * 255] + ['0' * 256] + ['0' * 255] +
            ['0' * 256] * 2 + ['0' * 255]
        ), dtype='T')
    dst = np.array(
        (
            ['0' * 255] + ['0' * 256] * 2 + ['0' * 255] + ['0' * 256] +
            ['0' * 255] + [''] * 5
        ), dtype='T')

    dst[1:N + 1] = src
    assert_array_equal(dst[1:N + 1], src)


UFUNC_TEST_DATA = [
    "hello" * 10,
    "Ae¢☃€ 😊" * 20,
    "entry\nwith\nnewlines",
    "entry\twith\ttabs",
]


@pytest.fixture
def string_array(dtype):
    return np.array(UFUNC_TEST_DATA, dtype=dtype)


@pytest.fixture
def unicode_array():
    return np.array(UFUNC_TEST_DATA, dtype=np.str_)


NAN_PRESERVING_FUNCTIONS = [
    "capitalize",
    "expandtabs",
    "lower",
    "lstrip",
    "rstrip",
    "splitlines",
    "strip",
    "swapcase",
    "title",
    "upper",
]

BOOL_OUTPUT_FUNCTIONS = [
    "isalnum",
    "isalpha",
    "isdigit",
    "islower",
    "isspace",
    "istitle",
    "isupper",
    "isnumeric",
    "isdecimal",
]

UNARY_FUNCTIONS = [
    "str_len",
    "capitalize",
    "expandtabs",
    "isalnum",
    "isalpha",
    "isdigit",
    "islower",
    "isspace",
    "istitle",
    "isupper",
    "lower",
    "lstrip",
    "rstrip",
    "splitlines",
    "strip",
    "swapcase",
    "title",
    "upper",
    "isnumeric",
    "isdecimal",
    "isalnum",
    "islower",
    "istitle",
    "isupper",
]

UNIMPLEMENTED_VEC_STRING_FUNCTIONS = [
    "capitalize",
    "expandtabs",
    "lower",
    "splitlines",
    "swapcase",
    "title",
    "upper",
]

ONLY_IN_NP_CHAR = [
    "join",
    "split",
    "rsplit",
    "splitlines"
]


@pytest.mark.parametrize("function_name", UNARY_FUNCTIONS)
def test_unary(string_array, unicode_array, function_name):
    if function_name in ONLY_IN_NP_CHAR:
        func = getattr(np.char, function_name)
    else:
        func = getattr(np.strings, function_name)
    dtype = string_array.dtype
    sres = func(string_array)
    ures = func(unicode_array)
    if sres.dtype == StringDType():
        ures = ures.astype(StringDType())
    assert_array_equal(sres, ures)

    if not hasattr(dtype, "na_object"):
        return

    is_nan = np.isnan(np.array([dtype.na_object], dtype=dtype))[0]
    is_str = isinstance(dtype.na_object, str)
    na_arr = np.insert(string_array, 0, dtype.na_object)

    if function_name in UNIMPLEMENTED_VEC_STRING_FUNCTIONS:
        if not is_str:
            # to avoid these errors we'd need to add NA support to _vec_string
            with pytest.raises((ValueError, TypeError)):
                func(na_arr)
        elif function_name == "splitlines":
            assert func(na_arr)[0] == func(dtype.na_object)[()]
        else:
            assert func(na_arr)[0] == func(dtype.na_object)
        return
    if function_name == "str_len" and not is_str:
        # str_len always errors for any non-string null, even NA ones because
        # it has an integer result
        with pytest.raises(ValueError):
            func(na_arr)
        return
    if function_name in BOOL_OUTPUT_FUNCTIONS:
        if is_nan:
            assert func(na_arr)[0] is np.False_
        elif is_str:
            assert func(na_arr)[0] == func(dtype.na_object)
        else:
            with pytest.raises(ValueError):
                func(na_arr)
        return
    if not (is_nan or is_str):
        with pytest.raises(ValueError):
            func(na_arr)
        return
    res = func(na_arr)
    if is_nan and function_name in NAN_PRESERVING_FUNCTIONS:
        assert res[0] is dtype.na_object
    elif is_str:
        assert res[0] == func(dtype.na_object)


unicode_bug_fail = pytest.mark.xfail(
    reason="unicode output width is buggy", strict=True
)

# None means that the argument is a string array
BINARY_FUNCTIONS = [
    ("add", (None, None)),
    ("multiply", (None, 2)),
    ("mod", ("format: %s", None)),
    ("center", (None, 25)),
    ("count", (None, "A")),
    ("encode", (None, "UTF-8")),
    ("endswith", (None, "lo")),
    ("find", (None, "A")),
    ("index", (None, "e")),
    ("join", ("-", None)),
    ("ljust", (None, 12)),
    ("lstrip", (None, "A")),
    ("partition", (None, "A")),
    ("replace", (None, "A", "B")),
    ("rfind", (None, "A")),
    ("rindex", (None, "e")),
    ("rjust", (None, 12)),
    ("rsplit", (None, "A")),
    ("rstrip", (None, "A")),
    ("rpartition", (None, "A")),
    ("split", (None, "A")),
    ("strip", (None, "A")),
    ("startswith", (None, "A")),
    ("zfill", (None, 12)),
]

PASSES_THROUGH_NAN_NULLS = [
    "add",
    "center",
    "ljust",
    "multiply",
    "replace",
    "rjust",
    "strip",
    "lstrip",
    "rstrip",
    "replace"
    "zfill",
]

NULLS_ARE_FALSEY = [
    "startswith",
    "endswith",
]

NULLS_ALWAYS_ERROR = [
    "count",
    "find",
    "rfind",
]

SUPPORTS_NULLS = (
    PASSES_THROUGH_NAN_NULLS +
    NULLS_ARE_FALSEY +
    NULLS_ALWAYS_ERROR
)


def call_func(func, args, array, sanitize=True):
    if args == (None, None):
        return func(array, array)
    if args[0] is None:
        if sanitize:
            san_args = tuple(
                np.array(arg, dtype=array.dtype) if isinstance(arg, str) else
                arg for arg in args[1:]
            )
        else:
            san_args = args[1:]
        return func(array, *san_args)
    if args[1] is None:
        return func(args[0], array)
    # shouldn't ever happen
    assert 0


@pytest.mark.parametrize("function_name, args", BINARY_FUNCTIONS)
def test_binary(string_array, unicode_array, function_name, args):
    if function_name in ONLY_IN_NP_CHAR:
        func = getattr(np.char, function_name)
    else:
        func = getattr(np.strings, function_name)
    sres = call_func(func, args, string_array)
    ures = call_func(func, args, unicode_array, sanitize=False)
    if not isinstance(sres, tuple) and sres.dtype == StringDType():
        ures = ures.astype(StringDType())
    assert_array_equal(sres, ures)

    dtype = string_array.dtype
    if function_name not in SUPPORTS_NULLS or not hasattr(dtype, "na_object"):
        return

    na_arr = np.insert(string_array, 0, dtype.na_object)
    is_nan = np.isnan(np.array([dtype.na_object], dtype=dtype))[0]
    is_str = isinstance(dtype.na_object, str)
    should_error = not (is_nan or is_str)

    if (
        (function_name in NULLS_ALWAYS_ERROR and not is_str)
        or (function_name in PASSES_THROUGH_NAN_NULLS and should_error)
        or (function_name in NULLS_ARE_FALSEY and should_error)
    ):
        with pytest.raises((ValueError, TypeError)):
            call_func(func, args, na_arr)
        return

    res = call_func(func, args, na_arr)

    if is_str:
        assert res[0] == call_func(func, args, na_arr[:1])
    elif function_name in NULLS_ARE_FALSEY:
        assert res[0] is np.False_
    elif function_name in PASSES_THROUGH_NAN_NULLS:
        assert res[0] is dtype.na_object
    else:
        # shouldn't ever get here
        assert 0


@pytest.mark.parametrize("function, expected", [
    (np.strings.find, [[2, -1], [1, -1]]),
    (np.strings.startswith, [[False, False], [True, False]])])
@pytest.mark.parametrize("start, stop", [
    (1, 4),
    (np.int8(1), np.int8(4)),
    (np.array([1, 1], dtype='u2'), np.array([4, 4], dtype='u2'))])
def test_non_default_start_stop(function, start, stop, expected):
    a = np.array([["--🐍--", "--🦜--"],
                  ["-🐍---", "-🦜---"]], "T")
    indx = function(a, "🐍", start, stop)
    assert_array_equal(indx, expected)


@pytest.mark.parametrize("count", [2, np.int8(2), np.array([2, 2], 'u2')])
def test_replace_non_default_repeat(count):
    a = np.array(["🐍--", "🦜-🦜-"], "T")
    result = np.strings.replace(a, "🦜-", "🦜†", count)
    assert_array_equal(result, np.array(["🐍--", "🦜†🦜†"], "T"))


def test_strip_ljust_rjust_consistency(string_array, unicode_array):
    rjs = np.char.rjust(string_array, 1000)
    rju = np.char.rjust(unicode_array, 1000)

    ljs = np.char.ljust(string_array, 1000)
    lju = np.char.ljust(unicode_array, 1000)

    assert_array_equal(
        np.char.lstrip(rjs),
        np.char.lstrip(rju).astype(StringDType()),
    )

    assert_array_equal(
        np.char.rstrip(ljs),
        np.char.rstrip(lju).astype(StringDType()),
    )

    assert_array_equal(
        np.char.strip(ljs),
        np.char.strip(lju).astype(StringDType()),
    )

    assert_array_equal(
        np.char.strip(rjs),
        np.char.strip(rju).astype(StringDType()),
    )


def test_unset_na_coercion():
    # a dtype instance with an unset na object is compatible
    # with a dtype that has one set

    # this test uses the "add" and "equal" ufunc but all ufuncs that
    # accept more than one string argument and produce a string should
    # behave this way
    # TODO: generalize to more ufuncs
    inp = ["hello", "world"]
    arr = np.array(inp, dtype=StringDType(na_object=None))
    for op_dtype in [None, StringDType(), StringDType(coerce=False),
                     StringDType(na_object=None)]:
        if op_dtype is None:
            op = "2"
        else:
            op = np.array("2", dtype=op_dtype)
        res = arr + op
        assert_array_equal(res, ["hello2", "world2"])

    # dtype instances with distinct explicitly set NA objects are incompatible
    for op_dtype in [StringDType(na_object=pd_NA), StringDType(na_object="")]:
        op = np.array("2", dtype=op_dtype)
        with pytest.raises(TypeError):
            arr + op

    # comparisons only consider the na_object
    for op_dtype in [None, StringDType(), StringDType(coerce=True),
                     StringDType(na_object=None)]:
        if op_dtype is None:
            op = inp
        else:
            op = np.array(inp, dtype=op_dtype)
        assert_array_equal(arr, op)

    for op_dtype in [StringDType(na_object=pd_NA),
                     StringDType(na_object=np.nan)]:
        op = np.array(inp, dtype=op_dtype)
        with pytest.raises(TypeError):
            arr == op


def test_repeat(string_array):
    res = string_array.repeat(1000)
    # Create an empty array with expanded dimension, and fill it.  Then,
    # reshape it to the expected result.
    expected = np.empty_like(string_array, shape=string_array.shape + (1000,))
    expected[...] = string_array[:, np.newaxis]
    expected = expected.reshape(-1)

    assert_array_equal(res, expected, strict=True)


@pytest.mark.parametrize("tile", [1, 6, (2, 5)])
def test_accumulation(string_array, tile):
    """Accumulation is odd for StringDType but tests dtypes with references.
    """
    # Fill with mostly empty strings to not create absurdly big strings
    arr = np.zeros_like(string_array, shape=(100,))
    arr[:len(string_array)] = string_array
    arr[-len(string_array):] = string_array

    # Bloat size a bit (get above thresholds and test >1 ndim).
    arr = np.tile(string_array, tile)

    res = np.add.accumulate(arr, axis=0)
    res_obj = np.add.accumulate(arr.astype(object), axis=0)
    assert_array_equal(res, res_obj.astype(arr.dtype), strict=True)

    if arr.ndim > 1:
        res = np.add.accumulate(arr, axis=-1)
        res_obj = np.add.accumulate(arr.astype(object), axis=-1)

        assert_array_equal(res, res_obj.astype(arr.dtype), strict=True)


class TestImplementation:
    """Check that strings are stored in the arena when possible.

    This tests implementation details, so should be adjusted if
    the implementation changes.
    """

    @classmethod
    def setup_class(self):
        self.MISSING = 0x80
        self.INITIALIZED = 0x40
        self.OUTSIDE_ARENA = 0x20
        self.LONG = 0x10
        self.dtype = StringDType(na_object=np.nan)
        self.sizeofstr = self.dtype.itemsize
        sp = self.dtype.itemsize // 2  # pointer size = sizeof(size_t)
        # Below, size is not strictly correct, since it really uses
        # 7 (or 3) bytes, but good enough for the tests here.
        self.view_dtype = np.dtype([
            ('offset', f'u{sp}'),
            ('size', f'u{sp // 2}'),
            ('xsiz', f'V{sp // 2 - 1}'),
            ('size_and_flags', 'u1'),
        ] if sys.byteorder == 'little' else [
            ('size_and_flags', 'u1'),
            ('xsiz', f'V{sp // 2 - 1}'),
            ('size', f'u{sp // 2}'),
            ('offset', f'u{sp}'),
        ])
        self.s_empty = ""
        self.s_short = "01234"
        self.s_medium = "abcdefghijklmnopqrstuvwxyz"
        self.s_long = "-=+" * 100
        self.a = np.array(
            [self.s_empty, self.s_short, self.s_medium, self.s_long],
            self.dtype)

    def get_view(self, a):
        # Cannot view a StringDType as anything else directly, since
        # it has references. So, we use a stride trick hack.
        from numpy.lib._stride_tricks_impl import DummyArray
        interface = dict(a.__array_interface__)
        interface['descr'] = self.view_dtype.descr
        interface['typestr'] = self.view_dtype.str
        return np.asarray(DummyArray(interface, base=a))

    def get_flags(self, a):
        return self.get_view(a)['size_and_flags'] & 0xf0

    def is_short(self, a):
        return self.get_flags(a) == self.INITIALIZED | self.OUTSIDE_ARENA

    def is_on_heap(self, a):
        return self.get_flags(a) == (self.INITIALIZED
                                     | self.OUTSIDE_ARENA
                                     | self.LONG)

    def is_missing(self, a):
        return self.get_flags(a) & self.MISSING == self.MISSING

    def in_arena(self, a):
        return (self.get_flags(a) & (self.INITIALIZED | self.OUTSIDE_ARENA)
                == self.INITIALIZED)

    def test_setup(self):
        is_short = self.is_short(self.a)
        length = np.strings.str_len(self.a)
        assert_array_equal(is_short, (length > 0) & (length <= 15))
        assert_array_equal(self.in_arena(self.a), [False, False, True, True])
        assert_array_equal(self.is_on_heap(self.a), False)
        assert_array_equal(self.is_missing(self.a), False)
        view = self.get_view(self.a)
        sizes = np.where(is_short, view['size_and_flags'] & 0xf,
                         view['size'])
        assert_array_equal(sizes, np.strings.str_len(self.a))
        assert_array_equal(view['xsiz'][2:],
                           np.void(b'\x00' * (self.sizeofstr // 4 - 1)))
        # Check that the medium string uses only 1 byte for its length
        # in the arena, while the long string takes 8 (or 4).
        offsets = view['offset']
        assert offsets[2] == 1
        assert offsets[3] == 1 + len(self.s_medium) + self.sizeofstr // 2

    def test_empty(self):
        e = np.empty((3,), self.dtype)
        assert_array_equal(self.get_flags(e), 0)
        assert_array_equal(e, "")

    def test_zeros(self):
        z = np.zeros((2,), self.dtype)
        assert_array_equal(self.get_flags(z), 0)
        assert_array_equal(z, "")

    def test_copy(self):
        for c in [self.a.copy(), copy.copy(self.a), copy.deepcopy(self.a)]:
            assert_array_equal(self.get_flags(c), self.get_flags(self.a))
            assert_array_equal(c, self.a)
            offsets = self.get_view(c)['offset']
            assert offsets[2] == 1
            assert offsets[3] == 1 + len(self.s_medium) + self.sizeofstr // 2

    def test_arena_use_with_setting(self):
        c = np.zeros_like(self.a)
        assert_array_equal(self.get_flags(c), 0)
        c[:] = self.a
        assert_array_equal(self.get_flags(c), self.get_flags(self.a))
        assert_array_equal(c, self.a)

    def test_arena_reuse_with_setting(self):
        c = self.a.copy()
        c[:] = self.a
        assert_array_equal(self.get_flags(c), self.get_flags(self.a))
        assert_array_equal(c, self.a)

    def test_arena_reuse_after_missing(self):
        c = self.a.copy()
        c[:] = np.nan
        assert np.all(self.is_missing(c))
        # Replacing with the original strings, the arena should be reused.
        c[:] = self.a
        assert_array_equal(self.get_flags(c), self.get_flags(self.a))
        assert_array_equal(c, self.a)

    def test_arena_reuse_after_empty(self):
        c = self.a.copy()
        c[:] = ""
        assert_array_equal(c, "")
        # Replacing with the original strings, the arena should be reused.
        c[:] = self.a
        assert_array_equal(self.get_flags(c), self.get_flags(self.a))
        assert_array_equal(c, self.a)

    def test_arena_reuse_for_shorter(self):
        c = self.a.copy()
        # A string slightly shorter than the shortest in the arena
        # should be used for all strings in the arena.
        c[:] = self.s_medium[:-1]
        assert_array_equal(c, self.s_medium[:-1])
        # first empty string in original was never initialized, so
        # filling it in now leaves it initialized inside the arena.
        # second string started as a short string so it can never live
        # in the arena.
        in_arena = np.array([True, False, True, True])
        assert_array_equal(self.in_arena(c), in_arena)
        # But when a short string is replaced, it will go on the heap.
        assert_array_equal(self.is_short(c), False)
        assert_array_equal(self.is_on_heap(c), ~in_arena)
        # We can put the originals back, and they'll still fit,
        # and short strings are back as short strings
        c[:] = self.a
        assert_array_equal(c, self.a)
        assert_array_equal(self.in_arena(c), in_arena)
        assert_array_equal(self.is_short(c), self.is_short(self.a))
        assert_array_equal(self.is_on_heap(c), False)

    def test_arena_reuse_if_possible(self):
        c = self.a.copy()
        # A slightly longer string will not fit in the arena for
        # the medium string, but will fit for the longer one.
        c[:] = self.s_medium + "±"
        assert_array_equal(c, self.s_medium + "±")
        in_arena_exp = np.strings.str_len(self.a) >= len(self.s_medium) + 1
        # first entry started uninitialized and empty, so filling it leaves
        # it in the arena
        in_arena_exp[0] = True
        assert not np.all(in_arena_exp == self.in_arena(self.a))
        assert_array_equal(self.in_arena(c), in_arena_exp)
        assert_array_equal(self.is_short(c), False)
        assert_array_equal(self.is_on_heap(c), ~in_arena_exp)
        # And once outside arena, it stays outside, since offset is lost.
        # But short strings are used again.
        c[:] = self.a
        is_short_exp = self.is_short(self.a)
        assert_array_equal(c, self.a)
        assert_array_equal(self.in_arena(c), in_arena_exp)
        assert_array_equal(self.is_short(c), is_short_exp)
        assert_array_equal(self.is_on_heap(c), ~in_arena_exp & ~is_short_exp)

    def test_arena_no_reuse_after_short(self):
        c = self.a.copy()
        # If we replace a string with a short string, it cannot
        # go into the arena after because the offset is lost.
        c[:] = self.s_short
        assert_array_equal(c, self.s_short)
        assert_array_equal(self.in_arena(c), False)
        c[:] = self.a
        assert_array_equal(c, self.a)
        assert_array_equal(self.in_arena(c), False)
        assert_array_equal(self.is_on_heap(c), self.in_arena(self.a))