You cannot select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
	
	
		
			332 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Python
		
	
			
		
		
	
	
			332 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Python
		
	
import numpy as np
 | 
						|
import pytest
 | 
						|
 | 
						|
from pandas._libs import groupby as libgroupby
 | 
						|
from pandas._libs.groupby import (
 | 
						|
    group_cumprod,
 | 
						|
    group_cumsum,
 | 
						|
    group_mean,
 | 
						|
    group_sum,
 | 
						|
    group_var,
 | 
						|
)
 | 
						|
 | 
						|
from pandas.core.dtypes.common import ensure_platform_int
 | 
						|
 | 
						|
from pandas import isna
 | 
						|
import pandas._testing as tm
 | 
						|
 | 
						|
 | 
						|
class GroupVarTestMixin:
 | 
						|
    def test_group_var_generic_1d(self):
 | 
						|
        prng = np.random.default_rng(2)
 | 
						|
 | 
						|
        out = (np.nan * np.ones((5, 1))).astype(self.dtype)
 | 
						|
        counts = np.zeros(5, dtype="int64")
 | 
						|
        values = 10 * prng.random((15, 1)).astype(self.dtype)
 | 
						|
        labels = np.tile(np.arange(5), (3,)).astype("intp")
 | 
						|
 | 
						|
        expected_out = (
 | 
						|
            np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2
 | 
						|
        )[:, np.newaxis]
 | 
						|
        expected_counts = counts + 3
 | 
						|
 | 
						|
        self.algo(out, counts, values, labels)
 | 
						|
        assert np.allclose(out, expected_out, self.rtol)
 | 
						|
        tm.assert_numpy_array_equal(counts, expected_counts)
 | 
						|
 | 
						|
    def test_group_var_generic_1d_flat_labels(self):
 | 
						|
        prng = np.random.default_rng(2)
 | 
						|
 | 
						|
        out = (np.nan * np.ones((1, 1))).astype(self.dtype)
 | 
						|
        counts = np.zeros(1, dtype="int64")
 | 
						|
        values = 10 * prng.random((5, 1)).astype(self.dtype)
 | 
						|
        labels = np.zeros(5, dtype="intp")
 | 
						|
 | 
						|
        expected_out = np.array([[values.std(ddof=1) ** 2]])
 | 
						|
        expected_counts = counts + 5
 | 
						|
 | 
						|
        self.algo(out, counts, values, labels)
 | 
						|
 | 
						|
        assert np.allclose(out, expected_out, self.rtol)
 | 
						|
        tm.assert_numpy_array_equal(counts, expected_counts)
 | 
						|
 | 
						|
    def test_group_var_generic_2d_all_finite(self):
 | 
						|
        prng = np.random.default_rng(2)
 | 
						|
 | 
						|
        out = (np.nan * np.ones((5, 2))).astype(self.dtype)
 | 
						|
        counts = np.zeros(5, dtype="int64")
 | 
						|
        values = 10 * prng.random((10, 2)).astype(self.dtype)
 | 
						|
        labels = np.tile(np.arange(5), (2,)).astype("intp")
 | 
						|
 | 
						|
        expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2
 | 
						|
        expected_counts = counts + 2
 | 
						|
 | 
						|
        self.algo(out, counts, values, labels)
 | 
						|
        assert np.allclose(out, expected_out, self.rtol)
 | 
						|
        tm.assert_numpy_array_equal(counts, expected_counts)
 | 
						|
 | 
						|
    def test_group_var_generic_2d_some_nan(self):
 | 
						|
        prng = np.random.default_rng(2)
 | 
						|
 | 
						|
        out = (np.nan * np.ones((5, 2))).astype(self.dtype)
 | 
						|
        counts = np.zeros(5, dtype="int64")
 | 
						|
        values = 10 * prng.random((10, 2)).astype(self.dtype)
 | 
						|
        values[:, 1] = np.nan
 | 
						|
        labels = np.tile(np.arange(5), (2,)).astype("intp")
 | 
						|
 | 
						|
        expected_out = np.vstack(
 | 
						|
            [
 | 
						|
                values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2,
 | 
						|
                np.nan * np.ones(5),
 | 
						|
            ]
 | 
						|
        ).T.astype(self.dtype)
 | 
						|
        expected_counts = counts + 2
 | 
						|
 | 
						|
        self.algo(out, counts, values, labels)
 | 
						|
        tm.assert_almost_equal(out, expected_out, rtol=0.5e-06)
 | 
						|
        tm.assert_numpy_array_equal(counts, expected_counts)
 | 
						|
 | 
						|
    def test_group_var_constant(self):
 | 
						|
        # Regression test from GH 10448.
 | 
						|
 | 
						|
        out = np.array([[np.nan]], dtype=self.dtype)
 | 
						|
        counts = np.array([0], dtype="int64")
 | 
						|
        values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype)
 | 
						|
        labels = np.zeros(3, dtype="intp")
 | 
						|
 | 
						|
        self.algo(out, counts, values, labels)
 | 
						|
 | 
						|
        assert counts[0] == 3
 | 
						|
        assert out[0, 0] >= 0
 | 
						|
        tm.assert_almost_equal(out[0, 0], 0.0)
 | 
						|
 | 
						|
 | 
						|
class TestGroupVarFloat64(GroupVarTestMixin):
 | 
						|
    __test__ = True
 | 
						|
 | 
						|
    algo = staticmethod(group_var)
 | 
						|
    dtype = np.float64
 | 
						|
    rtol = 1e-5
 | 
						|
 | 
						|
    def test_group_var_large_inputs(self):
 | 
						|
        prng = np.random.default_rng(2)
 | 
						|
 | 
						|
        out = np.array([[np.nan]], dtype=self.dtype)
 | 
						|
        counts = np.array([0], dtype="int64")
 | 
						|
        values = (prng.random(10**6) + 10**12).astype(self.dtype)
 | 
						|
        values.shape = (10**6, 1)
 | 
						|
        labels = np.zeros(10**6, dtype="intp")
 | 
						|
 | 
						|
        self.algo(out, counts, values, labels)
 | 
						|
 | 
						|
        assert counts[0] == 10**6
 | 
						|
        tm.assert_almost_equal(out[0, 0], 1.0 / 12, rtol=0.5e-3)
 | 
						|
 | 
						|
 | 
						|
class TestGroupVarFloat32(GroupVarTestMixin):
 | 
						|
    __test__ = True
 | 
						|
 | 
						|
    algo = staticmethod(group_var)
 | 
						|
    dtype = np.float32
 | 
						|
    rtol = 1e-2
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("dtype", ["float32", "float64"])
 | 
						|
def test_group_ohlc(dtype):
 | 
						|
    obj = np.array(np.random.default_rng(2).standard_normal(20), dtype=dtype)
 | 
						|
 | 
						|
    bins = np.array([6, 12, 20])
 | 
						|
    out = np.zeros((3, 4), dtype)
 | 
						|
    counts = np.zeros(len(out), dtype=np.int64)
 | 
						|
    labels = ensure_platform_int(np.repeat(np.arange(3), np.diff(np.r_[0, bins])))
 | 
						|
 | 
						|
    func = libgroupby.group_ohlc
 | 
						|
    func(out, counts, obj[:, None], labels)
 | 
						|
 | 
						|
    def _ohlc(group):
 | 
						|
        if isna(group).all():
 | 
						|
            return np.repeat(np.nan, 4)
 | 
						|
        return [group[0], group.max(), group.min(), group[-1]]
 | 
						|
 | 
						|
    expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])])
 | 
						|
 | 
						|
    tm.assert_almost_equal(out, expected)
 | 
						|
    tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64))
 | 
						|
 | 
						|
    obj[:6] = np.nan
 | 
						|
    func(out, counts, obj[:, None], labels)
 | 
						|
    expected[0] = np.nan
 | 
						|
    tm.assert_almost_equal(out, expected)
 | 
						|
 | 
						|
 | 
						|
def _check_cython_group_transform_cumulative(pd_op, np_op, dtype):
 | 
						|
    """
 | 
						|
    Check a group transform that executes a cumulative function.
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    pd_op : callable
 | 
						|
        The pandas cumulative function.
 | 
						|
    np_op : callable
 | 
						|
        The analogous one in NumPy.
 | 
						|
    dtype : type
 | 
						|
        The specified dtype of the data.
 | 
						|
    """
 | 
						|
    is_datetimelike = False
 | 
						|
 | 
						|
    data = np.array([[1], [2], [3], [4]], dtype=dtype)
 | 
						|
    answer = np.zeros_like(data)
 | 
						|
 | 
						|
    labels = np.array([0, 0, 0, 0], dtype=np.intp)
 | 
						|
    ngroups = 1
 | 
						|
    pd_op(answer, data, labels, ngroups, is_datetimelike)
 | 
						|
 | 
						|
    tm.assert_numpy_array_equal(np_op(data), answer[:, 0], check_dtype=False)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("np_dtype", ["int64", "uint64", "float32", "float64"])
 | 
						|
def test_cython_group_transform_cumsum(np_dtype):
 | 
						|
    # see gh-4095
 | 
						|
    dtype = np.dtype(np_dtype).type
 | 
						|
    pd_op, np_op = group_cumsum, np.cumsum
 | 
						|
    _check_cython_group_transform_cumulative(pd_op, np_op, dtype)
 | 
						|
 | 
						|
 | 
						|
def test_cython_group_transform_cumprod():
 | 
						|
    # see gh-4095
 | 
						|
    dtype = np.float64
 | 
						|
    pd_op, np_op = group_cumprod, np.cumprod
 | 
						|
    _check_cython_group_transform_cumulative(pd_op, np_op, dtype)
 | 
						|
 | 
						|
 | 
						|
def test_cython_group_transform_algos():
 | 
						|
    # see gh-4095
 | 
						|
    is_datetimelike = False
 | 
						|
 | 
						|
    # with nans
 | 
						|
    labels = np.array([0, 0, 0, 0, 0], dtype=np.intp)
 | 
						|
    ngroups = 1
 | 
						|
 | 
						|
    data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64")
 | 
						|
    actual = np.zeros_like(data)
 | 
						|
    actual.fill(np.nan)
 | 
						|
    group_cumprod(actual, data, labels, ngroups, is_datetimelike)
 | 
						|
    expected = np.array([1, 2, 6, np.nan, 24], dtype="float64")
 | 
						|
    tm.assert_numpy_array_equal(actual[:, 0], expected)
 | 
						|
 | 
						|
    actual = np.zeros_like(data)
 | 
						|
    actual.fill(np.nan)
 | 
						|
    group_cumsum(actual, data, labels, ngroups, is_datetimelike)
 | 
						|
    expected = np.array([1, 3, 6, np.nan, 10], dtype="float64")
 | 
						|
    tm.assert_numpy_array_equal(actual[:, 0], expected)
 | 
						|
 | 
						|
    # timedelta
 | 
						|
    is_datetimelike = True
 | 
						|
    data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None]
 | 
						|
    actual = np.zeros_like(data, dtype="int64")
 | 
						|
    group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike)
 | 
						|
    expected = np.array(
 | 
						|
        [
 | 
						|
            np.timedelta64(1, "ns"),
 | 
						|
            np.timedelta64(2, "ns"),
 | 
						|
            np.timedelta64(3, "ns"),
 | 
						|
            np.timedelta64(4, "ns"),
 | 
						|
            np.timedelta64(5, "ns"),
 | 
						|
        ]
 | 
						|
    )
 | 
						|
    tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected)
 | 
						|
 | 
						|
 | 
						|
def test_cython_group_mean_datetimelike():
 | 
						|
    actual = np.zeros(shape=(1, 1), dtype="float64")
 | 
						|
    counts = np.array([0], dtype="int64")
 | 
						|
    data = (
 | 
						|
        np.array(
 | 
						|
            [np.timedelta64(2, "ns"), np.timedelta64(4, "ns"), np.timedelta64("NaT")],
 | 
						|
            dtype="m8[ns]",
 | 
						|
        )[:, None]
 | 
						|
        .view("int64")
 | 
						|
        .astype("float64")
 | 
						|
    )
 | 
						|
    labels = np.zeros(len(data), dtype=np.intp)
 | 
						|
 | 
						|
    group_mean(actual, counts, data, labels, is_datetimelike=True)
 | 
						|
 | 
						|
    tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64"))
 | 
						|
 | 
						|
 | 
						|
def test_cython_group_mean_wrong_min_count():
 | 
						|
    actual = np.zeros(shape=(1, 1), dtype="float64")
 | 
						|
    counts = np.zeros(1, dtype="int64")
 | 
						|
    data = np.zeros(1, dtype="float64")[:, None]
 | 
						|
    labels = np.zeros(1, dtype=np.intp)
 | 
						|
 | 
						|
    with pytest.raises(AssertionError, match="min_count"):
 | 
						|
        group_mean(actual, counts, data, labels, is_datetimelike=True, min_count=0)
 | 
						|
 | 
						|
 | 
						|
def test_cython_group_mean_not_datetimelike_but_has_NaT_values():
 | 
						|
    actual = np.zeros(shape=(1, 1), dtype="float64")
 | 
						|
    counts = np.array([0], dtype="int64")
 | 
						|
    data = (
 | 
						|
        np.array(
 | 
						|
            [np.timedelta64("NaT"), np.timedelta64("NaT")],
 | 
						|
            dtype="m8[ns]",
 | 
						|
        )[:, None]
 | 
						|
        .view("int64")
 | 
						|
        .astype("float64")
 | 
						|
    )
 | 
						|
    labels = np.zeros(len(data), dtype=np.intp)
 | 
						|
 | 
						|
    group_mean(actual, counts, data, labels, is_datetimelike=False)
 | 
						|
 | 
						|
    tm.assert_numpy_array_equal(
 | 
						|
        actual[:, 0], np.array(np.divide(np.add(data[0], data[1]), 2), dtype="float64")
 | 
						|
    )
 | 
						|
 | 
						|
 | 
						|
def test_cython_group_mean_Inf_at_begining_and_end():
 | 
						|
    # GH 50367
 | 
						|
    actual = np.array([[np.nan, np.nan], [np.nan, np.nan]], dtype="float64")
 | 
						|
    counts = np.array([0, 0], dtype="int64")
 | 
						|
    data = np.array(
 | 
						|
        [[np.inf, 1.0], [1.0, 2.0], [2.0, 3.0], [3.0, 4.0], [4.0, 5.0], [5, np.inf]],
 | 
						|
        dtype="float64",
 | 
						|
    )
 | 
						|
    labels = np.array([0, 1, 0, 1, 0, 1], dtype=np.intp)
 | 
						|
 | 
						|
    group_mean(actual, counts, data, labels, is_datetimelike=False)
 | 
						|
 | 
						|
    expected = np.array([[np.inf, 3], [3, np.inf]], dtype="float64")
 | 
						|
 | 
						|
    tm.assert_numpy_array_equal(
 | 
						|
        actual,
 | 
						|
        expected,
 | 
						|
    )
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "values, out",
 | 
						|
    [
 | 
						|
        ([[np.inf], [np.inf], [np.inf]], [[np.inf], [np.inf]]),
 | 
						|
        ([[np.inf], [np.inf], [-np.inf]], [[np.inf], [np.nan]]),
 | 
						|
        ([[np.inf], [-np.inf], [np.inf]], [[np.inf], [np.nan]]),
 | 
						|
        ([[np.inf], [-np.inf], [-np.inf]], [[np.inf], [-np.inf]]),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_cython_group_sum_Inf_at_begining_and_end(values, out):
 | 
						|
    # GH #53606
 | 
						|
    actual = np.array([[np.nan], [np.nan]], dtype="float64")
 | 
						|
    counts = np.array([0, 0], dtype="int64")
 | 
						|
    data = np.array(values, dtype="float64")
 | 
						|
    labels = np.array([0, 1, 1], dtype=np.intp)
 | 
						|
 | 
						|
    group_sum(actual, counts, data, labels, None, is_datetimelike=False)
 | 
						|
 | 
						|
    expected = np.array(out, dtype="float64")
 | 
						|
 | 
						|
    tm.assert_numpy_array_equal(
 | 
						|
        actual,
 | 
						|
        expected,
 | 
						|
    )
 |