kwarray.util_averages module¶
Currently just defines “stats_dict”, which is a nice way to gather multiple numeric statistics (e.g. max, min, median, mode, arithmetic-mean, geometric-mean, standard-deviation, etc…) about data in an array.
- kwarray.util_averages.stats_dict(inputs, axis=None, nan=False, sum=False, extreme=True, n_extreme=False, median=False, shape=True, size=False, quantile='auto')[source]¶
Describe statistics about an input array
- Parameters:
inputs (ArrayLike) – set of values to get statistics of
axis (int) – if
inputs
is ndarray then this specifies the axisnan (bool) – report number of nan items (TODO: rename to skipna)
sum (bool) – report sum of values
extreme (bool) – report min and max values
n_extreme (bool) – report extreme value frequencies
median (bool) – report median
size (bool) – report array size
shape (bool) – report array shape
quantile (str | bool | List[float]) – defaults to ‘auto’. Can also be a list of quantiles to compute. if truthy computes quantiles.
- Returns:
dictionary of common numpy statistics (min, max, mean, std, nMin, nMax, shape)
- Return type:
- SeeAlso:
scipy.stats.describe()
pandas.DataFrame.describe()
Example
>>> # xdoctest: +IGNORE_WHITESPACE >>> from kwarray.util_averages import * # NOQA >>> axis = 0 >>> rng = np.random.RandomState(0) >>> inputs = rng.rand(10, 2).astype(np.float32) >>> stats = stats_dict(inputs, axis=axis, nan=False, median=True) >>> import ubelt as ub # NOQA >>> result = str(ub.urepr(stats, nl=1, precision=4, with_dtype=True)) >>> print(result) { 'mean': np.array([[0.5206, 0.6425]], dtype=np.float32), 'std': np.array([[0.2854, 0.2517]], dtype=np.float32), 'min': np.array([[0.0202, 0.0871]], dtype=np.float32), 'max': np.array([[0.9637, 0.9256]], dtype=np.float32), 'q_0.25': np.array([0.4271, 0.5329], dtype=np.float64), 'q_0.50': np.array([0.5584, 0.6805], dtype=np.float64), 'q_0.75': np.array([0.7343, 0.8607], dtype=np.float64), 'med': np.array([0.5584, 0.6805], dtype=np.float32), 'shape': (10, 2), }
Example
>>> # xdoctest: +IGNORE_WHITESPACE >>> from kwarray.util_averages import * # NOQA >>> axis = 0 >>> rng = np.random.RandomState(0) >>> inputs = rng.randint(0, 42, size=100).astype(np.float32) >>> inputs[4] = np.nan >>> stats = stats_dict(inputs, axis=axis, nan=True, quantile='auto') >>> import ubelt as ub # NOQA >>> result = str(ub.urepr(stats, nl=0, precision=1, strkeys=True)) >>> print(result)
Example
>>> import kwarray >>> import ubelt as ub >>> rng = kwarray.ensure_rng(0) >>> orig_inputs = rng.rand(1, 1, 2, 3) >>> param_grid = ub.named_product({ >>> #'axis': (None, 0, (0, 1), -1), >>> 'axis': [None], >>> 'percent_nan': [0, 0.5, 1.0], >>> 'nan': [True, False], >>> 'sum': [1], >>> 'extreme': [True], >>> 'n_extreme': [True], >>> 'median': [1], >>> 'size': [1], >>> 'shape': [1], >>> 'quantile': ['auto'], >>> }) >>> for params in param_grid: >>> kwargs = params.copy() >>> percent_nan = kwargs.pop('percent_nan', 0) >>> if percent_nan: >>> inputs = orig_inputs.copy() >>> inputs[rng.rand(*inputs.shape) < percent_nan] = np.nan >>> else: >>> inputs = orig_inputs >>> stats = kwarray.stats_dict(inputs, **kwargs) >>> print('---') >>> print('params = {}'.format(ub.urepr(params, nl=1))) >>> print('stats = {}'.format(ub.urepr(stats, nl=1)))
- kwarray.util_averages._gmean(a, axis=0, dtype=None, clobber=False)[source]¶
Compute the geometric mean along the specified axis.
Modification of the scikit-learn method to be more memory efficient
- Example
>>> rng = np.random.RandomState(0) >>> C, H, W = 8, 32, 32 >>> axis = 0 >>> a = [rng.rand(C, H, W).astype(np.float16), >>> rng.rand(C, H, W).astype(np.float16)]
- exception kwarray.util_averages.NoSupportError[source]¶
Bases:
RuntimeError
- class kwarray.util_averages.RunningStats(nan_policy='omit', check_weights=True, **kwargs)[source]¶
Bases:
NiceRepr
Track mean, std, min, and max values over time with constant memory.
Dynamically records per-element array statistics and can summarized them per-element, across channels, or globally.
Todo
[ ] This may need a few API tweaks and good documentation
Example
>>> import kwarray >>> run = kwarray.RunningStats() >>> ch1 = np.array([[0, 1], [3, 4]]) >>> ch2 = np.zeros((2, 2)) >>> img = np.dstack([ch1, ch2]) >>> run.update(np.dstack([ch1, ch2])) >>> run.update(np.dstack([ch1 + 1, ch2])) >>> run.update(np.dstack([ch1 + 2, ch2])) >>> # No marginalization >>> print('current-ave = ' + ub.urepr(run.summarize(axis=ub.NoParam), nl=2, precision=3)) >>> # Average over channels (keeps spatial dims separate) >>> print('chann-ave(k=1) = ' + ub.urepr(run.summarize(axis=0), nl=2, precision=3)) >>> print('chann-ave(k=0) = ' + ub.urepr(run.summarize(axis=0, keepdims=0), nl=2, precision=3)) >>> # Average over spatial dims (keeps channels separate) >>> print('spatial-ave(k=1) = ' + ub.urepr(run.summarize(axis=(1, 2)), nl=2, precision=3)) >>> print('spatial-ave(k=0) = ' + ub.urepr(run.summarize(axis=(1, 2), keepdims=0), nl=2, precision=3)) >>> # Average over all dims >>> print('alldim-ave(k=1) = ' + ub.urepr(run.summarize(axis=None), nl=2, precision=3)) >>> print('alldim-ave(k=0) = ' + ub.urepr(run.summarize(axis=None, keepdims=0), nl=2, precision=3))
- Parameters:
- nan_policy (str) – indicates how we will handle nan values
if “omit” - set weights of nan items to zero.
if “propogate” - propogate nans.
if “raise” - then raise a ValueError if nans are given.
- check_weights (bool):
if True, we check the weights for zeros (which can also implicitly occur when data has nans). Disabling this check will result in faster computation, but it is your responsibility to ensure all data passed to update is valid.
- property shape¶
- update_many(data, weights=1)[source]¶
Assumes first data axis represents multiple observations
Example
>>> import kwarray >>> rng = kwarray.ensure_rng(0) >>> run = kwarray.RunningStats() >>> data = rng.randn(1, 2, 3) >>> run.update_many(data) >>> print(run.current()) >>> data = rng.randn(2, 2, 3) >>> run.update_many(data) >>> print(run.current()) >>> data = rng.randn(3, 2, 3) >>> run.update_many(data) >>> print(run.current()) >>> run.update_many(1000) >>> print(run.current()) >>> assert np.all(run.current()['n'] == 7)
Example
>>> import kwarray >>> rng = kwarray.ensure_rng(0) >>> run = kwarray.RunningStats() >>> data = rng.randn(1, 2, 3) >>> run.update_many(data.ravel()) >>> print(run.current()) >>> data = rng.randn(2, 2, 3) >>> run.update_many(data.ravel()) >>> print(run.current()) >>> data = rng.randn(3, 2, 3) >>> run.update_many(data.ravel()) >>> print(run.current()) >>> run.update_many(1000) >>> print(run.current()) >>> assert np.all(run.current()['n'] == 37)
- update(data, weights=1)[source]¶
Updates statistics across all data dimensions on a per-element basis
Example
>>> import kwarray >>> data = np.full((7, 5), fill_value=1.3) >>> weights = np.ones((7, 5), dtype=np.float32) >>> run = kwarray.RunningStats() >>> run.update(data, weights=1) >>> run.update(data, weights=weights) >>> rng = np.random >>> weights[rng.rand(*weights.shape) > 0.5] = 0 >>> run.update(data, weights=weights)
Example
>>> import kwarray >>> run = kwarray.RunningStats() >>> data = np.array([[1, np.nan, np.nan], [0, np.nan, 1.]]) >>> run.update(data) >>> print('current = {}'.format(ub.urepr(run.current(), nl=1))) >>> print('summary(axis=None) = {}'.format(ub.urepr(run.summarize(), nl=1))) >>> print('summary(axis=1) = {}'.format(ub.urepr(run.summarize(axis=1), nl=1))) >>> print('summary(axis=0) = {}'.format(ub.urepr(run.summarize(axis=0), nl=1))) >>> data = np.array([[2, 0, 1], [0, 1, np.nan]]) >>> run.update(data) >>> data = np.array([[3, 1, 1], [0, 1, np.nan]]) >>> run.update(data) >>> data = np.array([[4, 1, 1], [0, 1, 1.]]) >>> run.update(data) >>> print('----') >>> print('current = {}'.format(ub.urepr(run.current(), nl=1))) >>> print('summary(axis=None) = {}'.format(ub.urepr(run.summarize(), nl=1))) >>> print('summary(axis=1) = {}'.format(ub.urepr(run.summarize(axis=1), nl=1))) >>> print('summary(axis=0) = {}'.format(ub.urepr(run.summarize(axis=0), nl=1)))
- summarize(axis=None, keepdims=True)[source]¶
Compute summary statistics across a one or more dimension
- Parameters:
axis (int | List[int] | None | NoParamType) – axis or axes to summarize over, if None, all axes are summarized. if ub.NoParam, no axes are summarized the current result is returned.
keepdims (bool, default=True) – if False removes the dimensions that are summarized over
- Returns:
containing minimum, maximum, mean, std, etc..
- Return type:
Dict
- Raises:
NoSupportError – if update was never called with valid data
Example
>>> # Test to make sure summarize works across different shapes >>> base = np.array([1, 1, 1, 1, 0, 0, 0, 1]) >>> run0 = RunningStats() >>> for _ in range(3): >>> run0.update(base.reshape(8, 1)) >>> run1 = RunningStats() >>> for _ in range(3): >>> run1.update(base.reshape(4, 2)) >>> run2 = RunningStats() >>> for _ in range(3): >>> run2.update(base.reshape(2, 2, 2)) >>> # >>> # Summarizing over everything should be exactly the same >>> s0N = run0.summarize(axis=None, keepdims=0) >>> s1N = run1.summarize(axis=None, keepdims=0) >>> s2N = run2.summarize(axis=None, keepdims=0) >>> #assert ub.util_indexable.indexable_allclose(s0N, s1N, rel_tol=0.0, abs_tol=0.0) >>> #assert ub.util_indexable.indexable_allclose(s1N, s2N, rel_tol=0.0, abs_tol=0.0) >>> assert s0N['mean'] == 0.625
- kwarray.util_averages._combine_mean_stds(means, stds, nums=None, axis=None, keepdims=False, bessel=True)[source]¶
- Parameters:
means (array) – means[i] is the mean of the ith entry to combine
stds (array) – stds[i] is the std of the ith entry to combine
nums (array | None) – nums[i] is the number of samples in the ith entry to combine. if None, assumes sample sizes are infinite.
axis (int | Tuple[int] | None) – axis to combine the statistics over
keepdims (bool) – if True return arrays with the same number of dimensions they were given in.
bessel (int) – Set to 1 to enables bessel correction to unbias the combined std estimate. Only disable if you have the true population means, or you think you know what you are doing.
References
Sympy
>>> # xdoctest: +REQUIRES(env:SHOW_SYMPY) >>> # What about the case where we don't know population size of the >>> # estimates. We could treat it as a fixed number, or perhaps take the >>> # limit as n -> infinity. >>> import sympy >>> import sympy as sym >>> from sympy import symbols, sqrt, limit, IndexedBase, summation >>> from sympy import Indexed, Idx, symbols >>> means = IndexedBase('m') >>> stds = IndexedBase('s') >>> nums = IndexedBase('n') >>> i = symbols('i', cls=Idx) >>> k = symbols('k', cls=Idx) >>> # >>> combo_mean = symbols('C') >>> # >>> bessel = 1 >>> total = summation(nums[i], (i, 1, k)) >>> combo_mean_expr = summation(nums[i] * means[i], (i, 1, k)) / total >>> p1 = summation((nums[i] - bessel) * stds[i], (i, 1, k)) >>> p2 = summation(nums[i] * ((means[i] - combo_mean) ** 2), (i, 1, k)) >>> # >>> combo_std_expr = sqrt((p1 + p2) / (total - bessel)) >>> print('------------------------------------') >>> print('General Combined Mean / Std Formulas') >>> print('C = combined mean') >>> print('S = combined std') >>> print('------------------------------------') >>> print(ub.hzcat(['C = ', sym.pretty(combo_mean_expr, use_unicode=True, use_unicode_sqrt_char=True)])) >>> print(ub.hzcat(['S = ', sym.pretty(combo_std_expr, use_unicode=True, use_unicode_sqrt_char=True)])) >>> print('') >>> print('---------') >>> print('Now assuming all sample sizes are the same constant value N') >>> print('---------') >>> # Now assume all n[i] = N (i.e. a constant value) >>> N = symbols('N') >>> combo_mean_const_n_expr = combo_mean_expr.copy().xreplace({nums[i]: N}) >>> combo_std_const_n_expr = combo_std_expr.copy().xreplace({nums[i]: N}) >>> p1_const_n = p1.copy().xreplace({nums[i]: N}) >>> p2_const_n = p2.copy().xreplace({nums[i]: N}) >>> total_const_n = total.copy().xreplace({nums[i]: N}) >>> # >>> print(ub.hzcat(['C = ', sym.pretty(combo_mean_const_n_expr, use_unicode=True, use_unicode_sqrt_char=True)])) >>> print(ub.hzcat(['S = ', sym.pretty(combo_std_const_n_expr, use_unicode=True, use_unicode_sqrt_char=True)])) >>> # >>> print('') >>> print('---------') >>> print('Take the limit as N -> infinity') >>> print('---------') >>> # >>> # Limit doesnt directly but we can break it into parts >>> lim_C = limit(combo_mean_const_n_expr, N, float('inf')) >>> lim_p1 = limit(p1_const_n / (total_const_n - bessel), N, float('inf')) >>> lim_p2 = limit(p2_const_n / (total_const_n - bessel), N, float('inf')) >>> lim_expr = sym.sqrt(lim_p1 + lim_p2) >>> print(ub.hzcat(['lim(C, N->inf) = ', sym.pretty(lim_C)])) >>> print(ub.hzcat(['lim(S, N->inf) = ', sym.pretty(lim_expr)]))
Example
>>> from kwarray.util_averages import * # NOQA >>> from kwarray.util_averages import _combine_mean_stds >>> means = np.array([1.2, 3.2, 4.1]) >>> stds = np.array([4.2, 0.2, 2.1]) >>> nums = np.array([10, 100, 10]) >>> _combine_mean_stds(means, stds, nums) >>> means = np.array([1, 2, 3]) >>> stds = np.array([1, 2, 3]) >>> # >>> nums = np.array([1, 1, 1]) / 3 >>> print(_combine_mean_stds(means, stds, nums, bessel=True), '- .3 B') >>> print(_combine_mean_stds(means, stds, nums, bessel=False), '- .3') >>> nums = np.array([1, 1, 1]) >>> print(_combine_mean_stds(means, stds, nums, bessel=True), '- 1 B') >>> print(_combine_mean_stds(means, stds, nums, bessel=False), '- 1') >>> nums = np.array([10, 10, 10]) >>> print(_combine_mean_stds(means, stds, nums, bessel=True), '- 10 B') >>> print(_combine_mean_stds(means, stds, nums, bessel=False), '- 10') >>> nums = np.array([1000, 1000, 1000]) >>> print(_combine_mean_stds(means, stds, nums, bessel=True), '- 1000 B') >>> print(_combine_mean_stds(means, stds, nums, bessel=False), '- 1000') >>> # >>> nums = None >>> print(_combine_mean_stds(means, stds, nums, bessel=True), '- inf B') >>> print(_combine_mean_stds(means, stds, nums, bessel=False), '- inf')
Example
>>> from kwarray.util_averages import * # NOQA >>> from kwarray.util_averages import _combine_mean_stds >>> means = np.stack([np.array([1.2, 3.2, 4.1])] * 100, axis=0) >>> stds = np.stack([np.array([4.2, 0.2, 2.1])] * 100, axis=0) >>> nums = np.stack([np.array([10, 100, 10])] * 100, axis=0) >>> cm1, cs1, _ = _combine_mean_stds(means, stds, nums, axis=None) >>> print('combo_mean = {}'.format(ub.urepr(cm1, nl=1))) >>> print('combo_std = {}'.format(ub.urepr(cs1, nl=1))) >>> means = np.stack([np.array([1.2, 3.2, 4.1])] * 1, axis=0) >>> stds = np.stack([np.array([4.2, 0.2, 2.1])] * 1, axis=0) >>> nums = np.stack([np.array([10, 100, 10])] * 1, axis=0) >>> cm2, cs2, _ = _combine_mean_stds(means, stds, nums, axis=None) >>> print('combo_mean = {}'.format(ub.urepr(cm2, nl=1))) >>> print('combo_std = {}'.format(ub.urepr(cs2, nl=1))) >>> means = np.stack([np.array([1.2, 3.2, 4.1])] * 5, axis=0) >>> stds = np.stack([np.array([4.2, 0.2, 2.1])] * 5, axis=0) >>> nums = np.stack([np.array([10, 100, 10])] * 5, axis=0) >>> cm3, cs3, combo_num = _combine_mean_stds(means, stds, nums, axis=1) >>> print('combo_mean = {}'.format(ub.urepr(cm3, nl=1))) >>> print('combo_std = {}'.format(ub.urepr(cs3, nl=1))) >>> assert np.allclose(cm1, cm2) and np.allclose(cm2, cm3) >>> assert not np.allclose(cs1, cs2) >>> assert np.allclose(cs2, cs3)
Example
>>> from kwarray.util_averages import * # NOQA >>> from kwarray.util_averages import _combine_mean_stds >>> means = np.random.rand(2, 3, 5, 7) >>> stds = np.random.rand(2, 3, 5, 7) >>> nums = (np.random.rand(2, 3, 5, 7) * 10) + 1 >>> cm, cs, cn = _combine_mean_stds(means, stds, nums, axis=1, keepdims=1) >>> assert cm.shape == cs.shape == cn.shape >>> print(f'cm.shape={cm.shape}') >>> cm, cs, cn = _combine_mean_stds(means, stds, nums, axis=(0, 2), keepdims=1) >>> assert cm.shape == cs.shape == cn.shape >>> print(f'cm.shape={cm.shape}') >>> cm, cs, cn = _combine_mean_stds(means, stds, nums, axis=(1, 3), keepdims=1) >>> assert cm.shape == cs.shape == cn.shape >>> print(f'cm.shape={cm.shape}') >>> cm, cs, cn = _combine_mean_stds(means, stds, nums, axis=None) >>> assert cm.shape == cs.shape == cn.shape >>> print(f'cm.shape={cm.shape}') cm.shape=(2, 1, 5, 7) cm.shape=(1, 3, 1, 7) cm.shape=(2, 1, 5, 1) cm.shape=()