Source code for kwarray.dataframe_light

"""
A faster-than-pandas pandas-like interface to column-major data, in the case
where the data only needs to be accessed by index.

For data where more complex ids are needed you must use pandas.
"""
import ubelt as ub
import numpy as np
import copy

try:
    import pandas as pd
except Exception:
    pd = None


__version__ = '0.0.1'



[docs]
class LocLight(object):
    def __init__(self, parent):
        self.parent = parent

    def __getitem__(self, index):
        return self.parent._getrow(index)




[docs]
class DataFrameLight(ub.NiceRepr):
    r"""
    Implements a subset of the pandas.DataFrame API

    The API is restricted to facilitate speed tradeoffs

    Note:
        Assumes underlying data is Dict[list|ndarray]. If the data is known
        to be a Dict[ndarray] use DataFrameArray instead, which has faster
        implementations for some operations.

    Note:
        pandas.DataFrame is slow. DataFrameLight is faster.
        It is a tad more restrictive though.

    Example:
        >>> self = DataFrameLight({})
        >>> print('self = {!r}'.format(self))
        >>> self = DataFrameLight({'a': [0, 1, 2], 'b': [2, 3, 4]})
        >>> print('self = {!r}'.format(self))
        >>> item = self.iloc[0]
        >>> print('item = {!r}'.format(item))

    Benchmark:
        >>> # BENCHMARK
        >>> # xdoc: +REQUIRES(--bench)
        >>> from kwarray.dataframe_light import *  # NOQA
        >>> import ubelt as ub
        >>> NUM = 1000
        >>> print('NUM = {!r}'.format(NUM))
        >>> # to_dict conversions
        >>> print('==============')
        >>> print('====== to_dict conversions =====')
        >>> _keys = ['list', 'dict', 'series', 'split', 'records', 'index']
        >>> results = []
        >>> df = DataFrameLight._demodata(num=NUM).pandas()
        >>> ti = ub.Timerit(verbose=False, unit='ms')
        >>> for key in _keys:
        >>>     result = ti.reset(key).call(lambda: df.to_dict(orient=key))
        >>>     results.append((result.mean(), result.report()))
        >>> key = 'series+numpy'
        >>> result = ti.reset(key).call(lambda: {k: v.values for k, v in df.to_dict(orient='series').items()})
        >>> results.append((result.mean(), result.report()))
        >>> print('\n'.join([t[1] for t in sorted(results)]))
        >>> print('==============')
        >>> print('====== DFLight Conversions =======')
        >>> ti = ub.Timerit(verbose=True, unit='ms')
        >>> key = 'self.pandas'
        >>> self = DataFrameLight(df)
        >>> ti.reset(key).call(lambda: self.pandas())
        >>> key = 'light-from-pandas'
        >>> ti.reset(key).call(lambda: DataFrameLight(df))
        >>> key = 'light-from-dict'
        >>> ti.reset(key).call(lambda: DataFrameLight(self._data))
        >>> print('==============')
        >>> print('====== BENCHMARK: .LOC[] =======')
        >>> ti = ub.Timerit(num=20, bestof=4, verbose=True, unit='ms')
        >>> df_light = DataFrameLight._demodata(num=NUM)
        >>> # xdoctest: +REQUIRES(module:pandas)
        >>> df_heavy = df_light.pandas()
        >>> series_data = df_heavy.to_dict(orient='series')
        >>> list_data = df_heavy.to_dict(orient='list')
        >>> np_data = {k: v.values for k, v in df_heavy.to_dict(orient='series').items()}
        >>> for timer in ti.reset('DF-heavy.iloc'):
        >>>     with timer:
        >>>         for i in range(NUM):
        >>>             df_heavy.iloc[i]
        >>> for timer in ti.reset('DF-heavy.loc'):
        >>>     with timer:
        >>>         for i in range(NUM):
        >>>             df_heavy.iloc[i]
        >>> for timer in ti.reset('dict[SERIES].loc'):
        >>>     with timer:
        >>>         for i in range(NUM):
        >>>             {key: series_data[key].loc[i] for key in series_data.keys()}
        >>> for timer in ti.reset('dict[SERIES].iloc'):
        >>>     with timer:
        >>>         for i in range(NUM):
        >>>             {key: series_data[key].iloc[i] for key in series_data.keys()}
        >>> for timer in ti.reset('dict[SERIES][]'):
        >>>     with timer:
        >>>         for i in range(NUM):
        >>>             {key: series_data[key][i] for key in series_data.keys()}
        >>> for timer in ti.reset('dict[NDARRAY][]'):
        >>>     with timer:
        >>>         for i in range(NUM):
        >>>             {key: np_data[key][i] for key in np_data.keys()}
        >>> for timer in ti.reset('dict[list][]'):
        >>>     with timer:
        >>>         for i in range(NUM):
        >>>             {key: list_data[key][i] for key in np_data.keys()}
        >>> for timer in ti.reset('DF-Light.iloc/loc'):
        >>>     with timer:
        >>>         for i in range(NUM):
        >>>             df_light.iloc[i]
        >>> for timer in ti.reset('DF-Light._getrow'):
        >>>     with timer:
        >>>         for i in range(NUM):
        >>>             df_light._getrow(i)
        NUM = 1000
        ==============
        ====== to_dict conversions =====
        Timed best=0.022 ms, mean=0.022 ± 0.0 ms for series
        Timed best=0.059 ms, mean=0.059 ± 0.0 ms for series+numpy
        Timed best=0.315 ms, mean=0.315 ± 0.0 ms for list
        Timed best=0.895 ms, mean=0.895 ± 0.0 ms for dict
        Timed best=2.705 ms, mean=2.705 ± 0.0 ms for split
        Timed best=5.474 ms, mean=5.474 ± 0.0 ms for records
        Timed best=7.320 ms, mean=7.320 ± 0.0 ms for index
        ==============
        ====== DFLight Conversions =======
        Timed best=1.798 ms, mean=1.798 ± 0.0 ms for self.pandas
        Timed best=0.064 ms, mean=0.064 ± 0.0 ms for light-from-pandas
        Timed best=0.010 ms, mean=0.010 ± 0.0 ms for light-from-dict
        ==============
        ====== BENCHMARK: .LOC[] =======
        Timed best=101.365 ms, mean=101.564 ± 0.2 ms for DF-heavy.iloc
        Timed best=102.038 ms, mean=102.273 ± 0.2 ms for DF-heavy.loc
        Timed best=29.357 ms, mean=29.449 ± 0.1 ms for dict[SERIES].loc
        Timed best=21.701 ms, mean=22.014 ± 0.3 ms for dict[SERIES].iloc
        Timed best=11.469 ms, mean=11.566 ± 0.1 ms for dict[SERIES][]
        Timed best=0.807 ms, mean=0.826 ± 0.0 ms for dict[NDARRAY][]
        Timed best=0.478 ms, mean=0.492 ± 0.0 ms for dict[list][]
        Timed best=0.969 ms, mean=0.994 ± 0.0 ms for DF-Light.iloc/loc
        Timed best=0.760 ms, mean=0.776 ± 0.0 ms for DF-Light._getrow

    """
    def __init__(self, data=None, columns=None):
        if columns is not None:
            if data is None:
                data = ub.odict(zip(columns, [[] for _ in range(len(columns))]))
            else:
                data = ub.odict(zip(columns, data.T))

        self._raw = data
        self._data = None
        self._localizer = LocLight(self)
        self.__normalize__()

    @property
    def iloc(self):
        return self._localizer

    @property
    def values(self):
        data = self._getcols(self.columns)
        return data

    @property
    def loc(self):
        return self._localizer

    def __eq__(self, other):
        """
        Example:
            >>> # xdoctest: +REQUIRES(module:pandas)
            >>> self = DataFrameLight._demodata(num=7)
            >>> other = self.pandas()
            >>> assert np.all(self == other)
        """
        self_vals = self.values
        if isinstance(other, DataFrameLight):
            other_vals = other.values
        if pd is not None and isinstance(other, pd.DataFrame):
            other_vals = other.reindex(columns=self.columns).values
        else:
            other_vals = other
        return self_vals == other_vals


[docs]
    def to_string(self, *args, **kwargs):
        """
        Returns:
            str:
        """
        return self.pandas().to_string(*args, **kwargs)



[docs]
    def to_dict(self, orient='dict', into=dict):
        """
        Convert the data frame into a dictionary.

        Args:
            orient (str): Currently naitively suports orient in
                {'dict', 'list'}, otherwise we fallback to pandas conversion
                and call its to_dict method.

            into (type): type of dictionary to transform into

        Returns:
           dict

        Example:
            >>> from kwarray.dataframe_light import *  # NOQA
            >>> self = DataFrameLight._demodata(num=7)
            >>> print(self.to_dict(orient='dict'))
            >>> print(self.to_dict(orient='list'))
        """
        if orient == 'dict':
            out = into(self.iterrows())
        elif orient == 'list':
            import kwarray
            out = into((k, kwarray.ArrayAPI.tolist(v))
                        for k, v in self._data.items())
        else:
            out = self.pandas().to_dict(orient=orient, into=into)
        return out



[docs]
    def pandas(self):
        """
        Convert back to pandas if you need the full API

        Returns:
            pd.DataFrame:

        Example:
            >>> # xdoctest: +REQUIRES(module:pandas)
            >>> df_light = DataFrameLight._demodata(num=7)
            >>> df_heavy = df_light.pandas()
            >>> got = DataFrameLight(df_heavy)
            >>> assert got._data == df_light._data
        """
        if pd is None:
            raise Exception('Pandas is not available')
        return pd.DataFrame(self._data)



[docs]
    def _pandas(self):
        """ Deprecated, use self.pandas instead """
        return self.pandas()



[docs]
    @classmethod
    def _demodata(cls, num=7):
        """
        Example:
            >>> self = DataFrameLight._demodata(num=7)
            >>> print('self = {!r}'.format(self))
            >>> other = DataFrameLight._demodata(num=11)
            >>> print('other = {!r}'.format(other))
            >>> both = self.union(other)
            >>> print('both = {!r}'.format(both))
            >>> assert both is not self
            >>> assert other is not self
        """
        demodata = {
            'foo': [0] * num,
            'bar': [i % 3 for i in range(num)],
            'baz': [2.73] * num,
        }
        self = cls(demodata)
        return self


    def __nice__(self):
        return 'keys: {}, len={}'.format(list(self.keys()), len(self))

    def __len__(self):
        if self._data:
            key = next(iter(self.keys()))
            return len(self._data[key])
        else:
            return 0

    def __contains__(self, item):
        return item in self.keys()

    def __normalize__(self):
        """
        Try to convert input data to Dict[List]
        """
        if self._raw is None:
            self._data = {}
        elif isinstance(self._raw, dict):
            self._data = self._raw
            if __debug__:
                lens = []
                for d in self._data.values():
                    if not isinstance(d, (list, np.ndarray)):
                        raise TypeError(type(d))
                    lens.append(len(d))
                assert ub.allsame(lens)
        elif isinstance(self._raw, DataFrameLight):
            self._data = copy.copy(self._raw._data)
        elif pd is not None and isinstance(self._raw, pd.DataFrame):
            self._data = self._raw.to_dict(orient='list')
        else:
            raise TypeError('Unknown _raw type')

    @property
    def columns(self):
        return list(self.keys())


[docs]
    def sort_values(self, key, inplace=False, ascending=True):
        sortx = np.argsort(self._getcol(key))
        if not ascending:
            sortx = sortx[::-1]
        return self.take(sortx, inplace=inplace)



[docs]
    def keys(self):
        if self._data:
            for key in self._data.keys():
                yield key



[docs]
    def _getrow(self, index):
        return {key: self._data[key][index] for key in self._data.keys()}



[docs]
    def _getcol(self, key):
        return self._data[key]



[docs]
    def _getcols(self, keys):
        num = len(self)
        col_data = [self._getcol(key) for key in keys]
        data = np.hstack([np.asarray(d).reshape(num, -1) for d in col_data])
        return data



[docs]
    def get(self, key, default=None):
        """
        Get item for given key. Returns default value if not found.
        """
        return self[key] if key in self else default



[docs]
    def clear(self):
        """
        Removes all rows inplace
        """
        if self._data:
            for key in self._data.keys():
                self._data[key].clear()


    def __getitem__(self, key):
        """
        Note:
            only handles the case where key is a single column name.

        Example:
            >>> df_light = DataFrameLight._demodata(num=7)
            >>> sub1 = df_light['bar']
            >>> # xdoctest: +REQUIRES(module:pandas)
            >>> df_heavy = df_light.pandas()
            >>> sub2 = df_heavy['bar']
            >>> assert np.all(sub1 == sub2)
        """
        return self._getcol(key)

    def __setitem__(self, key, value):
        """
        Note:
            only handles the case where key is a single column name. and value
            is an array of all the values to set.

        Example:
            >>> df_light = DataFrameLight._demodata(num=7)
            >>> value = [2] * len(df_light)
            >>> df_light['bar'] = value
            >>> # xdoctest: +REQUIRES(module:pandas)
            >>> df_heavy = df_light.pandas()
            >>> df_heavy['bar'] = value
            >>> assert np.all(df_light == df_heavy)
        """
        self._data[key] = value


[docs]
    def compress(self, flags, inplace=False):
        """
        NOTE: NOT A PART OF THE PANDAS API
        """
        subset = self if inplace else self.__class__()
        for key in self._data.keys():
            subset._data[key] = list(ub.compress(self._data[key], flags))
        return subset



[docs]
    def take(self, indices, inplace=False):
        """
        Return the elements in the given *positional* indices along an axis.

        Args:
            inplace (bool): NOT PART OF PANDAS API

        Note:
            assumes axis=0

        Example:
            >>> df_light = DataFrameLight._demodata(num=7)
            >>> indices = [0, 2, 3]
            >>> sub1 = df_light.take(indices)
            >>> # xdoctest: +REQUIRES(module:pandas)
            >>> df_heavy = df_light.pandas()
            >>> sub2 = df_heavy.take(indices)
            >>> assert np.all(sub1 == sub2)
        """
        subset = self if inplace else self.__class__()
        if isinstance(indices, slice):
            for key in self._data.keys():
                subset._data[key] = self._data[key][indices]
        else:
            for key in self._data.keys():
                subset._data[key] = list(ub.take(self._data[key], indices))
        return subset



[docs]
    def copy(self):
        other = copy.copy(self)
        other._data = other._data.copy()
        other._localizer = LocLight(other)
        return other



[docs]
    def extend(self, other):
        """
        Extend ``self`` inplace using another dataframe array

        Args:
            other (DataFrameLight | dict[str, Sequence]):
                values to concat to end of this object

        NOTE:
            Not part of the pandas API

        Example:
            >>> self = DataFrameLight(columns=['foo', 'bar'])
            >>> other = {'foo': [0], 'bar': [1]}
            >>> self.extend(other)
            >>> assert len(self) == 1
        """
        try:
            _other_data = other._data
        except AttributeError:
            _other_data = other

        _self_data = self._data
        for key, vals1 in _self_data.items():
            vals2 = _other_data[key]
            try:
                vals1.extend(vals2)
            except AttributeError:
                if isinstance(vals1, np.ndarray):
                    _self_data[key] = np.hstack([vals1, vals2])
                else:
                    raise



[docs]
    def union(self, *others):
        """
        NOTE:
            Note part of the pandas API
        """
        if isinstance(self, DataFrameLight):
            first = self
            rest = others
        else:
            if len(others) == 0:
                return DataFrameLight()
            first = others[0]
            rest = others[1:]

        both = first.copy()
        if not both.keys:
            for other in rest:
                if other.keys:
                    both.keys = copy.copy(other.keys)
                    break

        for other in rest:
            both.extend(other)
        return both



[docs]
    @classmethod
    def concat(cls, others):
        return cls.union(*others)



[docs]
    @classmethod
    def from_pandas(cls, df):
        _raw = {k: v.values for k, v in df.to_dict(orient='series').items()}
        return cls(_raw)



[docs]
    @classmethod
    def from_dict(cls, records):
        record_iter = iter(records)
        columns = {}
        try:
            r = next(record_iter)
            for key, value in r.items():
                columns[key] = [value]
        except StopIteration:
            pass
        else:
            for r in record_iter:
                for key, value in r.items():
                    columns[key].append(value)
        self = cls(columns)
        return self



[docs]
    def reset_index(self, drop=False):
        """ noop for compatability, the light version doesnt store an index """
        return self



[docs]
    def groupby(self, by=None, *args, **kwargs):
        """
        Group rows by the value of a column. Unlike pandas this simply
        returns a zip object. To ensure compatiability call list on the
        result of groupby.

        Args:
            by (str): column name to group by
            *args: if specified, the dataframe is coerced to pandas
            *kwargs: if specified, the dataframe is coerced to pandas

        Example:
            >>> df_light = DataFrameLight._demodata(num=7)
            >>> res1 = list(df_light.groupby('bar'))
            >>> # xdoctest: +REQUIRES(module:pandas)
            >>> df_heavy = df_light.pandas()
            >>> res2 = list(df_heavy.groupby('bar'))
            >>> assert len(res1) == len(res2)
            >>> assert all([np.all(a[1] == b[1]) for a, b in zip(res1, res2)])

        Ignore:
            >>> self = DataFrameLight._demodata(num=1000)
            >>> args = ['cx']
            >>> self['cx'] = (np.random.rand(len(self)) * 10).astype(np.int)
            >>> # As expected, our custom restricted implementation is faster
            >>> # than pandas
            >>> ub.Timerit(100).call(lambda: dict(list(self.pandas().groupby('cx')))).print()
            >>> ub.Timerit(100).call(lambda: dict(self.groupby('cx'))).print()
        """
        if len(args) == 0 and len(kwargs) == 0:
            # In this special case we can be fast
            import kwarray
            unique, groupxs = kwarray.group_indices(self[by])
            groups = [self.take(idxs) for idxs in groupxs]
            return zip(unique, groups)
        else:
            # otherwise we need to use the slow method
            return self.pandas().groupby(by=by)



[docs]
    def rename(self, mapper=None, columns=None, axis=None, inplace=False):
        """
        Rename the columns (index renaming is not supported)

        Example:
            >>> df_light = DataFrameLight._demodata(num=7)
            >>> mapper = {'foo': 'fi'}
            >>> res1 = df_light.rename(columns=mapper)
            >>> res3 = df_light.rename(mapper, axis=1)
            >>> # xdoctest: +REQUIRES(module:pandas)
            >>> df_heavy = df_light.pandas()
            >>> res2 = df_heavy.rename(columns=mapper)
            >>> res4 = df_heavy.rename(mapper, axis=1)
            >>> assert np.all(res1 == res2)
            >>> assert np.all(res3 == res2)
            >>> assert np.all(res3 == res4)
        """
        if columns is not None:
            if axis is not None:
                raise TypeError("Cannot specify both 'axis' and any of 'index' or 'columns'")
        else:
            if axis != 1:
                raise NotImplementedError('only axis=1 is supported')
            columns = mapper

        if not inplace:
            self = self.copy()
        for old, new in columns.items():
            if old in self._data:
                self._data[new] = self._data.pop(old)
        return self



[docs]
    def iterrows(self):
        """
        Iterate over rows as (index, Dict) pairs.

        Yields:
            Tuple[int, Dict]: the index and a dictionary representing a row

        Example:
            >>> from kwarray.dataframe_light import *  # NOQA
            >>> self = DataFrameLight._demodata(num=3)
            >>> print(ub.urepr(list(self.iterrows()), sort=1))
            [
                (0, {'bar': 0, 'baz': 2.73, 'foo': 0}),
                (1, {'bar': 1, 'baz': 2.73, 'foo': 0}),
                (2, {'bar': 2, 'baz': 2.73, 'foo': 0}),
            ]

        Benchmark:
            >>> # xdoc: +REQUIRES(--bench)
            >>> from kwarray.dataframe_light import *  # NOQA
            >>> import ubelt as ub
            >>> df_light = DataFrameLight._demodata(num=1000)
            >>> df_heavy = df_light.pandas()
            >>> ti = ub.Timerit(21, bestof=3, verbose=2, unit='ms')
            >>> ti.reset('light').call(lambda: list(df_light.iterrows()))
            >>> ti.reset('heavy').call(lambda: list(df_heavy.iterrows()))
            >>> # xdoctest: +IGNORE_WANT
            Timed light for: 21 loops, best of 3
                time per loop: best=0.834 ms, mean=0.850 ± 0.0 ms
            Timed heavy for: 21 loops, best of 3
                time per loop: best=45.007 ms, mean=45.633 ± 0.5 ms
        """
        for idx in range(len(self)):
            row = self._getrow(idx)
            yield idx, row





[docs]
class DataFrameArray(DataFrameLight):
    """
    DataFrameLight assumes the backend is a Dict[list]
    DataFrameArray assumes the backend is a Dict[ndarray]

    Take and compress are much faster, but extend and union are slower
    """

    def __normalize__(self):
        """
        Try to convert input data to Dict[ndarray]
        """
        if self._raw is None:
            self._data = {}
        elif isinstance(self._raw, dict):
            self._data = self._raw
            if __debug__:
                lens = []
                for d in self._data.values():
                    if not isinstance(d, (list, np.ndarray)):
                        raise TypeError(type(d))
                    lens.append(len(d))
                assert ub.allsame(lens), (
                    'lens are not all same {} for columns {}'.format(
                        lens,
                        list(self._data.keys()))
                )
        elif isinstance(self._raw, DataFrameLight):
            self._data = copy.copy(self._raw._data)
        elif pd is not None and isinstance(self._raw, pd.DataFrame):
            self._data = {k: v.values for k, v in self._raw.to_dict(orient='series').items()}
        else:
            raise TypeError('Unknown _raw type')
        # self._data = ub.map_vals(np.asarray, self._data)  # does this break anything?


[docs]
    def extend(self, other):
        for key in self._data.keys():
            vals1 = self._data[key]
            vals2 = other._data[key]
            self._data[key] = np.hstack([vals1, vals2])



[docs]
    def compress(self, flags, inplace=False):
        subset = self if inplace else self.__class__()
        for key in self._data.keys():
            subset._data[key] = self._data[key][flags]
        return subset



[docs]
    def take(self, indices, inplace=False):
        subset = self if inplace else self.__class__()
        for key in self._data.keys():
            subset._data[key] = self._data[key][indices]
        return subset



    # def min(self, axis=None):
    #     return self._extreme(func=np.minimum, axis=axis)

    # def max(self, axis=None):
    #     """
    #     Example:
    #         >>> self = DataFrameArray._demodata(num=7)
    #         >>> func = np.maximum
    #     """
    #     return self._extreme(func=np.maximum, axis=axis)

    # def _extreme(self, func, axis=None):
    #     if axis is None:
    #         raise NotImplementedError
    #     if axis == 0:
    #         raise NotImplementedError
    #     elif axis == 1:
    #         newdata = nh.util.iter_reduce_ufunc(func, (self[key] for key in self.keys()))
    #         newobj = self.__class__(newdata, self._keys)
    #     else:
    #         raise NotImplementedError