Source code for kwarray.dataframe_light

"""
A faster-than-pandas pandas-like interface to column-major data, in the case
where the data only needs to be accessed by index.

For data where more complex ids are needed you must use pandas.
"""
import ubelt as ub
import numpy as np
import copy

try:
    import pandas as pd
except Exception:
    pd = None


__version__ = '0.0.1'


[docs] class LocLight(object): def __init__(self, parent): self.parent = parent def __getitem__(self, index): return self.parent._getrow(index)
[docs] class DataFrameLight(ub.NiceRepr): r""" Implements a subset of the pandas.DataFrame API The API is restricted to facilitate speed tradeoffs Note: Assumes underlying data is Dict[list|ndarray]. If the data is known to be a Dict[ndarray] use DataFrameArray instead, which has faster implementations for some operations. Note: pandas.DataFrame is slow. DataFrameLight is faster. It is a tad more restrictive though. Example: >>> self = DataFrameLight({}) >>> print('self = {!r}'.format(self)) >>> self = DataFrameLight({'a': [0, 1, 2], 'b': [2, 3, 4]}) >>> print('self = {!r}'.format(self)) >>> item = self.iloc[0] >>> print('item = {!r}'.format(item)) Benchmark: >>> # BENCHMARK >>> # xdoc: +REQUIRES(--bench) >>> from kwarray.dataframe_light import * # NOQA >>> import ubelt as ub >>> NUM = 1000 >>> print('NUM = {!r}'.format(NUM)) >>> # to_dict conversions >>> print('==============') >>> print('====== to_dict conversions =====') >>> _keys = ['list', 'dict', 'series', 'split', 'records', 'index'] >>> results = [] >>> df = DataFrameLight._demodata(num=NUM).pandas() >>> ti = ub.Timerit(verbose=False, unit='ms') >>> for key in _keys: >>> result = ti.reset(key).call(lambda: df.to_dict(orient=key)) >>> results.append((result.mean(), result.report())) >>> key = 'series+numpy' >>> result = ti.reset(key).call(lambda: {k: v.values for k, v in df.to_dict(orient='series').items()}) >>> results.append((result.mean(), result.report())) >>> print('\n'.join([t[1] for t in sorted(results)])) >>> print('==============') >>> print('====== DFLight Conversions =======') >>> ti = ub.Timerit(verbose=True, unit='ms') >>> key = 'self.pandas' >>> self = DataFrameLight(df) >>> ti.reset(key).call(lambda: self.pandas()) >>> key = 'light-from-pandas' >>> ti.reset(key).call(lambda: DataFrameLight(df)) >>> key = 'light-from-dict' >>> ti.reset(key).call(lambda: DataFrameLight(self._data)) >>> print('==============') >>> print('====== BENCHMARK: .LOC[] =======') >>> ti = ub.Timerit(num=20, bestof=4, verbose=True, unit='ms') >>> df_light = DataFrameLight._demodata(num=NUM) >>> # xdoctest: +REQUIRES(module:pandas) >>> df_heavy = df_light.pandas() >>> series_data = df_heavy.to_dict(orient='series') >>> list_data = df_heavy.to_dict(orient='list') >>> np_data = {k: v.values for k, v in df_heavy.to_dict(orient='series').items()} >>> for timer in ti.reset('DF-heavy.iloc'): >>> with timer: >>> for i in range(NUM): >>> df_heavy.iloc[i] >>> for timer in ti.reset('DF-heavy.loc'): >>> with timer: >>> for i in range(NUM): >>> df_heavy.iloc[i] >>> for timer in ti.reset('dict[SERIES].loc'): >>> with timer: >>> for i in range(NUM): >>> {key: series_data[key].loc[i] for key in series_data.keys()} >>> for timer in ti.reset('dict[SERIES].iloc'): >>> with timer: >>> for i in range(NUM): >>> {key: series_data[key].iloc[i] for key in series_data.keys()} >>> for timer in ti.reset('dict[SERIES][]'): >>> with timer: >>> for i in range(NUM): >>> {key: series_data[key][i] for key in series_data.keys()} >>> for timer in ti.reset('dict[NDARRAY][]'): >>> with timer: >>> for i in range(NUM): >>> {key: np_data[key][i] for key in np_data.keys()} >>> for timer in ti.reset('dict[list][]'): >>> with timer: >>> for i in range(NUM): >>> {key: list_data[key][i] for key in np_data.keys()} >>> for timer in ti.reset('DF-Light.iloc/loc'): >>> with timer: >>> for i in range(NUM): >>> df_light.iloc[i] >>> for timer in ti.reset('DF-Light._getrow'): >>> with timer: >>> for i in range(NUM): >>> df_light._getrow(i) NUM = 1000 ============== ====== to_dict conversions ===== Timed best=0.022 ms, mean=0.022 ± 0.0 ms for series Timed best=0.059 ms, mean=0.059 ± 0.0 ms for series+numpy Timed best=0.315 ms, mean=0.315 ± 0.0 ms for list Timed best=0.895 ms, mean=0.895 ± 0.0 ms for dict Timed best=2.705 ms, mean=2.705 ± 0.0 ms for split Timed best=5.474 ms, mean=5.474 ± 0.0 ms for records Timed best=7.320 ms, mean=7.320 ± 0.0 ms for index ============== ====== DFLight Conversions ======= Timed best=1.798 ms, mean=1.798 ± 0.0 ms for self.pandas Timed best=0.064 ms, mean=0.064 ± 0.0 ms for light-from-pandas Timed best=0.010 ms, mean=0.010 ± 0.0 ms for light-from-dict ============== ====== BENCHMARK: .LOC[] ======= Timed best=101.365 ms, mean=101.564 ± 0.2 ms for DF-heavy.iloc Timed best=102.038 ms, mean=102.273 ± 0.2 ms for DF-heavy.loc Timed best=29.357 ms, mean=29.449 ± 0.1 ms for dict[SERIES].loc Timed best=21.701 ms, mean=22.014 ± 0.3 ms for dict[SERIES].iloc Timed best=11.469 ms, mean=11.566 ± 0.1 ms for dict[SERIES][] Timed best=0.807 ms, mean=0.826 ± 0.0 ms for dict[NDARRAY][] Timed best=0.478 ms, mean=0.492 ± 0.0 ms for dict[list][] Timed best=0.969 ms, mean=0.994 ± 0.0 ms for DF-Light.iloc/loc Timed best=0.760 ms, mean=0.776 ± 0.0 ms for DF-Light._getrow """ def __init__(self, data=None, columns=None): if columns is not None: if data is None: data = ub.odict(zip(columns, [[] for _ in range(len(columns))])) else: data = ub.odict(zip(columns, data.T)) self._raw = data self._data = None self._localizer = LocLight(self) self.__normalize__() @property def iloc(self): return self._localizer @property def values(self): data = self._getcols(self.columns) return data @property def loc(self): return self._localizer def __eq__(self, other): """ Example: >>> # xdoctest: +REQUIRES(module:pandas) >>> self = DataFrameLight._demodata(num=7) >>> other = self.pandas() >>> assert np.all(self == other) """ self_vals = self.values if isinstance(other, DataFrameLight): other_vals = other.values if pd is not None and isinstance(other, pd.DataFrame): other_vals = other.reindex(columns=self.columns).values else: other_vals = other return self_vals == other_vals
[docs] def to_string(self, *args, **kwargs): """ Returns: str: """ return self.pandas().to_string(*args, **kwargs)
[docs] def to_dict(self, orient='dict', into=dict): """ Convert the data frame into a dictionary. Args: orient (str): Currently naitively suports orient in {'dict', 'list'}, otherwise we fallback to pandas conversion and call its to_dict method. into (type): type of dictionary to transform into Returns: dict Example: >>> from kwarray.dataframe_light import * # NOQA >>> self = DataFrameLight._demodata(num=7) >>> print(self.to_dict(orient='dict')) >>> print(self.to_dict(orient='list')) """ if orient == 'dict': out = into(self.iterrows()) elif orient == 'list': import kwarray out = into((k, kwarray.ArrayAPI.tolist(v)) for k, v in self._data.items()) else: out = self.pandas().to_dict(orient=orient, into=into) return out
[docs] def pandas(self): """ Convert back to pandas if you need the full API Returns: pd.DataFrame: Example: >>> # xdoctest: +REQUIRES(module:pandas) >>> df_light = DataFrameLight._demodata(num=7) >>> df_heavy = df_light.pandas() >>> got = DataFrameLight(df_heavy) >>> assert got._data == df_light._data """ if pd is None: raise Exception('Pandas is not available') return pd.DataFrame(self._data)
[docs] def _pandas(self): """ Deprecated, use self.pandas instead """ return self.pandas()
[docs] @classmethod def _demodata(cls, num=7): """ Example: >>> self = DataFrameLight._demodata(num=7) >>> print('self = {!r}'.format(self)) >>> other = DataFrameLight._demodata(num=11) >>> print('other = {!r}'.format(other)) >>> both = self.union(other) >>> print('both = {!r}'.format(both)) >>> assert both is not self >>> assert other is not self """ demodata = { 'foo': [0] * num, 'bar': [i % 3 for i in range(num)], 'baz': [2.73] * num, } self = cls(demodata) return self
def __nice__(self): return 'keys: {}, len={}'.format(list(self.keys()), len(self)) def __len__(self): if self._data: key = next(iter(self.keys())) return len(self._data[key]) else: return 0 def __contains__(self, item): return item in self.keys() def __normalize__(self): """ Try to convert input data to Dict[List] """ if self._raw is None: self._data = {} elif isinstance(self._raw, dict): self._data = self._raw if __debug__: lens = [] for d in self._data.values(): if not isinstance(d, (list, np.ndarray)): raise TypeError(type(d)) lens.append(len(d)) assert ub.allsame(lens) elif isinstance(self._raw, DataFrameLight): self._data = copy.copy(self._raw._data) elif pd is not None and isinstance(self._raw, pd.DataFrame): self._data = self._raw.to_dict(orient='list') else: raise TypeError('Unknown _raw type') @property def columns(self): return list(self.keys())
[docs] def sort_values(self, key, inplace=False, ascending=True): sortx = np.argsort(self._getcol(key)) if not ascending: sortx = sortx[::-1] return self.take(sortx, inplace=inplace)
[docs] def keys(self): if self._data: for key in self._data.keys(): yield key
[docs] def _getrow(self, index): return {key: self._data[key][index] for key in self._data.keys()}
[docs] def _getcol(self, key): return self._data[key]
[docs] def _getcols(self, keys): num = len(self) col_data = [self._getcol(key) for key in keys] data = np.hstack([np.asarray(d).reshape(num, -1) for d in col_data]) return data
[docs] def get(self, key, default=None): """ Get item for given key. Returns default value if not found. """ return self[key] if key in self else default
[docs] def clear(self): """ Removes all rows inplace """ if self._data: for key in self._data.keys(): self._data[key].clear()
def __getitem__(self, key): """ Note: only handles the case where key is a single column name. Example: >>> df_light = DataFrameLight._demodata(num=7) >>> sub1 = df_light['bar'] >>> # xdoctest: +REQUIRES(module:pandas) >>> df_heavy = df_light.pandas() >>> sub2 = df_heavy['bar'] >>> assert np.all(sub1 == sub2) """ return self._getcol(key) def __setitem__(self, key, value): """ Note: only handles the case where key is a single column name. and value is an array of all the values to set. Example: >>> df_light = DataFrameLight._demodata(num=7) >>> value = [2] * len(df_light) >>> df_light['bar'] = value >>> # xdoctest: +REQUIRES(module:pandas) >>> df_heavy = df_light.pandas() >>> df_heavy['bar'] = value >>> assert np.all(df_light == df_heavy) """ self._data[key] = value
[docs] def compress(self, flags, inplace=False): """ NOTE: NOT A PART OF THE PANDAS API """ subset = self if inplace else self.__class__() for key in self._data.keys(): subset._data[key] = list(ub.compress(self._data[key], flags)) return subset
[docs] def take(self, indices, inplace=False): """ Return the elements in the given *positional* indices along an axis. Args: inplace (bool): NOT PART OF PANDAS API Note: assumes axis=0 Example: >>> df_light = DataFrameLight._demodata(num=7) >>> indices = [0, 2, 3] >>> sub1 = df_light.take(indices) >>> # xdoctest: +REQUIRES(module:pandas) >>> df_heavy = df_light.pandas() >>> sub2 = df_heavy.take(indices) >>> assert np.all(sub1 == sub2) """ subset = self if inplace else self.__class__() if isinstance(indices, slice): for key in self._data.keys(): subset._data[key] = self._data[key][indices] else: for key in self._data.keys(): subset._data[key] = list(ub.take(self._data[key], indices)) return subset
[docs] def copy(self): other = copy.copy(self) other._data = other._data.copy() other._localizer = LocLight(other) return other
[docs] def extend(self, other): """ Extend ``self`` inplace using another dataframe array Args: other (DataFrameLight | dict[str, Sequence]): values to concat to end of this object NOTE: Not part of the pandas API Example: >>> self = DataFrameLight(columns=['foo', 'bar']) >>> other = {'foo': [0], 'bar': [1]} >>> self.extend(other) >>> assert len(self) == 1 """ try: _other_data = other._data except AttributeError: _other_data = other _self_data = self._data for key, vals1 in _self_data.items(): vals2 = _other_data[key] try: vals1.extend(vals2) except AttributeError: if isinstance(vals1, np.ndarray): _self_data[key] = np.hstack([vals1, vals2]) else: raise
[docs] def union(self, *others): """ NOTE: Note part of the pandas API """ if isinstance(self, DataFrameLight): first = self rest = others else: if len(others) == 0: return DataFrameLight() first = others[0] rest = others[1:] both = first.copy() if not both.keys: for other in rest: if other.keys: both.keys = copy.copy(other.keys) break for other in rest: both.extend(other) return both
[docs] @classmethod def concat(cls, others): return cls.union(*others)
[docs] @classmethod def from_pandas(cls, df): _raw = {k: v.values for k, v in df.to_dict(orient='series').items()} return cls(_raw)
[docs] @classmethod def from_dict(cls, records): record_iter = iter(records) columns = {} try: r = next(record_iter) for key, value in r.items(): columns[key] = [value] except StopIteration: pass else: for r in record_iter: for key, value in r.items(): columns[key].append(value) self = cls(columns) return self
[docs] def reset_index(self, drop=False): """ noop for compatability, the light version doesnt store an index """ return self
[docs] def groupby(self, by=None, *args, **kwargs): """ Group rows by the value of a column. Unlike pandas this simply returns a zip object. To ensure compatiability call list on the result of groupby. Args: by (str): column name to group by *args: if specified, the dataframe is coerced to pandas *kwargs: if specified, the dataframe is coerced to pandas Example: >>> df_light = DataFrameLight._demodata(num=7) >>> res1 = list(df_light.groupby('bar')) >>> # xdoctest: +REQUIRES(module:pandas) >>> df_heavy = df_light.pandas() >>> res2 = list(df_heavy.groupby('bar')) >>> assert len(res1) == len(res2) >>> assert all([np.all(a[1] == b[1]) for a, b in zip(res1, res2)]) Ignore: >>> self = DataFrameLight._demodata(num=1000) >>> args = ['cx'] >>> self['cx'] = (np.random.rand(len(self)) * 10).astype(np.int) >>> # As expected, our custom restricted implementation is faster >>> # than pandas >>> ub.Timerit(100).call(lambda: dict(list(self.pandas().groupby('cx')))).print() >>> ub.Timerit(100).call(lambda: dict(self.groupby('cx'))).print() """ if len(args) == 0 and len(kwargs) == 0: # In this special case we can be fast import kwarray unique, groupxs = kwarray.group_indices(self[by]) groups = [self.take(idxs) for idxs in groupxs] return zip(unique, groups) else: # otherwise we need to use the slow method return self.pandas().groupby(by=by)
[docs] def rename(self, mapper=None, columns=None, axis=None, inplace=False): """ Rename the columns (index renaming is not supported) Example: >>> df_light = DataFrameLight._demodata(num=7) >>> mapper = {'foo': 'fi'} >>> res1 = df_light.rename(columns=mapper) >>> res3 = df_light.rename(mapper, axis=1) >>> # xdoctest: +REQUIRES(module:pandas) >>> df_heavy = df_light.pandas() >>> res2 = df_heavy.rename(columns=mapper) >>> res4 = df_heavy.rename(mapper, axis=1) >>> assert np.all(res1 == res2) >>> assert np.all(res3 == res2) >>> assert np.all(res3 == res4) """ if columns is not None: if axis is not None: raise TypeError("Cannot specify both 'axis' and any of 'index' or 'columns'") else: if axis != 1: raise NotImplementedError('only axis=1 is supported') columns = mapper if not inplace: self = self.copy() for old, new in columns.items(): if old in self._data: self._data[new] = self._data.pop(old) return self
[docs] def iterrows(self): """ Iterate over rows as (index, Dict) pairs. Yields: Tuple[int, Dict]: the index and a dictionary representing a row Example: >>> from kwarray.dataframe_light import * # NOQA >>> self = DataFrameLight._demodata(num=3) >>> print(ub.urepr(list(self.iterrows()), sort=1)) [ (0, {'bar': 0, 'baz': 2.73, 'foo': 0}), (1, {'bar': 1, 'baz': 2.73, 'foo': 0}), (2, {'bar': 2, 'baz': 2.73, 'foo': 0}), ] Benchmark: >>> # xdoc: +REQUIRES(--bench) >>> from kwarray.dataframe_light import * # NOQA >>> import ubelt as ub >>> df_light = DataFrameLight._demodata(num=1000) >>> df_heavy = df_light.pandas() >>> ti = ub.Timerit(21, bestof=3, verbose=2, unit='ms') >>> ti.reset('light').call(lambda: list(df_light.iterrows())) >>> ti.reset('heavy').call(lambda: list(df_heavy.iterrows())) >>> # xdoctest: +IGNORE_WANT Timed light for: 21 loops, best of 3 time per loop: best=0.834 ms, mean=0.850 ± 0.0 ms Timed heavy for: 21 loops, best of 3 time per loop: best=45.007 ms, mean=45.633 ± 0.5 ms """ for idx in range(len(self)): row = self._getrow(idx) yield idx, row
[docs] class DataFrameArray(DataFrameLight): """ DataFrameLight assumes the backend is a Dict[list] DataFrameArray assumes the backend is a Dict[ndarray] Take and compress are much faster, but extend and union are slower """ def __normalize__(self): """ Try to convert input data to Dict[ndarray] """ if self._raw is None: self._data = {} elif isinstance(self._raw, dict): self._data = self._raw if __debug__: lens = [] for d in self._data.values(): if not isinstance(d, (list, np.ndarray)): raise TypeError(type(d)) lens.append(len(d)) assert ub.allsame(lens), ( 'lens are not all same {} for columns {}'.format( lens, list(self._data.keys())) ) elif isinstance(self._raw, DataFrameLight): self._data = copy.copy(self._raw._data) elif pd is not None and isinstance(self._raw, pd.DataFrame): self._data = {k: v.values for k, v in self._raw.to_dict(orient='series').items()} else: raise TypeError('Unknown _raw type') # self._data = ub.map_vals(np.asarray, self._data) # does this break anything?
[docs] def extend(self, other): for key in self._data.keys(): vals1 = self._data[key] vals2 = other._data[key] self._data[key] = np.hstack([vals1, vals2])
[docs] def compress(self, flags, inplace=False): subset = self if inplace else self.__class__() for key in self._data.keys(): subset._data[key] = self._data[key][flags] return subset
[docs] def take(self, indices, inplace=False): subset = self if inplace else self.__class__() for key in self._data.keys(): subset._data[key] = self._data[key][indices] return subset
# def min(self, axis=None): # return self._extreme(func=np.minimum, axis=axis) # def max(self, axis=None): # """ # Example: # >>> self = DataFrameArray._demodata(num=7) # >>> func = np.maximum # """ # return self._extreme(func=np.maximum, axis=axis) # def _extreme(self, func, axis=None): # if axis is None: # raise NotImplementedError # if axis == 0: # raise NotImplementedError # elif axis == 1: # newdata = nh.util.iter_reduce_ufunc(func, (self[key] for key in self.keys())) # newobj = self.__class__(newdata, self._keys) # else: # raise NotImplementedError