Source code for meerkat.ops.groupby

from __future__ import annotations

from typing import Callable, Dict, List, Sequence, Tuple, Union

import numpy as np

from meerkat.datapanel import DataPanel


[docs]class GroupBy: def __init__( self, data: DataPanel, indices: Dict[Union[str, Tuple[str]], np.ndarray], by: Union[List[str], str], ): self.indices = [indices] if isinstance(indices, str) else indices self.data = data self.by = by
[docs] def mean(self, *args, **kwargs): return self._reduce(lambda x: x.mean(*args, **kwargs))
def _reduce(self, f: Callable): """self.indices are a dictionary of {labels : [indices]}""" group_keys = list(self.indices.keys()) # sorting them so that they appear in a nice order. group_keys.sort() # Means will be a list of dictionaries where each element in the dict groups = [] for label in group_keys: indices_l = self.indices[label] relevant_rows_where_by_is_label = self.data.lz[indices_l] m = f(relevant_rows_where_by_is_label) groups.append(m) from meerkat.datapanel import DataPanel # Create DataPanel as a list of rows. out = DataPanel(groups) # Add the by columns. if len(group_keys) > 0: if len(self.by) > 1: columns = list(zip(*group_keys)) for i, col in enumerate(self.by): out[col] = columns[i] else: col = self.by[0] out[col] = group_keys return out def __getitem__(self, key: Union[str, Sequence[str]]) -> GroupBy: if isinstance(key, str): key = [key] return GroupBy(data=self.data[key], indices=self.indices, by=self.by)
[docs]def groupby( data: DataPanel, by: Union[str, Sequence[str]] = None, ) -> GroupBy: """Perform a groupby operation on a DataPanel or Column (similar to a `DataFrame.groupby` and `Series.groupby` operations in Pandas). TODO (Sam): I put down a very rough scaffolding of how you could setup the class hierarchy for this. It is inspired by the way pandas has things setup: check out https://github.com/pandas-dev/pandas/tree/a8968bfa696d51f73769c54f2630a9530488236a/pandas/core/groupby for some inspiration. I'd recommend starting with small simple datapanels. e.g. a datapanel of all numpy array columns. For example, ``` dp = DataPanel({ 'a': NumpyArrayColumn([1, 2, 2, 1, 3, 2, 3]), 'b': NumpyArrayColumn([1, 2, 3, 4, 5, 6, 7]), 'c': NumpyArrayColumn([1.0, 3.2, 2.1, 4.3, 5.4, 6.5, 7.6]) }) groupby(dp, by="a")["c"].mean() ``` Eventually we'll want to support a bunch of different aggregations, but for the time being let's just focus on mean, sum, and count. Note: we'll also want to implement methods `DataPanel.groupby` or `AbstractColumn.groupby` eventually, but we also want a functional version that could be called like `mk.groupby(dp, by="class")`. I'd suggest putting most of the implementation here, and then making the methods just wrappers. See merge as an example. Args: data (Union[DataPanel, AbstractColumn]): The data to group. by (Union[str, Sequence[str]]): The column(s) to group by. Ignored if ``data`` is a Column. Returns: Union[DataPanelGroupBy, AbstractColumnGroupBy]: A GroupBy object. """ # must pass two arguments (columns - by, by), # by -> is a dictionary, a map, all distinct group_ids to indicies. # pass DataPanelGroupBy() try: if isinstance(by, str): by = [by] return GroupBy( data=data, indices=data[by].to_pandas().groupby(by).indices, by=by ) except Exception as e: # future work needed here. print("dataPanel group by error", e) raise NotImplementedError()