meerkat.columns package#

Submodules#

meerkat.columns.abstract module#

class AbstractColumn(data: Sequence | None = None, collate_fn: Callable | None = None, formatter: Callable | None = None, *args, **kwargs)[source]#

Bases: BlockableMixin, CloneableMixin, CollateMixin, ColumnIOMixin, FunctionInspectorMixin, LambdaMixin, MappableMixin, MaterializationMixin, ProvenanceMixin, ABC

An abstract class for Meerkat columns.

append(column: AbstractColumn) None[source]#
batch(batch_size: int = 1, drop_last_batch: bool = False, collate: bool = True, num_workers: int = 0, materialize: bool = True, *args, **kwargs)[source]#

Batch the column.

Parameters:
  • batch_size – integer batch size

  • drop_last_batch – drop the last batch if its smaller than batch_size

  • collate – whether to collate the returned batches

Returns:

batches of data

static concat(columns: Sequence[AbstractColumn]) None[source]#
filter(function: Callable, with_indices=False, input_columns: str | List[str] | None = None, is_batched_fn: bool = False, batch_size: int | None = 1, drop_last_batch: bool = False, num_workers: int | None = 0, materialize: bool = True, pbar: bool = False, **kwargs) AbstractColumn | None[source]#

Filter the elements of the column using a function.

classmethod from_data(data: Columnable | AbstractColumn)[source]#

Convert data to a meerkat column using the appropriate Column type.

full_length()[source]#
classmethod get_writer(mmap: bool = False, template: AbstractColumn | None = None)[source]#
head(n: int = 5) AbstractColumn[source]#

Get the first n examples of the column.

is_equal(other: AbstractColumn) bool[source]#

Tests whether two columns.

Parameters:

other (AbstractColumn) – [description]

streamlit()[source]#
tail(n: int = 5) AbstractColumn[source]#

Get the last n examples of the column.

to_pandas() Series[source]#
Columnable#

alias of Union[Sequence, ndarray, Series, Tensor]

property data#

Get the underlying data.

property formatter: Callable#
property is_mmap#
logdir: Path = PosixPath('/home/docs/meerkat')#
property metadata#

meerkat.columns.arrow_column module#

class ArrowArrayColumn(data: Sequence, *args, **kwargs)[source]#

Bases: AbstractColumn

block_class#

alias of ArrowBlock

classmethod concat(columns: Sequence[ArrowArrayColumn])[source]#
is_equal(other: AbstractColumn) bool[source]#

Tests whether two columns.

Parameters:

other (AbstractColumn) – [description]

to_numpy()[source]#
to_pandas()[source]#
to_tensor()[source]#

meerkat.columns.cell_column module#

class CellColumn(cells: Sequence[AbstractCell] | None = None, *args, **kwargs)[source]#

Bases: AbstractColumn

static concat(columns: Sequence[CellColumn])[source]#
classmethod from_cells(cells: Sequence[AbstractCell], *args, **kwargs)[source]#
is_equal(other: AbstractColumn) bool[source]#

Tests whether two columns.

Parameters:

other (AbstractColumn) – [description]

property cells#

meerkat.columns.image_column module#

class ImageColumn(data: Sequence[str] | None = None, transform: callable | None = None, loader: callable | None = None, base_dir: str | None = None, *args, **kwargs)[source]#

Bases: FileColumn

A column where each cell represents an image stored on disk. The underlying data is a PandasSeriesColumn of strings, where each string is the path to an image. The column materializes the images into memory when indexed. If the column is lazy indexed with the lz indexer, the images are not materialized and an ImageCell or an ImageColumn is returned instead.

Parameters:
  • data (Sequence[str]) – A list of filepaths to images.

  • transform (callable) –

    A function that transforms the image (e.g. torchvision.transforms.functional.center_crop).

    Warning

    In order for the column to be serializable, the transform function must be pickleable.

  • loader (callable) –

    A callable with signature def loader(filepath: str) -> PIL.Image:. Defaults to torchvision.datasets.folder.default_loader.

    Warning

    In order for the column to be serializable with write(), the loader function must be pickleable.

  • base_dir (str) – A base directory that the paths in data are relative to. If None, the paths are assumed to be absolute.

classmethod default_loader(*args, **kwargs)[source]#

meerkat.columns.lambda_column module#

class LambdaCell(fn: callable | None = None, data: any | None = None)[source]#

Bases: AbstractCell

get(*args, **kwargs)[source]#

Get me the thing that this cell exists for.

property data: object#

Get the data associated with this cell.

class LambdaColumn(data: DataPanel | AbstractColumn, fn: callable | None = None, output_type: type | None = None, *args, **kwargs)[source]#

Bases: AbstractColumn

static concat(columns: Sequence[LambdaColumn])[source]#
fn(data: object)[source]#

Subclasses like ImageColumn should be able to implement their own version.

is_equal(other: AbstractColumn) bool[source]#

Tests whether two columns.

Parameters:

other (AbstractColumn) – [description]

meerkat.columns.list_column module#

class ListColumn(data: Sequence | None = None, *args, **kwargs)[source]#

Bases: AbstractColumn

batch(batch_size: int = 1, drop_last_batch: bool = False, collate: bool = True, *args, **kwargs)[source]#

Batch the column.

Parameters:
  • batch_size – integer batch size

  • drop_last_batch – drop the last batch if its smaller than batch_size

  • collate – whether to collate the returned batches

Returns:

batches of data

classmethod concat(columns: Sequence[ListColumn])[source]#
default_formatter()#
classmethod from_list(data: Sequence)[source]#
is_equal(other: AbstractColumn) bool[source]#

Tests whether two columns.

Parameters:

other (AbstractColumn) – [description]

meerkat.columns.numpy_column module#

class NumpyArrayColumn(data: Sequence, *args, **kwargs)[source]#

Bases: AbstractColumn, NDArrayOperatorsMixin

block_class#

alias of NumpyBlock

classmethod concat(columns: Sequence[NumpyArrayColumn])[source]#
classmethod from_array(data: ndarray, *args, **kwargs)[source]#
classmethod from_npy(path, mmap_mode=None, allow_pickle=False, fix_imports=True, encoding='ASCII')[source]#
classmethod get_writer(mmap: bool = False, template: AbstractColumn | None = None)[source]#
is_equal(other: AbstractColumn) bool[source]#

Tests whether two columns.

Parameters:

other (AbstractColumn) – [description]

to_pandas() Series[source]#
to_tensor() Tensor[source]#

Use column.to_tensor() instead of torch.tensor(column), which is very slow.

property is_mmap#
getattr_decorator(fn: Callable)[source]#

meerkat.columns.pandas_column module#

class PandasSeriesColumn(data: Sequence | None = None, collate_fn: Callable | None = None, formatter: Callable | None = None, *args, **kwargs)[source]#

Bases: AbstractColumn, NDArrayOperatorsMixin

block_class#

alias of PandasBlock

cat#

alias of _MeerkatCategoricalAccessor

dt#

alias of _MeerkatCombinedDatetimelikeProperties

str#

alias of _MeerkatStringMethods

classmethod concat(columns: Sequence[PandasSeriesColumn])[source]#
classmethod from_array(data: ndarray, *args, **kwargs)[source]#
is_equal(other: AbstractColumn) bool[source]#

Tests whether two columns.

Parameters:

other (AbstractColumn) – [description]

to_pandas() Series[source]#
to_tensor() Tensor[source]#

Use column.to_tensor() instead of torch.tensor(column), which is very slow.

getattr_decorator(fn: Callable)[source]#

meerkat.columns.spacy_column module#

class SpacyColumn(data: Sequence[spacy_tokens.Doc] = None, *args, **kwargs)[source]#

Bases: ListColumn

classmethod from_docs(data: Sequence[spacy_tokens.Doc], *args, **kwargs)[source]#
classmethod from_texts(texts: Sequence[str], lang: str = 'en_core_web_sm', *args, **kwargs)[source]#
classmethod read(path: str, nlp: spacy.language.Language = None, lang: str = None, *args, **kwargs) SpacyColumn[source]#
write(path: str, **kwargs) None[source]#
property docs#
property tokens#

meerkat.columns.tensor_column module#

class TensorColumn(data: Sequence | None = None, *args, **kwargs)[source]#

Bases: NDArrayOperatorsMixin, AbstractColumn

block_class#

alias of TensorBlock

classmethod concat(columns: Sequence[TensorColumn])[source]#
classmethod from_data(data: Sequence | ndarray | Series | Tensor | AbstractColumn)[source]#

Convert data to an EmbeddingColumn.

classmethod get_writer(mmap: bool = False, template: AbstractColumn | None = None)[source]#
is_equal(other: AbstractColumn) bool[source]#

Tests whether two columns.

Parameters:

other (AbstractColumn) – [description]

to_pandas() Series[source]#
to_tensor() Tensor[source]#
getattr_decorator(fn: Callable)[source]#

meerkat.columns.video_column module#

class VideoColumn(*args, **kwargs)[source]#

Bases: CellColumn

Interface for creating a CellColumn from VideoCell objects.

classmethod from_filepaths(filepaths: Sequence[str] | None = None, time_dim: int | None = 1, transform: Callable | None = None, *args, **kwargs)[source]#

meerkat.columns.volume_column module#

class MedicalVolumeColumn(*args, **kwargs)[source]#

Bases: CellColumn

classmethod from_filepaths(filepaths: Sequence[str] | None = None, loader: callable | None = None, transform: callable | None = None, *args, **kwargs)[source]#