Source code for meerkat.columns.spacy_column

from __future__ import annotations

import abc
import logging
import os
from typing import Sequence, Text

import yaml
from yaml.representer import Representer

from meerkat.columns.list_column import ListColumn
from meerkat.tools.lazy_loader import LazyLoader

spacy = LazyLoader("spacy")
spacy_attrs = LazyLoader("spacy.attrs")
spacy_tokens = LazyLoader("spacy.tokens")

Representer.add_representer(abc.ABCMeta, Representer.represent_name)

logger = logging.getLogger(__name__)


[docs]class SpacyColumn(ListColumn): def __init__( self, data: Sequence[spacy_tokens.Doc] = None, *args, **kwargs, ): super(SpacyColumn, self).__init__(data=data, *args, **kwargs)
[docs] @classmethod def from_docs(cls, data: Sequence[spacy_tokens.Doc], *args, **kwargs): return cls(data=data, *args, **kwargs)
[docs] @classmethod def from_texts( cls, texts: Sequence[Text], lang: str = "en_core_web_sm", *args, **kwargs, ): # Create the pipeline nlp = spacy.load(lang) return cls(data=[nlp(text) for text in texts], *args, **kwargs)
@property def docs(self): return self.data @property def tokens(self): return [list(doc) for doc in self] def __getattr__(self, item): try: return [getattr(doc, item) for doc in self] except AttributeError: raise AttributeError(f"Attribute {item} not found.")
[docs] @classmethod def read( cls, path: str, nlp: spacy.language.Language = None, lang: str = None, *args, **kwargs, ) -> SpacyColumn: assert (nlp is None) != (lang is None) if nlp is None: nlp = spacy.load(lang) # Load in the data metadata = dict( yaml.load(open(os.path.join(path, "meta.yaml")), Loader=yaml.FullLoader) ) assert metadata["dtype"] == cls # Load the `DocBin` from disk docbin = spacy_tokens.DocBin().from_disk(os.path.join(path, "data.spacy")) return cls(list(docbin.get_docs(nlp.vocab)))
[docs] def write(self, path: str, **kwargs) -> None: # Construct the metadata state = self._get_state() metadata = { "dtype": type(self), "len": len(self), "state": state, **self.metadata, } # Make directory os.makedirs(path, exist_ok=True) # Get the paths where metadata and data should be stored metadata_path = os.path.join(path, "meta.yaml") data_path = os.path.join(path, "data.spacy") # Create a `DocBin` to store the docs attrs = [name for name in spacy_attrs.NAMES if name != "HEAD"] docbin = spacy_tokens.DocBin(attrs=attrs, store_user_data=True, docs=self.docs) # Save all the docs docbin.to_disk(data_path) # Save the metadata as a yaml yaml.dump(metadata, open(metadata_path, "w"))