Source code for meerkat.datasets.utils

import os


[docs]def download_url(url: str, dataset_dir: str, force: bool = False):
    os.makedirs(dataset_dir, exist_ok=True)

    from datasets.utils.file_utils import get_from_cache

    return get_from_cache(
        url, cache_dir=os.path.join(dataset_dir, "downloads"), force_download=force
    )


[docs]def extract(path: str, dst: str, extractor: str = None):
    from datasets.utils.extract import Extractor
    from datasets.utils.filelock import FileLock

    # Prevent parallel extractions
    lock_path = path + ".lock"
    with FileLock(lock_path):
        # Support for older versions of datasets that have list of extractors instead
        # of dict of extractors
        extractors = (
            Extractor.extractors
            if isinstance(Extractor.extractors, list)
            else Extractor.extractors.values()
        )
        if extractor:
            return extractor.extract(path, dst)

        for extractor in extractors:
            if extractor.is_extractable(path):
                return extractor.extract(path, dst)
        raise ValueError("Extraction method not found for {}".format(path))


[docs]def download_google_drive(
    url: str = None, id: str = None, dst: str = None, is_folder: bool = False
):
    os.makedirs(dst, exist_ok=True)
    if (url is None) == (id is None):
        raise ValueError("Exactly one of url or id must be provided.")

    if dst is None:
        raise ValueError("dst must be provided.")

    try:
        import gdown
    except ImportError:
        raise ImportError(
            "Google Drive download requires gdown. Install with `pip install gdown`."
        )

    if is_folder:
        gdown.download_folder(url=url, id=id, output=dst)
    else:
        gdown.download(url=url, id=id, output=dst)