Source code for meerkat.datasets.utils
import os
[docs]def download_url(url: str, dataset_dir: str, force: bool = False):
os.makedirs(dataset_dir, exist_ok=True)
from datasets.utils.file_utils import get_from_cache
return get_from_cache(
url, cache_dir=os.path.join(dataset_dir, "downloads"), force_download=force
)
[docs]def extract(path: str, dst: str, extractor: str = None):
from datasets.utils.extract import Extractor
from datasets.utils.filelock import FileLock
# Prevent parallel extractions
lock_path = path + ".lock"
with FileLock(lock_path):
# Support for older versions of datasets that have list of extractors instead
# of dict of extractors
extractors = (
Extractor.extractors
if isinstance(Extractor.extractors, list)
else Extractor.extractors.values()
)
if extractor:
return extractor.extract(path, dst)
for extractor in extractors:
if extractor.is_extractable(path):
return extractor.extract(path, dst)
raise ValueError("Extraction method not found for {}".format(path))
[docs]def download_google_drive(
url: str = None, id: str = None, dst: str = None, is_folder: bool = False
):
os.makedirs(dst, exist_ok=True)
if (url is None) == (id is None):
raise ValueError("Exactly one of url or id must be provided.")
if dst is None:
raise ValueError("dst must be provided.")
try:
import gdown
except ImportError:
raise ImportError(
"Google Drive download requires gdown. Install with `pip install gdown`."
)
if is_folder:
gdown.download_folder(url=url, id=id, output=dst)
else:
gdown.download(url=url, id=id, output=dst)