Source code for meerkat.contrib.imagenette

import os
import tarfile

import pandas as pd
from torchvision.datasets.utils import download_url

import meerkat as mk

VERSION_TO_URL = {
    "full": "https://s3.amazonaws.com/fast-ai-imageclas/imagenette2.tgz",
    "320px": "https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-320.tgz",
    "160px": "https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-160.tgz",
}

ID_TO_WORDS = {
    "n02979186": "cassette player",
    "n03417042": "garbage truck",
    "n01440764": "tench",
    "n02102040": "english springer spaniel",
    "n03028079": "church",
    "n03888257": "parachute",
    "n03394916": "french horn",
    "n03000684": "chainsaw",
    "n03445777": "golf ball",
    "n03425413": "gas pump",
}

ID_TO_IDX = {
    "n02979186": 482,
    "n03417042": 569,
    "n01440764": 0,
    "n02102040": 217,
    "n03028079": 497,
    "n03888257": 701,
    "n03394916": 566,
    "n03000684": 491,
    "n03445777": 574,
    "n03425413": 571,
}


[docs]def download_imagenette( download_dir, version="160px", overwrite: bool = False, return_df: bool = False ): """Download Imagenette dataset. Args: download_dir (str): The directory path to save to. version (str, optional): Imagenette version. Choices: ``"full"``, ``"320px"``, ``"160px"``. overwrite (bool, optional): If ``True``, redownload the dataset. return_df (bool, optional): If ``True``, return a ``pd.DataFrame``. Returns: Union[str, pd.DataFrame]: If ``return_df=True``, returns a pandas DataFrame. Otherwise, returns the directory path where the data is stored. References: https://github.com/fastai/imagenette """ tar_path = os.path.join(download_dir, os.path.basename(VERSION_TO_URL[version])) dir_path = os.path.splitext(tar_path)[0] csv_path = os.path.join(dir_path, "imagenette.csv") if not overwrite and os.path.isfile(csv_path): return (pd.read_csv(csv_path), dir_path) if return_df else dir_path if overwrite or not os.path.exists(dir_path): download_url( url=VERSION_TO_URL[version], root=download_dir, ) print("Extracting tar archive, this may take a few minutes...") tar = tarfile.open(tar_path) tar.extractall(download_dir) tar.close() os.remove(tar_path) else: print(f"Directory {dir_path} already exists. Skipping download.") # build dataframe df = pd.read_csv(os.path.join(dir_path, "noisy_imagenette.csv")) df["label_id"] = df["noisy_labels_0"] df["label"] = df["label_id"].replace(ID_TO_WORDS) df["label_idx"] = df["label_id"].replace(ID_TO_IDX) df["split"] = df["is_valid"].replace({False: "train", True: "valid"}) df["img_path"] = df.path df[["img_path", "label", "label_id", "label_idx", "split"]].to_csv( csv_path, index=False ) return (df, dir_path) if return_df else dir_path
[docs]def build_imagenette_dp( dataset_dir: str, download: bool = False, version: str = "160px", ) -> mk.DataPanel: """Build DataPanel for the Imagenette dataset. Args: download_dir (str): The directory path to save to or load from. version (str, optional): Imagenette version. Choices: ``"full"``, ``"320px"``, ``"160px"``. overwrite (bool, optional): If ``True``, redownload the datasets. Returns: mk.DataPanel: A DataPanel corresponding to the dataset. References: https://github.com/fastai/imagenette """ if download: df, dir_path = download_imagenette( dataset_dir, version=version, overwrite=False, return_df=True ) else: csv_path = os.path.join(dataset_dir, "imagenette.csv") if not os.path.isfile(csv_path): raise ValueError("Imagenette is not downloaded. Pass `download=True`.") df = pd.read_csv(csv_path) dp = mk.DataPanel.from_pandas(df) dp["img"] = mk.ImageColumn.from_filepaths(dp["img_path"], base_dir=dir_path) return dp