import os
import tarfile
import pandas as pd
import meerkat as mk
from ..abstract import DatasetBuilder
from ..info import DatasetInfo
from ..registry import datasets
from ..utils import download_url, extract
ID_TO_WORDS = {
"n02979186": "cassette player",
"n03417042": "garbage truck",
"n01440764": "tench",
"n02102040": "english springer spaniel",
"n03028079": "church",
"n03888257": "parachute",
"n03394916": "french horn",
"n03000684": "chainsaw",
"n03445777": "golf ball",
"n03425413": "gas pump",
}
ID_TO_IDX = {
"n02979186": 482,
"n03417042": 569,
"n01440764": 0,
"n02102040": 217,
"n03028079": 497,
"n03888257": 701,
"n03394916": 566,
"n03000684": 491,
"n03445777": 574,
"n03425413": 571,
}
[docs]@datasets.register()
class imagenette(DatasetBuilder):
VERSION_TO_URL = {
"full": "https://s3.amazonaws.com/fast-ai-imageclas/imagenette2.tgz",
"320px": "https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-320.tgz",
"160px": "https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-160.tgz",
}
VERSIONS = ["full", "320px", "160px"]
info = DatasetInfo(
name="imagenette",
full_name="ImageNette",
description=(
"Imagenette is a subset of 10 easily classified classes from Imagenet "
"(tench, English springer, cassette player, chain saw, church, "
"French horn, garbage truck, gas pump, golf ball, parachute)."
),
homepage="https://github.com/fastai/imagenette",
tags=["image", "classification"],
)
@property
def data_dir(self):
if self.version == "full":
return os.path.join(self.dataset_dir, "imagenette2")
elif self.version == "160px":
return os.path.join(self.dataset_dir, "imagenette2-160")
else:
return os.path.join(self.dataset_dir, "imagenette2-320")
[docs] def build(self):
df = self._build_df()
dp = mk.DataPanel.from_pandas(df)
dp["img"] = mk.ImageColumn.from_filepaths(
dp["img_path"], base_dir=self.data_dir
)
return dp
[docs] def download(self):
url = self.VERSION_TO_URL[self.version]
path = self.download_url(url)
extract(path, self.dataset_dir)
def _build_df(
self,
):
csv_path = os.path.join(self.data_dir, "noisy_imagenette.csv")
df = pd.read_csv(csv_path)
df["label_id"] = df["noisy_labels_0"]
df["label"] = df["label_id"].replace(ID_TO_WORDS)
df["label_idx"] = df["label_id"].replace(ID_TO_IDX)
df["split"] = df["is_valid"].replace({False: "train", True: "valid"})
df["img_path"] = df.path
return df
[docs]def download_imagenette(
download_dir, version="160px", overwrite: bool = False, return_df: bool = False
):
"""Download Imagenette dataset.
Args:
download_dir (str): The directory path to save to.
version (str, optional): Imagenette version.
Choices: ``"full"``, ``"320px"``, ``"160px"``.
overwrite (bool, optional): If ``True``, redownload the dataset.
return_df (bool, optional): If ``True``, return a ``pd.DataFrame``.
Returns:
Union[str, pd.DataFrame]: If ``return_df=True``, returns a pandas DataFrame.
Otherwise, returns the directory path where the data is stored.
References:
https://github.com/fastai/imagenette
"""
tar_path = os.path.join(
download_dir, os.path.basename(imagenette.VERSION_TO_URL[version])
)
dir_path = os.path.splitext(tar_path)[0]
csv_path = os.path.join(dir_path, "imagenette.csv")
if not overwrite and os.path.isfile(csv_path):
return (pd.read_csv(csv_path), dir_path) if return_df else dir_path
if overwrite or not os.path.exists(dir_path):
download_url(
url=imagenette.VERSION_TO_URL[version],
root=download_dir,
)
print("Extracting tar archive, this may take a few minutes...")
tar = tarfile.open(tar_path)
tar.extractall(download_dir)
tar.close()
os.remove(tar_path)
else:
print(f"Directory {dir_path} already exists. Skipping download.")
# build dataframe
df = pd.read_csv(os.path.join(dir_path, "noisy_imagenette.csv"))
df["label_id"] = df["noisy_labels_0"]
df["label"] = df["label_id"].replace(ID_TO_WORDS)
df["label_idx"] = df["label_id"].replace(ID_TO_IDX)
df["split"] = df["is_valid"].replace({False: "train", True: "valid"})
df["img_path"] = df.path
df[["img_path", "label", "label_id", "label_idx", "split"]].to_csv(
csv_path, index=False
)
return (df, dir_path) if return_df else dir_path
[docs]def build_imagenette_dp(
dataset_dir: str,
download: bool = False,
version: str = "160px",
) -> mk.DataPanel:
"""Build DataPanel for the Imagenette dataset.
Args:
download_dir (str): The directory path to save to or load from.
version (str, optional): Imagenette version.
Choices: ``"full"``, ``"320px"``, ``"160px"``.
overwrite (bool, optional): If ``True``, redownload the datasets.
Returns:
mk.DataPanel: A DataPanel corresponding to the dataset.
References:
https://github.com/fastai/imagenette
"""
if download:
df, dir_path = download_imagenette(
dataset_dir, version=version, overwrite=False, return_df=True
)
else:
csv_path = os.path.join(dataset_dir, "imagenette.csv")
if not os.path.isfile(csv_path):
raise ValueError("Imagenette is not downloaded. Pass `download=True`.")
df = pd.read_csv(csv_path)
dp = mk.DataPanel.from_pandas(df)
dp["img"] = mk.ImageColumn.from_filepaths(dp["img_path"], base_dir=dir_path)
return dp