Source code for meerkat.datasets.imagenet

import os
import subprocess
from typing import Dict

import numpy as np
import pandas as pd

import meerkat as mk

from ..abstract import DatasetBuilder
from ..info import DatasetInfo
from ..registry import datasets
from ..utils import download_url, extract


[docs]@datasets.register() class imagenet(DatasetBuilder): VERSIONS = ["ilsvrc2012"] info = DatasetInfo( name="imagenet", full_name="ImageNet", # flake8: noqa description="ImageNet is an image database organized according to the WordNet hierarchy (currently only the nouns), in which each node of the hierarchy is depicted by hundreds and thousands of images..", homepage="https://www.image-net.org/", tags=["image", "classification"], citation=( "@inproceedings{imagenet_cvpr09," "AUTHOR = {Deng, J. and Dong, W. and Socher, R. and Li, L.-J. and Li, K. and Fei-Fei, L.}," "TITLE = {{ImageNet: A Large-Scale Hierarchical Image Database}}," "BOOKTITLE = {CVPR09}," "YEAR = {2009}," 'BIBSOURCE = "http://www.image-net.org/papers/imagenet_cvpr09.bib"}' ), )
[docs] def build(self): paths = pd.read_csv( os.path.join(self.dataset_dir, "ILSVRC/ImageSets/CLS-LOC/train_cls.txt"), delimiter=" ", names=["path", "idx"], )["path"] train_df = paths.str.extract(r"(?P<synset>.*)/(?P<image_id>.*)") train_df["path"] = paths.apply( lambda x: os.path.join( self.dataset_dir, "ILSVRC/Data/CLS-LOC/train", f"{x}.JPEG" ) ) train_df["split"] = "train" # load validation data valid_df = pd.read_csv( os.path.join(self.dataset_dir, "LOC_val_solution.csv") ).rename(columns={"ImageId": "image_id"}) valid_df["synset"] = valid_df["PredictionString"].str.split(" ", expand=True)[0] valid_df["path"] = valid_df["image_id"].apply( lambda x: os.path.join( self.dataset_dir, "ILSVRC/Data/CLS-LOC/val", f"{x}.JPEG" ) ) valid_df["split"] = "valid" dp = mk.DataPanel.from_pandas( pd.concat([train_df, valid_df.drop(columns="PredictionString")]) ) dp["image"] = mk.ImageColumn.from_filepaths(dp["path"]) # mapping from synset to english with open(os.path.join(self.dataset_dir, "LOC_synset_mapping.txt")) as f: lines = f.read().splitlines() df = ( pd.Series(lines) .str.split(" ", expand=True, n=1) .rename(columns={0: "synset", 1: "name"}) ) mapping_dp = mk.DataPanel.from_pandas(df) # torchvision models use class indices corresponding to the order of the # LOC_synset_mapping.txt file, which we confirmed using the mapping provided here # https://gist.github.com/yrevar/942d3a0ac09ec9e5eb3a mapping_dp["class_idx"] = np.arange(len(mapping_dp)) dp = dp.merge(mapping_dp, how="left", on="synset") return dp
[docs] def download(self): curr_dir = os.getcwd() os.makedirs(self.dataset_dir, exist_ok=True) os.chdir(self.dataset_dir) subprocess.run( args=[ "kaggle competitions download " "-c imagenet-object-localization-challenge", ], shell=True, check=True, ) subprocess.run(["unzip", "imagenet-object-localization-challenge.zip"]) subprocess.run( ["tar", "-xzvf", "imagenet_object_localization_patched2019.tar.gz"] ) os.chdir(curr_dir)
[docs]def build_imagenet_dps( dataset_dir: str, download: bool = False ) -> Dict[str, mk.DataPanel]: if download: curr_dir = os.getcwd() os.makedirs(dataset_dir, exist_ok=True) os.chdir(dataset_dir) # subprocess.run( # args=[ # "kaggle competitions download " # "-c imagenet-object-localization-challenge", # ], # shell=True, # check=True, # ) # subprocess.run(["unzip", "imagenet-object-localization-challenge.zip"]) subprocess.run( ["tar", "-xzvf", "imagenet_object_localization_patched2019.tar.gz"] ) os.chdir(curr_dir) # load training data paths = pd.read_csv( os.path.join(dataset_dir, "ILSVRC/ImageSets/CLS-LOC/train_cls.txt"), delimiter=" ", names=["path", "idx"], )["path"] train_df = paths.str.extract(r"(?P<synset>.*)/(?P<image_id>.*)") train_df["path"] = paths.apply( lambda x: os.path.join(dataset_dir, "ILSVRC/Data/CLS-LOC/train", f"{x}.JPEG") ) train_df["split"] = "train" # load validation data valid_df = pd.read_csv(os.path.join(dataset_dir, "LOC_val_solution.csv")).rename( columns={"ImageId": "image_id"} ) valid_df["synset"] = valid_df["PredictionString"].str.split(" ", expand=True)[0] valid_df["path"] = valid_df["image_id"].apply( lambda x: os.path.join(dataset_dir, "ILSVRC/Data/CLS-LOC/val", f"{x}.JPEG") ) valid_df["split"] = "valid" dp = mk.DataPanel.from_pandas( pd.concat([train_df, valid_df.drop(columns="PredictionString")]) ) dp["image"] = mk.ImageColumn.from_filepaths(dp["path"]) # mapping from synset to english with open(os.path.join(dataset_dir, "LOC_synset_mapping.txt")) as f: lines = f.read().splitlines() df = ( pd.Series(lines) .str.split(" ", expand=True, n=1) .rename(columns={0: "synset", 1: "name"}) ) mapping_dp = mk.DataPanel.from_pandas(df) # torchvision models use class indices corresponding to the order of the # LOC_synset_mapping.txt file, which we confirmed using the mapping provided here # https://gist.github.com/yrevar/942d3a0ac09ec9e5eb3a mapping_dp["class_idx"] = np.arange(len(mapping_dp)) dp = dp.merge(mapping_dp, how="left", on="synset") return dp