Source code for meerkat.contrib.siim_cxr

import os
import subprocess
from glob import glob

import numpy as np
import pandas as pd
import torchvision.transforms as transforms
from PIL import Image

from meerkat.cells.volume import MedicalVolumeCell

GAZE_DATA_URL = "https://raw.githubusercontent.com/robustness-gym/meerkat/dev/examples/03-med_img/cxr_gaze_data.json"  # noqa: E501


[docs]def download_siim_cxr( dataset_dir: str, kaggle_username: str, kaggle_key: str, download_gaze_data: bool = True, include_mock_reports: bool = True, ): """Download the dataset from the SIIM-ACR Pneumothorax Segmentation challenge. https://www.kaggle.com/c/siim-acr-pneumothorax- segmentation/data. Args: dataset_dir (str): Path to directory where the dataset will be downloaded. kaggle_username (str): Your kaggle username. kaggle_key (str): A kaggle API key. In order to use the Kaggle’s public API, you must first authenticate using an API token. From the site header, click on your user profile picture, then on “My Account” from the dropdown menu. This will take you to your account settings at https://www.kaggle.com/account. Scroll down to the section of the page labelled API: To create a new token, click on the “Create New API Token” button. This will download a json file with a "username" and "key" field. Copy and paste the "key" field and pass it in as `kaggle_key`. Instructions copied from Kaggle API docs: https://www.kaggle.com/docs/api download_gaze_data (str): Download a pkl file containing eye-tracking data collected on a radiologist interpreting the xray. """ # download and integrate gaze data os.environ["KAGGLE_USERNAME"] = kaggle_username os.environ["KAGGLE_KEY"] = kaggle_key subprocess.run( [ "kaggle", "datasets", "download", "-d", "seesee/siim-train-test", "-p", dataset_dir, ] ) if os.path.exists(os.path.join(dataset_dir, "siim-train-test.zip")): subprocess.run( [ "unzip", "-q", os.path.join(dataset_dir, "siim-train-test.zip"), "-d", dataset_dir, ] ) os.remove(os.path.join(dataset_dir, "siim-train-test.zip")) # get segment annotations segment_df = pd.read_csv(os.path.join(dataset_dir, "siim", "train-rle.csv")) segment_df = segment_df.rename( columns={"ImageId": "image_id", " EncodedPixels": "encoded_pixels"} ) # there are some images that were segemented by multiple annotators, we'll just take # the first segment_df = segment_df[~segment_df.image_id.duplicated(keep="first")] # get binary labels for pneumothorax, any row with a "-1" for encoded pixels is # considered a negative segment_df["pmx"] = (segment_df.encoded_pixels != "-1").astype(int) # start building up a main dataframe with a few `merge` operations (i.e. join) df = segment_df # get filepaths for all images in the "dicom-images-train" directory filepaths = sorted( glob(os.path.join(dataset_dir, "siim", "dicom-images-train/*/*/*.dcm")) ) filepath_df = pd.DataFrame( [ { "filepath": filepath, "image_id": os.path.splitext(os.path.basename(filepath))[0], } for filepath in filepaths ] ) # important to perform a left join here, because there are some images in the # directory without labels in `segment_df` and we only want those with labelsy df = df.merge(filepath_df, how="left", on="image_id") if download_gaze_data: subprocess.run( [ "curl", GAZE_DATA_URL, "--output", os.path.join(dataset_dir, "cxr_gaze_data.json"), ] ) if include_mock_reports: df["report"] = (df["pmx"] == 1).apply(_get_mock_report) df.to_csv(os.path.join(dataset_dir, "siim_cxr.csv"), index=False)
CXR_MEAN = 0.48865 CXR_STD = 0.24621 CXR_SIZE = 224
[docs]def cxr_transform_pil(volume: MedicalVolumeCell): array = volume._volume.squeeze() return Image.fromarray(np.uint8(array))
[docs]def cxr_transform(volume: MedicalVolumeCell): img = cxr_transform_pil(volume) img = transforms.Compose( [ transforms.Resize([CXR_SIZE, CXR_SIZE]), transforms.ToTensor(), transforms.Normalize(CXR_MEAN, CXR_STD), ] )(img) return img.repeat([3, 1, 1])
def _get_mock_report(pmx: bool): state = (np.random.choice(["severe", "moderate"])) if pmx else "no" return np.random.choice( [ ( "Cardiac size cannot be evaluated. Large left pleural effusion is new. " "Small right effusion is new. The upper lungs are clear. Right lower " f" lobe opacities are better seen in prior CT. There is {state} " " pneumothorax. There are mild degenerative changes in the thoracic " "spine." ), ( f"There is {state} pneumothorax. There are mild degenerative changes " "in the thoracic spine. The upper lungs are clear. Right lower lobe " "opacities are better seen in prior CT." "There are mild degenerative changes in the thoracic spine." ), ( "The upper lungs are clear. Right lower lobe opacities are better " f"seen in prior CT. There is {state} pneumothorax. " "There are mild degenerative changes in the thoracic spine." ), ] )