Source code for meerkat.contrib.visual_genome

import os
from typing import Dict, Mapping

import ujson as json

import meerkat as mk

# TODO (Sabri): Add support for downloading the data.
# For the time being, relevant files should be downloaded to the directory here
# https://visualgenome.org/VGViz/explore


[docs]def build_visual_genome_dps(
    dataset_dir: str, write: bool = False
) -> Dict[str, mk.DataPanel]:
    dps = {}
    print("Loading objects and attributes...")
    dps.update(_build_object_dps(dataset_dir))

    print("Loading images...")
    dps.update(_build_image_dp(dataset_dir=dataset_dir))

    print("Loading relationships...")
    dps.update(_build_relationships_dp(dataset_dir=dataset_dir))

    if write:
        write_visual_genome_dps(dps, dataset_dir=dataset_dir)
    return dps


[docs]def read_visual_genome_dps(dataset_dir: str) -> Dict[str, mk.DataPanel]:
    return {
        key: mk.DataPanel.read(os.path.join(dataset_dir, f"{key}.mk"))
        for key in ["attributes", "relationships", "objects", "images"]
    }


[docs]def write_visual_genome_dps(dps: Mapping[str, mk.DataPanel], dataset_dir: str):
    for key, dp in dps.items():
        dp.write(os.path.join(dataset_dir, f"{key}.mk"))


def _build_object_dps(dataset_dir: str):
    with open(os.path.join(dataset_dir, "attributes.json")) as f:
        objects = json.load(f)

    objects_dp = []  # create one table for objects
    attributes_dp = []  # create one table of attributes
    for image in objects:
        for obj in image["attributes"]:
            obj["image_id"] = image["image_id"]

            # all names are length 1
            names = obj.pop("names")
            obj["name"] = names[0]

            # add attributes to the table
            attributes = obj.pop("attributes", None)
            if attributes is not None:
                for attribute in attributes:
                    attributes_dp.append(
                        {"object_id": obj["object_id"], "attribute": attribute}
                    )

            # the vast majority of objects (99.7%) have 0 or 1 synonym in their
            # synset, so we only consider the first synonym to keep things simple
            synset = obj.pop("synsets")
            obj["syn_name"] = synset[0] if len(synset) > 0 else ""

            objects_dp.append(obj)

    return {
        "objects": mk.DataPanel(objects_dp),
        "attributes": mk.DataPanel(attributes_dp),
    }


def _build_image_dp(dataset_dir: str):
    with open(os.path.join(dataset_dir, "image_data.json")) as f:
        images = json.load(f)
    image_dp = mk.DataPanel(images)
    image_dp.remove_column("coco_id")
    image_dp.remove_column("flickr_id")

    image_dp["local_path"] = dataset_dir + (image_dp["url"].str.split("rak248")).apply(
        lambda x: x[-1]
    )

    image_dp["image"] = mk.ImageColumn(image_dp["local_path"])

    return {"images": image_dp}


def _build_relationships_dp(dataset_dir: str):

    with open(os.path.join(dataset_dir, "relationships.json")) as f:
        relationships = json.load(f)

    rel_dp = []
    for image in relationships:
        image_id = image["image_id"]
        for r in image["relationships"]:
            object_synset = r["object"]["synsets"]
            subject_synset = r["subject"]["synsets"]

            rel_dp.append(
                {
                    "image_id": image_id,
                    "predicate": r["predicate"],
                    "subject_object_id": r["subject"]["object_id"],
                    "subject_name": r["subject"]["name"]
                    if "name" in r["subject"]
                    else r["subject"]["names"][0],
                    "subject_syn": subject_synset[0] if len(subject_synset) > 0 else "",
                    "object_object_id": r["object"]["object_id"],
                    "object_name": r["object"]["name"]
                    if "name" in r["object"]
                    else r["object"]["names"][0],
                    "object_syn": object_synset[0] if len(object_synset) > 0 else "",
                }
            )
    rel_dp = mk.DataPanel(rel_dp)
    return {"relationships": rel_dp}