Source code for meerkat.datasets.enron

import email
import os
import subprocess

import pandas as pd
from tqdm import tqdm

import meerkat as mk

COLUMNS = [
    "From",
    "To",
    "Message-ID",
    "Subject",
    "X-FileName",
    "X-From",
    "X-To",
    "X-cc",
    "X-bcc",
    "X-Folder",
    "Date",
]


def _parse_email(email_string: str):
    e = email.message_from_string(email_string)
    d = {col.lower(): e.get(col, "") for col in COLUMNS}
    d["body"] = e.get_payload()
    return d


[docs]def build_enron_dp(dataset_dir: str, download: bool = True) -> mk.DataPanel: dp_path = os.path.join(dataset_dir, "enron.mk") if os.path.exists(dp_path): return mk.DataPanel.read(dp_path) downloaded = os.path.exists(os.path.join(dataset_dir, "emails.csv")) if not downloaded and download: print("Downloading data...") curr_dir = os.getcwd() os.makedirs(dataset_dir, exist_ok=True) os.chdir(dataset_dir) subprocess.run( args=["kaggle datasets download -d wcukierski/enron-email-dataset"], shell=True, check=True, ) subprocess.run( args=["unzip enron-email-dataset.zip"], shell=True, check=True, ) os.chdir(curr_dir) # load training data print("Parsing emails...") dp = mk.DataPanel.from_csv(os.path.join(dataset_dir, "emails.csv")) dp = mk.DataPanel([_parse_email(message) for message in tqdm(dp["message"])]) print("Parsing dates...") # need to remove timezone info to save and load with feather # otherwise get UnknownTimeZoneError on read dp["date"] = pd.to_datetime(dp["date"], utc=True) dp.write(dp_path) return dp