""" This script should be moved to dicee/scripts
Example:
python dicee/analyse_experiments.py --dir Experiments --features "model" "trainMRR" "testMRR"
"""
import os
import json
import pandas as pd
import argparse
[docs]
def get_default_arguments():
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument("--dir", type=str, default=None, help="Path of a directory containing experiments")
parser.add_argument('--features', nargs='+', default=[])
# TODO: features/columns for pandas dataframe
return parser.parse_args()
[docs]
class Experiment:
def __init__(self):
self.model_name = []
self.callbacks = []
self.embedding_dim = []
self.num_params = []
self.num_epochs = []
self.batch_size = []
self.lr = []
self.byte_pair_encoding = []
self.aswa = []
self.path_dataset_folder = []
self.full_storage_path = []
self.pq = []
self.train_mrr = []
self.train_h1 = []
self.train_h3 = []
self.train_h10 = []
self.val_mrr = []
self.val_h1 = []
self.val_h3 = []
self.val_h10 = []
self.test_mrr = []
self.test_h1 = []
self.test_h3 = []
self.test_h10 = []
self.runtime = []
self.normalization = []
self.scoring_technique = []
[docs]
def save_experiment(self, x):
self.model_name.append(x['model'])
self.embedding_dim.append(x['embedding_dim'])
self.num_epochs.append(x['num_epochs'])
self.batch_size.append(x['batch_size'])
self.lr.append(x['lr'])
self.byte_pair_encoding.append(x["byte_pair_encoding"])
self.aswa.append(x["adaptive_swa"])
self.path_dataset_folder.append(x['dataset_dir'])
self.pq.append((x['p'], x['q']))
self.runtime.append(x['Runtime'])
self.num_params.append(x['NumParam'])
self.normalization.append(x['normalization'])
self.scoring_technique.append(x['scoring_technique'])
self.callbacks.append(x['callbacks'])
self.train_mrr.append(x['Train']['MRR'])
self.train_h1.append(x['Train']['H@1'])
self.train_h3.append(x['Train']['H@3'])
self.train_h10.append(x['Train']['H@10'])
# Partition by delim and take the last one
# self.full_storage_path.append(x['full_storage_path'].partition('dice-embeddings')[-1])
self.val_mrr.append(x['Val']['MRR'])
self.val_h1.append(x['Val']['H@1'])
self.val_h3.append(x['Val']['H@3'])
self.val_h10.append(x['Val']['H@10'])
self.test_mrr.append(x['Test']['MRR'])
self.test_h1.append(x['Test']['H@1'])
self.test_h3.append(x['Test']['H@3'])
self.test_h10.append(x['Test']['H@10'])
[docs]
def to_df(self):
return pd.DataFrame(
dict(model=self.model_name,
byte_pair_encoding=self.byte_pair_encoding,
aswa=self.aswa,
Dataset=self.path_dataset_folder,
trainMRR=self.train_mrr,
trainH1=self.train_h1,
trainH3=self.train_h3,
trainH10=self.train_h10,
num_epochs=self.num_epochs,
full_storage_path=self.full_storage_path,
valMRR=self.val_mrr,
valH1=self.val_h1,
valH3=self.val_h3,
valH10=self.val_h10,
testMRR=self.test_mrr,
testH1=self.test_h1,
testH3=self.test_h3,
testH10=self.test_h10,
runtime=self.runtime,
params=self.num_params,
callbacks=self.callbacks,
embeddingdim=self.embedding_dim,
scoring_technique=self.scoring_technique
)
)
[docs]
def analyse(args):
# (2) Get all subfolders
sub_folder_str_paths = os.listdir(args.dir)
experiments = []
for path in sub_folder_str_paths:
full_path = args.dir + "/" + path
if os.path.isdir(full_path) is False:
continue
with open(f'{full_path}/configuration.json', 'r') as f:
config = json.load(f)
with open(f'{full_path}/report.json', 'r') as f:
report = json.load(f)
report = {i: report[i] for i in ['Runtime', 'NumParam']}
with open(f'{full_path}/eval_report.json', 'r') as f:
eval_report = json.load(f)
config.update(eval_report)
config.update(report)
if "Train" in config:
for k, v in config["Train"].items():
config[f"train{k}"] = v
if "Val" in config:
for k, v in config["Val"].items():
config[f"val{k}"] = v
if "Test" in config:
for k, v in config["Test"].items():
config[f"test{k}"] = v
del config["Train"]
del config["Val"]
del config["Test"]
experiments.append(config)
df = pd.DataFrame(experiments)
df.sort_values(by=['testMRR'], ascending=False, inplace=True)
pd.set_option("display.precision", 3)
#features=["model","testMRR"]
# print(df.columns)
try:
df_features = df[args.features]
except KeyError:
print(f"--features ({args.features}) is not a subset of {df.columns}")
raise KeyError
print(df_features.to_latex(index=False, float_format="%.3f"))
path_to_save = args.dir + '/summary.csv'
df_features.to_csv(path_or_buf=path_to_save)
print(f"Saved in {path_to_save}")
if __name__ == '__main__':
analyse(get_default_arguments())