informagi · f-hafner · Nov 25, 2022 · Dec 7, 2022 · Dec 14, 2022 · Dec 16, 2022
diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py
@@ -1,17 +1,107 @@
-import numpy as np
+import argparse
+import cProfile 
+import logging 
+import numpy as np 
+import os 
+import pickle
+import pandas as pd 
+import pstats 
 import requests
 
 from REL.training_datasets import TrainingEvaluationDatasets
 
 np.random.seed(seed=42)
 
-base_url = "/Users/vanhulsm/Desktop/projects/data/"
-wiki_version = "wiki_2014"
-datasets = TrainingEvaluationDatasets(base_url, wiki_version).load()["aida_testB"]
-
-# random_docs = np.random.choice(list(datasets.keys()), 50)
-
-server = True
+def profile_to_df(call):
+    """Helper function to profile a function call and save the timing in a pd df.
+
+    Source: https://stackoverflow.com/questions/44302726/pandas-how-to-store-cprofile-output-in-a-pandas-dataframe
+    """
+    cProfile.run(call, filename="temp.txt")
+    st = pstats.Stats("temp.txt")
+
+    keys_from_k = ['file', 'line', 'fn']
+    keys_from_v = ['cc', 'ncalls', 'tottime', 'cumtime', 'callers']
+    data = {k: [] for k in keys_from_k + keys_from_v}
+
+    s = st.stats
+
+    for k in s.keys():
+        for i, kk in enumerate(keys_from_k):
+            data[kk].append(k[i])
+
+        for i, kk in enumerate(keys_from_v):
+            data[kk].append(s[k][i])
+
+    df = pd.DataFrame(data)
+    os.remove('temp.txt')
+    return df
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--url",
+    dest="base_url",
+    type=str,
+    help="path to input and output data"
+)
+parser.add_argument(
+    '--search_corefs',
+    type=str,
+    choices=['all', 'lsh', 'off'],
+    default='all',
+    help="Setting for search_corefs in Entity Disambiguation."
+)
+parser.add_argument(
+    "--profile",
+    action="store_true",
+    default=False,
+    help="Profile the disambiguation step."
+    )
+parser.add_argument(
+    "--scale_mentions",
+    action="store_true", 
+    default=False,
+    help="""Stack mentions in each dataset and time the disambiguation step by document. 
+            This is to assess the time complexity of the program."""
+    )
+parser.add_argument(
+    "--name_dataset",
+    type=str,
+    default="aida_testB",
+    help="Name of the training dataset to be used"
+)
+parser.add_argument(
+    "--n_docs",
+    type=int,
+    default=50,
+    help="Number of documents to be processed."
+)
+logging.basicConfig(level=logging.INFO) # do not print to file 
+
+args = parser.parse_args()
+print(f"args.search_corefs is {args.search_corefs}")
+
+
+# base_url = "/home/flavio/projects/rel20/data"
+wiki_version = "wiki_2019"
+datasets = TrainingEvaluationDatasets(args.base_url, wiki_version, args.search_corefs).load()[args.name_dataset] 
+
+# create directories where to save the output from the tests
+dir_efficiency_test = os.path.join(args.base_url, "efficiency_test")
+sub_directories = {
+    "profile": "profile",
+    "predictions": "predictions",
+    "n_mentions_time": "n_mentions_time"
+}
+sub_directories = {k: os.path.join(dir_efficiency_test, v) for k, v in sub_directories.items()}
+
+for d in sub_directories.values():
+    if not os.path.exists(d):
+        os.makedirs(d)
+
+
+server = False
 docs = {}
 for i, doc in enumerate(datasets):
     sentences = []
@@ -20,8 +110,8 @@
             sentences.append(x["sentence"])
     text = ". ".join([x for x in sentences])
 
-    if len(docs) == 50:
-        print("length docs is 50.")
+    if len(docs) == args.n_docs:
+        print(f"length docs is {args.n_docs}.")
         print("====================")
         break
 
@@ -56,11 +146,11 @@
     from REL.entity_disambiguation import EntityDisambiguation
     from REL.mention_detection import MentionDetection
 
-    base_url = "C:/Users/mickv/desktop/data_back/"
+    # base_url = "C:/Users/mickv/desktop/data_back/" # why is this defined again here?
 
-    flair.device = torch.device("cuda:0")
+    flair.device = torch.device("cpu")
 
-    mention_detection = MentionDetection(base_url, wiki_version)
+    mention_detection = MentionDetection(args.base_url, wiki_version)
 
     # Alternatively use Flair NER tagger.
     tagger_ner = SequenceTagger.load("ner-fast")
@@ -72,11 +162,73 @@
     # 3. Load model.
     config = {
         "mode": "eval",
-        "model_path": "{}/{}/generated/model".format(base_url, wiki_version),
+        "model_path": "{}/{}/generated/model".format(args.base_url, wiki_version),
     }
-    model = EntityDisambiguation(base_url, wiki_version, config)
+    model = EntityDisambiguation(args.base_url, wiki_version, config, search_corefs=args.search_corefs) 
 
     # 4. Entity disambiguation.
     start = time()
     predictions, timing = model.predict(mentions_dataset)
     print("ED took: {}".format(time() - start))
+
+    output = {
+        "mentions": mentions_dataset,
+        "predictions": predictions,
+        "timing": timing
+    }
+
+    iteration_identifier = f"{args.name_dataset}_{args.n_docs}_{args.search_corefs}"
+    filename = os.path.join(sub_directories["predictions"], iteration_identifier)
+
+    with open(f"{filename}.pickle", "wb") as f:
+        pickle.dump(output, f, protocol=pickle.HIGHEST_PROTOCOL)        
+
+    # ## 4.b Profile the disambiguation part 
+    if args.profile:
+        print("Profiling disambiguation")
+        filename = os.path.join(sub_directories["profile"], iteration_identifier)
+
+        df_stats = profile_to_df(call="model.predict(mentions_dataset)")
+        df_stats.to_csv(f"{filename}.csv", index=False)
+
+    # ## 4.c time disambiguation by document, vary number of mentions 
+    if args.scale_mentions:
+        print("Scaling the mentions per document")
+        logging.basicConfig(level=logging.DEBUG) 
+        mentions_dataset_scaled = {}
+
+        for k, data in mentions_dataset.items():
+            mentions_dataset_scaled[k] = data # add the baseline data as in mentions_dataset
+            for f in [5, 50, 100]:
+                d = data * f 
+                key = f"{k}_{f}"
+                mentions_dataset_scaled[key] = d
+
+        print("Timing disambiguation per document")
+        timing_by_dataset = {}
+        for name, mentions in mentions_dataset_scaled.items():
+            print(f"predicting for dataset {name}", flush=True)
+            tempdict = {name: mentions} # format so that model.predict() works 
+            start = time()
+            predictions, timing = model.predict(tempdict)
+            t = time() - start
+
+            timing_by_dataset[name] = {
+                "n_mentions": len(mentions),
+                "time": t
+            }
+
+            if args.profile:
+                print("Profiling disambiguation for synthetic data set")
+                df_profile = profile_to_df(call="model.predict(tempdict)") 
+                timing_by_dataset[name]['profile'] = df_profile
+
+        # save timing by dataset
+        filename = os.path.join(sub_directories["n_mentions_time"], f"{args.name_dataset}_{args.search_corefs}" )
+
+        with open(f"{filename}.pickle", "wb") as f:
+            pickle.dump(timing_by_dataset, f, protocol=pickle.HIGHEST_PROTOCOL)
+
+
+
+
diff --git a/scripts/run_efficiency_tests.sh b/scripts/run_efficiency_tests.sh
@@ -0,0 +1,42 @@
+
+BASE_URL="$1"
+
+DATASETS=("aida_testB")
+DOCSIZES=(50 500)
+COREF_OPTIONS=("all" "off" "lsh")
+
+
+echo $DATASETS
+
+
+echo "--Running efficiency tests by data set, n_docs and coref option--"
+
+# do profiling and checking predictions in one 
+for size in ${DOCSIZES[@]}; do
+    for ds in ${DATASETS[@]}; do
+        for option in ${COREF_OPTIONS[@]}; do
+            echo $ds, echo $size, echo $option 
+            python scripts/efficiency_test.py \
+                --url "$BASE_URL" \
+                --profile \
+                --n_docs $size \
+                --name_dataset "$ds" \
+                --search_corefs $option 
+        done
+    done 
+done 
+
+# echo "--Scaling number of mentions--"
+
+# for ds in ${datasets[@]}; do
+#     echo $ds
+#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "all"
+#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "lsh"
+#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "off"
+# done 
+
+
+echo "Done."
+
+
+
diff --git a/src/REL/entity_disambiguation.py b/src/REL/entity_disambiguation.py
@@ -32,7 +32,12 @@ class EntityDisambiguation:
     Parent Entity Disambiguation class that directs the various subclasses used
     for the ED step.
     """
-    def __init__(self, base_url, wiki_version, user_config, reset_embeddings=False):
+    def __init__(self, base_url, wiki_version, user_config, reset_embeddings=False, search_corefs="all"):
+        """
+        Argument search_corefs: One of 'all' (default), 'lsh', 'off'. 
+            If 'off', no coreference search is done.
+            Otherwise the arguments are passed to the argument `search_corefs_in` in `with_coref`.
+        """
         self.base_url = base_url
         self.wiki_version = wiki_version
         self.embeddings = {}
@@ -53,7 +58,9 @@ def __init__(self, base_url, wiki_version, user_config, reset_embeddings=False):
         ), "Glove embeddings in wrong folder..? Test embedding not found.."
 
         self.__load_embeddings()
-        self.coref = TrainingEvaluationDatasets(base_url, wiki_version)
+        assert search_corefs in ['all', 'lsh', 'off']
+        self.search_corefs = search_corefs
+        self.coref = TrainingEvaluationDatasets(base_url, wiki_version, search_corefs)
         self.prerank_model = PreRank(self.config).to(self.device)
 
         self.__max_conf = None
@@ -470,7 +477,9 @@ def predict(self, data):
         :return: predictions and time taken for the ED step.
         """
 
-        self.coref.with_coref(data)
+        if self.search_corefs != "off":
+            self.coref.with_coref(data, search_corefs_in=self.search_corefs)
+
         data = self.get_data_items(data, "raw", predict=True)
         predictions, timing = self.__predict(data, include_timing=True, eval_raw=True)
 
@@ -664,7 +673,12 @@ def __predict(self, data, include_timing=False, eval_raw=False):
                 ]
                 doc_names = [m["doc_name"] for m in batch]
 
-                for dname, entity in zip(doc_names, pred_entities):
+                if self.search_corefs != 'off':
+                    coref_indicators = [m['raw']['is_coref'] for m in batch]
+                else:
+                    coref_indicators = [None for m in batch]
+
+                for dname, entity, is_coref in zip(doc_names, pred_entities, coref_indicators):
                     if entity[0] != "NIL":
                         predictions[dname].append(
                             {
@@ -673,6 +687,7 @@ def __predict(self, data, include_timing=False, eval_raw=False):
                                 "candidates": entity[2],
                                 "conf_ed": entity[4],
                                 "scores": list([str(x) for x in entity[3]]),
+                                "is_coref": is_coref
                             }
                         )
 
@@ -683,6 +698,7 @@ def __predict(self, data, include_timing=False, eval_raw=False):
                                 "prediction": entity[0],
                                 "candidates": entity[2],
                                 "scores": [],
+                                "is_coref": is_coref
                             }
                         )