From 53540b3184480bc43bd53c6883010f6592362095 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Fri, 25 Nov 2022 13:29:28 +0100
Subject: [PATCH 01/43] add option for whether with_coref() should be used

---
 scripts/efficiency_test.py       | 45 ++++++++++++++++++++++++++------
 src/REL/entity_disambiguation.py |  9 ++++---
 src/REL/training_datasets.py     |  8 +++---
 3 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py
index fc66f54..3190e17 100644
--- a/scripts/efficiency_test.py
+++ b/scripts/efficiency_test.py
@@ -1,17 +1,27 @@
 import numpy as np
 import requests
+import argparse
+import pickle
 
 from REL.training_datasets import TrainingEvaluationDatasets
 
 np.random.seed(seed=42)
 
-base_url = "/Users/vanhulsm/Desktop/projects/data/"
-wiki_version = "wiki_2014"
-datasets = TrainingEvaluationDatasets(base_url, wiki_version).load()["aida_testB"]
+parser = argparse.ArgumentParser()
+parser.add_argument("--use_corefs", action="store_true", help="use function with_coref()?", default=False)
+args = parser.parse_args()
+print(f"args.use_corefs is {args.use_corefs}")
+
+
+
+base_url = "/home/flavio/projects/rel20/data"
+wiki_version = "wiki_2019"
+datasets = TrainingEvaluationDatasets(base_url, wiki_version, args.use_corefs).load()["aida_testB"] 
+    # datasets are loaded here, then processed and stored in docs, which is then used to check the efficiency
 
 # random_docs = np.random.choice(list(datasets.keys()), 50)
 
-server = True
+server = False
 docs = {}
 for i, doc in enumerate(datasets):
     sentences = []
@@ -56,9 +66,9 @@
     from REL.entity_disambiguation import EntityDisambiguation
     from REL.mention_detection import MentionDetection
 
-    base_url = "C:/Users/mickv/desktop/data_back/"
+    # base_url = "C:/Users/mickv/desktop/data_back/" # why is this defined again here?
 
-    flair.device = torch.device("cuda:0")
+    flair.device = torch.device("cpu")
 
     mention_detection = MentionDetection(base_url, wiki_version)
 
@@ -66,7 +76,9 @@
     tagger_ner = SequenceTagger.load("ner-fast")
 
     start = time()
-    mentions_dataset, n_mentions = mention_detection.find_mentions(docs, tagger_ner)
+    mentions_dataset, n_mentions = mention_detection.find_mentions(docs, tagger_ner) # TODO: here corefs have an impact! check how.
+        # but what we do in the mention detection here has no impact on what we below in ED. 
+        # so would we expect an effect here, or only below?
     print("MD took: {}".format(time() - start))
 
     # 3. Load model.
@@ -74,9 +86,26 @@
         "mode": "eval",
         "model_path": "{}/{}/generated/model".format(base_url, wiki_version),
     }
-    model = EntityDisambiguation(base_url, wiki_version, config)
+    model = EntityDisambiguation(base_url, wiki_version, config, use_corefs=args.use_corefs)
+        # model.coref is a training data set
+        # model.coref has method with_coref
+        # compare the training data sets when using corefs and when not
+        # note that the data are loaded elsewhere! so not sure this is the right place to add the option? 
 
     # 4. Entity disambiguation.
     start = time()
     predictions, timing = model.predict(mentions_dataset)
     print("ED took: {}".format(time() - start))
+
+    output = {
+        "predictions": predictions,
+        "timing": timing
+    }
+    fn = f"{base_url}/efficiency_test/output"
+    if not args.use_corefs:
+        fn = f"{fn}_nocoref"
+
+    with open(f"{fn}.pickle", "wb") as f:
+        pickle.dump(output, f, protocol=pickle.HIGHEST_PROTOCOL)
+
+    
diff --git a/src/REL/entity_disambiguation.py b/src/REL/entity_disambiguation.py
index 838670c..e7b0e12 100644
--- a/src/REL/entity_disambiguation.py
+++ b/src/REL/entity_disambiguation.py
@@ -32,7 +32,7 @@ class EntityDisambiguation:
     Parent Entity Disambiguation class that directs the various subclasses used
     for the ED step.
     """
-    def __init__(self, base_url, wiki_version, user_config, reset_embeddings=False):
+    def __init__(self, base_url, wiki_version, user_config, reset_embeddings=False, use_corefs=True):
         self.base_url = base_url
         self.wiki_version = wiki_version
         self.embeddings = {}
@@ -53,7 +53,8 @@ def __init__(self, base_url, wiki_version, user_config, reset_embeddings=False):
         ), "Glove embeddings in wrong folder..? Test embedding not found.."
 
         self.__load_embeddings()
-        self.coref = TrainingEvaluationDatasets(base_url, wiki_version)
+        self.use_corefs = use_corefs
+        self.coref = TrainingEvaluationDatasets(base_url, wiki_version, use_corefs)
         self.prerank_model = PreRank(self.config).to(self.device)
 
         self.__max_conf = None
@@ -470,7 +471,9 @@ def predict(self, data):
         :return: predictions and time taken for the ED step.
         """
 
-        self.coref.with_coref(data)
+        if self.use_corefs:
+            self.coref.with_coref(data)
+            
         data = self.get_data_items(data, "raw", predict=True)
         predictions, timing = self.__predict(data, include_timing=True, eval_raw=True)
 
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 0e41d62..9052322 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -9,11 +9,12 @@ class TrainingEvaluationDatasets:
     Reading dataset from CoNLL dataset, extracted by https://github.com/dalab/deep-ed/
     """
 
-    def __init__(self, base_url, wiki_version):
+    def __init__(self, base_url, wiki_version, use_corefs=True):
         self.person_names = self.__load_person_names(
             os.path.join(base_url, "generic/p_e_m_data/persons.txt")
         )
         self.base_url = os.path.join(base_url, wiki_version)
+        self.use_corefs = use_corefs
 
     def load(self):
         """
@@ -44,7 +45,8 @@ def load(self):
                 if "Jiří_Třanovský Jiří_Třanovský" in datasets[ds]:
                     del datasets[ds]["Jiří_Třanovský Jiří_Třanovský"]
 
-            self.with_coref(datasets[ds])
+            if self.use_corefs:
+                self.with_coref(datasets[ds])
 
         return datasets
 
@@ -109,7 +111,7 @@ def with_coref(self, dataset):
 
         :return: dataset
         """
-
+        print("with_coref() is called.")
         for data_name, content in dataset.items():
             for cur_m in content:
                 coref = self.__find_coref(cur_m, content)

From cdd8c7b439256397205d4cab60ada28d1d1dbcde Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Wed, 7 Dec 2022 17:31:36 +0100
Subject: [PATCH 02/43] time ED for different dataset sizes

---
 scripts/efficiency_test.py | 41 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py
index 3190e17..f9eff3e 100644
--- a/scripts/efficiency_test.py
+++ b/scripts/efficiency_test.py
@@ -97,6 +97,39 @@
     predictions, timing = model.predict(mentions_dataset)
     print("ED took: {}".format(time() - start))
 
+
+    # scale the number of mentions
+    # max_scaling_factor = 10
+    # steps = 5
+
+    mentions_dataset_scaled = {}
+
+    for k, data in mentions_dataset.items():
+        mentions_dataset_scaled[k] = data # add the baseline data as in mentions_dataset
+        for f in [5, 10, 50, 100]:
+            d = data * f 
+            key = f"{k}_{f}"
+            mentions_dataset_scaled[key] = d
+
+    timing_by_dataset = {}
+    for name, mentions in mentions_dataset_scaled.items():
+        print(f"predicting for dataset {name}")
+        tempdict = {name: mentions} # format so that model.predict() works 
+        start = time()
+        predictions, timing = model.predict(tempdict)
+        t = time() - start
+        timing_by_dataset[name] = {
+            "n_mentions": len(mentions),
+            "time": t
+        }
+
+    import cProfile 
+    fn = f"{base_url}/efficiency_test/profile_predict"
+    if not args.use_corefs:
+        fn = f"{fn}_nocoref"
+    # cProfile.run("model.predict(mentions_dataset_scaled)", sort=1, filename=fn)
+    # breakpoint()
+
     output = {
         "predictions": predictions,
         "timing": timing
@@ -107,5 +140,13 @@
 
     with open(f"{fn}.pickle", "wb") as f:
         pickle.dump(output, f, protocol=pickle.HIGHEST_PROTOCOL)
+    
+    # save timing by dataet
+    fn_time_dataset = f"{base_url}/efficiency_test/time_dataset"
+    if not args.use_corefs:
+        fn_time_dataset = f"{fn_time_dataset}_nocoref"
+    
+    with open(f"{fn_time_dataset}.pickle", "wb") as f:
+        pickle.dump(timing_by_dataset, f, protocol=pickle.HIGHEST_PROTOCOL)
 
     

From aa9c7412a7d6607af9a42563d63d0ca346d707d0 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Wed, 14 Dec 2022 10:13:31 +0100
Subject: [PATCH 03/43] change coref switch name, more efficiency tests

- with_coref option: change name to no_corefs, default False
- add options to efficiency test
- bash script for multiple runs with efficiency test and different
options
---
 scripts/efficiency_test.py       | 202 +++++++++++++++++++++++--------
 scripts/run_efficiency_tests.sh  |  31 +++++
 src/REL/entity_disambiguation.py |   8 +-
 src/REL/training_datasets.py     |   6 +-
 4 files changed, 187 insertions(+), 60 deletions(-)
 create mode 100644 scripts/run_efficiency_tests.sh

diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py
index f9eff3e..3410122 100644
--- a/scripts/efficiency_test.py
+++ b/scripts/efficiency_test.py
@@ -3,21 +3,86 @@
 import argparse
 import pickle
 
+
 from REL.training_datasets import TrainingEvaluationDatasets
 
 np.random.seed(seed=42)
 
 parser = argparse.ArgumentParser()
-parser.add_argument("--use_corefs", action="store_true", help="use function with_coref()?", default=False)
+parser.add_argument( 
+    "--no_corefs",
+    action="store_true",
+    help="use function with_coref()?", 
+    default=False)
+
+parser.add_argument(
+    "--profile",
+    action="store_true",
+    default=False,
+    help="Profile the disambiguation step."
+    )
+parser.add_argument(
+    "--scale_mentions",
+    action="store_true", 
+    default=False,
+    help="""Stack mentions in each dataset and time the disambiguation step by document. 
+            This is to assess the time complexity of the program."""
+    )
+parser.add_argument(
+    "--name_dataset",
+    type=str,
+    default="aida_testB",
+    help="Name of the training dataset to be used"
+)
+parser.add_argument(
+    "--n_docs",
+    type=int,
+    default=50,
+    help="Number of documents to be processed."
+)
+
+# helper function to profile a call and save the timing in a pd dataframe 
+def profile_to_df(call):
+    cProfile.run(call, filename="temp.txt")
+    st = pstats.Stats("temp.txt")
+
+    keys_from_k = ['file', 'line', 'fn']
+    keys_from_v = ['cc', 'ncalls', 'tottime', 'cumtime', 'callers']
+    data = {k: [] for k in keys_from_k + keys_from_v}
+
+    s = st.stats
+
+    for k in s.keys():
+        for i, kk in enumerate(keys_from_k):
+            data[kk].append(k[i])
+
+        for i, kk in enumerate(keys_from_v):
+            data[kk].append(s[k][i])
+
+    df = pd.DataFrame(data)
+    os.remove('temp.txt')
+    return df
+
+
+
+# TODO:
+# make log files!?
+# adjust folder structure on computer and in script 
+
 args = parser.parse_args()
-print(f"args.use_corefs is {args.use_corefs}")
+print(f"args.no_corefs is {args.no_corefs}")
 
+if args.profile:
+    import cProfile 
+    import pandas as pd 
+    import pstats 
+    import os 
 
 
 base_url = "/home/flavio/projects/rel20/data"
 wiki_version = "wiki_2019"
-datasets = TrainingEvaluationDatasets(base_url, wiki_version, args.use_corefs).load()["aida_testB"] 
-    # datasets are loaded here, then processed and stored in docs, which is then used to check the efficiency
+datasets = TrainingEvaluationDatasets(base_url, wiki_version, args.no_corefs).load()[args.name_dataset] 
+save_data_to = f"{base_url}/efficiency_test/" # save all recorded in this directory 
 
 # random_docs = np.random.choice(list(datasets.keys()), 50)
 
@@ -30,8 +95,8 @@
             sentences.append(x["sentence"])
     text = ". ".join([x for x in sentences])
 
-    if len(docs) == 50:
-        print("length docs is 50.")
+    if len(docs) == args.n_docs:
+        print(f"length docs is {args.n_docs}.")
         print("====================")
         break
 
@@ -86,7 +151,7 @@
         "mode": "eval",
         "model_path": "{}/{}/generated/model".format(base_url, wiki_version),
     }
-    model = EntityDisambiguation(base_url, wiki_version, config, use_corefs=args.use_corefs)
+    model = EntityDisambiguation(base_url, wiki_version, config, no_corefs=args.no_corefs) # TODO: change to no_corefs to be consistent!
         # model.coref is a training data set
         # model.coref has method with_coref
         # compare the training data sets when using corefs and when not
@@ -97,56 +162,87 @@
     predictions, timing = model.predict(mentions_dataset)
     print("ED took: {}".format(time() - start))
 
-
-    # scale the number of mentions
-    # max_scaling_factor = 10
-    # steps = 5
-
-    mentions_dataset_scaled = {}
-
-    for k, data in mentions_dataset.items():
-        mentions_dataset_scaled[k] = data # add the baseline data as in mentions_dataset
-        for f in [5, 10, 50, 100]:
-            d = data * f 
-            key = f"{k}_{f}"
-            mentions_dataset_scaled[key] = d
-
-    timing_by_dataset = {}
-    for name, mentions in mentions_dataset_scaled.items():
-        print(f"predicting for dataset {name}")
-        tempdict = {name: mentions} # format so that model.predict() works 
-        start = time()
-        predictions, timing = model.predict(tempdict)
-        t = time() - start
-        timing_by_dataset[name] = {
-            "n_mentions": len(mentions),
-            "time": t
-        }
-
-    import cProfile 
-    fn = f"{base_url}/efficiency_test/profile_predict"
-    if not args.use_corefs:
-        fn = f"{fn}_nocoref"
-    # cProfile.run("model.predict(mentions_dataset_scaled)", sort=1, filename=fn)
-    # breakpoint()
-
     output = {
         "predictions": predictions,
         "timing": timing
     }
-    fn = f"{base_url}/efficiency_test/output"
-    if not args.use_corefs:
-        fn = f"{fn}_nocoref"
-
-    with open(f"{fn}.pickle", "wb") as f:
-        pickle.dump(output, f, protocol=pickle.HIGHEST_PROTOCOL)
-    
-    # save timing by dataet
-    fn_time_dataset = f"{base_url}/efficiency_test/time_dataset"
-    if not args.use_corefs:
-        fn_time_dataset = f"{fn_time_dataset}_nocoref"
     
-    with open(f"{fn_time_dataset}.pickle", "wb") as f:
-        pickle.dump(timing_by_dataset, f, protocol=pickle.HIGHEST_PROTOCOL)
+    filename = f"{save_data_to}predictions/{args.name_dataset}_{args.n_docs}"
+    if args.no_corefs:
+        filename = f"{filename}_nocoref"
+
+    with open(f"{filename}.pickle", "wb") as f:
+        pickle.dump(output, f, protocol=pickle.HIGHEST_PROTOCOL)        
+
+    # ## 4.b Profile disambiguation
+    if args.profile:
+        print("Profiling disambiguation")
+        filename = f"{save_data_to}profile/{args.name_dataset}_{args.n_docs}"
+        if args.no_corefs:
+            filename = f"{filename}_nocoref"
+
+        df_stats = profile_to_df(call="model.predict(mentions_dataset)")
+        # cProfile.run("model.predict(mentions_dataset)", filename="temp.txt")
+        # st = pstats.Stats("temp.txt")
+
+        # keys_from_k = ['file', 'line', 'fn']
+        # keys_from_v = ['cc', 'ncalls', 'tottime', 'cumtime', 'callers']
+        # data = {k: [] for k in keys_from_k + keys_from_v}
+
+        # s = st.stats
+
+        # for k in s.keys():
+        #     for i, kk in enumerate(keys_from_k):
+        #         data[kk].append(k[i])
+
+        #     for i, kk in enumerate(keys_from_v):
+        #         data[kk].append(s[k][i])
+
+        # df_stats = pd.DataFrame(data)
+        # os.remove('temp.txt')
+
+        df_stats.to_csv(f"{filename}.csv", index=False)
+
+
+    # ## 4.c time disambiguation by document, vary number of mentions 
+    if args.scale_mentions:
+        print("Scaling the mentions per document")
+        mentions_dataset_scaled = {}
+
+        for k, data in mentions_dataset.items():
+            mentions_dataset_scaled[k] = data # add the baseline data as in mentions_dataset
+            for f in [5, 10, 50, 100]:
+                d = data * f 
+                key = f"{k}_{f}"
+                mentions_dataset_scaled[key] = d
+
+        print("Timing disambiguation per document")
+        timing_by_dataset = {}
+        for name, mentions in mentions_dataset_scaled.items():
+            print(f"predicting for dataset {name}")
+            tempdict = {name: mentions} # format so that model.predict() works 
+            start = time()
+            predictions, timing = model.predict(tempdict)
+            t = time() - start
+
+            timing_by_dataset[name] = {
+                "n_mentions": len(mentions),
+                "time": t
+            }
+            
+            if args.profile:
+                df_profile = profile_to_df(call="model.predict(tempdict)") 
+                timing_by_dataset[name]['profile'] = df_profile
+
+        
+        # save timing by dataet
+        filename = f"{save_data_to}n_mentions_time/{args.name_dataset}"
+        if args.no_corefs:
+            filename = f"{filename}_nocoref"
+        
+        with open(f"{filename}.pickle", "wb") as f:
+            pickle.dump(timing_by_dataset, f, protocol=pickle.HIGHEST_PROTOCOL)
+
+
 
     
diff --git a/scripts/run_efficiency_tests.sh b/scripts/run_efficiency_tests.sh
new file mode 100644
index 0000000..b199c3a
--- /dev/null
+++ b/scripts/run_efficiency_tests.sh
@@ -0,0 +1,31 @@
+
+
+
+datasets=("aida_testB")
+
+docsizes=(50 500)
+
+
+echo $datasets
+
+# do profiling and checking predictions in one 
+# for size in ${docsizes[@]}; do
+#     for ds in ${datasets[@]}; do
+#         echo $ds, echo $size
+#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds"
+#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --no_corefs
+#     done 
+# done 
+
+
+for ds in ${datasets[@]}; do
+    echo $ds
+    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile 
+    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --no_corefs
+done 
+
+
+
+
+
+
diff --git a/src/REL/entity_disambiguation.py b/src/REL/entity_disambiguation.py
index e7b0e12..14b29b1 100644
--- a/src/REL/entity_disambiguation.py
+++ b/src/REL/entity_disambiguation.py
@@ -32,7 +32,7 @@ class EntityDisambiguation:
     Parent Entity Disambiguation class that directs the various subclasses used
     for the ED step.
     """
-    def __init__(self, base_url, wiki_version, user_config, reset_embeddings=False, use_corefs=True):
+    def __init__(self, base_url, wiki_version, user_config, reset_embeddings=False, no_corefs=False):
         self.base_url = base_url
         self.wiki_version = wiki_version
         self.embeddings = {}
@@ -53,8 +53,8 @@ def __init__(self, base_url, wiki_version, user_config, reset_embeddings=False,
         ), "Glove embeddings in wrong folder..? Test embedding not found.."
 
         self.__load_embeddings()
-        self.use_corefs = use_corefs
-        self.coref = TrainingEvaluationDatasets(base_url, wiki_version, use_corefs)
+        self.no_corefs = no_corefs
+        self.coref = TrainingEvaluationDatasets(base_url, wiki_version, no_corefs)
         self.prerank_model = PreRank(self.config).to(self.device)
 
         self.__max_conf = None
@@ -471,7 +471,7 @@ def predict(self, data):
         :return: predictions and time taken for the ED step.
         """
 
-        if self.use_corefs:
+        if not self.no_corefs:
             self.coref.with_coref(data)
             
         data = self.get_data_items(data, "raw", predict=True)
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 9052322..2e691ae 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -9,12 +9,12 @@ class TrainingEvaluationDatasets:
     Reading dataset from CoNLL dataset, extracted by https://github.com/dalab/deep-ed/
     """
 
-    def __init__(self, base_url, wiki_version, use_corefs=True):
+    def __init__(self, base_url, wiki_version, no_corefs=False):
         self.person_names = self.__load_person_names(
             os.path.join(base_url, "generic/p_e_m_data/persons.txt")
         )
         self.base_url = os.path.join(base_url, wiki_version)
-        self.use_corefs = use_corefs
+        self.no_corefs = no_corefs
 
     def load(self):
         """
@@ -45,7 +45,7 @@ def load(self):
                 if "Jiří_Třanovský Jiří_Třanovský" in datasets[ds]:
                     del datasets[ds]["Jiří_Třanovský Jiří_Třanovský"]
 
-            if self.use_corefs:
+            if not self.no_corefs:
                 self.with_coref(datasets[ds])
 
         return datasets

From 463245d80e4e6438da87cce7566f5d30c1a2f19f Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Fri, 16 Dec 2022 11:06:38 +0100
Subject: [PATCH 04/43] add coreference indicator to prediction output

---
 src/REL/entity_disambiguation.py | 11 +++++++++--
 src/REL/training_datasets.py     |  3 +++
 src/REL/utils.py                 |  2 ++
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/REL/entity_disambiguation.py b/src/REL/entity_disambiguation.py
index 14b29b1..a1423bf 100644
--- a/src/REL/entity_disambiguation.py
+++ b/src/REL/entity_disambiguation.py
@@ -473,7 +473,7 @@ def predict(self, data):
 
         if not self.no_corefs:
             self.coref.with_coref(data)
-            
+
         data = self.get_data_items(data, "raw", predict=True)
         predictions, timing = self.__predict(data, include_timing=True, eval_raw=True)
 
@@ -667,7 +667,12 @@ def __predict(self, data, include_timing=False, eval_raw=False):
                 ]
                 doc_names = [m["doc_name"] for m in batch]
 
-                for dname, entity in zip(doc_names, pred_entities):
+                if not self.no_corefs:
+                    coref_indicators = [m['raw']['is_coref'] for m in batch]
+                else:
+                    coref_indicators = [None for m in batch]
+
+                for dname, entity, is_coref in zip(doc_names, pred_entities, coref_indicators):
                     if entity[0] != "NIL":
                         predictions[dname].append(
                             {
@@ -676,6 +681,7 @@ def __predict(self, data, include_timing=False, eval_raw=False):
                                 "candidates": entity[2],
                                 "conf_ed": entity[4],
                                 "scores": list([str(x) for x in entity[3]]),
+                                "is_coref": is_coref
                             }
                         )
 
@@ -686,6 +692,7 @@ def __predict(self, data, include_timing=False, eval_raw=False):
                                 "prediction": entity[0],
                                 "candidates": entity[2],
                                 "scores": [],
+                                "is_coref": is_coref
                             }
                         )
 
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 2e691ae..005d43c 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -125,3 +125,6 @@ def with_coref(self, dataset):
                     cur_m["candidates"] = sorted(
                         list(cur_cands.items()), key=lambda x: x[1]
                     )[::-1]
+                    cur_m["is_coref"] = 1
+                else:
+                    cur_m["is_coref"] = 0
diff --git a/src/REL/utils.py b/src/REL/utils.py
index 1e52cf6..2abef0d 100644
--- a/src/REL/utils.py
+++ b/src/REL/utils.py
@@ -90,6 +90,7 @@ def process_results(
             idx = ment["sent_idx"]
             start_pos = ment["pos"]
             mention_length = int(ment["end_pos"] - ment["pos"])
+            is_coref = pred['is_coref']
 
             if pred["prediction"] != "NIL":
                 temp = (
@@ -100,6 +101,7 @@ def process_results(
                     pred["conf_ed"],
                     ment["conf_md"] if "conf_md" in ment else 0.0,
                     ment["tag"] if "tag" in ment else "NULL",
+                    is_coref,
                 )
                 res_doc.append(temp)
         res[doc] = res_doc

From 93fa8f5dcd24d315cfe6080fe7024a181a9ac8d2 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Tue, 20 Dec 2022 14:47:52 +0100
Subject: [PATCH 05/43] efficiency test: also pickle data after mention
 detection

---
 scripts/efficiency_test.py      |  1 +
 scripts/run_efficiency_tests.sh | 24 ++++++++++++------------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py
index 3410122..4fc22a2 100644
--- a/scripts/efficiency_test.py
+++ b/scripts/efficiency_test.py
@@ -163,6 +163,7 @@ def profile_to_df(call):
     print("ED took: {}".format(time() - start))
 
     output = {
+        "mentions": mentions_dataset,
         "predictions": predictions,
         "timing": timing
     }
diff --git a/scripts/run_efficiency_tests.sh b/scripts/run_efficiency_tests.sh
index b199c3a..1824e30 100644
--- a/scripts/run_efficiency_tests.sh
+++ b/scripts/run_efficiency_tests.sh
@@ -9,20 +9,20 @@ docsizes=(50 500)
 echo $datasets
 
 # do profiling and checking predictions in one 
-# for size in ${docsizes[@]}; do
-#     for ds in ${datasets[@]}; do
-#         echo $ds, echo $size
-#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds"
-#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --no_corefs
-#     done 
-# done 
+for size in ${docsizes[@]}; do
+    for ds in ${datasets[@]}; do
+        echo $ds, echo $size
+        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds"
+        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --no_corefs
+    done 
+done 
 
 
-for ds in ${datasets[@]}; do
-    echo $ds
-    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile 
-    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --no_corefs
-done 
+# for ds in ${datasets[@]}; do
+#     echo $ds
+#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile 
+#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --no_corefs
+# done 
 
 
 

From c218ce1b847be008ed8bf3d9d0ce20bf710029cf Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Fri, 6 Jan 2023 10:45:12 +0100
Subject: [PATCH 06/43] integrate lsh and first run

---
 src/REL/lsh.py               | 190 +++++++++++++++++++++++++++++++++++
 src/REL/training_datasets.py |  48 ++++++++-
 2 files changed, 236 insertions(+), 2 deletions(-)
 create mode 100644 src/REL/lsh.py

diff --git a/src/REL/lsh.py b/src/REL/lsh.py
new file mode 100644
index 0000000..1e2c86a
--- /dev/null
+++ b/src/REL/lsh.py
@@ -0,0 +1,190 @@
+
+from random import shuffle, seed 
+import time 
+import numpy as np 
+seed(3)
+
+def split_mention(m):
+    return m.split(" ")
+
+def k_shingle(s, k):
+    "convert string s into shingles of length k"
+    shingle = []
+    for i in range(len(s) - k + 1):
+        shingle.append(s[i:(i+k)])
+    return shingle
+
+
+def partition_signature(s, b):
+    "Convert signature s into b partitions of equal size"
+    assert len(s) % b == 0
+    rg = int(len(s) / b)
+    partitions = []
+    for i in range(0, len(s), rg):
+        v = s[i:i+rg]
+        partitions.append(v)
+    return partitions
+
+def cols_to_int(a):
+    "combine columns in all rows to an integer: [[1,20,3], [1,4,10]] becomes [1203,1410]"
+    existing_powers = np.floor(np.log10(a))
+    nrows, ncols = a.shape 
+
+    cumsum_powers = np.fliplr(np.cumsum(np.fliplr(existing_powers), axis=1))
+
+    add_powers = [x for x in reversed(range(ncols))]
+    add_powers = np.tile(add_powers, (nrows, 1))
+
+    mult_factor = cumsum_powers - existing_powers + add_powers  
+    summationvector = np.ones((ncols, 1)) 
+    out = np.matmul(a * 10**mult_factor, summationvector)
+    return out 
+
+
+
+def idx_unique_multidim(a):
+    "groups rows in a multidimensional arrays by their unique signature"
+    # a = cols_to_int(a).squeeze() # wrong
+    # a = cols_to_string(a).squeeze() # slow 
+    a = cols_to_int(a).squeeze()
+    sort_idx = np.argsort(a)
+    sort_idx
+    a_sorted = a[sort_idx]
+    unq_first = np.concatenate(([True], a_sorted[1:] != a_sorted[:-1])) # "is the current value different from the previous?". the concat of [True]: because the first occurrence is always True (ie the first time it occur)
+    unq_items = a_sorted[unq_first]
+    unq_count = np.diff(np.nonzero(unq_first)[0]) # np.nonzero(unq_first)[0] gives the indices of first elements in a_sorted
+    unq_idx = np.split(sort_idx, np.cumsum(unq_count))
+    return unq_idx
+
+
+def reshape_rows_reps(a):
+    "reshape a 3-d array of n_reps x n_rows x n_cols to n_rows x n_reps x n_cols"
+    n_reps, n_rows, n_cols = a.shape
+    a = a.reshape(n_reps*n_rows, n_cols)
+    # extractor indices: for 3 reps, 2 rows: [0,2,4,1,3,5]. to reorder a
+        # in other words: goes from 0 to (n_reps * n_rows). step sizes are n_rows. starts are the row indices
+    idx = np.arange(n_reps*n_rows).reshape(n_reps, n_rows).T.reshape(-1,1)
+    a = np.take_along_axis(a, idx, axis=0)
+    a = a.reshape(n_rows, n_reps, n_cols)
+    return a 
+
+def minhash_signature_np(x, n_reps):
+    """Make a minhash signature of array x with length n_reps.
+
+    Inputs
+    ------
+    x: axis 0 are observations, columns are binary one-hot encoded vectors
+    """
+    # get indices 
+    indices = np.arange(x.shape[1])
+    rng = np.random.default_rng(12345)
+
+    # expand by n_reps 
+    indices_mult = np.tile(indices, (n_reps, 1)) # reorder the columns n_reps times 
+    x_mult = np.tile(x, (n_reps, 1)).reshape((n_reps,) + x.shape) # new shape: (n_resp, x.shape[0], x.shape[1
+
+    # permute indices and apply to x_mult
+    permuted_indices = rng.permuted(indices_mult, axis=1)
+    x_mult_permuted = np.take_along_axis(x_mult, permuted_indices[:, np.newaxis], 2)
+
+    # for the reduction below, need to have all samples of the same observation in one block
+    x_mult_permuted = reshape_rows_reps(x_mult_permuted)
+
+    # make signature
+    sig = x_mult_permuted.argmax(axis=2)
+    return sig 
+
+
+class LSHBase:
+    # Important: order of occurences in shingles and vectors = order of input list (=order of occurrence in document)
+    def __init__(self, mentions, shingle_size):
+        if isinstance(mentions, dict):
+            self.shingles = [k_shingle(m, shingle_size) for m in mentions.values()]
+        elif isinstance(mentions, list):
+            self.shingles = [k_shingle(m, shingle_size) for m in mentions]
+
+    def _build_vocab(self):
+        # shingles = [v["shingles"] for v in self.mentions.values()]
+        vocab = list(set([shingle for sublist in self.shingles for shingle in sublist]))
+        self.vocab = vocab
+
+    def encode_binary(self, to_numpy=False):
+        vectors = [[1 if word in cur_shingles else 0 for word in self.vocab] for cur_shingles in self.shingles]
+        if not to_numpy:
+            self.vectors = vectors 
+        else:
+            self.vectors = np.stack(vectors)
+
+
+class LSHMinHash(LSHBase):
+    "LSH with MinHashing and numpy"
+
+    def __init__(self, mentions, shingle_size, signature_size, band_length):
+        super().__init__(mentions, shingle_size)
+        if signature_size % band_length != 0:
+            raise ValueError("Signature needs to be divisible into equal-sized bands.")
+        self.signature_size = signature_size 
+        self.band_length = band_length 
+    
+    def make_signature(self):
+        "make array of dense vectors with MinHashing. each row is one mention"
+        templist = []
+        rng = np.random.default_rng(seed=3)
+        i = 0
+        while i < self.signature_size:
+            rng.shuffle(self.vectors, axis=1)
+            sig_i = 1 + self.vectors.argmax(axis=1) # add one for the log10 operations in idx_unique_multidim 
+            templist.append(sig_i)
+            i += 1
+
+        self.signature = np.stack(templist, axis=1)
+
+    def make_signature_np(self):
+        signature = minhash_signature_np(self.vectors, self.signature_size)
+        self.signature = signature + np.ones(signature.shape)  # this is for the log10 operations: do not want to have 0s
+
+    def all_candidates_to_all(self):
+        "fall-back option to return the non-clustered input: each mention is a candidate coreference for all"
+        n_mentions = self.vectors.shape[0]
+        self.candidates = [set(range(n_mentions)) for _ in range(n_mentions)]
+
+    def get_candidates(self):
+        "extract similar candidates for each mention by comparing subsets of the signature"
+        n_bands = int(self.signature_size / self.band_length)
+        bands = np.split(ary=self.signature, indices_or_sections=n_bands, axis=1)
+        candidates = [set() for _ in range(self.vectors.shape[0])]
+
+        if len(candidates) > 1:
+            for band in bands:
+                groups = idx_unique_multidim(band)
+                groups = [g for g in groups if g.shape[0] > 1]
+                for g in groups:
+                    g = list(g)
+                    for i in g:
+                        for j in g:
+                            if i != j:
+                                candidates[i].add(j)
+        else: # idx_unique_multidim above does not work when there is only one candidate
+            candidates[0].add(0)
+
+        self.candidates = candidates
+
+    def cluster(self, numpy_signature=False):
+        "find similar records for each mention"
+        start = time.time()
+        self._build_vocab()
+        self.encode_binary(to_numpy=True)
+        if self.vectors.shape[1] == 0: # no signature possible b/c no mention is longer than the shingle size.
+            self.all_candidates_to_all()
+        else:
+            if numpy_signature:
+                self.make_signature_np()
+            else:
+                self.make_signature()
+            self.get_candidates()
+        self.time = time.time() - start 
+
+    def summarise(self):
+        sizes = [len(g) for g in self.candidates]
+        print(f"took {self.time} seconds for {len(self.candidates)} mentions")
+        print(f"average, min, max cluster size: {round(sum(sizes)/len(sizes),2)}, {min(sizes)}, {max(sizes)}")
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 005d43c..666b707 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -1,6 +1,7 @@
 import os
 import pickle
-
+import pdb 
+from REL.lsh import LSHMinHash
 
 class TrainingEvaluationDatasets:
     """
@@ -105,7 +106,7 @@ def __find_coref(self, ment, mentlist):
 
         return coref
 
-    def with_coref(self, dataset):
+    def with_coref_old(self, dataset): # TODO: need to update the calls to with_coref
         """
         Parent function that checks if there are coreferences in the given dataset.
 
@@ -128,3 +129,46 @@ def with_coref(self, dataset):
                     cur_m["is_coref"] = 1
                 else:
                     cur_m["is_coref"] = 0
+
+    def with_coref(self, dataset): # TODO: need to update the calls to with_coref
+        """
+        Check if there are coreferences in the given dataset. Use LSH for dimensionality reduction.
+
+        :return: dataset
+        """
+        print("with_coref_lsh() is called.")
+        for data_name, content in dataset.items():
+            # pdb.set_trace()
+            # handle problem with only 1 input mention -- fall back to previous approach
+            # print(f"len content: {len(content)}")
+            # print(f"data name: {data_name}")
+            # if data_name == '937 OFFICIAL)':
+            #     pdb.set_trace()
+            if len(content) == 0:
+                pass 
+            else:
+                input_mentions = [m["mention"] for m in content]
+                lsh_corefs = LSHMinHash(mentions=input_mentions, shingle_size=4, signature_size=50, band_length=2)
+                lsh_corefs.cluster()
+                assert len(content) == len(lsh_corefs.candidates)
+                # lsh_corefs.candidates are the input for below. indices refer to index in input_mentions
+                # call lsh here on all mentions 
+                for cur_m, idx_candidates in zip(content, lsh_corefs.candidates):
+                    idx_candidates = list(idx_candidates) # lsh returns the indices of the candidate coreferences
+                    candidates = [content[i] for i in idx_candidates]
+                    coref = self.__find_coref(cur_m, candidates)
+                    # update __find_coref: use indices from lsh call above 
+                    # pdb.set_trace()
+                    if coref is not None and len(coref) > 0:
+                        cur_cands = {}
+                        for m in coref:
+                            for c, p in m["candidates"]:
+                                cur_cands[c] = cur_cands.get(c, 0) + p
+                        for c in cur_cands.keys():
+                            cur_cands[c] /= len(coref)
+                        cur_m["candidates"] = sorted(
+                            list(cur_cands.items()), key=lambda x: x[1]
+                        )[::-1]
+                        cur_m["is_coref"] = 1
+                    else:
+                        cur_m["is_coref"] = 0

From 9f5657d1fd1abf112003290ef8ca59cab5d277f1 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Fri, 6 Jan 2023 11:44:44 +0100
Subject: [PATCH 07/43] 3 options for coreferences

- no coref search, 'all', 'lsh'
- update ED class and efficiency_test.py accordingly
---
 scripts/efficiency_test.py       | 41 ++++++++++-------
 src/REL/entity_disambiguation.py | 18 +++++---
 src/REL/training_datasets.py     | 77 +++++++++++++-------------------
 3 files changed, 66 insertions(+), 70 deletions(-)

diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py
index 4fc22a2..a1da06d 100644
--- a/scripts/efficiency_test.py
+++ b/scripts/efficiency_test.py
@@ -9,11 +9,18 @@
 np.random.seed(seed=42)
 
 parser = argparse.ArgumentParser()
-parser.add_argument( 
-    "--no_corefs",
-    action="store_true",
-    help="use function with_coref()?", 
-    default=False)
+# parser.add_argument( 
+#     "--no_corefs",
+#     action="store_true",
+#     help="use function with_coref()?", 
+#     default=False)
+parser.add_argument(
+    '--search_corefs',
+    type=str,
+    choices=['all', 'lsh', 'off'],
+    default='all',
+    help="Setting for search_corefs in Entity Disambiguation."
+)
 
 parser.add_argument(
     "--profile",
@@ -70,7 +77,7 @@ def profile_to_df(call):
 # adjust folder structure on computer and in script 
 
 args = parser.parse_args()
-print(f"args.no_corefs is {args.no_corefs}")
+print(f"args.search_corefs is {args.search_corefs}")
 
 if args.profile:
     import cProfile 
@@ -81,7 +88,7 @@ def profile_to_df(call):
 
 base_url = "/home/flavio/projects/rel20/data"
 wiki_version = "wiki_2019"
-datasets = TrainingEvaluationDatasets(base_url, wiki_version, args.no_corefs).load()[args.name_dataset] 
+datasets = TrainingEvaluationDatasets(base_url, wiki_version, args.search_corefs).load()[args.name_dataset] 
 save_data_to = f"{base_url}/efficiency_test/" # save all recorded in this directory 
 
 # random_docs = np.random.choice(list(datasets.keys()), 50)
@@ -151,7 +158,7 @@ def profile_to_df(call):
         "mode": "eval",
         "model_path": "{}/{}/generated/model".format(base_url, wiki_version),
     }
-    model = EntityDisambiguation(base_url, wiki_version, config, no_corefs=args.no_corefs) # TODO: change to no_corefs to be consistent!
+    model = EntityDisambiguation(base_url, wiki_version, config, search_corefs=args.search_corefs) 
         # model.coref is a training data set
         # model.coref has method with_coref
         # compare the training data sets when using corefs and when not
@@ -168,9 +175,9 @@ def profile_to_df(call):
         "timing": timing
     }
     
-    filename = f"{save_data_to}predictions/{args.name_dataset}_{args.n_docs}"
-    if args.no_corefs:
-        filename = f"{filename}_nocoref"
+    filename = f"{save_data_to}predictions/{args.name_dataset}_{args.n_docs}_{args.search_corefs}"
+    # if args.no_corefs:
+    #     filename = f"{filename}_nocoref"
 
     with open(f"{filename}.pickle", "wb") as f:
         pickle.dump(output, f, protocol=pickle.HIGHEST_PROTOCOL)        
@@ -178,9 +185,9 @@ def profile_to_df(call):
     # ## 4.b Profile disambiguation
     if args.profile:
         print("Profiling disambiguation")
-        filename = f"{save_data_to}profile/{args.name_dataset}_{args.n_docs}"
-        if args.no_corefs:
-            filename = f"{filename}_nocoref"
+        filename = f"{save_data_to}profile/{args.name_dataset}_{args.n_docs}_{args.search_corefs}"
+        # if args.no_corefs:
+        #     filename = f"{filename}_nocoref"
 
         df_stats = profile_to_df(call="model.predict(mentions_dataset)")
         # cProfile.run("model.predict(mentions_dataset)", filename="temp.txt")
@@ -237,9 +244,9 @@ def profile_to_df(call):
 
         
         # save timing by dataet
-        filename = f"{save_data_to}n_mentions_time/{args.name_dataset}"
-        if args.no_corefs:
-            filename = f"{filename}_nocoref"
+        filename = f"{save_data_to}n_mentions_time/{args.name_dataset}_{args.search_corefs}"
+        # if args.no_corefs:
+        #     filename = f"{filename}_nocoref"
         
         with open(f"{filename}.pickle", "wb") as f:
             pickle.dump(timing_by_dataset, f, protocol=pickle.HIGHEST_PROTOCOL)
diff --git a/src/REL/entity_disambiguation.py b/src/REL/entity_disambiguation.py
index a1423bf..f07d70a 100644
--- a/src/REL/entity_disambiguation.py
+++ b/src/REL/entity_disambiguation.py
@@ -32,7 +32,12 @@ class EntityDisambiguation:
     Parent Entity Disambiguation class that directs the various subclasses used
     for the ED step.
     """
-    def __init__(self, base_url, wiki_version, user_config, reset_embeddings=False, no_corefs=False):
+    def __init__(self, base_url, wiki_version, user_config, reset_embeddings=False, search_corefs="all"):
+        """
+        Argument search_corefs: One of 'all' (default), 'lsh', 'off'. 
+            If 'off', no coreference search is done.
+            Otherwise the arguments are passed to the argument `search_corefs_in` in `with_coref`.
+        """
         self.base_url = base_url
         self.wiki_version = wiki_version
         self.embeddings = {}
@@ -53,8 +58,9 @@ def __init__(self, base_url, wiki_version, user_config, reset_embeddings=False,
         ), "Glove embeddings in wrong folder..? Test embedding not found.."
 
         self.__load_embeddings()
-        self.no_corefs = no_corefs
-        self.coref = TrainingEvaluationDatasets(base_url, wiki_version, no_corefs)
+        assert search_corefs in ['all', 'lsh', 'off']
+        self.search_corefs = search_corefs
+        self.coref = TrainingEvaluationDatasets(base_url, wiki_version, search_corefs)
         self.prerank_model = PreRank(self.config).to(self.device)
 
         self.__max_conf = None
@@ -471,8 +477,8 @@ def predict(self, data):
         :return: predictions and time taken for the ED step.
         """
 
-        if not self.no_corefs:
-            self.coref.with_coref(data)
+        if self.search_corefs != "off":
+            self.coref.with_coref(data, search_corefs_in=self.search_corefs)
 
         data = self.get_data_items(data, "raw", predict=True)
         predictions, timing = self.__predict(data, include_timing=True, eval_raw=True)
@@ -667,7 +673,7 @@ def __predict(self, data, include_timing=False, eval_raw=False):
                 ]
                 doc_names = [m["doc_name"] for m in batch]
 
-                if not self.no_corefs:
+                if self.search_corefs != 'off':
                     coref_indicators = [m['raw']['is_coref'] for m in batch]
                 else:
                     coref_indicators = [None for m in batch]
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 666b707..55b5413 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -10,12 +10,18 @@ class TrainingEvaluationDatasets:
     Reading dataset from CoNLL dataset, extracted by https://github.com/dalab/deep-ed/
     """
 
-    def __init__(self, base_url, wiki_version, no_corefs=False):
+    def __init__(self, base_url, wiki_version, search_corefs="all"):
+        """
+        Argument search_corefs: One of 'all' (default), 'lsh', 'off'. 
+            If 'off', no coreference search is done.
+            Otherwise the arguments are passed to the argument `search_corefs_in` in `with_coref`.
+        """
         self.person_names = self.__load_person_names(
             os.path.join(base_url, "generic/p_e_m_data/persons.txt")
         )
         self.base_url = os.path.join(base_url, wiki_version)
-        self.no_corefs = no_corefs
+        assert search_corefs in ['all', 'lsh', 'off']
+        self.search_corefs = search_corefs
 
     def load(self):
         """
@@ -46,8 +52,8 @@ def load(self):
                 if "Jiří_Třanovský Jiří_Třanovský" in datasets[ds]:
                     del datasets[ds]["Jiří_Třanovský Jiří_Třanovský"]
 
-            if not self.no_corefs:
-                self.with_coref(datasets[ds])
+            if self.search_corefs != "off":
+                self.with_coref(datasets[ds], search_corefs_in=self.search_corefs)
 
         return datasets
 
@@ -106,59 +112,36 @@ def __find_coref(self, ment, mentlist):
 
         return coref
 
-    def with_coref_old(self, dataset): # TODO: need to update the calls to with_coref
-        """
-        Parent function that checks if there are coreferences in the given dataset.
-
-        :return: dataset
-        """
-        print("with_coref() is called.")
-        for data_name, content in dataset.items():
-            for cur_m in content:
-                coref = self.__find_coref(cur_m, content)
-                if coref is not None and len(coref) > 0:
-                    cur_cands = {}
-                    for m in coref:
-                        for c, p in m["candidates"]:
-                            cur_cands[c] = cur_cands.get(c, 0) + p
-                    for c in cur_cands.keys():
-                        cur_cands[c] /= len(coref)
-                    cur_m["candidates"] = sorted(
-                        list(cur_cands.items()), key=lambda x: x[1]
-                    )[::-1]
-                    cur_m["is_coref"] = 1
-                else:
-                    cur_m["is_coref"] = 0
-
-    def with_coref(self, dataset): # TODO: need to update the calls to with_coref
+    def with_coref(self, dataset, search_corefs_in="all"): # TODO: need to update the calls to with_coref
         """
         Check if there are coreferences in the given dataset. Use LSH for dimensionality reduction.
 
+        search_corefs_in: either of 'lsh' or all 'all'. 
+        If 'all', search for coreferences among all mentions in document. This is what REL currently does by default.
+        If 'lsh', search for coreferences among a pre-selected set of candidates. The set is calculated with LSH.
+
         :return: dataset
         """
-        print("with_coref_lsh() is called.")
+        print(f"with_coref() is called with {search_corefs_in=}.")
+        assert search_corefs_in in ['lsh', 'all']
         for data_name, content in dataset.items():
-            # pdb.set_trace()
-            # handle problem with only 1 input mention -- fall back to previous approach
-            # print(f"len content: {len(content)}")
-            # print(f"data name: {data_name}")
-            # if data_name == '937 OFFICIAL)':
-            #     pdb.set_trace()
             if len(content) == 0:
                 pass 
             else:
-                input_mentions = [m["mention"] for m in content]
-                lsh_corefs = LSHMinHash(mentions=input_mentions, shingle_size=4, signature_size=50, band_length=2)
-                lsh_corefs.cluster()
-                assert len(content) == len(lsh_corefs.candidates)
-                # lsh_corefs.candidates are the input for below. indices refer to index in input_mentions
-                # call lsh here on all mentions 
-                for cur_m, idx_candidates in zip(content, lsh_corefs.candidates):
-                    idx_candidates = list(idx_candidates) # lsh returns the indices of the candidate coreferences
-                    candidates = [content[i] for i in idx_candidates]
+                if search_corefs_in == 'lsh':
+                    input_mentions = [m["mention"] for m in content]
+                    lsh_corefs = LSHMinHash(mentions=input_mentions, shingle_size=4, signature_size=50, band_length=2)
+                    lsh_corefs.cluster()
+                    assert len(content) == len(lsh_corefs.candidates)
+                    # lsh_corefs.candidates are the input for below. indices refer to index in input_mentions
+                    # call lsh here on all mentions 
+                for idx_mention, cur_m in enumerate(content):
+                    if search_corefs_in == "lsh":
+                        idx_candidates = list(lsh_corefs.candidates[idx_mention]) # lsh returns the indices of the candidate coreferences
+                        candidates = [content[i] for i in idx_candidates]
+                    elif search_corefs_in == "all":
+                        candidates = content
                     coref = self.__find_coref(cur_m, candidates)
-                    # update __find_coref: use indices from lsh call above 
-                    # pdb.set_trace()
                     if coref is not None and len(coref) > 0:
                         cur_cands = {}
                         for m in coref:

From 7d8a17c207570474769034b4f8bbce1f8c3c93c0 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Fri, 6 Jan 2023 13:29:06 +0100
Subject: [PATCH 08/43] adjust update_efficiency_tests.sh

---
 scripts/run_efficiency_tests.sh | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/scripts/run_efficiency_tests.sh b/scripts/run_efficiency_tests.sh
index 1824e30..2979a4b 100644
--- a/scripts/run_efficiency_tests.sh
+++ b/scripts/run_efficiency_tests.sh
@@ -12,16 +12,18 @@ echo $datasets
 for size in ${docsizes[@]}; do
     for ds in ${datasets[@]}; do
         echo $ds, echo $size
-        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds"
-        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --no_corefs
+        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "all"
+        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "lsh"
+        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "off"
     done 
 done 
 
 
 # for ds in ${datasets[@]}; do
 #     echo $ds
-#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile 
-#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --no_corefs
+#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "all"
+#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "lsh"
+#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "off"
 # done 
 
 

From f080855901c561674ad175723a6a5c572db7165d Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Mon, 9 Jan 2023 10:56:53 +0100
Subject: [PATCH 09/43] make printout backwards compatible

---
 src/REL/training_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 55b5413..fe71f14 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -122,7 +122,7 @@ def with_coref(self, dataset, search_corefs_in="all"): # TODO: need to update th
 
         :return: dataset
         """
-        print(f"with_coref() is called with {search_corefs_in=}.")
+        print(f"with_coref() is called with search_corefs_in={search_corefs_in}.")
         assert search_corefs_in in ['lsh', 'all']
         for data_name, content in dataset.items():
             if len(content) == 0:

From 1a432d938f96e25916d3dd85f3db35c0fe2db83a Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Tue, 10 Jan 2023 15:27:42 +0100
Subject: [PATCH 10/43] add basic logging to lsh class

---
 src/REL/lsh.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index 1e2c86a..30f676d 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -2,6 +2,7 @@
 from random import shuffle, seed 
 import time 
 import numpy as np 
+import logging 
 seed(3)
 
 def split_mention(m):
@@ -109,10 +110,12 @@ def _build_vocab(self):
         self.vocab = vocab
 
     def encode_binary(self, to_numpy=False):
+        logging.debug(f"creating lists with binary vectors. Vocabulary size is {len(self.vocab)}")
         vectors = [[1 if word in cur_shingles else 0 for word in self.vocab] for cur_shingles in self.shingles]
         if not to_numpy:
             self.vectors = vectors 
         else:
+            logging.debug("putting to numpy")
             self.vectors = np.stack(vectors)
 
 
@@ -128,6 +131,7 @@ def __init__(self, mentions, shingle_size, signature_size, band_length):
     
     def make_signature(self):
         "make array of dense vectors with MinHashing. each row is one mention"
+        print(f"Making signature. vectors shape is {self.vectors.shape}")
         templist = []
         rng = np.random.default_rng(seed=3)
         i = 0
@@ -148,8 +152,9 @@ def all_candidates_to_all(self):
         n_mentions = self.vectors.shape[0]
         self.candidates = [set(range(n_mentions)) for _ in range(n_mentions)]
 
-    def get_candidates(self):
+    def get_candidates(self): ## TODO: use itertools
         "extract similar candidates for each mention by comparing subsets of the signature"
+        print("getting candidates...")
         n_bands = int(self.signature_size / self.band_length)
         bands = np.split(ary=self.signature, indices_or_sections=n_bands, axis=1)
         candidates = [set() for _ in range(self.vectors.shape[0])]
@@ -172,15 +177,20 @@ def get_candidates(self):
     def cluster(self, numpy_signature=False):
         "find similar records for each mention"
         start = time.time()
+        logging.debug("building vocabulary")
         self._build_vocab()
+        logging.debug("encoding to binary")
         self.encode_binary(to_numpy=True)
+        logging.debug("making signature")
         if self.vectors.shape[1] == 0: # no signature possible b/c no mention is longer than the shingle size.
+            print('self.vectors.shape[1] is 0.')
             self.all_candidates_to_all()
         else:
             if numpy_signature:
                 self.make_signature_np()
             else:
                 self.make_signature()
+            logging.debug("getting candidate groups")
             self.get_candidates()
         self.time = time.time() - start 
 

From ae8e9e1b69ddc5dbe5836f5166c92e0d1c04c585 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Wed, 11 Jan 2023 16:02:55 +0100
Subject: [PATCH 11/43] fix bug for single mention, add logging to efficiency
 test

---
 scripts/efficiency_test.py |   4 +-
 src/REL/lsh.py             | 114 +++++++++++++++++++++++++++++--------
 2 files changed, 92 insertions(+), 26 deletions(-)

diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py
index a1da06d..0206035 100644
--- a/scripts/efficiency_test.py
+++ b/scripts/efficiency_test.py
@@ -2,7 +2,7 @@
 import requests
 import argparse
 import pickle
-
+import logging 
 
 from REL.training_datasets import TrainingEvaluationDatasets
 
@@ -47,6 +47,8 @@
     default=50,
     help="Number of documents to be processed."
 )
+logging.basicConfig(level=logging.INFO) # do not print to file 
+
 
 # helper function to profile a call and save the timing in a pd dataframe 
 def profile_to_df(call):
diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index 30f676d..a4b7899 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -3,6 +3,9 @@
 import time 
 import numpy as np 
 import logging 
+import pdb 
+import sys 
+from scipy import sparse
 seed(3)
 
 def split_mention(m):
@@ -78,7 +81,7 @@ def minhash_signature_np(x, n_reps):
     """
     # get indices 
     indices = np.arange(x.shape[1])
-    rng = np.random.default_rng(12345)
+    rng = np.random.default_rng(12345) # TODO: this should be defined at class instantiation
 
     # expand by n_reps 
     indices_mult = np.tile(indices, (n_reps, 1)) # reorder the columns n_reps times 
@@ -109,39 +112,92 @@ def _build_vocab(self):
         vocab = list(set([shingle for sublist in self.shingles for shingle in sublist]))
         self.vocab = vocab
 
-    def encode_binary(self, to_numpy=False):
-        logging.debug(f"creating lists with binary vectors. Vocabulary size is {len(self.vocab)}")
-        vectors = [[1 if word in cur_shingles else 0 for word in self.vocab] for cur_shingles in self.shingles]
-        if not to_numpy:
-            self.vectors = vectors 
+    # def encode_binary(self, to_numpy=False):
+    #     logging.debug(f"creating lists with binary vectors. Vocabulary size is {len(self.vocab)}")
+    #     # pdb.set_trace()
+    #     vectors = [[1 if word in cur_shingles else 0 for word in self.vocab] for cur_shingles in self.shingles]
+    #     logging.debug(f"size of vectors: {sys.getsizeof(vectors)}")
+    #     if not to_numpy:
+    #         self.vectors = vectors 
+    #     else:
+    #         logging.debug("putting to numpy")
+    #         self.vectors = np.stack(vectors)
+    def encode_binary(self, dest="sparse"):
+        """Create binary vectors for each mention. 
+        
+        Parameters:
+        ----------
+        dest: how to store the resulting matrix. One of 'list' (base python), 'numpy' (numpy array), or 'sparse' (sparse matrix)
+        """
+        assert dest in ["list", "numpy", "sparse"]
+        if dest == "list":
+            raise NotImplementedError("Not implemented yet.")
         else:
-            logging.debug("putting to numpy")
-            self.vectors = np.stack(vectors)
+            # indices of ones 
+            logging.debug("making indices from vocab")# at least this gives me now the MemoryError.
+            one_indices = [[i for i in range(len(self.vocab)) if self.vocab[i] in shingle] for shingle in self.shingles]
+            if dest == "numpy":
+                logging.debug("making id_array")
+                id_array = np.eye(len(self.vocab)) # identiy array https://stackoverflow.com/questions/29831489/convert-array-of-indices-to-one-hot-encoded-array-in-numpy
+                logging.debug("making vectors")
+                vectors = [np.sum(id_array[i], axis=0) for i in one_indices]
+                logging.debug("stacking")
+                self.vectors = np.stack(vectors)
+            elif dest == "sparse":
+                logging.debug("making sparse matrix")
+                vectors = []
+                for idx in one_indices:
+                    a = sparse.lil_matrix((1,len(self.vocab)))
+                    a[0, idx] = 1
+                    vectors.append(a)
+                self.vectors = sparse.vstack(vectors)
+
 
 
 class LSHMinHash(LSHBase):
     "LSH with MinHashing and numpy"
 
-    def __init__(self, mentions, shingle_size, signature_size, band_length):
+    def __init__(self, mentions, shingle_size, signature_size, band_length, sparse_binary=True):
+        # sparse_binary: should the sparse 0/1 matrix be stored with scipy sparse? takes more time, but less memory
         super().__init__(mentions, shingle_size)
         if signature_size % band_length != 0:
             raise ValueError("Signature needs to be divisible into equal-sized bands.")
         self.signature_size = signature_size 
         self.band_length = band_length 
+        self.sparse_binary = sparse_binary
     
     def make_signature(self):
         "make array of dense vectors with MinHashing. each row is one mention"
-        print(f"Making signature. vectors shape is {self.vectors.shape}")
+        logging.debug(f"Making signature. vectors shape is {self.vectors.shape}")
+        # pdb.set_trace()
         templist = []
         rng = np.random.default_rng(seed=3)
         i = 0
-        while i < self.signature_size:
-            rng.shuffle(self.vectors, axis=1)
-            sig_i = 1 + self.vectors.argmax(axis=1) # add one for the log10 operations in idx_unique_multidim 
-            templist.append(sig_i)
-            i += 1
-
-        self.signature = np.stack(templist, axis=1)
+        if isinstance(self.vectors, np.ndarray):
+            logging.debug("using binary numpy arrays")
+            while i < self.signature_size:
+                rng.shuffle(self.vectors, axis=1)
+                sig_i = 1 + self.vectors.argmax(axis=1) # add one for the log10 operations in idx_unique_multidim 
+                templist.append(sig_i)
+                i += 1
+            self.signature = np.stack(templist, axis=1)
+        else: # older versions of scipy have not _coo attribute. TODO: fix this
+        # elif isinstance(self.vectors, sparse._coo.coo_matrix):
+            # not sure how efficient this is. switching a lot between data structures.
+            logging.debug('using binary sparse matrices')
+            indices = np.arange(self.vectors.shape[1])
+            while i < self.signature_size:
+                shuffle(indices)
+                sig = sparse.lil_matrix(self.vectors)
+                sig = sig[:, list(indices)]
+                sig = sparse.csr_matrix(sig)
+                sig_i = 1 + sig.argmax(axis=1)
+                sig_i = np.asarray(sig_i)
+                templist.append(sig_i)
+                i += 1
+
+            self.signature = np.stack(templist, axis=1).squeeze()
+            
 
     def make_signature_np(self):
         signature = minhash_signature_np(self.vectors, self.signature_size)
@@ -154,12 +210,17 @@ def all_candidates_to_all(self):
 
     def get_candidates(self): ## TODO: use itertools
         "extract similar candidates for each mention by comparing subsets of the signature"
-        print("getting candidates...")
+        logging.debug("getting candidates...")
         n_bands = int(self.signature_size / self.band_length)
-        bands = np.split(ary=self.signature, indices_or_sections=n_bands, axis=1)
-        candidates = [set() for _ in range(self.vectors.shape[0])]
-
-        if len(candidates) > 1:
+        
+        if self.vectors.shape[0] == 1:
+            candidates = [set()]
+            candidates[0].add(0)
+        else:
+            bands = np.split(ary=self.signature, indices_or_sections=n_bands, axis=1)
+            candidates = [set() for _ in range(self.vectors.shape[0])]
+                        
+            # if len(candidates) > 1:
             for band in bands:
                 groups = idx_unique_multidim(band)
                 groups = [g for g in groups if g.shape[0] > 1]
@@ -169,8 +230,8 @@ def get_candidates(self): ## TODO: use itertools
                         for j in g:
                             if i != j:
                                 candidates[i].add(j)
-        else: # idx_unique_multidim above does not work when there is only one candidate
-            candidates[0].add(0)
+            # else: # idx_unique_multidim above does not work when there is only one candidate
+            #     candidates[0].add(0)
 
         self.candidates = candidates
 
@@ -180,7 +241,10 @@ def cluster(self, numpy_signature=False):
         logging.debug("building vocabulary")
         self._build_vocab()
         logging.debug("encoding to binary")
-        self.encode_binary(to_numpy=True)
+        if self.sparse_binary:
+            self.encode_binary(dest="sparse")
+        else:
+            self.encode_binary(dest="numpy")
         logging.debug("making signature")
         if self.vectors.shape[1] == 0: # no signature possible b/c no mention is longer than the shingle size.
             print('self.vectors.shape[1] is 0.')

From b8b4ea0766874a9249b85a2f7d4518984e007be4 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Wed, 11 Jan 2023 17:13:48 +0100
Subject: [PATCH 12/43] restore run_efficiency_test.sh

---
 scripts/run_efficiency_tests.sh | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/scripts/run_efficiency_tests.sh b/scripts/run_efficiency_tests.sh
index 2979a4b..735fe34 100644
--- a/scripts/run_efficiency_tests.sh
+++ b/scripts/run_efficiency_tests.sh
@@ -19,15 +19,15 @@ for size in ${docsizes[@]}; do
 done 
 
 
-# for ds in ${datasets[@]}; do
-#     echo $ds
-#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "all"
-#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "lsh"
-#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "off"
-# done 
-
+for ds in ${datasets[@]}; do
+    echo $ds
+    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "all"
+    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "lsh"
+    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "off"
+done 
 
 
+echo "Done."
 
 
 

From 0cc75986b59f96803245e784b0b8d996cc31caa9 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Mon, 16 Jan 2023 11:45:03 +0100
Subject: [PATCH 13/43] scale fake data more

---
 scripts/efficiency_test.py      |  2 +-
 scripts/run_efficiency_tests.sh | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py
index 0206035..4948d1f 100644
--- a/scripts/efficiency_test.py
+++ b/scripts/efficiency_test.py
@@ -221,7 +221,7 @@ def profile_to_df(call):
 
         for k, data in mentions_dataset.items():
             mentions_dataset_scaled[k] = data # add the baseline data as in mentions_dataset
-            for f in [5, 10, 50, 100]:
+            for f in [5, 50, 100, 300]:
                 d = data * f 
                 key = f"{k}_{f}"
                 mentions_dataset_scaled[key] = d
diff --git a/scripts/run_efficiency_tests.sh b/scripts/run_efficiency_tests.sh
index 735fe34..2d335a6 100644
--- a/scripts/run_efficiency_tests.sh
+++ b/scripts/run_efficiency_tests.sh
@@ -9,14 +9,14 @@ docsizes=(50 500)
 echo $datasets
 
 # do profiling and checking predictions in one 
-for size in ${docsizes[@]}; do
-    for ds in ${datasets[@]}; do
-        echo $ds, echo $size
-        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "all"
-        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "lsh"
-        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "off"
-    done 
-done 
+# for size in ${docsizes[@]}; do
+#     for ds in ${datasets[@]}; do
+#         echo $ds, echo $size
+#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "all"
+#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "lsh"
+#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "off"
+#     done 
+# done 
 
 
 for ds in ${datasets[@]}; do

From 2ca1bbc393fa8d38522f3a222306c6e904805a37 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Mon, 16 Jan 2023 13:48:27 +0100
Subject: [PATCH 14/43] switch to hashing with random projections

---
 src/REL/lsh.py               | 35 +++++++++++++++++++++++------------
 src/REL/training_datasets.py |  2 +-
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index a4b7899..fe00300 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -185,18 +185,29 @@ def make_signature(self):
         # elif isinstance(self.vectors, sparse._coo.coo_matrix):
             # not sure how efficient this is. switching a lot between data structures.
             logging.debug('using binary sparse matrices')
-            indices = np.arange(self.vectors.shape[1])
-            while i < self.signature_size:
-                shuffle(indices)
-                sig = sparse.lil_matrix(self.vectors)
-                sig = sig[:, list(indices)]
-                sig = sparse.csr_matrix(sig)
-                sig_i = 1 + sig.argmax(axis=1)
-                sig_i = np.asarray(sig_i)
-                templist.append(sig_i)
-                i += 1
-
-            self.signature = np.stack(templist, axis=1).squeeze()
+            rng = np.random.default_rng(seed=3)
+            # vectors = mylsh.vectors
+            hyperplanes = rng.choice([-1, 1], (self.signature_size, self.vectors.shape[1]))
+            # TODO: make vectors a csr matrix (?)
+            hyperplanes = sparse.csr_matrix(hyperplanes)
+            products = self.vectors.dot(hyperplanes.transpose())
+            products = products.toarray()
+            sign = 1 + (products > 0) # TODO: can I change the downstream function for this? now it should be much easier to transform the signatures into a single string?
+            self.signature = sign
+
+
+            # while i < self.signature_size:
+            #     plane = hyperplanes[i, :].transpose()
+            #     out = self.vectors.dot(plane)
+            #     out = out.toarray()
+                
+            #     sig_i = (out > 0)
+            #     sig_i = sig_i.astype(int)
+            #     sig_i = 1 + sig_i # TODO: can I change the downstream function for this? now it should be much easier to transform the signatures into a single string?
+            #     templist.append(sig_i)
+            #     i += 1
+
+            # self.signature = np.stack(templist, axis=1).squeeze()
             
 
     def make_signature_np(self):
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index fe71f14..2d8af33 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -130,7 +130,7 @@ def with_coref(self, dataset, search_corefs_in="all"): # TODO: need to update th
             else:
                 if search_corefs_in == 'lsh':
                     input_mentions = [m["mention"] for m in content]
-                    lsh_corefs = LSHMinHash(mentions=input_mentions, shingle_size=4, signature_size=50, band_length=2)
+                    lsh_corefs = LSHMinHash(mentions=input_mentions, shingle_size=2, signature_size=800, band_length=10)
                     lsh_corefs.cluster()
                     assert len(content) == len(lsh_corefs.candidates)
                     # lsh_corefs.candidates are the input for below. indices refer to index in input_mentions

From 5fc4357484a088af609fece282ee157d9573159a Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Mon, 16 Jan 2023 15:36:14 +0100
Subject: [PATCH 15/43] add some more debugging to lsh

---
 scripts/run_efficiency_tests.sh | 24 ++++++++++++++----------
 src/REL/lsh.py                  |  5 ++++-
 src/REL/training_datasets.py    |  1 +
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/scripts/run_efficiency_tests.sh b/scripts/run_efficiency_tests.sh
index 2d335a6..81a259b 100644
--- a/scripts/run_efficiency_tests.sh
+++ b/scripts/run_efficiency_tests.sh
@@ -8,22 +8,26 @@ docsizes=(50 500)
 
 echo $datasets
 
+
+echo "--Running efficiency tests by data set and n_docs--"
+
 # do profiling and checking predictions in one 
-# for size in ${docsizes[@]}; do
-#     for ds in ${datasets[@]}; do
-#         echo $ds, echo $size
-#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "all"
-#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "lsh"
-#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "off"
-#     done 
-# done 
+for size in ${docsizes[@]}; do
+    for ds in ${datasets[@]}; do
+        echo $ds, echo $size
+        # python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "all"
+        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "lsh"
+        # python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "off"
+    done 
+done 
 
+echo "--Scaling number of mentions--"
 
 for ds in ${datasets[@]}; do
     echo $ds
-    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "all"
+    # python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "all"
     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "lsh"
-    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "off"
+    # python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "off"
 done 
 
 
diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index fe00300..7b883ba 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -185,12 +185,15 @@ def make_signature(self):
         # elif isinstance(self.vectors, sparse._coo.coo_matrix):
             # not sure how efficient this is. switching a lot between data structures.
             logging.debug('using binary sparse matrices')
-            rng = np.random.default_rng(seed=3)
+            rng = np.random.default_rng(seed=3) # TODO: put this to class instantiation
             # vectors = mylsh.vectors
+            logging.debug("making hyperplanes")
             hyperplanes = rng.choice([-1, 1], (self.signature_size, self.vectors.shape[1]))
             # TODO: make vectors a csr matrix (?)
             hyperplanes = sparse.csr_matrix(hyperplanes)
+            logging.debug("making dot product")
             products = self.vectors.dot(hyperplanes.transpose())
+            logging.debug("making signature")
             products = products.toarray()
             sign = 1 + (products > 0) # TODO: can I change the downstream function for this? now it should be much easier to transform the signatures into a single string?
             self.signature = sign
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 2d8af33..0bc3cc4 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -2,6 +2,7 @@
 import pickle
 import pdb 
 from REL.lsh import LSHMinHash
+import logging
 
 class TrainingEvaluationDatasets:
     """

From e6894d5946298112228dad735ed2382cfef50212 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Wed, 18 Jan 2023 12:13:19 +0100
Subject: [PATCH 16/43] speed up get_candidates()

---
 scripts/efficiency_test.py      |  3 ++-
 scripts/run_efficiency_tests.sh | 20 +++++++-------
 src/REL/lsh.py                  | 48 +++++++++++++++++++++++++++++++--
 3 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py
index 4948d1f..7e28335 100644
--- a/scripts/efficiency_test.py
+++ b/scripts/efficiency_test.py
@@ -217,6 +217,7 @@ def profile_to_df(call):
     # ## 4.c time disambiguation by document, vary number of mentions 
     if args.scale_mentions:
         print("Scaling the mentions per document")
+        logging.basicConfig(level=logging.DEBUG) 
         mentions_dataset_scaled = {}
 
         for k, data in mentions_dataset.items():
@@ -229,7 +230,7 @@ def profile_to_df(call):
         print("Timing disambiguation per document")
         timing_by_dataset = {}
         for name, mentions in mentions_dataset_scaled.items():
-            print(f"predicting for dataset {name}")
+            print(f"predicting for dataset {name}", flush=True)
             tempdict = {name: mentions} # format so that model.predict() works 
             start = time()
             predictions, timing = model.predict(tempdict)
diff --git a/scripts/run_efficiency_tests.sh b/scripts/run_efficiency_tests.sh
index 81a259b..b957e40 100644
--- a/scripts/run_efficiency_tests.sh
+++ b/scripts/run_efficiency_tests.sh
@@ -12,22 +12,22 @@ echo $datasets
 echo "--Running efficiency tests by data set and n_docs--"
 
 # do profiling and checking predictions in one 
-for size in ${docsizes[@]}; do
-    for ds in ${datasets[@]}; do
-        echo $ds, echo $size
-        # python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "all"
-        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "lsh"
-        # python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "off"
-    done 
-done 
+# for size in ${docsizes[@]}; do
+#     for ds in ${datasets[@]}; do
+#         echo $ds, echo $size
+#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "all"
+#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "lsh"
+#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "off"
+#     done 
+# done 
 
 echo "--Scaling number of mentions--"
 
 for ds in ${datasets[@]}; do
     echo $ds
-    # python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "all"
+    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "all"
     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "lsh"
-    # python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "off"
+    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "off"
 done 
 
 
diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index 7b883ba..30a88dc 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -3,6 +3,8 @@
 import time 
 import numpy as np 
 import logging 
+from collections import defaultdict
+import itertools
 import pdb 
 import sys 
 from scipy import sparse
@@ -99,6 +101,22 @@ def minhash_signature_np(x, n_reps):
     return sig 
 
 
+def signature_to_bucket(signature, n_bands):
+    "Collect items with same bands in buckets"
+    num_cols = signature.shape[0] # number of documents to classify
+    bands = np.split(signature, n_bands, axis=1)
+    buckets = []
+    for band in bands:
+        items_buckets = defaultdict(list)
+        items = np.vsplit(band, num_cols)
+        for i, item in enumerate(items): # this orders the row indices into groups that have the same signature 
+            item = tuple(item.flatten().astype(int)) 
+            items_buckets[item].append(i)  # assign row i to item--ie, groups observations into buckets with the same signature 
+        buckets.append(items_buckets)
+
+    return buckets
+
+
 class LSHBase:
     # Important: order of occurences in shingles and vectors = order of input list (=order of occurrence in document)
     def __init__(self, mentions, shingle_size):
@@ -235,6 +253,7 @@ def get_candidates(self): ## TODO: use itertools
             candidates = [set() for _ in range(self.vectors.shape[0])]
                         
             # if len(candidates) > 1:
+            # TODO: can I speed this up? 
             for band in bands:
                 groups = idx_unique_multidim(band)
                 groups = [g for g in groups if g.shape[0] > 1]
@@ -249,7 +268,29 @@ def get_candidates(self): ## TODO: use itertools
 
         self.candidates = candidates
 
-    def cluster(self, numpy_signature=False):
+    def get_candidates_new(self):
+        "extract similar candidates for each mention by comparing subsets of the signature"
+        logging.debug("getting candidates...")
+        n_bands = int(self.signature_size / self.band_length)
+        if self.vectors.shape[0] == 1:
+            candidates = [set()]
+            candidates[0].add(0)
+        else:
+            bands = np.split(ary=self.signature, indices_or_sections=n_bands, axis=1)
+            candidates = [set() for _ in range(self.vectors.shape[0])]
+            for band in bands:
+                groups = idx_unique_multidim(band)
+                # groups = [g for g in groups if g.shape[0] > 1]
+                groups = itertools.filterfalse(lambda x: len(x) == 1, groups) # does not change much to the comprehension above, but seems to scale better
+                for g in groups:
+                    g = list(g)
+                    for i in g:
+                        candidates[i].update(g) # for row i, this also adds i to the candidates. would need to drop them later again, leading to another operation
+            [candidates[i].remove(i) for i in range(len(candidates))]
+            self.candidates = candidates
+
+
+    def cluster(self, numpy_signature=False, candidates="new"): # TODO: tidy this, only use the new function for getting candidates
         "find similar records for each mention"
         start = time.time()
         logging.debug("building vocabulary")
@@ -269,7 +310,10 @@ def cluster(self, numpy_signature=False):
             else:
                 self.make_signature()
             logging.debug("getting candidate groups")
-            self.get_candidates()
+            if candidates == "old":
+                self.get_candidates()
+            elif candidates == "new": # this seems to be slower than the old approach 
+                self.get_candidates_new()
         self.time = time.time() - start 
 
     def summarise(self):

From 6cbb668c79aabf773da0f6885db58ddaf3f5c521 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Wed, 18 Jan 2023 15:04:14 +0100
Subject: [PATCH 17/43] use sklearn binarizer for encoding

---
 scripts/run_efficiency_tests.sh | 28 ++++++++---------
 src/REL/lsh.py                  | 54 ++++++++++++++-------------------
 src/REL/training_datasets.py    |  1 +
 3 files changed, 37 insertions(+), 46 deletions(-)

diff --git a/scripts/run_efficiency_tests.sh b/scripts/run_efficiency_tests.sh
index b957e40..78f27bc 100644
--- a/scripts/run_efficiency_tests.sh
+++ b/scripts/run_efficiency_tests.sh
@@ -12,23 +12,23 @@ echo $datasets
 echo "--Running efficiency tests by data set and n_docs--"
 
 # do profiling and checking predictions in one 
-# for size in ${docsizes[@]}; do
-#     for ds in ${datasets[@]}; do
-#         echo $ds, echo $size
-#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "all"
-#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "lsh"
-#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "off"
-#     done 
-# done 
+for size in ${docsizes[@]}; do
+    for ds in ${datasets[@]}; do
+        echo $ds, echo $size
+        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "all"
+        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "lsh"
+        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "off"
+    done 
+done 
 
 echo "--Scaling number of mentions--"
 
-for ds in ${datasets[@]}; do
-    echo $ds
-    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "all"
-    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "lsh"
-    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "off"
-done 
+# for ds in ${datasets[@]}; do
+#     echo $ds
+#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "all"
+#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "lsh"
+#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "off"
+# done 
 
 
 echo "Done."
diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index 30a88dc..ba2e612 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -4,6 +4,7 @@
 import numpy as np 
 import logging 
 from collections import defaultdict
+from sklearn.preprocessing import MultiLabelBinarizer
 import itertools
 import pdb 
 import sys 
@@ -140,43 +141,24 @@ def _build_vocab(self):
     #     else:
     #         logging.debug("putting to numpy")
     #         self.vectors = np.stack(vectors)
-    def encode_binary(self, dest="sparse"):
+    def encode_binary(self, sparse_output=True):
         """Create binary vectors for each mention. 
         
         Parameters:
         ----------
-        dest: how to store the resulting matrix. One of 'list' (base python), 'numpy' (numpy array), or 'sparse' (sparse matrix)
+        sparse_output: Argument passed to `sklearn.preprocessing.MultiLabelBinarizer()`.
         """
-        assert dest in ["list", "numpy", "sparse"]
-        if dest == "list":
-            raise NotImplementedError("Not implemented yet.")
-        else:
-            # indices of ones 
-            logging.debug("making indices from vocab")# at least this gives me now the MemoryError.
-            one_indices = [[i for i in range(len(self.vocab)) if self.vocab[i] in shingle] for shingle in self.shingles]
-            if dest == "numpy":
-                logging.debug("making id_array")
-                id_array = np.eye(len(self.vocab)) # identiy array https://stackoverflow.com/questions/29831489/convert-array-of-indices-to-one-hot-encoded-array-in-numpy
-                logging.debug("making vectors")
-                vectors = [np.sum(id_array[i], axis=0) for i in one_indices]
-                logging.debug("stacking")
-                self.vectors = np.stack(vectors)
-            elif dest == "sparse":
-                logging.debug("making sparse matrix")
-                vectors = []
-                for idx in one_indices:
-                    a = sparse.lil_matrix((1,len(self.vocab)))
-                    a[0, idx] = 1
-                    vectors.append(a)
-                self.vectors = sparse.vstack(vectors)
-
+        logging.debug("making one-hot vectors")
+        binarizer = MultiLabelBinarizer(sparse_output=sparse_output)
+        vectors = binarizer.fit_transform(self.shingles)
+        self.vectors = vectors
 
 
 class LSHMinHash(LSHBase):
     "LSH with MinHashing and numpy"
 
     def __init__(self, mentions, shingle_size, signature_size, band_length, sparse_binary=True):
-        # sparse_binary: should the sparse 0/1 matrix be stored with scipy sparse? takes more time, but less memory
+        # sparse_binary: should the sparse 0/1 matrix be stored with scipy sparse? takes less memory.
         super().__init__(mentions, shingle_size)
         if signature_size % band_length != 0:
             raise ValueError("Signature needs to be divisible into equal-sized bands.")
@@ -286,8 +268,8 @@ def get_candidates_new(self):
                     g = list(g)
                     for i in g:
                         candidates[i].update(g) # for row i, this also adds i to the candidates. would need to drop them later again, leading to another operation
-            [candidates[i].remove(i) for i in range(len(candidates))]
-            self.candidates = candidates
+            [candidates[i].discard(i) for i in range(len(candidates))]
+        self.candidates = candidates
 
 
     def cluster(self, numpy_signature=False, candidates="new"): # TODO: tidy this, only use the new function for getting candidates
@@ -296,10 +278,7 @@ def cluster(self, numpy_signature=False, candidates="new"): # TODO: tidy this, o
         logging.debug("building vocabulary")
         self._build_vocab()
         logging.debug("encoding to binary")
-        if self.sparse_binary:
-            self.encode_binary(dest="sparse")
-        else:
-            self.encode_binary(dest="numpy")
+        self.encode_binary(sparse_output=self.sparse_binary)
         logging.debug("making signature")
         if self.vectors.shape[1] == 0: # no signature possible b/c no mention is longer than the shingle size.
             print('self.vectors.shape[1] is 0.')
@@ -320,3 +299,14 @@ def summarise(self):
         sizes = [len(g) for g in self.candidates]
         print(f"took {self.time} seconds for {len(self.candidates)} mentions")
         print(f"average, min, max cluster size: {round(sum(sizes)/len(sizes),2)}, {min(sizes)}, {max(sizes)}")
+
+    def efficiency_gain_comparisons(self):
+        """
+        Compare number of comparisons made for coreference search with option "lsh" and option "all".
+        Useful for understanding time complexity. 
+        And to assess whether number of comparisons is meaningfully reduced
+        """
+        sizes = [len(g) for g in self.candidates]
+        runtime_all = len(self.candidates)*len(self.candidates)
+        runtime_lsh = len(self.candidates)*(sum(sizes)/len(sizes))
+        print(f"LSH makes fraction {round(runtime_lsh/runtime_all, 2)} of comparisons relative to option all.")
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 0bc3cc4..9834fad 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -133,6 +133,7 @@ def with_coref(self, dataset, search_corefs_in="all"): # TODO: need to update th
                     input_mentions = [m["mention"] for m in content]
                     lsh_corefs = LSHMinHash(mentions=input_mentions, shingle_size=2, signature_size=800, band_length=10)
                     lsh_corefs.cluster()
+                    lsh_corefs.efficiency_gain_comparisons()
                     assert len(content) == len(lsh_corefs.candidates)
                     # lsh_corefs.candidates are the input for below. indices refer to index in input_mentions
                     # call lsh here on all mentions 

From 6ac9ff0c1c02f269f1d4bf093f08bef5bdad3420 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Wed, 18 Jan 2023 21:50:46 +0100
Subject: [PATCH 18/43] test higher precision for lsh

---
 src/REL/training_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 9834fad..22a46e2 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -131,7 +131,7 @@ def with_coref(self, dataset, search_corefs_in="all"): # TODO: need to update th
             else:
                 if search_corefs_in == 'lsh':
                     input_mentions = [m["mention"] for m in content]
-                    lsh_corefs = LSHMinHash(mentions=input_mentions, shingle_size=2, signature_size=800, band_length=10)
+                    lsh_corefs = LSHMinHash(mentions=input_mentions, shingle_size=3, signature_size=800, band_length=15)
                     lsh_corefs.cluster()
                     lsh_corefs.efficiency_gain_comparisons()
                     assert len(content) == len(lsh_corefs.candidates)

From f19c904187e107c06f33d993cbf2c4f50face545 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Thu, 19 Jan 2023 21:09:16 +0100
Subject: [PATCH 19/43] vectorize banding

---
 src/REL/lsh.py               | 78 ++++++++++++++++++++++++++++++++----
 src/REL/training_datasets.py |  4 +-
 2 files changed, 72 insertions(+), 10 deletions(-)

diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index ba2e612..e6c7111 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -50,7 +50,7 @@ def cols_to_int(a):
 
 
 def idx_unique_multidim(a):
-    "groups rows in a multidimensional arrays by their unique signature"
+    "groups row indices in a multidimensional arrays by their unique signature"
     # a = cols_to_int(a).squeeze() # wrong
     # a = cols_to_string(a).squeeze() # slow 
     a = cols_to_int(a).squeeze()
@@ -117,6 +117,65 @@ def signature_to_bucket(signature, n_bands):
 
     return buckets
 
+## new stuff
+def cols_to_int_multidim(a):
+    "combine columns in all rows to an integer: [[1,20,3], [1,4,10]] becomes [1203,1410]"
+    existing_powers = np.floor(np.log10(a))
+    n_bands, nrows, ncols = a.shape 
+
+    cumsum_powers = np.fliplr(np.cumsum(np.fliplr(existing_powers), axis=1))
+
+    add_powers = [x for x in reversed(range(ncols))]
+    add_powers = np.tile(add_powers, (nrows, 1))
+
+    mult_factor = cumsum_powers - existing_powers + add_powers  
+    summationvector = np.ones((ncols, 1)) 
+    out = np.matmul(a * 10**mult_factor, summationvector)
+    return out 
+
+def vectorize_signature_bands(a, n_bands, band_length):
+    """ 
+    Convert a signature array of dimension (n_items, signature_length) into an array of (n_bands, n_items, band_length).
+    
+    This is a vectorized version for np.vstack(np.split(a, indices_or_sections=n_bands, axis=1)). 
+    The idea is to then use a vectorized function to extract the indices, instead of looping over each element in the output of np.split().
+    """
+    n_items, signature_length = a.shape
+    
+    # stacked bands of each item, stacked together
+    stacked_bands = a.reshape(n_items*n_bands, band_length) 
+    # reorder so that the first band of all items comes first, then the second band of all items
+    reordering_vector = np.arange(n_items*n_bands).reshape(n_items, n_bands).T.reshape(1, -1)
+
+    result = stacked_bands[reordering_vector, :].reshape(n_bands, n_items, band_length)
+    
+    return result 
+
+# this replaces idx_multidim
+def group_unique_indices(a):
+    """
+    calculate groups of indices of unique rows in a multidimensional array with the same signature
+    the groups are returned by band.
+
+    Returns a list of lists. One list corresponds to each band, and it indicates the rows
+    of a that have the same band.
+    """
+    n_bands, n_items, length_band = a.shape
+    a = cols_to_int_multidim(a).squeeze()
+    
+    sort_idx = np.argsort(a, axis=1) # necessary for later, need to calc anyway
+    a_sorted = np.sort(a, axis=1) # faster alternative to np.take_along_axis(b, sort_idx, axis=1)
+
+    # indicators for where a sequence of different unique elements starts 
+    indicators =  a_sorted[:, 1:] != a_sorted[:, :-1]
+    first_element = np.tile([[True]], n_bands).T 
+    unq_first = np.concatenate((first_element, indicators), axis=1)
+
+    # calculate number of unique items 
+    unq_count = [np.diff(np.nonzero(row)[0]) for row in unq_first] # iterate through rows. 
+    unq_idx = [np.split(sort_idx[i], np.cumsum(count)) for i, count in enumerate(unq_count)]
+
+    return unq_idx
 
 class LSHBase:
     # Important: order of occurences in shingles and vectors = order of input list (=order of occurrence in document)
@@ -258,17 +317,20 @@ def get_candidates_new(self):
             candidates = [set()]
             candidates[0].add(0)
         else:
-            bands = np.split(ary=self.signature, indices_or_sections=n_bands, axis=1)
+            # bands = np.split(ary=self.signature, indices_or_sections=n_bands, axis=1)
             candidates = [set() for _ in range(self.vectors.shape[0])]
-            for band in bands:
-                groups = idx_unique_multidim(band)
-                # groups = [g for g in groups if g.shape[0] > 1]
-                groups = itertools.filterfalse(lambda x: len(x) == 1, groups) # does not change much to the comprehension above, but seems to scale better
+
+            bands = vectorize_signature_bands(self.signature, n_bands=n_bands, band_length=self.band_length)
+            buckets_by_band = group_unique_indices(bands)
+
+            for bucket in buckets_by_band:
+                # bucket = [list(group) for group in bucket]
+                groups = itertools.filterfalse(lambda x: len(x) == 1, bucket) # not sure this works for np arrays?
                 for g in groups:
                     g = list(g)
                     for i in g:
-                        candidates[i].update(g) # for row i, this also adds i to the candidates. would need to drop them later again, leading to another operation
-            [candidates[i].discard(i) for i in range(len(candidates))]
+                        candidates[i].update(g)
+                [candidates[i].discard(i) for i in range(len(candidates))]
         self.candidates = candidates
 
 
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 22a46e2..9f4a92a 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -131,9 +131,9 @@ def with_coref(self, dataset, search_corefs_in="all"): # TODO: need to update th
             else:
                 if search_corefs_in == 'lsh':
                     input_mentions = [m["mention"] for m in content]
-                    lsh_corefs = LSHMinHash(mentions=input_mentions, shingle_size=3, signature_size=800, band_length=15)
+                    lsh_corefs = LSHMinHash(mentions=input_mentions, shingle_size=3, signature_size=900, band_length=15)
                     lsh_corefs.cluster()
-                    lsh_corefs.efficiency_gain_comparisons()
+                    # lsh_corefs.efficiency_gain_comparisons()
                     assert len(content) == len(lsh_corefs.candidates)
                     # lsh_corefs.candidates are the input for below. indices refer to index in input_mentions
                     # call lsh here on all mentions 

From 9f417454b54ecdc6a613893a8b7b2f0165f1152a Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Fri, 20 Jan 2023 12:15:48 +0100
Subject: [PATCH 20/43] small speed ups for get_candidates_new()

---
 src/REL/lsh.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index e6c7111..af20a47 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -144,7 +144,7 @@ def vectorize_signature_bands(a, n_bands, band_length):
     
     # stacked bands of each item, stacked together
     stacked_bands = a.reshape(n_items*n_bands, band_length) 
-    # reorder so that the first band of all items comes first, then the second band of all items
+    # reorder so that the first band of all items comes first, then the second band of all items, etc.
     reordering_vector = np.arange(n_items*n_bands).reshape(n_items, n_bands).T.reshape(1, -1)
 
     result = stacked_bands[reordering_vector, :].reshape(n_bands, n_items, band_length)
@@ -167,13 +167,14 @@ def group_unique_indices(a):
     a_sorted = np.sort(a, axis=1) # faster alternative to np.take_along_axis(b, sort_idx, axis=1)
 
     # indicators for where a sequence of different unique elements starts 
-    indicators =  a_sorted[:, 1:] != a_sorted[:, :-1]
+    indicators = a_sorted[:, 1:] != a_sorted[:, :-1]
     first_element = np.tile([[True]], n_bands).T 
     unq_first = np.concatenate((first_element, indicators), axis=1)
 
     # calculate number of unique items 
-    unq_count = [np.diff(np.nonzero(row)[0]) for row in unq_first] # iterate through rows. 
-    unq_idx = [np.split(sort_idx[i], np.cumsum(count)) for i, count in enumerate(unq_count)]
+    unq_count = [np.diff(np.nonzero(row)[0]) for row in unq_first] # iterate through rows.
+    # split sorted array into groups of identical items. only keep groups with more than one item. 
+    unq_idx = [[a for a in np.split(sort_idx[i], np.cumsum(count)) if len(a) > 1] for i, count in enumerate(unq_count)] 
 
     return unq_idx
 
@@ -322,15 +323,14 @@ def get_candidates_new(self):
 
             bands = vectorize_signature_bands(self.signature, n_bands=n_bands, band_length=self.band_length)
             buckets_by_band = group_unique_indices(bands)
+            groups = [tuple(i) for i in itertools.chain.from_iterable(buckets_by_band)] # flatten group; use tuple for applying set()
+            groups = set(groups) # we only need the unique clusters 
 
-            for bucket in buckets_by_band:
-                # bucket = [list(group) for group in bucket]
-                groups = itertools.filterfalse(lambda x: len(x) == 1, bucket) # not sure this works for np arrays?
-                for g in groups:
-                    g = list(g)
-                    for i in g:
-                        candidates[i].update(g)
-                [candidates[i].discard(i) for i in range(len(candidates))]
+            for group in groups:
+                for i in group:
+                    candidates[i].update(group)
+
+            [candidates[i].discard(i) for i in range(len(candidates))]
         self.candidates = candidates
 
 

From 7570688dc8628d1a6e107a7f34cf9b38d62ce07e Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Fri, 20 Jan 2023 17:11:49 +0100
Subject: [PATCH 21/43] small changes to efficiency tests

---
 scripts/efficiency_test.py      |  3 ++-
 scripts/run_efficiency_tests.sh | 28 ++++++++++++++--------------
 src/REL/lsh.py                  |  2 +-
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py
index 7e28335..36b7007 100644
--- a/scripts/efficiency_test.py
+++ b/scripts/efficiency_test.py
@@ -222,7 +222,7 @@ def profile_to_df(call):
 
         for k, data in mentions_dataset.items():
             mentions_dataset_scaled[k] = data # add the baseline data as in mentions_dataset
-            for f in [5, 50, 100, 300]:
+            for f in [5, 50, 100]:
                 d = data * f 
                 key = f"{k}_{f}"
                 mentions_dataset_scaled[key] = d
@@ -242,6 +242,7 @@ def profile_to_df(call):
             }
             
             if args.profile:
+                print("Profiling disambiguation for synthetic data set")
                 df_profile = profile_to_df(call="model.predict(tempdict)") 
                 timing_by_dataset[name]['profile'] = df_profile
 
diff --git a/scripts/run_efficiency_tests.sh b/scripts/run_efficiency_tests.sh
index 78f27bc..b957e40 100644
--- a/scripts/run_efficiency_tests.sh
+++ b/scripts/run_efficiency_tests.sh
@@ -12,23 +12,23 @@ echo $datasets
 echo "--Running efficiency tests by data set and n_docs--"
 
 # do profiling and checking predictions in one 
-for size in ${docsizes[@]}; do
-    for ds in ${datasets[@]}; do
-        echo $ds, echo $size
-        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "all"
-        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "lsh"
-        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "off"
-    done 
-done 
+# for size in ${docsizes[@]}; do
+#     for ds in ${datasets[@]}; do
+#         echo $ds, echo $size
+#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "all"
+#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "lsh"
+#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "off"
+#     done 
+# done 
 
 echo "--Scaling number of mentions--"
 
-# for ds in ${datasets[@]}; do
-#     echo $ds
-#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "all"
-#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "lsh"
-#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "off"
-# done 
+for ds in ${datasets[@]}; do
+    echo $ds
+    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "all"
+    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "lsh"
+    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "off"
+done 
 
 
 echo "Done."
diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index af20a47..b3b22dc 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -324,7 +324,7 @@ def get_candidates_new(self):
             bands = vectorize_signature_bands(self.signature, n_bands=n_bands, band_length=self.band_length)
             buckets_by_band = group_unique_indices(bands)
             groups = [tuple(i) for i in itertools.chain.from_iterable(buckets_by_band)] # flatten group; use tuple for applying set()
-            groups = set(groups) # we only need the unique clusters 
+            groups = set(groups) # we only need the unique groups 
 
             for group in groups:
                 for i in group:

From aa79b24e31faaaa3f5c289010f298d6449bbb58c Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Mon, 23 Jan 2023 16:39:42 +0100
Subject: [PATCH 22/43] start tidying lsh

---
 src/REL/lsh.py               | 220 +++++++++++++++--------------------
 src/REL/training_datasets.py |   4 +-
 2 files changed, 93 insertions(+), 131 deletions(-)

diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index b3b22dc..44dbfbd 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -11,8 +11,6 @@
 from scipy import sparse
 seed(3)
 
-def split_mention(m):
-    return m.split(" ")
 
 def k_shingle(s, k):
     "convert string s into shingles of length k"
@@ -22,30 +20,20 @@ def k_shingle(s, k):
     return shingle
 
 
-def partition_signature(s, b):
-    "Convert signature s into b partitions of equal size"
-    assert len(s) % b == 0
-    rg = int(len(s) / b)
-    partitions = []
-    for i in range(0, len(s), rg):
-        v = s[i:i+rg]
-        partitions.append(v)
-    return partitions
+# def cols_to_int(a):
+#     "combine columns in all rows to an integer: [[1,20,3], [1,4,10]] becomes [1203,1410]"
+#     existing_powers = np.floor(np.log10(a))
+#     nrows, ncols = a.shape 
 
-def cols_to_int(a):
-    "combine columns in all rows to an integer: [[1,20,3], [1,4,10]] becomes [1203,1410]"
-    existing_powers = np.floor(np.log10(a))
-    nrows, ncols = a.shape 
+#     cumsum_powers = np.fliplr(np.cumsum(np.fliplr(existing_powers), axis=1))
 
-    cumsum_powers = np.fliplr(np.cumsum(np.fliplr(existing_powers), axis=1))
+#     add_powers = [x for x in reversed(range(ncols))]
+#     add_powers = np.tile(add_powers, (nrows, 1))
 
-    add_powers = [x for x in reversed(range(ncols))]
-    add_powers = np.tile(add_powers, (nrows, 1))
-
-    mult_factor = cumsum_powers - existing_powers + add_powers  
-    summationvector = np.ones((ncols, 1)) 
-    out = np.matmul(a * 10**mult_factor, summationvector)
-    return out 
+#     mult_factor = cumsum_powers - existing_powers + add_powers  
+#     summationvector = np.ones((ncols, 1)) 
+#     out = np.matmul(a * 10**mult_factor, summationvector)
+#     return out 
 
 
 
@@ -64,58 +52,58 @@ def idx_unique_multidim(a):
     return unq_idx
 
 
-def reshape_rows_reps(a):
-    "reshape a 3-d array of n_reps x n_rows x n_cols to n_rows x n_reps x n_cols"
-    n_reps, n_rows, n_cols = a.shape
-    a = a.reshape(n_reps*n_rows, n_cols)
-    # extractor indices: for 3 reps, 2 rows: [0,2,4,1,3,5]. to reorder a
-        # in other words: goes from 0 to (n_reps * n_rows). step sizes are n_rows. starts are the row indices
-    idx = np.arange(n_reps*n_rows).reshape(n_reps, n_rows).T.reshape(-1,1)
-    a = np.take_along_axis(a, idx, axis=0)
-    a = a.reshape(n_rows, n_reps, n_cols)
-    return a 
-
-def minhash_signature_np(x, n_reps):
-    """Make a minhash signature of array x with length n_reps.
-
-    Inputs
-    ------
-    x: axis 0 are observations, columns are binary one-hot encoded vectors
-    """
-    # get indices 
-    indices = np.arange(x.shape[1])
-    rng = np.random.default_rng(12345) # TODO: this should be defined at class instantiation
-
-    # expand by n_reps 
-    indices_mult = np.tile(indices, (n_reps, 1)) # reorder the columns n_reps times 
-    x_mult = np.tile(x, (n_reps, 1)).reshape((n_reps,) + x.shape) # new shape: (n_resp, x.shape[0], x.shape[1
-
-    # permute indices and apply to x_mult
-    permuted_indices = rng.permuted(indices_mult, axis=1)
-    x_mult_permuted = np.take_along_axis(x_mult, permuted_indices[:, np.newaxis], 2)
-
-    # for the reduction below, need to have all samples of the same observation in one block
-    x_mult_permuted = reshape_rows_reps(x_mult_permuted)
-
-    # make signature
-    sig = x_mult_permuted.argmax(axis=2)
-    return sig 
-
-
-def signature_to_bucket(signature, n_bands):
-    "Collect items with same bands in buckets"
-    num_cols = signature.shape[0] # number of documents to classify
-    bands = np.split(signature, n_bands, axis=1)
-    buckets = []
-    for band in bands:
-        items_buckets = defaultdict(list)
-        items = np.vsplit(band, num_cols)
-        for i, item in enumerate(items): # this orders the row indices into groups that have the same signature 
-            item = tuple(item.flatten().astype(int)) 
-            items_buckets[item].append(i)  # assign row i to item--ie, groups observations into buckets with the same signature 
-        buckets.append(items_buckets)
-
-    return buckets
+# def reshape_rows_reps(a):
+#     "reshape a 3-d array of n_reps x n_rows x n_cols to n_rows x n_reps x n_cols"
+#     n_reps, n_rows, n_cols = a.shape
+#     a = a.reshape(n_reps*n_rows, n_cols)
+#     # extractor indices: for 3 reps, 2 rows: [0,2,4,1,3,5]. to reorder a
+#         # in other words: goes from 0 to (n_reps * n_rows). step sizes are n_rows. starts are the row indices
+#     idx = np.arange(n_reps*n_rows).reshape(n_reps, n_rows).T.reshape(-1,1)
+#     a = np.take_along_axis(a, idx, axis=0)
+#     a = a.reshape(n_rows, n_reps, n_cols)
+#     return a 
+
+# def minhash_signature_np(x, n_reps):
+#     """Make a minhash signature of array x with length n_reps.
+
+#     Inputs
+#     ------
+#     x: axis 0 are observations, columns are binary one-hot encoded vectors
+#     """
+#     # get indices 
+#     indices = np.arange(x.shape[1])
+#     rng = np.random.default_rng(12345) # TODO: this should be defined at class instantiation
+
+#     # expand by n_reps 
+#     indices_mult = np.tile(indices, (n_reps, 1)) # reorder the columns n_reps times 
+#     x_mult = np.tile(x, (n_reps, 1)).reshape((n_reps,) + x.shape) # new shape: (n_resp, x.shape[0], x.shape[1
+
+#     # permute indices and apply to x_mult
+#     permuted_indices = rng.permuted(indices_mult, axis=1)
+#     x_mult_permuted = np.take_along_axis(x_mult, permuted_indices[:, np.newaxis], 2)
+
+#     # for the reduction below, need to have all samples of the same observation in one block
+#     x_mult_permuted = reshape_rows_reps(x_mult_permuted)
+
+#     # make signature
+#     sig = x_mult_permuted.argmax(axis=2)
+#     return sig 
+
+
+# def signature_to_bucket(signature, n_bands):
+#     "Collect items with same bands in buckets"
+#     num_cols = signature.shape[0] # number of documents to classify
+#     bands = np.split(signature, n_bands, axis=1)
+#     buckets = []
+#     for band in bands:
+#         items_buckets = defaultdict(list)
+#         items = np.vsplit(band, num_cols)
+#         for i, item in enumerate(items): # this orders the row indices into groups that have the same signature 
+#             item = tuple(item.flatten().astype(int)) 
+#             items_buckets[item].append(i)  # assign row i to item--ie, groups observations into buckets with the same signature 
+#         buckets.append(items_buckets)
+
+#     return buckets
 
 ## new stuff
 def cols_to_int_multidim(a):
@@ -191,17 +179,7 @@ def _build_vocab(self):
         vocab = list(set([shingle for sublist in self.shingles for shingle in sublist]))
         self.vocab = vocab
 
-    # def encode_binary(self, to_numpy=False):
-    #     logging.debug(f"creating lists with binary vectors. Vocabulary size is {len(self.vocab)}")
-    #     # pdb.set_trace()
-    #     vectors = [[1 if word in cur_shingles else 0 for word in self.vocab] for cur_shingles in self.shingles]
-    #     logging.debug(f"size of vectors: {sys.getsizeof(vectors)}")
-    #     if not to_numpy:
-    #         self.vectors = vectors 
-    #     else:
-    #         logging.debug("putting to numpy")
-    #         self.vectors = np.stack(vectors)
-    def encode_binary(self, sparse_output=True):
+    def encode_binary(self, sparse_output=True): # TODO: remove this argument 
         """Create binary vectors for each mention. 
         
         Parameters:
@@ -258,57 +236,43 @@ def make_signature(self):
             sign = 1 + (products > 0) # TODO: can I change the downstream function for this? now it should be much easier to transform the signatures into a single string?
             self.signature = sign
 
-
-            # while i < self.signature_size:
-            #     plane = hyperplanes[i, :].transpose()
-            #     out = self.vectors.dot(plane)
-            #     out = out.toarray()
-                
-            #     sig_i = (out > 0)
-            #     sig_i = sig_i.astype(int)
-            #     sig_i = 1 + sig_i # TODO: can I change the downstream function for this? now it should be much easier to transform the signatures into a single string?
-            #     templist.append(sig_i)
-            #     i += 1
-
-            # self.signature = np.stack(templist, axis=1).squeeze()
             
-
-    def make_signature_np(self):
-        signature = minhash_signature_np(self.vectors, self.signature_size)
-        self.signature = signature + np.ones(signature.shape)  # this is for the log10 operations: do not want to have 0s
+    # def make_signature_np(self):
+    #     signature = minhash_signature_np(self.vectors, self.signature_size)
+    #     self.signature = signature + np.ones(signature.shape)  # this is for the log10 operations: do not want to have 0s
 
     def all_candidates_to_all(self):
         "fall-back option to return the non-clustered input: each mention is a candidate coreference for all"
         n_mentions = self.vectors.shape[0]
         self.candidates = [set(range(n_mentions)) for _ in range(n_mentions)]
 
-    def get_candidates(self): ## TODO: use itertools
-        "extract similar candidates for each mention by comparing subsets of the signature"
-        logging.debug("getting candidates...")
-        n_bands = int(self.signature_size / self.band_length)
+    # def get_candidates(self): ## TODO: use itertools
+    #     "extract similar candidates for each mention by comparing subsets of the signature"
+    #     logging.debug("getting candidates...")
+    #     n_bands = int(self.signature_size / self.band_length)
         
-        if self.vectors.shape[0] == 1:
-            candidates = [set()]
-            candidates[0].add(0)
-        else:
-            bands = np.split(ary=self.signature, indices_or_sections=n_bands, axis=1)
-            candidates = [set() for _ in range(self.vectors.shape[0])]
+    #     if self.vectors.shape[0] == 1:
+    #         candidates = [set()]
+    #         candidates[0].add(0)
+    #     else:
+    #         bands = np.split(ary=self.signature, indices_or_sections=n_bands, axis=1)
+    #         candidates = [set() for _ in range(self.vectors.shape[0])]
                         
-            # if len(candidates) > 1:
-            # TODO: can I speed this up? 
-            for band in bands:
-                groups = idx_unique_multidim(band)
-                groups = [g for g in groups if g.shape[0] > 1]
-                for g in groups:
-                    g = list(g)
-                    for i in g:
-                        for j in g:
-                            if i != j:
-                                candidates[i].add(j)
-            # else: # idx_unique_multidim above does not work when there is only one candidate
-            #     candidates[0].add(0)
-
-        self.candidates = candidates
+    #         # if len(candidates) > 1:
+    #         # TODO: can I speed this up? 
+    #         for band in bands:
+    #             groups = idx_unique_multidim(band)
+    #             groups = [g for g in groups if g.shape[0] > 1]
+    #             for g in groups:
+    #                 g = list(g)
+    #                 for i in g:
+    #                     for j in g:
+    #                         if i != j:
+    #                             candidates[i].add(j)
+    #         # else: # idx_unique_multidim above does not work when there is only one candidate
+    #         #     candidates[0].add(0)
+
+    #     self.candidates = candidates
 
     def get_candidates_new(self):
         "extract similar candidates for each mention by comparing subsets of the signature"
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 9f4a92a..1491636 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -131,12 +131,10 @@ def with_coref(self, dataset, search_corefs_in="all"): # TODO: need to update th
             else:
                 if search_corefs_in == 'lsh':
                     input_mentions = [m["mention"] for m in content]
-                    lsh_corefs = LSHMinHash(mentions=input_mentions, shingle_size=3, signature_size=900, band_length=15)
+                    lsh_corefs = LSHMinHash(mentions=input_mentions, shingle_size=2, signature_size=800, band_length=10) # TODO: set optimal parameters here 
                     lsh_corefs.cluster()
-                    # lsh_corefs.efficiency_gain_comparisons()
                     assert len(content) == len(lsh_corefs.candidates)
                     # lsh_corefs.candidates are the input for below. indices refer to index in input_mentions
-                    # call lsh here on all mentions 
                 for idx_mention, cur_m in enumerate(content):
                     if search_corefs_in == "lsh":
                         idx_candidates = list(lsh_corefs.candidates[idx_mention]) # lsh returns the indices of the candidate coreferences

From 50f6bfdfe9b6f624abfaf9e053c76f9b34bf4d31 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Mon, 23 Jan 2023 17:09:52 +0100
Subject: [PATCH 23/43] drop most of old code

---
 scripts/run_efficiency_tests.sh |  30 ++---
 src/REL/lsh.py                  | 190 ++++++--------------------------
 2 files changed, 49 insertions(+), 171 deletions(-)

diff --git a/scripts/run_efficiency_tests.sh b/scripts/run_efficiency_tests.sh
index b957e40..7fb1cb5 100644
--- a/scripts/run_efficiency_tests.sh
+++ b/scripts/run_efficiency_tests.sh
@@ -12,23 +12,23 @@ echo $datasets
 echo "--Running efficiency tests by data set and n_docs--"
 
 # do profiling and checking predictions in one 
-# for size in ${docsizes[@]}; do
-#     for ds in ${datasets[@]}; do
-#         echo $ds, echo $size
-#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "all"
-#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "lsh"
-#         python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "off"
-#     done 
-# done 
+for size in ${docsizes[@]}; do
+    for ds in ${datasets[@]}; do
+        echo $ds, echo $size
+        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "all"
+        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "lsh"
+        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "off"
+    done 
+done 
 
-echo "--Scaling number of mentions--"
+# echo "--Scaling number of mentions--"
 
-for ds in ${datasets[@]}; do
-    echo $ds
-    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "all"
-    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "lsh"
-    python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "off"
-done 
+# for ds in ${datasets[@]}; do
+#     echo $ds
+#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "all"
+#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "lsh"
+#     python scripts/efficiency_test.py --name_dataset "$ds" --scale_mentions --profile --search_corefs "off"
+# done 
 
 
 echo "Done."
diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index 44dbfbd..07d1d69 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -1,15 +1,11 @@
 
-from random import shuffle, seed 
 import time 
 import numpy as np 
 import logging 
-from collections import defaultdict
 from sklearn.preprocessing import MultiLabelBinarizer
 import itertools
 import pdb 
-import sys 
 from scipy import sparse
-seed(3)
 
 
 def k_shingle(s, k):
@@ -20,92 +16,22 @@ def k_shingle(s, k):
     return shingle
 
 
-# def cols_to_int(a):
-#     "combine columns in all rows to an integer: [[1,20,3], [1,4,10]] becomes [1203,1410]"
-#     existing_powers = np.floor(np.log10(a))
-#     nrows, ncols = a.shape 
 
-#     cumsum_powers = np.fliplr(np.cumsum(np.fliplr(existing_powers), axis=1))
+# def idx_unique_multidim(a):
+#     "groups row indices in a multidimensional arrays by their unique signature"
+#     # a = cols_to_int(a).squeeze() # wrong
+#     # a = cols_to_string(a).squeeze() # slow 
+#     a = cols_to_int(a).squeeze()
+#     sort_idx = np.argsort(a)
+#     sort_idx
+#     a_sorted = a[sort_idx]
+#     unq_first = np.concatenate(([True], a_sorted[1:] != a_sorted[:-1])) # "is the current value different from the previous?". the concat of [True]: because the first occurrence is always True (ie the first time it occur)
+#     unq_items = a_sorted[unq_first]
+#     unq_count = np.diff(np.nonzero(unq_first)[0]) # np.nonzero(unq_first)[0] gives the indices of first elements in a_sorted
+#     unq_idx = np.split(sort_idx, np.cumsum(unq_count))
+#     return unq_idx
 
-#     add_powers = [x for x in reversed(range(ncols))]
-#     add_powers = np.tile(add_powers, (nrows, 1))
 
-#     mult_factor = cumsum_powers - existing_powers + add_powers  
-#     summationvector = np.ones((ncols, 1)) 
-#     out = np.matmul(a * 10**mult_factor, summationvector)
-#     return out 
-
-
-
-def idx_unique_multidim(a):
-    "groups row indices in a multidimensional arrays by their unique signature"
-    # a = cols_to_int(a).squeeze() # wrong
-    # a = cols_to_string(a).squeeze() # slow 
-    a = cols_to_int(a).squeeze()
-    sort_idx = np.argsort(a)
-    sort_idx
-    a_sorted = a[sort_idx]
-    unq_first = np.concatenate(([True], a_sorted[1:] != a_sorted[:-1])) # "is the current value different from the previous?". the concat of [True]: because the first occurrence is always True (ie the first time it occur)
-    unq_items = a_sorted[unq_first]
-    unq_count = np.diff(np.nonzero(unq_first)[0]) # np.nonzero(unq_first)[0] gives the indices of first elements in a_sorted
-    unq_idx = np.split(sort_idx, np.cumsum(unq_count))
-    return unq_idx
-
-
-# def reshape_rows_reps(a):
-#     "reshape a 3-d array of n_reps x n_rows x n_cols to n_rows x n_reps x n_cols"
-#     n_reps, n_rows, n_cols = a.shape
-#     a = a.reshape(n_reps*n_rows, n_cols)
-#     # extractor indices: for 3 reps, 2 rows: [0,2,4,1,3,5]. to reorder a
-#         # in other words: goes from 0 to (n_reps * n_rows). step sizes are n_rows. starts are the row indices
-#     idx = np.arange(n_reps*n_rows).reshape(n_reps, n_rows).T.reshape(-1,1)
-#     a = np.take_along_axis(a, idx, axis=0)
-#     a = a.reshape(n_rows, n_reps, n_cols)
-#     return a 
-
-# def minhash_signature_np(x, n_reps):
-#     """Make a minhash signature of array x with length n_reps.
-
-#     Inputs
-#     ------
-#     x: axis 0 are observations, columns are binary one-hot encoded vectors
-#     """
-#     # get indices 
-#     indices = np.arange(x.shape[1])
-#     rng = np.random.default_rng(12345) # TODO: this should be defined at class instantiation
-
-#     # expand by n_reps 
-#     indices_mult = np.tile(indices, (n_reps, 1)) # reorder the columns n_reps times 
-#     x_mult = np.tile(x, (n_reps, 1)).reshape((n_reps,) + x.shape) # new shape: (n_resp, x.shape[0], x.shape[1
-
-#     # permute indices and apply to x_mult
-#     permuted_indices = rng.permuted(indices_mult, axis=1)
-#     x_mult_permuted = np.take_along_axis(x_mult, permuted_indices[:, np.newaxis], 2)
-
-#     # for the reduction below, need to have all samples of the same observation in one block
-#     x_mult_permuted = reshape_rows_reps(x_mult_permuted)
-
-#     # make signature
-#     sig = x_mult_permuted.argmax(axis=2)
-#     return sig 
-
-
-# def signature_to_bucket(signature, n_bands):
-#     "Collect items with same bands in buckets"
-#     num_cols = signature.shape[0] # number of documents to classify
-#     bands = np.split(signature, n_bands, axis=1)
-#     buckets = []
-#     for band in bands:
-#         items_buckets = defaultdict(list)
-#         items = np.vsplit(band, num_cols)
-#         for i, item in enumerate(items): # this orders the row indices into groups that have the same signature 
-#             item = tuple(item.flatten().astype(int)) 
-#             items_buckets[item].append(i)  # assign row i to item--ie, groups observations into buckets with the same signature 
-#         buckets.append(items_buckets)
-
-#     return buckets
-
-## new stuff
 def cols_to_int_multidim(a):
     "combine columns in all rows to an integer: [[1,20,3], [1,4,10]] becomes [1203,1410]"
     existing_powers = np.floor(np.log10(a))
@@ -195,7 +121,7 @@ def encode_binary(self, sparse_output=True): # TODO: remove this argument
 class LSHMinHash(LSHBase):
     "LSH with MinHashing and numpy"
 
-    def __init__(self, mentions, shingle_size, signature_size, band_length, sparse_binary=True):
+    def __init__(self, mentions, shingle_size, signature_size, band_length, sparse_binary=True, seed=3):
         # sparse_binary: should the sparse 0/1 matrix be stored with scipy sparse? takes less memory.
         super().__init__(mentions, shingle_size)
         if signature_size % band_length != 0:
@@ -203,78 +129,35 @@ def __init__(self, mentions, shingle_size, signature_size, band_length, sparse_b
         self.signature_size = signature_size 
         self.band_length = band_length 
         self.sparse_binary = sparse_binary
+        self.rng = np.random.default_rng(seed=seed)
     
     def make_signature(self):
         "make array of dense vectors with MinHashing. each row is one mention"
         logging.debug(f"Making signature. vectors shape is {self.vectors.shape}")
-        # pdb.set_trace()
-        templist = []
-        rng = np.random.default_rng(seed=3)
-        i = 0
-        if isinstance(self.vectors, np.ndarray):
-            logging.debug("using binary numpy arrays")
-            while i < self.signature_size:
-                rng.shuffle(self.vectors, axis=1)
-                sig_i = 1 + self.vectors.argmax(axis=1) # add one for the log10 operations in idx_unique_multidim 
-                templist.append(sig_i)
-                i += 1
-            self.signature = np.stack(templist, axis=1)
-        else: # older versions of scipy have not _coo attribute. TODO: fix this
+        # rng = np.random.default_rng(seed=3)
+        # older versions of scipy have not _coo attribute. TODO: fix this
         # elif isinstance(self.vectors, sparse._coo.coo_matrix):
             # not sure how efficient this is. switching a lot between data structures.
-            logging.debug('using binary sparse matrices')
-            rng = np.random.default_rng(seed=3) # TODO: put this to class instantiation
-            # vectors = mylsh.vectors
-            logging.debug("making hyperplanes")
-            hyperplanes = rng.choice([-1, 1], (self.signature_size, self.vectors.shape[1]))
-            # TODO: make vectors a csr matrix (?)
-            hyperplanes = sparse.csr_matrix(hyperplanes)
-            logging.debug("making dot product")
-            products = self.vectors.dot(hyperplanes.transpose())
-            logging.debug("making signature")
-            products = products.toarray()
-            sign = 1 + (products > 0) # TODO: can I change the downstream function for this? now it should be much easier to transform the signatures into a single string?
-            self.signature = sign
-
-            
-    # def make_signature_np(self):
-    #     signature = minhash_signature_np(self.vectors, self.signature_size)
-    #     self.signature = signature + np.ones(signature.shape)  # this is for the log10 operations: do not want to have 0s
+        logging.debug('using binary sparse matrices')
+        # rng = np.random.default_rng(seed=3) # TODO: put this to class instantiation
+        # vectors = mylsh.vectors
+        logging.debug("making hyperplanes")
+        hyperplanes = self.rng.choice([-1, 1], (self.signature_size, self.vectors.shape[1]))
+        # TODO: make vectors a csr matrix (?)
+        hyperplanes = sparse.csr_matrix(hyperplanes)
+        logging.debug("making dot product")
+        products = self.vectors.dot(hyperplanes.transpose())
+        logging.debug("making signature")
+        products = products.toarray()
+        sign = 1 + (products > 0) # TODO: can I change the downstream function for this? now it should be much easier to transform the signatures into a single string?
+        self.signature = sign
 
     def all_candidates_to_all(self):
         "fall-back option to return the non-clustered input: each mention is a candidate coreference for all"
         n_mentions = self.vectors.shape[0]
         self.candidates = [set(range(n_mentions)) for _ in range(n_mentions)]
 
-    # def get_candidates(self): ## TODO: use itertools
-    #     "extract similar candidates for each mention by comparing subsets of the signature"
-    #     logging.debug("getting candidates...")
-    #     n_bands = int(self.signature_size / self.band_length)
-        
-    #     if self.vectors.shape[0] == 1:
-    #         candidates = [set()]
-    #         candidates[0].add(0)
-    #     else:
-    #         bands = np.split(ary=self.signature, indices_or_sections=n_bands, axis=1)
-    #         candidates = [set() for _ in range(self.vectors.shape[0])]
-                        
-    #         # if len(candidates) > 1:
-    #         # TODO: can I speed this up? 
-    #         for band in bands:
-    #             groups = idx_unique_multidim(band)
-    #             groups = [g for g in groups if g.shape[0] > 1]
-    #             for g in groups:
-    #                 g = list(g)
-    #                 for i in g:
-    #                     for j in g:
-    #                         if i != j:
-    #                             candidates[i].add(j)
-    #         # else: # idx_unique_multidim above does not work when there is only one candidate
-    #         #     candidates[0].add(0)
-
-    #     self.candidates = candidates
-
-    def get_candidates_new(self):
+    def get_candidates(self):
         "extract similar candidates for each mention by comparing subsets of the signature"
         logging.debug("getting candidates...")
         n_bands = int(self.signature_size / self.band_length)
@@ -306,19 +189,14 @@ def cluster(self, numpy_signature=False, candidates="new"): # TODO: tidy this, o
         logging.debug("encoding to binary")
         self.encode_binary(sparse_output=self.sparse_binary)
         logging.debug("making signature")
+
         if self.vectors.shape[1] == 0: # no signature possible b/c no mention is longer than the shingle size.
             print('self.vectors.shape[1] is 0.')
             self.all_candidates_to_all()
         else:
-            if numpy_signature:
-                self.make_signature_np()
-            else:
-                self.make_signature()
+            self.make_signature()
             logging.debug("getting candidate groups")
-            if candidates == "old":
-                self.get_candidates()
-            elif candidates == "new": # this seems to be slower than the old approach 
-                self.get_candidates_new()
+            self.get_candidates()
         self.time = time.time() - start 
 
     def summarise(self):

From 017b03e395ade5c76c01ab132125492243da131a Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Tue, 24 Jan 2023 10:22:19 +0100
Subject: [PATCH 24/43] lsh class: tidy, add docstrings

---
 src/REL/lsh.py | 92 ++++++++++++++++++++++----------------------------
 1 file changed, 41 insertions(+), 51 deletions(-)

diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index 07d1d69..86ae30d 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -7,6 +7,10 @@
 import pdb 
 from scipy import sparse
 
+# TODO:
+    # document the class?
+    # add the academic references
+
 
 def k_shingle(s, k):
     "convert string s into shingles of length k"
@@ -16,24 +20,18 @@ def k_shingle(s, k):
     return shingle
 
 
+def cols_to_int_multidim(a):
+    """
+    Combine columns in all rows to an integer: [[1,20,3], [1,4,10]] becomes [1203,1410].
 
-# def idx_unique_multidim(a):
-#     "groups row indices in a multidimensional arrays by their unique signature"
-#     # a = cols_to_int(a).squeeze() # wrong
-#     # a = cols_to_string(a).squeeze() # slow 
-#     a = cols_to_int(a).squeeze()
-#     sort_idx = np.argsort(a)
-#     sort_idx
-#     a_sorted = a[sort_idx]
-#     unq_first = np.concatenate(([True], a_sorted[1:] != a_sorted[:-1])) # "is the current value different from the previous?". the concat of [True]: because the first occurrence is always True (ie the first time it occur)
-#     unq_items = a_sorted[unq_first]
-#     unq_count = np.diff(np.nonzero(unq_first)[0]) # np.nonzero(unq_first)[0] gives the indices of first elements in a_sorted
-#     unq_idx = np.split(sort_idx, np.cumsum(unq_count))
-#     return unq_idx
-
+    Notes
+    ------
+    Advantage: uses vectorized numpy to create a unique signature.
+    Disadvantage: Because one additional row increases the size of the integer at least by an order of magnitude, 
+    this only works for cases where the bands are not too large (otherwise integer overflow problems).
 
-def cols_to_int_multidim(a):
-    "combine columns in all rows to an integer: [[1,20,3], [1,4,10]] becomes [1203,1410]"
+    In practice I have found that optimal bands are typically not long enough to cause problems.
+    """
     existing_powers = np.floor(np.log10(a))
     n_bands, nrows, ncols = a.shape 
 
@@ -68,7 +66,7 @@ def vectorize_signature_bands(a, n_bands, band_length):
 # this replaces idx_multidim
 def group_unique_indices(a):
     """
-    calculate groups of indices of unique rows in a multidimensional array with the same signature
+    Calculate groups of indices of unique rows in a multidimensional array with the same signature
     the groups are returned by band.
 
     Returns a list of lists. One list corresponds to each band, and it indicates the rows
@@ -101,71 +99,61 @@ def __init__(self, mentions, shingle_size):
             self.shingles = [k_shingle(m, shingle_size) for m in mentions]
 
     def _build_vocab(self):
-        # shingles = [v["shingles"] for v in self.mentions.values()]
+        "Build a vocabulary of the shingles in a document."
         vocab = list(set([shingle for sublist in self.shingles for shingle in sublist]))
         self.vocab = vocab
 
-    def encode_binary(self, sparse_output=True): # TODO: remove this argument 
-        """Create binary vectors for each mention. 
-        
-        Parameters:
-        ----------
-        sparse_output: Argument passed to `sklearn.preprocessing.MultiLabelBinarizer()`.
+    def encode_binary(self): 
+        """
+        Create sparse binary vectors for each mention.
+
+        Output: CSR sparse matrix.
+        Rows indicate mentions, columns indicate whether the mention contains the shingle. 
         """
         logging.debug("making one-hot vectors")
-        binarizer = MultiLabelBinarizer(sparse_output=sparse_output)
-        vectors = binarizer.fit_transform(self.shingles)
-        self.vectors = vectors
+        binarizer = MultiLabelBinarizer(sparse_output=True)
+        self.vectors = binarizer.fit_transform(self.shingles)
 
 
 class LSHMinHash(LSHBase):
     "LSH with MinHashing and numpy"
 
-    def __init__(self, mentions, shingle_size, signature_size, band_length, sparse_binary=True, seed=3):
+    def __init__(self, mentions, shingle_size, signature_size, band_length, seed=3):
         # sparse_binary: should the sparse 0/1 matrix be stored with scipy sparse? takes less memory.
         super().__init__(mentions, shingle_size)
         if signature_size % band_length != 0:
             raise ValueError("Signature needs to be divisible into equal-sized bands.")
         self.signature_size = signature_size 
         self.band_length = band_length 
-        self.sparse_binary = sparse_binary
         self.rng = np.random.default_rng(seed=seed)
     
     def make_signature(self):
-        "make array of dense vectors with MinHashing. each row is one mention"
+        """
+        Create a signature for a given mention, using random projections.
+        """
         logging.debug(f"Making signature. vectors shape is {self.vectors.shape}")
-        # rng = np.random.default_rng(seed=3)
-        # older versions of scipy have not _coo attribute. TODO: fix this
-        # elif isinstance(self.vectors, sparse._coo.coo_matrix):
-            # not sure how efficient this is. switching a lot between data structures.
-        logging.debug('using binary sparse matrices')
-        # rng = np.random.default_rng(seed=3) # TODO: put this to class instantiation
-        # vectors = mylsh.vectors
-        logging.debug("making hyperplanes")
+        # TODO: can this be more memory-efficient by generating directly the scipy sparse function? 
+        # https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.random.html
         hyperplanes = self.rng.choice([-1, 1], (self.signature_size, self.vectors.shape[1]))
-        # TODO: make vectors a csr matrix (?)
         hyperplanes = sparse.csr_matrix(hyperplanes)
-        logging.debug("making dot product")
-        products = self.vectors.dot(hyperplanes.transpose())
-        logging.debug("making signature")
-        products = products.toarray()
+        products = self.vectors.dot(hyperplanes.transpose()).toarray()
+        # products = products.toarray()
         sign = 1 + (products > 0) # TODO: can I change the downstream function for this? now it should be much easier to transform the signatures into a single string?
         self.signature = sign
 
     def all_candidates_to_all(self):
-        "fall-back option to return the non-clustered input: each mention is a candidate coreference for all"
+        "Fall-back option to return the non-clustered input: each mention is a candidate coreference for all"
         n_mentions = self.vectors.shape[0]
         self.candidates = [set(range(n_mentions)) for _ in range(n_mentions)]
 
     def get_candidates(self):
-        "extract similar candidates for each mention by comparing subsets of the signature"
+        "Extract similar candidates for each mention by comparing subsets of the signature"
         logging.debug("getting candidates...")
         n_bands = int(self.signature_size / self.band_length)
         if self.vectors.shape[0] == 1:
             candidates = [set()]
             candidates[0].add(0)
         else:
-            # bands = np.split(ary=self.signature, indices_or_sections=n_bands, axis=1)
             candidates = [set() for _ in range(self.vectors.shape[0])]
 
             bands = vectorize_signature_bands(self.signature, n_bands=n_bands, band_length=self.band_length)
@@ -180,18 +168,19 @@ def get_candidates(self):
             [candidates[i].discard(i) for i in range(len(candidates))]
         self.candidates = candidates
 
-
-    def cluster(self, numpy_signature=False, candidates="new"): # TODO: tidy this, only use the new function for getting candidates
-        "find similar records for each mention"
+    def cluster(self): 
+        """
+        Cluster mentions together based on their similarity. This is the main functionality of the LSH class.
+        """
         start = time.time()
         logging.debug("building vocabulary")
         self._build_vocab()
         logging.debug("encoding to binary")
-        self.encode_binary(sparse_output=self.sparse_binary)
+        self.encode_binary()
         logging.debug("making signature")
 
         if self.vectors.shape[1] == 0: # no signature possible b/c no mention is longer than the shingle size.
-            print('self.vectors.shape[1] is 0.')
+            logging.debug('self.vectors.shape[1] is 0.')
             self.all_candidates_to_all()
         else:
             self.make_signature()
@@ -200,6 +189,7 @@ def cluster(self, numpy_signature=False, candidates="new"): # TODO: tidy this, o
         self.time = time.time() - start 
 
     def summarise(self):
+        "Summarise the time taken and output from clustering one LSH instance."
         sizes = [len(g) for g in self.candidates]
         print(f"took {self.time} seconds for {len(self.candidates)} mentions")
         print(f"average, min, max cluster size: {round(sum(sizes)/len(sizes),2)}, {min(sizes)}, {max(sizes)}")

From 780cee225bbb5efb72e5a5412e3b8b8db9fac535 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Tue, 24 Jan 2023 10:36:40 +0100
Subject: [PATCH 25/43] give right name to main class: random projections

---
 src/REL/lsh.py               | 16 ++++++++++++++--
 src/REL/training_datasets.py |  4 ++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index 86ae30d..d7d4909 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -115,8 +115,20 @@ def encode_binary(self):
         self.vectors = binarizer.fit_transform(self.shingles)
 
 
-class LSHMinHash(LSHBase):
-    "LSH with MinHashing and numpy"
+class LSHRandomProjections(LSHBase):
+    """
+    A class for locality-sensitive hashing with random projections.
+    
+
+    Parameters:
+    -----------
+    mentions:
+    shingle_size:
+    signature_size:
+    band_length:
+    seed:     
+    """
+    # TODO: document more 
 
     def __init__(self, mentions, shingle_size, signature_size, band_length, seed=3):
         # sparse_binary: should the sparse 0/1 matrix be stored with scipy sparse? takes less memory.
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 1491636..4825ba6 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -1,7 +1,7 @@
 import os
 import pickle
 import pdb 
-from REL.lsh import LSHMinHash
+from REL.lsh import LSHRandomProjections
 import logging
 
 class TrainingEvaluationDatasets:
@@ -131,7 +131,7 @@ def with_coref(self, dataset, search_corefs_in="all"): # TODO: need to update th
             else:
                 if search_corefs_in == 'lsh':
                     input_mentions = [m["mention"] for m in content]
-                    lsh_corefs = LSHMinHash(mentions=input_mentions, shingle_size=2, signature_size=800, band_length=10) # TODO: set optimal parameters here 
+                    lsh_corefs = LSHRandomProjections(mentions=input_mentions, shingle_size=2, signature_size=800, band_length=10) # TODO: set optimal parameters here 
                     lsh_corefs.cluster()
                     assert len(content) == len(lsh_corefs.candidates)
                     # lsh_corefs.candidates are the input for below. indices refer to index in input_mentions

From afa63d93ab967891953515b5285c787b1e32cadd Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Tue, 24 Jan 2023 13:09:55 +0100
Subject: [PATCH 26/43] start tests, fix bug in cols_to_int_multidim

---
 src/REL/lsh.py    | 10 ++++----
 tests/test_lsh.py | 59 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_lsh.py

diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index d7d4909..3db6990 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -28,14 +28,15 @@ def cols_to_int_multidim(a):
     ------
     Advantage: uses vectorized numpy to create a unique signature.
     Disadvantage: Because one additional row increases the size of the integer at least by an order of magnitude, 
-    this only works for cases where the bands are not too large (otherwise integer overflow problems).
+    this only works for cases where the bands are not too large. 
 
     In practice I have found that optimal bands are typically not long enough to cause problems.
     """
     existing_powers = np.floor(np.log10(a))
     n_bands, nrows, ncols = a.shape 
 
-    cumsum_powers = np.fliplr(np.cumsum(np.fliplr(existing_powers), axis=1))
+    # cumsum_powers = np.fliplr(np.cumsum(np.fliplr(existing_powers), axis=1))
+    cumsum_powers = np.flip(np.cumsum(np.flip(existing_powers, axis=2), axis=2), axis=2)
 
     add_powers = [x for x in reversed(range(ncols))]
     add_powers = np.tile(add_powers, (nrows, 1))
@@ -66,11 +67,10 @@ def vectorize_signature_bands(a, n_bands, band_length):
 # this replaces idx_multidim
 def group_unique_indices(a):
     """
-    Calculate groups of indices of unique rows in a multidimensional array with the same signature
-    the groups are returned by band.
+    Calculate groups of indices of unique rows in a multidimensional array with the same signature.
 
     Returns a list of lists. One list corresponds to each band, and it indicates the rows
-    of a that have the same band.
+    of `a` that have the same band.
     """
     n_bands, n_items, length_band = a.shape
     a = cols_to_int_multidim(a).squeeze()
diff --git a/tests/test_lsh.py b/tests/test_lsh.py
new file mode 100644
index 0000000..9678bc9
--- /dev/null
+++ b/tests/test_lsh.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from pathlib import Path
+
+from REL.lsh import vectorize_signature_bands, group_unique_indices, cols_to_int_multidim
+import numpy as np
+import itertools 
+
+
+
+def test_cols_to_int_multidim():
+    a = np.array([[[1, 20, 3], [1, 4, 10]],
+             [[1, 3, 5], [100, 3, 50]]]
+            )
+    output = cols_to_int_multidim(a) 
+    expected = np.array(
+        [
+            [[1203], [1410]],
+            [[135], [100350]]
+        ]
+    )
+    assert np.all(output == expected), "rows do not convert correctly to integer"
+
+def test_vectorize_signature_bands():
+    a = np.array([[1, 4, 7, 8, 10, 8], [5, 3, 2, 6, 11, 0], [1, 4, 2, 6, 13, 15]])
+
+    n_bands = 2
+    n_items = a.shape[0]
+    band_length = int(a.shape[1]/n_bands)
+    result = vectorize_signature_bands(a, n_bands=n_bands, band_length=band_length)
+
+    expected = np.vstack(np.split(a, n_bands, axis=1)).reshape(n_bands, n_items, -1)
+    assert np.all(result == expected), "signature bands not vectorized correctly"
+
+
+
+def test_group_unique_indices():
+    a = np.array([[[1, 4], [1, 4], [5,3], [5, 3], [1 , 2]],
+                    [[7,8], [2, 7], [2, 7], [7, 8], [10, 3]]
+                  ]) 
+    output = group_unique_indices(a)
+
+    # build expected
+    groups_band0 = [[0, 1], [2, 3]]
+    groups_band1 = [[1, 2], [0, 3]] 
+    # Notes:
+    # [1,2], [10,3] are not listed because their group is of size 1. 
+    # [2,7] is before [7, 8] because 27 < 78
+    groups_band0 = [np.array(i) for i in groups_band0]
+    groups_band1 = [np.array(i) for i in groups_band1]
+    expected = [groups_band0, groups_band1]
+
+    o = itertools.chain.from_iterable(output)
+    e = itertools.chain.from_iterable(expected)
+
+    # test 
+    assert all([np.all(i==j) for i, j in zip(o, e)]), "unique indices not grouped correctly"
+

From 136659b4536c3a06b8a543a951efabea10ce21e5 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Tue, 24 Jan 2023 14:01:52 +0100
Subject: [PATCH 27/43] improve docstrings

---
 src/REL/lsh.py    | 72 ++++++++++++++++++++++++++++++++++-------------
 tests/test_lsh.py | 24 ++++++++++++----
 2 files changed, 70 insertions(+), 26 deletions(-)

diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index 3db6990..aa1b4f6 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -13,7 +13,11 @@
 
 
 def k_shingle(s, k):
-    "convert string s into shingles of length k"
+    """
+    Convert string s into shingles of length k
+
+    :return: List of shingles
+    """
     shingle = []
     for i in range(len(s) - k + 1):
         shingle.append(s[i:(i+k)])
@@ -24,13 +28,16 @@ def cols_to_int_multidim(a):
     """
     Combine columns in all rows to an integer: [[1,20,3], [1,4,10]] becomes [1203,1410].
 
+    :return: An array of shape (n, 1), where the horizontally neighboring column values 
+    are appended together.
+
     Notes
     ------
     Advantage: uses vectorized numpy to create a unique signature.
     Disadvantage: Because one additional row increases the size of the integer at least by an order of magnitude, 
     this only works for cases where the bands are not too large. 
 
-    In practice I have found that optimal bands are typically not long enough to cause problems.
+    In practice, optimal bands are typically not long enough to cause problems.
     """
     existing_powers = np.floor(np.log10(a))
     n_bands, nrows, ncols = a.shape 
@@ -46,12 +53,19 @@ def cols_to_int_multidim(a):
     out = np.matmul(a * 10**mult_factor, summationvector)
     return out 
 
-def vectorize_signature_bands(a, n_bands, band_length):
+def signature_to_3d_bands(a, n_bands, band_length):
     """ 
-    Convert a signature array of dimension (n_items, signature_length) into an array of (n_bands, n_items, band_length).
+    Convert a signature array of dimension (n_items, signature_length) into an array 
+    of (n_bands, n_items, band_length).
+
+    :return: An array of shape (n_bands, n_items, band_length)
     
-    This is a vectorized version for np.vstack(np.split(a, indices_or_sections=n_bands, axis=1)). 
-    The idea is to then use a vectorized function to extract the indices, instead of looping over each element in the output of np.split().
+    Details:
+    --------
+    This produces the same output as np.vstack(np.split(a, indices_or_sections=n_bands, axis=1)).
+    When further processing the output, this is a useful alternative to looping on the output of
+    np.split(a, indices_or_sections=n_bands, axis=1) because a single vectorized call can be used,
+    while np.vstack(np.split(...)) is likely to be less efficient. 
     """
     n_items, signature_length = a.shape
     
@@ -67,10 +81,13 @@ def vectorize_signature_bands(a, n_bands, band_length):
 # this replaces idx_multidim
 def group_unique_indices(a):
     """
-    Calculate groups of indices of unique rows in a multidimensional array with the same signature.
+    In a 3-dimensional array, for each array (axis 0), 
+    calculate the indices of rows (axis=1) that are identical.
 
-    Returns a list of lists. One list corresponds to each band, and it indicates the rows
-    of `a` that have the same band.
+    :return: a list of lists. Outer lists correspond to bands. 
+    Inner lists correspond to the row indices that 
+    have the same values in their columns. An item 
+    in the inner list is an np.array.
     """
     n_bands, n_items, length_band = a.shape
     a = cols_to_int_multidim(a).squeeze()
@@ -91,6 +108,10 @@ def group_unique_indices(a):
     return unq_idx
 
 class LSHBase:
+    """
+    Base class for locality-sensitive hashing, 
+    with methods for one-hot encoding and building a vocabulary of shingles
+    """
     # Important: order of occurences in shingles and vectors = order of input list (=order of occurrence in document)
     def __init__(self, mentions, shingle_size):
         if isinstance(mentions, dict):
@@ -99,7 +120,9 @@ def __init__(self, mentions, shingle_size):
             self.shingles = [k_shingle(m, shingle_size) for m in mentions]
 
     def _build_vocab(self):
-        "Build a vocabulary of the shingles in a document."
+        """
+        Build a vocabulary of the shingles in a document.
+        """
         vocab = list(set([shingle for sublist in self.shingles for shingle in sublist]))
         self.vocab = vocab
 
@@ -107,8 +130,8 @@ def encode_binary(self):
         """
         Create sparse binary vectors for each mention.
 
-        Output: CSR sparse matrix.
-        Rows indicate mentions, columns indicate whether the mention contains the shingle. 
+        :return: CSR sparse matrix. Rows indicate mentions, columns indicate whether 
+        the mention contains the shingle. 
         """
         logging.debug("making one-hot vectors")
         binarizer = MultiLabelBinarizer(sparse_output=True)
@@ -117,7 +140,7 @@ def encode_binary(self):
 
 class LSHRandomProjections(LSHBase):
     """
-    A class for locality-sensitive hashing with random projections.
+    Class for locality-sensitive hashing with random projections.
     
 
     Parameters:
@@ -149,17 +172,25 @@ def make_signature(self):
         hyperplanes = self.rng.choice([-1, 1], (self.signature_size, self.vectors.shape[1]))
         hyperplanes = sparse.csr_matrix(hyperplanes)
         products = self.vectors.dot(hyperplanes.transpose()).toarray()
-        # products = products.toarray()
-        sign = 1 + (products > 0) # TODO: can I change the downstream function for this? now it should be much easier to transform the signatures into a single string?
+        sign = 1 + (products > 0)
         self.signature = sign
 
     def all_candidates_to_all(self):
-        "Fall-back option to return the non-clustered input: each mention is a candidate coreference for all"
+        """
+        Fall-back option to return the non-clustered input.
+        Each mention is a candidate coreference for all mentions. This is useful in 
+        edge cases where no single mention is longer than the shingle size.
+        """
         n_mentions = self.vectors.shape[0]
         self.candidates = [set(range(n_mentions)) for _ in range(n_mentions)]
 
     def get_candidates(self):
-        "Extract similar candidates for each mention by comparing subsets of the signature"
+        """
+        For each mention, extract most similar mentions based on whether part 
+        of their signatures overlap.
+
+        :return: list of sets of candidate indices.
+        """
         logging.debug("getting candidates...")
         n_bands = int(self.signature_size / self.band_length)
         if self.vectors.shape[0] == 1:
@@ -168,7 +199,7 @@ def get_candidates(self):
         else:
             candidates = [set() for _ in range(self.vectors.shape[0])]
 
-            bands = vectorize_signature_bands(self.signature, n_bands=n_bands, band_length=self.band_length)
+            bands = signature_to_3d_bands(self.signature, n_bands=n_bands, band_length=self.band_length)
             buckets_by_band = group_unique_indices(bands)
             groups = [tuple(i) for i in itertools.chain.from_iterable(buckets_by_band)] # flatten group; use tuple for applying set()
             groups = set(groups) # we only need the unique groups 
@@ -182,7 +213,9 @@ def get_candidates(self):
 
     def cluster(self): 
         """
-        Cluster mentions together based on their similarity. This is the main functionality of the LSH class.
+        Main functionality of this class: cluster mentions together based on their similarity. 
+
+        :return: for each mention, mention index of most similar other mentions based on LSH.
         """
         start = time.time()
         logging.debug("building vocabulary")
@@ -190,7 +223,6 @@ def cluster(self):
         logging.debug("encoding to binary")
         self.encode_binary()
         logging.debug("making signature")
-
         if self.vectors.shape[1] == 0: # no signature possible b/c no mention is longer than the shingle size.
             logging.debug('self.vectors.shape[1] is 0.')
             self.all_candidates_to_all()
diff --git a/tests/test_lsh.py b/tests/test_lsh.py
index 9678bc9..ba22137 100644
--- a/tests/test_lsh.py
+++ b/tests/test_lsh.py
@@ -3,17 +3,23 @@
 
 from pathlib import Path
 
-from REL.lsh import vectorize_signature_bands, group_unique_indices, cols_to_int_multidim
+import REL.lsh as lsh 
+# from REL.lsh import vectorize_signature_bands, group_unique_indices, cols_to_int_multidim
 import numpy as np
 import itertools 
 
 
+def test_k_shingle():
+    output = lsh.k_shingle("random string", 5)
+    expected = ["rando", "andom", "ndom ", "dom s", "om st", "m str", " stri", "strin", "tring"]
+    assert output == expected, "shingles not built correctly"
+
 
 def test_cols_to_int_multidim():
     a = np.array([[[1, 20, 3], [1, 4, 10]],
              [[1, 3, 5], [100, 3, 50]]]
             )
-    output = cols_to_int_multidim(a) 
+    output = lsh.cols_to_int_multidim(a) 
     expected = np.array(
         [
             [[1203], [1410]],
@@ -22,13 +28,19 @@ def test_cols_to_int_multidim():
     )
     assert np.all(output == expected), "rows do not convert correctly to integer"
 
-def test_vectorize_signature_bands():
-    a = np.array([[1, 4, 7, 8, 10, 8], [5, 3, 2, 6, 11, 0], [1, 4, 2, 6, 13, 15]])
+def test_signature_to_3d_bands():
+    a = np.array(
+        [
+            [1, 4, 7, 8, 10, 8], 
+            [5, 3, 2, 6, 11, 0], 
+            [1, 4, 2, 6, 13, 15]
+        ]
+    )
 
     n_bands = 2
     n_items = a.shape[0]
     band_length = int(a.shape[1]/n_bands)
-    result = vectorize_signature_bands(a, n_bands=n_bands, band_length=band_length)
+    result = lsh.signature_to_3d_bands(a, n_bands=n_bands, band_length=band_length)
 
     expected = np.vstack(np.split(a, n_bands, axis=1)).reshape(n_bands, n_items, -1)
     assert np.all(result == expected), "signature bands not vectorized correctly"
@@ -39,7 +51,7 @@ def test_group_unique_indices():
     a = np.array([[[1, 4], [1, 4], [5,3], [5, 3], [1 , 2]],
                     [[7,8], [2, 7], [2, 7], [7, 8], [10, 3]]
                   ]) 
-    output = group_unique_indices(a)
+    output = lsh.group_unique_indices(a)
 
     # build expected
     groups_band0 = [[0, 1], [2, 3]]

From 35a0a32f73005fe67cf18c3270a897a0b0a331ba Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Tue, 24 Jan 2023 14:29:37 +0100
Subject: [PATCH 28/43] n_bands and band_length as main inputs to class

---
 src/REL/lsh.py               | 21 +++++++++++++--------
 src/REL/training_datasets.py |  8 +++++++-
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index aa1b4f6..d1c05e2 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -1,3 +1,10 @@
+"""
+This implements a simple version of locality-sensitive hashing.
+The main reference is chapter 3 in "Mining of Massive Datasets" (http://www.mmds.org/).
+
+To allow for high-dimensional data, it stores the feature vectors as sparse matrices,
+and uses random projections as hash functions. 
+"""
 
 import time 
 import numpy as np 
@@ -8,8 +15,7 @@
 from scipy import sparse
 
 # TODO:
-    # document the class?
-    # add the academic references
+    # document the class? -- after swapping the arguments 
 
 
 def k_shingle(s, k):
@@ -153,13 +159,12 @@ class LSHRandomProjections(LSHBase):
     """
     # TODO: document more 
 
-    def __init__(self, mentions, shingle_size, signature_size, band_length, seed=3):
+    def __init__(self, mentions, shingle_size, n_bands, band_length, seed=3):
         # sparse_binary: should the sparse 0/1 matrix be stored with scipy sparse? takes less memory.
         super().__init__(mentions, shingle_size)
-        if signature_size % band_length != 0:
-            raise ValueError("Signature needs to be divisible into equal-sized bands.")
-        self.signature_size = signature_size 
+        self.n_bands = n_bands
         self.band_length = band_length 
+        self.signature_size = n_bands * band_length 
         self.rng = np.random.default_rng(seed=seed)
     
     def make_signature(self):
@@ -192,14 +197,14 @@ def get_candidates(self):
         :return: list of sets of candidate indices.
         """
         logging.debug("getting candidates...")
-        n_bands = int(self.signature_size / self.band_length)
+        # n_bands = int(self.signature_size / self.band_length)
         if self.vectors.shape[0] == 1:
             candidates = [set()]
             candidates[0].add(0)
         else:
             candidates = [set() for _ in range(self.vectors.shape[0])]
 
-            bands = signature_to_3d_bands(self.signature, n_bands=n_bands, band_length=self.band_length)
+            bands = signature_to_3d_bands(self.signature, n_bands=self.n_bands, band_length=self.band_length)
             buckets_by_band = group_unique_indices(bands)
             groups = [tuple(i) for i in itertools.chain.from_iterable(buckets_by_band)] # flatten group; use tuple for applying set()
             groups = set(groups) # we only need the unique groups 
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 4825ba6..670795e 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -131,7 +131,13 @@ def with_coref(self, dataset, search_corefs_in="all"): # TODO: need to update th
             else:
                 if search_corefs_in == 'lsh':
                     input_mentions = [m["mention"] for m in content]
-                    lsh_corefs = LSHRandomProjections(mentions=input_mentions, shingle_size=2, signature_size=800, band_length=10) # TODO: set optimal parameters here 
+                    # lsh_corefs = LSHRandomProjections(mentions=input_mentions, shingle_size=2, signature_size=800, band_length=10) # TODO: set optimal parameters here 
+                    lsh_corefs = LSHRandomProjections(
+                        mentions=input_mentions,
+                        shingle_size=2,
+                        n_bands=80,
+                        band_length=10
+                    )
                     lsh_corefs.cluster()
                     assert len(content) == len(lsh_corefs.candidates)
                     # lsh_corefs.candidates are the input for below. indices refer to index in input_mentions

From ff6778c7ec57eae65ae6aeb2cef3fe0a330ad37d Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Tue, 24 Jan 2023 14:35:05 +0100
Subject: [PATCH 29/43] document the lsh class

---
 src/REL/lsh.py | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index d1c05e2..a1d5fbb 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -11,12 +11,8 @@
 import logging 
 from sklearn.preprocessing import MultiLabelBinarizer
 import itertools
-import pdb 
 from scipy import sparse
 
-# TODO:
-    # document the class? -- after swapping the arguments 
-
 
 def k_shingle(s, k):
     """
@@ -148,19 +144,15 @@ class LSHRandomProjections(LSHBase):
     """
     Class for locality-sensitive hashing with random projections.
     
-
     Parameters:
     -----------
-    mentions:
-    shingle_size:
-    signature_size:
-    band_length:
-    seed:     
+    mentions: list of strings.
+    shingle_size: length of the shingles to be constructed from each string in `mentions`.
+    n_bands, band_length: cut the hash signature into `n_bands` subsets of length `band_length`.
+    seed: random seed for np.random.default_rng
     """
-    # TODO: document more 
 
     def __init__(self, mentions, shingle_size, n_bands, band_length, seed=3):
-        # sparse_binary: should the sparse 0/1 matrix be stored with scipy sparse? takes less memory.
         super().__init__(mentions, shingle_size)
         self.n_bands = n_bands
         self.band_length = band_length 

From 2b6315a124c024a0064a10af2121a47c89139153 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Tue, 24 Jan 2023 14:42:12 +0100
Subject: [PATCH 30/43] update docstring for with_coref

---
 src/REL/training_datasets.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 670795e..82046ab 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -2,7 +2,6 @@
 import pickle
 import pdb 
 from REL.lsh import LSHRandomProjections
-import logging
 
 class TrainingEvaluationDatasets:
     """
@@ -115,13 +114,21 @@ def __find_coref(self, ment, mentlist):
 
     def with_coref(self, dataset, search_corefs_in="all"): # TODO: need to update the calls to with_coref
         """
-        Check if there are coreferences in the given dataset. Use LSH for dimensionality reduction.
+        Check if there are coreferences in the given dataset, and replace
+        the candidate entity of a coreferring mention with the candidates from the main mention.
 
+        Example: If a document contains both "Jimi Hendrix" and "Hendrix" as a mention,
+        then the candidate entities of "Hendrix" will be replaced by the candidate
+        entities of "Jimi Hendrix". 
+
+        Parameters:
+        -----------
         search_corefs_in: either of 'lsh' or all 'all'. 
-        If 'all', search for coreferences among all mentions in document. This is what REL currently does by default.
-        If 'lsh', search for coreferences among a pre-selected set of candidates. The set is calculated with LSH.
+        If 'all', search for coreferences among all mentions in document
+        If 'lsh', search for coreferences among a pre-selected set of candidates. 
+        The set is calculated with LSH.
 
-        :return: dataset
+        :return: dataset with updated candidate entities and p(e|m) scores.
         """
         print(f"with_coref() is called with search_corefs_in={search_corefs_in}.")
         assert search_corefs_in in ['lsh', 'all']
@@ -131,7 +138,7 @@ def with_coref(self, dataset, search_corefs_in="all"): # TODO: need to update th
             else:
                 if search_corefs_in == 'lsh':
                     input_mentions = [m["mention"] for m in content]
-                    # lsh_corefs = LSHRandomProjections(mentions=input_mentions, shingle_size=2, signature_size=800, band_length=10) # TODO: set optimal parameters here 
+                    # TODO: set optimal parameters here 
                     lsh_corefs = LSHRandomProjections(
                         mentions=input_mentions,
                         shingle_size=2,

From 86f89c26c7fa4c49b4695e567f61117f10ac408c Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Tue, 24 Jan 2023 15:16:55 +0100
Subject: [PATCH 31/43] small fixes to lsh and training_datasets

---
 src/REL/lsh.py               | 2 +-
 src/REL/training_datasets.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index a1d5fbb..e660519 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -1,5 +1,5 @@
 """
-This implements a simple version of locality-sensitive hashing.
+Implement a simple version of locality-sensitive hashing.
 The main reference is chapter 3 in "Mining of Massive Datasets" (http://www.mmds.org/).
 
 To allow for high-dimensional data, it stores the feature vectors as sparse matrices,
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 82046ab..ee6eed2 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -1,6 +1,5 @@
 import os
 import pickle
-import pdb 
 from REL.lsh import LSHRandomProjections
 
 class TrainingEvaluationDatasets:

From 7586fe6a2984880ddbc2e9bdf8d1d43b9a0566f7 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Tue, 24 Jan 2023 15:20:44 +0100
Subject: [PATCH 32/43] tidy efficiency_test

---
 scripts/efficiency_test.py | 54 ++++----------------------------------
 1 file changed, 5 insertions(+), 49 deletions(-)

diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py
index 36b7007..19477ee 100644
--- a/scripts/efficiency_test.py
+++ b/scripts/efficiency_test.py
@@ -9,11 +9,7 @@
 np.random.seed(seed=42)
 
 parser = argparse.ArgumentParser()
-# parser.add_argument( 
-#     "--no_corefs",
-#     action="store_true",
-#     help="use function with_coref()?", 
-#     default=False)
+
 parser.add_argument(
     '--search_corefs',
     type=str,
@@ -21,7 +17,6 @@
     default='all',
     help="Setting for search_corefs in Entity Disambiguation."
 )
-
 parser.add_argument(
     "--profile",
     action="store_true",
@@ -73,11 +68,6 @@ def profile_to_df(call):
     return df
 
 
-
-# TODO:
-# make log files!?
-# adjust folder structure on computer and in script 
-
 args = parser.parse_args()
 print(f"args.search_corefs is {args.search_corefs}")
 
@@ -93,7 +83,6 @@ def profile_to_df(call):
 datasets = TrainingEvaluationDatasets(base_url, wiki_version, args.search_corefs).load()[args.name_dataset] 
 save_data_to = f"{base_url}/efficiency_test/" # save all recorded in this directory 
 
-# random_docs = np.random.choice(list(datasets.keys()), 50)
 
 server = False
 docs = {}
@@ -150,9 +139,7 @@ def profile_to_df(call):
     tagger_ner = SequenceTagger.load("ner-fast")
 
     start = time()
-    mentions_dataset, n_mentions = mention_detection.find_mentions(docs, tagger_ner) # TODO: here corefs have an impact! check how.
-        # but what we do in the mention detection here has no impact on what we below in ED. 
-        # so would we expect an effect here, or only below?
+    mentions_dataset, n_mentions = mention_detection.find_mentions(docs, tagger_ner)
     print("MD took: {}".format(time() - start))
 
     # 3. Load model.
@@ -161,10 +148,6 @@ def profile_to_df(call):
         "model_path": "{}/{}/generated/model".format(base_url, wiki_version),
     }
     model = EntityDisambiguation(base_url, wiki_version, config, search_corefs=args.search_corefs) 
-        # model.coref is a training data set
-        # model.coref has method with_coref
-        # compare the training data sets when using corefs and when not
-        # note that the data are loaded elsewhere! so not sure this is the right place to add the option? 
 
     # 4. Entity disambiguation.
     start = time()
@@ -178,42 +161,18 @@ def profile_to_df(call):
     }
     
     filename = f"{save_data_to}predictions/{args.name_dataset}_{args.n_docs}_{args.search_corefs}"
-    # if args.no_corefs:
-    #     filename = f"{filename}_nocoref"
 
     with open(f"{filename}.pickle", "wb") as f:
         pickle.dump(output, f, protocol=pickle.HIGHEST_PROTOCOL)        
 
-    # ## 4.b Profile disambiguation
+    # ## 4.b Profile the disambiguation part 
     if args.profile:
         print("Profiling disambiguation")
         filename = f"{save_data_to}profile/{args.name_dataset}_{args.n_docs}_{args.search_corefs}"
-        # if args.no_corefs:
-        #     filename = f"{filename}_nocoref"
 
         df_stats = profile_to_df(call="model.predict(mentions_dataset)")
-        # cProfile.run("model.predict(mentions_dataset)", filename="temp.txt")
-        # st = pstats.Stats("temp.txt")
-
-        # keys_from_k = ['file', 'line', 'fn']
-        # keys_from_v = ['cc', 'ncalls', 'tottime', 'cumtime', 'callers']
-        # data = {k: [] for k in keys_from_k + keys_from_v}
-
-        # s = st.stats
-
-        # for k in s.keys():
-        #     for i, kk in enumerate(keys_from_k):
-        #         data[kk].append(k[i])
-
-        #     for i, kk in enumerate(keys_from_v):
-        #         data[kk].append(s[k][i])
-
-        # df_stats = pd.DataFrame(data)
-        # os.remove('temp.txt')
-
         df_stats.to_csv(f"{filename}.csv", index=False)
 
-
     # ## 4.c time disambiguation by document, vary number of mentions 
     if args.scale_mentions:
         print("Scaling the mentions per document")
@@ -245,13 +204,10 @@ def profile_to_df(call):
                 print("Profiling disambiguation for synthetic data set")
                 df_profile = profile_to_df(call="model.predict(tempdict)") 
                 timing_by_dataset[name]['profile'] = df_profile
-
         
-        # save timing by dataet
+        # save timing by dataset
         filename = f"{save_data_to}n_mentions_time/{args.name_dataset}_{args.search_corefs}"
-        # if args.no_corefs:
-        #     filename = f"{filename}_nocoref"
-        
+
         with open(f"{filename}.pickle", "wb") as f:
             pickle.dump(timing_by_dataset, f, protocol=pickle.HIGHEST_PROTOCOL)
 

From d218d543b192c83c7d43f300fc93c2eb30b3ab93 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Tue, 24 Jan 2023 16:49:01 +0100
Subject: [PATCH 33/43] set lsh params according to validation data

---
 src/REL/training_datasets.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index ee6eed2..fa9c1a4 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -137,12 +137,11 @@ def with_coref(self, dataset, search_corefs_in="all"): # TODO: need to update th
             else:
                 if search_corefs_in == 'lsh':
                     input_mentions = [m["mention"] for m in content]
-                    # TODO: set optimal parameters here 
                     lsh_corefs = LSHRandomProjections(
                         mentions=input_mentions,
                         shingle_size=2,
-                        n_bands=80,
-                        band_length=10
+                        n_bands=400,
+                        band_length=15
                     )
                     lsh_corefs.cluster()
                     assert len(content) == len(lsh_corefs.candidates)

From 94146a76ead567fa83e24ba2fa98c7c35e699ede Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Wed, 25 Jan 2023 09:11:23 +0100
Subject: [PATCH 34/43] update docstrings; optimize lsh parameters

---
 src/REL/lsh.py               | 25 +++++++++++++++++++------
 src/REL/training_datasets.py |  8 +++++---
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index e660519..179bf98 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -1,9 +1,12 @@
 """
 Implement a simple version of locality-sensitive hashing.
 The main reference is chapter 3 in "Mining of Massive Datasets" (http://www.mmds.org/).
+The time complexity is explained at the end of this video: https://www.youtube.com/watch?v=Arni-zkqMBA
+(number of hyperplanes = band length). 
+The video does not talk about amplification; see the book for this.
 
-To allow for high-dimensional data, it stores the feature vectors as sparse matrices,
-and uses random projections as hash functions. 
+To deal with high-dimensional data (=many mentions), the clas stores the feature vectors
+as sparse matrices and uses random projections as hash functions. 
 """
 
 import time 
@@ -12,6 +15,7 @@
 from sklearn.preprocessing import MultiLabelBinarizer
 import itertools
 from scipy import sparse
+import math 
 
 
 def k_shingle(s, k):
@@ -146,16 +150,25 @@ class LSHRandomProjections(LSHBase):
     
     Parameters:
     -----------
-    mentions: list of strings.
+    mentions: list of strings (mentions).
+
     shingle_size: length of the shingles to be constructed from each string in `mentions`.
-    n_bands, band_length: cut the hash signature into `n_bands` subsets of length `band_length`.
+
+    n_bands, band_length: the signature of a mention will be n_bands*band_length.
+        Longer bands increase precision, more bands increase recall. 
+        If band_length is `None`, it is set as log(len(mentions)), which 
+        will guarantee O(log(N)) time complexity.
+
     seed: random seed for np.random.default_rng
     """
 
-    def __init__(self, mentions, shingle_size, n_bands, band_length, seed=3):
+    def __init__(self, mentions, shingle_size, n_bands, band_length=None, seed=3):
         super().__init__(mentions, shingle_size)
         self.n_bands = n_bands
-        self.band_length = band_length 
+        if band_length is None:
+            self.band_length = math.ceil(math.log(len(mentions))) # for O(log(N)) complexity
+        else:
+            self.band_length = band_length
         self.signature_size = n_bands * band_length 
         self.rng = np.random.default_rng(seed=seed)
     
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index fa9c1a4..25e87af 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -1,6 +1,7 @@
 import os
 import pickle
 from REL.lsh import LSHRandomProjections
+import math 
 
 class TrainingEvaluationDatasets:
     """
@@ -137,11 +138,12 @@ def with_coref(self, dataset, search_corefs_in="all"): # TODO: need to update th
             else:
                 if search_corefs_in == 'lsh':
                     input_mentions = [m["mention"] for m in content]
+                    band_length = math.ceil(math.log(len(input_mentions)))
                     lsh_corefs = LSHRandomProjections(
                         mentions=input_mentions,
-                        shingle_size=2,
-                        n_bands=400,
-                        band_length=15
+                        shingle_size=2, # best recall: 2. acceptable: 2
+                        n_bands=15, # best recall: 400. acceptable: 200. 
+                        band_length=band_length # best recall: 15. acceptable: 15
                     )
                     lsh_corefs.cluster()
                     assert len(content) == len(lsh_corefs.candidates)

From 575cb1d34409c09b613d872a620f13937a6322d8 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Wed, 25 Jan 2023 11:12:27 +0100
Subject: [PATCH 35/43] small changes in lsh.py

---
 src/REL/lsh.py               | 13 +++++++++----
 src/REL/training_datasets.py |  2 ++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index 179bf98..ac2a246 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -17,6 +17,7 @@
 from scipy import sparse
 import math 
 
+# First, define a bunch of functions. TODO: should they be defined elsewhere? utils?
 
 def k_shingle(s, k):
     """
@@ -84,7 +85,6 @@ def signature_to_3d_bands(a, n_bands, band_length):
     
     return result 
 
-# this replaces idx_multidim
 def group_unique_indices(a):
     """
     In a 3-dimensional array, for each array (axis 0), 
@@ -113,6 +113,8 @@ def group_unique_indices(a):
 
     return unq_idx
 
+# ## Here follow the classes
+
 class LSHBase:
     """
     Base class for locality-sensitive hashing, 
@@ -125,6 +127,10 @@ def __init__(self, mentions, shingle_size):
         elif isinstance(mentions, list):
             self.shingles = [k_shingle(m, shingle_size) for m in mentions]
 
+    def __repr__(self):
+        #return f"{type(self).__name__}({self.shingles})"
+        pass 
+
     def _build_vocab(self):
         """
         Build a vocabulary of the shingles in a document.
@@ -150,7 +156,7 @@ class LSHRandomProjections(LSHBase):
     
     Parameters:
     -----------
-    mentions: list of strings (mentions).
+    mentions: list or dict of mentions.
 
     shingle_size: length of the shingles to be constructed from each string in `mentions`.
 
@@ -251,8 +257,7 @@ def summarise(self):
     def efficiency_gain_comparisons(self):
         """
         Compare number of comparisons made for coreference search with option "lsh" and option "all".
-        Useful for understanding time complexity. 
-        And to assess whether number of comparisons is meaningfully reduced
+        Useful for understanding time complexity, and to assess whether number of comparisons is meaningfully reduced
         """
         sizes = [len(g) for g in self.candidates]
         runtime_all = len(self.candidates)*len(self.candidates)
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 25e87af..0813be2 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -2,6 +2,7 @@
 import pickle
 from REL.lsh import LSHRandomProjections
 import math 
+import pdb 
 
 class TrainingEvaluationDatasets:
     """
@@ -145,6 +146,7 @@ def with_coref(self, dataset, search_corefs_in="all"): # TODO: need to update th
                         n_bands=15, # best recall: 400. acceptable: 200. 
                         band_length=band_length # best recall: 15. acceptable: 15
                     )
+                    pdb.set_trace()
                     lsh_corefs.cluster()
                     assert len(content) == len(lsh_corefs.candidates)
                     # lsh_corefs.candidates are the input for below. indices refer to index in input_mentions

From eb65beee8c56f877ee06cc5bb0331603b09b7f31 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Wed, 25 Jan 2023 11:58:46 +0100
Subject: [PATCH 36/43] add __repr__ to lsh

---
 src/REL/lsh.py               | 14 +++++++++++---
 src/REL/training_datasets.py |  2 --
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index ac2a246..e9657d4 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -122,14 +122,20 @@ class LSHBase:
     """
     # Important: order of occurences in shingles and vectors = order of input list (=order of occurrence in document)
     def __init__(self, mentions, shingle_size):
+        self.shingle_size = shingle_size
         if isinstance(mentions, dict):
             self.shingles = [k_shingle(m, shingle_size) for m in mentions.values()]
         elif isinstance(mentions, list):
             self.shingles = [k_shingle(m, shingle_size) for m in mentions]
+        self._rep_items_not_show = ["shingles"]
 
     def __repr__(self):
-        #return f"{type(self).__name__}({self.shingles})"
-        pass 
+        items_dict_show = {k: v for k, v in self.__dict__.items() 
+                                if k not in self._rep_items_not_show
+                                and k[0] != "_"
+                            }
+        items_dict_show = [f"{k}={v}" for k, v in items_dict_show.items()]
+        return f"<{type(self).__name__}() with {', '.join(items_dict_show)}>"
 
     def _build_vocab(self):
         """
@@ -170,13 +176,15 @@ class LSHRandomProjections(LSHBase):
 
     def __init__(self, mentions, shingle_size, n_bands, band_length=None, seed=3):
         super().__init__(mentions, shingle_size)
+        self.seed = seed
         self.n_bands = n_bands
         if band_length is None:
             self.band_length = math.ceil(math.log(len(mentions))) # for O(log(N)) complexity
         else:
             self.band_length = band_length
         self.signature_size = n_bands * band_length 
-        self.rng = np.random.default_rng(seed=seed)
+        self.rng = np.random.default_rng(seed=self.seed)
+        self._rep_items_not_show.extend(["signature_size", "rng"])
     
     def make_signature(self):
         """
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 0813be2..25e87af 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -2,7 +2,6 @@
 import pickle
 from REL.lsh import LSHRandomProjections
 import math 
-import pdb 
 
 class TrainingEvaluationDatasets:
     """
@@ -146,7 +145,6 @@ def with_coref(self, dataset, search_corefs_in="all"): # TODO: need to update th
                         n_bands=15, # best recall: 400. acceptable: 200. 
                         band_length=band_length # best recall: 15. acceptable: 15
                     )
-                    pdb.set_trace()
                     lsh_corefs.cluster()
                     assert len(content) == len(lsh_corefs.candidates)
                     # lsh_corefs.candidates are the input for below. indices refer to index in input_mentions

From 6907ecaaa4c8a84dc8302ce3d10504fb90dec898 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Wed, 25 Jan 2023 13:57:36 +0100
Subject: [PATCH 37/43] improve docstrings, reorder imports

---
 src/REL/lsh.py               | 208 +++++++++++++++++++++++------------
 src/REL/training_datasets.py |   3 +-
 2 files changed, 139 insertions(+), 72 deletions(-)

diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index e9657d4..110375c 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -1,30 +1,26 @@
-"""
-Implement a simple version of locality-sensitive hashing.
-The main reference is chapter 3 in "Mining of Massive Datasets" (http://www.mmds.org/).
-The time complexity is explained at the end of this video: https://www.youtube.com/watch?v=Arni-zkqMBA
-(number of hyperplanes = band length). 
-The video does not talk about amplification; see the book for this.
+"""Implement a simple version of locality-sensitive hashing.
 
 To deal with high-dimensional data (=many mentions), the clas stores the feature vectors
 as sparse matrices and uses random projections as hash functions. 
+
+See chapter 3 in "Mining of Massive Datasets" (http://www.mmds.org/).
+The time complexity is explained at the end of this video: https://www.youtube.com/watch?v=Arni-zkqMBA
+(number of hyperplanes = band length). 
+The video does not talk about amplification; see the book for this.
 """
 
-import time 
-import numpy as np 
-import logging 
-from sklearn.preprocessing import MultiLabelBinarizer
 import itertools
-from scipy import sparse
+import logging 
 import math 
+import numpy as np 
+from scipy import sparse
+from sklearn.preprocessing import MultiLabelBinarizer
+import time 
 
 # First, define a bunch of functions. TODO: should they be defined elsewhere? utils?
 
 def k_shingle(s, k):
-    """
-    Convert string s into shingles of length k
-
-    :return: List of shingles
-    """
+    "Convert string s into shingles of length k"
     shingle = []
     for i in range(len(s) - k + 1):
         shingle.append(s[i:(i+k)])
@@ -32,19 +28,22 @@ def k_shingle(s, k):
 
 
 def cols_to_int_multidim(a):
-    """
-    Combine columns in all rows to an integer: [[1,20,3], [1,4,10]] becomes [1203,1410].
-
-    :return: An array of shape (n, 1), where the horizontally neighboring column values 
-    are appended together.
+    """Combine columns in all rows to an integer
+    
+    For instance, [[1,20,3], [1,4,10]] becomes [1203,1410].
 
     Notes
-    ------
-    Advantage: uses vectorized numpy to create a unique signature.
-    Disadvantage: Because one additional row increases the size of the integer at least by an order of magnitude, 
-    this only works for cases where the bands are not too large. 
-
-    In practice, optimal bands are typically not long enough to cause problems.
+    -----
+    The addvantage is that uses vectorized numpy to create a unique signature.
+    The disadvantage is that because one additional row increases the size of the integer at least 
+    by an order of magnitude, this only works for cases where the bands are not too large. 
+    But in practice, optimal bands are typically not long enough to cause problems.
+
+    :param a: 2-dimensional array
+    :type a: np.ndarray
+    :returns: An array of shape (n, 1), where the horizontally neighboring column values 
+    are appended together.
+    :rtype: np.ndarray
     """
     existing_powers = np.floor(np.log10(a))
     n_bands, nrows, ncols = a.shape 
@@ -61,18 +60,26 @@ def cols_to_int_multidim(a):
     return out 
 
 def signature_to_3d_bands(a, n_bands, band_length):
-    """ 
+    """Convert a signature from 2d to 3d
+
     Convert a signature array of dimension (n_items, signature_length) into an array 
     of (n_bands, n_items, band_length).
 
-    :return: An array of shape (n_bands, n_items, band_length)
-    
-    Details:
-    --------
+    Notes
+    -----
     This produces the same output as np.vstack(np.split(a, indices_or_sections=n_bands, axis=1)).
     When further processing the output, this is a useful alternative to looping on the output of
     np.split(a, indices_or_sections=n_bands, axis=1) because a single vectorized call can be used,
     while np.vstack(np.split(...)) is likely to be less efficient. 
+
+    :param a: Array with 2 dimensions
+    :type a: np.ndarray 
+    :param n_bands: Number of bands the columns to cut into
+    :type n_bands: int 
+    :param band_length: Length of each band 
+    :type band_length: int
+    :returns: Array of shape (n_bands, n_items, band_length)
+    :rtype: np.ndarray
     """
     n_items, signature_length = a.shape
     
@@ -86,14 +93,18 @@ def signature_to_3d_bands(a, n_bands, band_length):
     return result 
 
 def group_unique_indices(a):
-    """
-    In a 3-dimensional array, for each array (axis 0), 
-    calculate the indices of rows (axis=1) that are identical.
+    """Compute indices of matching rows
 
-    :return: a list of lists. Outer lists correspond to bands. 
-    Inner lists correspond to the row indices that 
-    have the same values in their columns. An item 
-    in the inner list is an np.array.
+    In a 3-dimensional array, for each array (axis 0), 
+    compute the indices of rows (axis=1) that are identical.
+
+    :param a: 3-dimensional array
+    :type a: np.ndarray
+    :returns: List of lists. Outer lists correspond to bands. 
+        Inner lists correspond to the row indices that 
+        have the same values in their columns. An item 
+        in the inner list is an np.array.
+    :rtype: list
     """
     n_bands, n_items, length_band = a.shape
     a = cols_to_int_multidim(a).squeeze()
@@ -117,11 +128,31 @@ def group_unique_indices(a):
 
 class LSHBase:
     """
-    Base class for locality-sensitive hashing, 
-    with methods for one-hot encoding and building a vocabulary of shingles
+    Base class for locality-sensitive hashing.
+
+    Attributes
+    ----------
+    shingle_size
+        Size of shingles to be created from mentions
+    mentions
+        Mentions in which to search for similar items
+
+    Methods
+    -------
+    encode_binary()
+        One-hot encode mentions, based on shingles 
     """
     # Important: order of occurences in shingles and vectors = order of input list (=order of occurrence in document)
     def __init__(self, mentions, shingle_size):
+        """
+
+        Parameters
+        ----------
+        :param mentions: Mentions in which to search for similar items
+        :type mentions: list or dict
+        :param shingle_size: Length of substrings to be created from mentions ("shingles")
+        :type shingle_size: int
+        """
         self.shingle_size = shingle_size
         if isinstance(mentions, dict):
             self.shingles = [k_shingle(m, shingle_size) for m in mentions.values()]
@@ -138,18 +169,17 @@ def __repr__(self):
         return f"<{type(self).__name__}() with {', '.join(items_dict_show)}>"
 
     def _build_vocab(self):
-        """
-        Build a vocabulary of the shingles in a document.
-        """
+        "Make vocabulary of unique shingles in all mentions"
         vocab = list(set([shingle for sublist in self.shingles for shingle in sublist]))
         self.vocab = vocab
 
     def encode_binary(self): 
-        """
-        Create sparse binary vectors for each mention.
+        """Create sparse binary vectors for each mention.
 
-        :return: CSR sparse matrix. Rows indicate mentions, columns indicate whether 
-        the mention contains the shingle. 
+        :return: Indicator matrix. 
+            Rows indicate mentions, columns indicate whether 
+            the mention contains the shingle. 
+        :rtype: scipy.sparse.csr_matrix
         """
         logging.debug("making one-hot vectors")
         binarizer = MultiLabelBinarizer(sparse_output=True)
@@ -160,21 +190,52 @@ class LSHRandomProjections(LSHBase):
     """
     Class for locality-sensitive hashing with random projections.
     
-    Parameters:
+    Attributes
     -----------
-    mentions: list or dict of mentions.
-
-    shingle_size: length of the shingles to be constructed from each string in `mentions`.
-
-    n_bands, band_length: the signature of a mention will be n_bands*band_length.
+    mentions
+        List or dict of mentions
+    shingle_size
+        Length of the shingles to be constructed from each string in `mentions`
+    n_bands, band_length
+        The signature of a mention will be n_bands*band_length.
         Longer bands increase precision, more bands increase recall. 
         If band_length is `None`, it is set as log(len(mentions)), which 
         will guarantee O(log(N)) time complexity.
+    seed
+        random seed for np.random.default_rng
 
-    seed: random seed for np.random.default_rng
+    Methods
+    --------
+    make_signature()
+        Create a dense signature vector with random projections.
+    get_candidates()
+        Find groups of mentions overlapping signatures.
+    cluster()
+        End-to-end hashing from shingles to clusters.
+        This is the main functionality of the class.
+    summarise()
+        Summarise time and output of cluster()
+    efficiency_gain_comparisons()
+        Compare number of computations for coreference search with hashing 
+        and without hashing
     """
 
     def __init__(self, mentions, shingle_size, n_bands, band_length=None, seed=3):
+        """
+
+        Parameters
+        ----------
+        :param mentions: Mentions in which to search for similar items
+        :type mentions: list or dict
+        :param shingle_size: Length of substrings to be created from mentions ("shingles")
+        :type shingle_size: int
+        :param n_bands: Number of signature bands (equal-sized cuts of the full signature)
+        :type n_bands: int
+        :param band_length: Length of bands
+        :type band_length: int or None 
+        :seed: Random seed for random number generator from numpy
+        :type seed: int
+        """
         super().__init__(mentions, shingle_size)
         self.seed = seed
         self.n_bands = n_bands
@@ -187,33 +248,35 @@ def __init__(self, mentions, shingle_size, n_bands, band_length=None, seed=3):
         self._rep_items_not_show.extend(["signature_size", "rng"])
     
     def make_signature(self):
-        """
-        Create a signature for a given mention, using random projections.
-        """
+        "Create a matrix of signatures with random projections"
         logging.debug(f"Making signature. vectors shape is {self.vectors.shape}")
         # TODO: can this be more memory-efficient by generating directly the scipy sparse function? 
         # https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.random.html
-        hyperplanes = self.rng.choice([-1, 1], (self.signature_size, self.vectors.shape[1]))
-        hyperplanes = sparse.csr_matrix(hyperplanes)
+        n_rows = self.signature_size
+        n_cols = self.vectors.shape[1]
+        hyperplanes = sparse.csr_matrix(
+            self.rng.choice([-1, 1], (n_rows, n_cols))
+        )
         products = self.vectors.dot(hyperplanes.transpose()).toarray()
         sign = 1 + (products > 0)
         self.signature = sign
 
-    def all_candidates_to_all(self):
-        """
-        Fall-back option to return the non-clustered input.
-        Each mention is a candidate coreference for all mentions. This is useful in 
-        edge cases where no single mention is longer than the shingle size.
+    def _all_candidates_to_all(self):
+        """Assign all mentions as candidates to all other mentions. 
+        For edge cases where no single mention is longer than the shingle size
         """
         n_mentions = self.vectors.shape[0]
         self.candidates = [set(range(n_mentions)) for _ in range(n_mentions)]
 
     def get_candidates(self):
-        """
+        """Extract most similar mentions from signature
+
         For each mention, extract most similar mentions based on whether part 
         of their signatures overlap.
 
-        :return: list of sets of candidate indices.
+        :return: Index of mentions that are similar to each other.
+            A list of the candidate set of similar mentions.
+        :rtype: list
         """
         logging.debug("getting candidates...")
         # n_bands = int(self.signature_size / self.band_length)
@@ -236,10 +299,13 @@ def get_candidates(self):
         self.candidates = candidates
 
     def cluster(self): 
-        """
-        Main functionality of this class: cluster mentions together based on their similarity. 
+        """End-to-end locality-sensitive hashing
+
+        Cluster mentions together based on their similarity. 
 
-        :return: for each mention, mention index of most similar other mentions based on LSH.
+        :return: Index of mentions that are similar to each other.
+            A list of the candidate set of similar mentions.
+        :rtype: list
         """
         start = time.time()
         logging.debug("building vocabulary")
@@ -249,7 +315,7 @@ def cluster(self):
         logging.debug("making signature")
         if self.vectors.shape[1] == 0: # no signature possible b/c no mention is longer than the shingle size.
             logging.debug('self.vectors.shape[1] is 0.')
-            self.all_candidates_to_all()
+            self._all_candidates_to_all()
         else:
             self.make_signature()
             logging.debug("getting candidate groups")
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 25e87af..19898fe 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -1,8 +1,9 @@
 import os
 import pickle
-from REL.lsh import LSHRandomProjections
 import math 
 
+from REL.lsh import LSHRandomProjections
+
 class TrainingEvaluationDatasets:
     """
     Class responsible for loading training/evaluation datasets for local ED.

From f39fa94fb7d12569119e7d9d0b75ea99e1521131 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Wed, 25 Jan 2023 14:39:23 +0100
Subject: [PATCH 38/43] further tidy efficiency tests

---
 scripts/efficiency_test.py | 61 +++++++++++++++++---------------------
 1 file changed, 27 insertions(+), 34 deletions(-)

diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py
index 19477ee..ee536d2 100644
--- a/scripts/efficiency_test.py
+++ b/scripts/efficiency_test.py
@@ -1,15 +1,38 @@
-import numpy as np
 import requests
 import argparse
 import pickle
 import logging 
+import cProfile 
+import pandas as pd 
+import pstats 
+import os 
 
 from REL.training_datasets import TrainingEvaluationDatasets
 
-np.random.seed(seed=42)
+def profile_to_df(call):
+    "Helper function to profile a function call and save the timing in a pd df"
+    cProfile.run(call, filename="temp.txt")
+    st = pstats.Stats("temp.txt")
 
-parser = argparse.ArgumentParser()
+    keys_from_k = ['file', 'line', 'fn']
+    keys_from_v = ['cc', 'ncalls', 'tottime', 'cumtime', 'callers']
+    data = {k: [] for k in keys_from_k + keys_from_v}
+
+    s = st.stats
 
+    for k in s.keys():
+        for i, kk in enumerate(keys_from_k):
+            data[kk].append(k[i])
+
+        for i, kk in enumerate(keys_from_v):
+            data[kk].append(s[k][i])
+
+    df = pd.DataFrame(data)
+    os.remove('temp.txt')
+    return df
+
+
+parser = argparse.ArgumentParser()
 parser.add_argument(
     '--search_corefs',
     type=str,
@@ -44,44 +67,14 @@
 )
 logging.basicConfig(level=logging.INFO) # do not print to file 
 
-
-# helper function to profile a call and save the timing in a pd dataframe 
-def profile_to_df(call):
-    cProfile.run(call, filename="temp.txt")
-    st = pstats.Stats("temp.txt")
-
-    keys_from_k = ['file', 'line', 'fn']
-    keys_from_v = ['cc', 'ncalls', 'tottime', 'cumtime', 'callers']
-    data = {k: [] for k in keys_from_k + keys_from_v}
-
-    s = st.stats
-
-    for k in s.keys():
-        for i, kk in enumerate(keys_from_k):
-            data[kk].append(k[i])
-
-        for i, kk in enumerate(keys_from_v):
-            data[kk].append(s[k][i])
-
-    df = pd.DataFrame(data)
-    os.remove('temp.txt')
-    return df
-
-
 args = parser.parse_args()
 print(f"args.search_corefs is {args.search_corefs}")
 
-if args.profile:
-    import cProfile 
-    import pandas as pd 
-    import pstats 
-    import os 
-
 
 base_url = "/home/flavio/projects/rel20/data"
 wiki_version = "wiki_2019"
 datasets = TrainingEvaluationDatasets(base_url, wiki_version, args.search_corefs).load()[args.name_dataset] 
-save_data_to = f"{base_url}/efficiency_test/" # save all recorded in this directory 
+save_data_to = f"{base_url}/efficiency_test/" # save all recorded data in this directory 
 
 
 server = False

From 20785e426c5bda0c0cdbe2aaa27ed0347999f42a Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Wed, 25 Jan 2023 15:10:26 +0100
Subject: [PATCH 39/43] tidy docstring, add test for short mentions

---
 scripts/efficiency_test.py   | 11 ++++++----
 src/REL/lsh.py               | 42 ++++++++++++++++++------------------
 src/REL/training_datasets.py | 23 ++++++++++++--------
 tests/test_lsh.py            | 16 ++++++++++----
 4 files changed, 54 insertions(+), 38 deletions(-)

diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py
index ee536d2..0df5724 100644
--- a/scripts/efficiency_test.py
+++ b/scripts/efficiency_test.py
@@ -1,14 +1,17 @@
-import requests
 import argparse
-import pickle
-import logging 
 import cProfile 
+import logging 
+import numpy as np 
+import os 
+import pickle
 import pandas as pd 
 import pstats 
-import os 
+import requests
 
 from REL.training_datasets import TrainingEvaluationDatasets
 
+np.random.seed(seed=42)
+
 def profile_to_df(call):
     "Helper function to profile a function call and save the timing in a pd df"
     cProfile.run(call, filename="temp.txt")
diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index 110375c..1bc6c3a 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -20,7 +20,7 @@
 # First, define a bunch of functions. TODO: should they be defined elsewhere? utils?
 
 def k_shingle(s, k):
-    "Convert string s into shingles of length k"
+    "Convert string s into shingles of length k."
     shingle = []
     for i in range(len(s) - k + 1):
         shingle.append(s[i:(i+k)])
@@ -28,16 +28,17 @@ def k_shingle(s, k):
 
 
 def cols_to_int_multidim(a):
-    """Combine columns in all rows to an integer
+    """Combine columns in all rows to an integer.
     
     For instance, [[1,20,3], [1,4,10]] becomes [1203,1410].
 
     Notes
     -----
-    The addvantage is that uses vectorized numpy to create a unique signature.
-    The disadvantage is that because one additional row increases the size of the integer at least 
-    by an order of magnitude, this only works for cases where the bands are not too large. 
-    But in practice, optimal bands are typically not long enough to cause problems.
+    The advantage is that it uses vectorized numpy to collapse an
+    entire row into one integer. The disadvantage is that because one additional row increases 
+    the size of the integer at least by an order of magnitude, this only works for cases where 
+    the bands are not too large. But in practice, optimal bands are typically not long enough 
+    to cause problems.
 
     :param a: 2-dimensional array
     :type a: np.ndarray
@@ -60,7 +61,7 @@ def cols_to_int_multidim(a):
     return out 
 
 def signature_to_3d_bands(a, n_bands, band_length):
-    """Convert a signature from 2d to 3d
+    """Convert a signature from 2d to 3d.
 
     Convert a signature array of dimension (n_items, signature_length) into an array 
     of (n_bands, n_items, band_length).
@@ -93,7 +94,7 @@ def signature_to_3d_bands(a, n_bands, band_length):
     return result 
 
 def group_unique_indices(a):
-    """Compute indices of matching rows
+    """Compute indices of matching rows.
 
     In a 3-dimensional array, for each array (axis 0), 
     compute the indices of rows (axis=1) that are identical.
@@ -187,8 +188,7 @@ def encode_binary(self):
 
 
 class LSHRandomProjections(LSHBase):
-    """
-    Class for locality-sensitive hashing with random projections.
+    """Class for locality-sensitive hashing with random projections.
     
     Attributes
     -----------
@@ -202,7 +202,7 @@ class LSHRandomProjections(LSHBase):
         If band_length is `None`, it is set as log(len(mentions)), which 
         will guarantee O(log(N)) time complexity.
     seed
-        random seed for np.random.default_rng
+        Random seed for np.random.default_rng
 
     Methods
     --------
@@ -217,7 +217,7 @@ class LSHRandomProjections(LSHBase):
         Summarise time and output of cluster()
     efficiency_gain_comparisons()
         Compare number of computations for coreference search with hashing 
-        and without hashing
+        and without hashing.
     """
 
     def __init__(self, mentions, shingle_size, n_bands, band_length=None, seed=3):
@@ -240,18 +240,17 @@ def __init__(self, mentions, shingle_size, n_bands, band_length=None, seed=3):
         self.seed = seed
         self.n_bands = n_bands
         if band_length is None:
-            self.band_length = math.ceil(math.log(len(mentions))) # for O(log(N)) complexity
+            log_n_mentions = math.ceil(math.log(len(mentions))) # for O(log(N)) complexity
+            self.band_length = max(1, log_n_mentions)
         else:
             self.band_length = band_length
-        self.signature_size = n_bands * band_length 
+        self.signature_size = n_bands * self.band_length 
         self.rng = np.random.default_rng(seed=self.seed)
         self._rep_items_not_show.extend(["signature_size", "rng"])
     
     def make_signature(self):
-        "Create a matrix of signatures with random projections"
+        "Create a matrix of signatures with random projections."
         logging.debug(f"Making signature. vectors shape is {self.vectors.shape}")
-        # TODO: can this be more memory-efficient by generating directly the scipy sparse function? 
-        # https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.random.html
         n_rows = self.signature_size
         n_cols = self.vectors.shape[1]
         hyperplanes = sparse.csr_matrix(
@@ -269,7 +268,7 @@ def _all_candidates_to_all(self):
         self.candidates = [set(range(n_mentions)) for _ in range(n_mentions)]
 
     def get_candidates(self):
-        """Extract most similar mentions from signature
+        """Extract most similar mentions from signature.
 
         For each mention, extract most similar mentions based on whether part 
         of their signatures overlap.
@@ -299,7 +298,7 @@ def get_candidates(self):
         self.candidates = candidates
 
     def cluster(self): 
-        """End-to-end locality-sensitive hashing
+        """End-to-end locality-sensitive hashing.
 
         Cluster mentions together based on their similarity. 
 
@@ -330,8 +329,9 @@ def summarise(self):
 
     def efficiency_gain_comparisons(self):
         """
-        Compare number of comparisons made for coreference search with option "lsh" and option "all".
-        Useful for understanding time complexity, and to assess whether number of comparisons is meaningfully reduced
+        Compare number of comparisons made for coreference search with option 
+        "lsh" and option "all". Useful for understanding time complexity, 
+        and to assess whether number of comparisons is meaningfully reduced.
         """
         sizes = [len(g) for g in self.candidates]
         runtime_all = len(self.candidates)*len(self.candidates)
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 19898fe..0891626 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -113,22 +113,27 @@ def __find_coref(self, ment, mentlist):
 
         return coref
 
-    def with_coref(self, dataset, search_corefs_in="all"): # TODO: need to update the calls to with_coref
-        """
+    def with_coref(self, dataset, search_corefs_in="all"): 
+        """Replace candidates of coreferring mentions with main mention.
+
         Check if there are coreferences in the given dataset, and replace
-        the candidate entity of a coreferring mention with the candidates from the main mention.
+        the candidate entity of a coreferring mention with the candidates 
+        from the main mention.
 
-        Example: If a document contains both "Jimi Hendrix" and "Hendrix" as a mention,
+        Example
+        -------
+        If a document contains both "Jimi Hendrix" and "Hendrix" as a mention,
         then the candidate entities of "Hendrix" will be replaced by the candidate
         entities of "Jimi Hendrix". 
 
         Parameters:
         -----------
-        search_corefs_in: either of 'lsh' or all 'all'. 
-        If 'all', search for coreferences among all mentions in document
-        If 'lsh', search for coreferences among a pre-selected set of candidates. 
-        The set is calculated with LSH.
-
+        :param search_corefs_in: in which set to search for coreferences.
+            Either of "lsh" or "all".
+            If 'all', search for coreferences among all mentions in document
+            If 'lsh', search for coreferences among a pre-selected set of candidates. 
+            The set is calculated with LSH.
+        :type search_corefs_in: string.
         :return: dataset with updated candidate entities and p(e|m) scores.
         """
         print(f"with_coref() is called with search_corefs_in={search_corefs_in}.")
diff --git a/tests/test_lsh.py b/tests/test_lsh.py
index ba22137..1aa7ed6 100644
--- a/tests/test_lsh.py
+++ b/tests/test_lsh.py
@@ -4,7 +4,6 @@
 from pathlib import Path
 
 import REL.lsh as lsh 
-# from REL.lsh import vectorize_signature_bands, group_unique_indices, cols_to_int_multidim
 import numpy as np
 import itertools 
 
@@ -43,9 +42,7 @@ def test_signature_to_3d_bands():
     result = lsh.signature_to_3d_bands(a, n_bands=n_bands, band_length=band_length)
 
     expected = np.vstack(np.split(a, n_bands, axis=1)).reshape(n_bands, n_items, -1)
-    assert np.all(result == expected), "signature bands not vectorized correctly"
-
-
+    assert np.all(result == expected), "signature not correctly converted to 3d bands"
 
 def test_group_unique_indices():
     a = np.array([[[1, 4], [1, 4], [5,3], [5, 3], [1 , 2]],
@@ -69,3 +66,14 @@ def test_group_unique_indices():
     # test 
     assert all([np.all(i==j) for i, j in zip(o, e)]), "unique indices not grouped correctly"
 
+def test_cluster_short_mentions():
+    mentions = ['EEC', 'ABC']
+    max_length = max([len(m) for m in mentions])
+    mylsh = lsh.LSHRandomProjections(
+        mentions=mentions, 
+        shingle_size=max_length + 1, 
+        n_bands=15)
+    mylsh.cluster()
+    expected = [set((0, 1)), set((0, 1))]
+    assert expected == mylsh.candidates, \
+        "lsh fails when shingle size longer than longest input mentions"

From 5e19915050df7e082051675f878036ab2db0464f Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Wed, 25 Jan 2023 16:44:31 +0100
Subject: [PATCH 40/43] some more comments, and reference online sources

---
 scripts/efficiency_test.py |  5 +++-
 src/REL/lsh.py             | 50 ++++++++++++++++++--------------------
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py
index 0df5724..36891c9 100644
--- a/scripts/efficiency_test.py
+++ b/scripts/efficiency_test.py
@@ -13,7 +13,10 @@
 np.random.seed(seed=42)
 
 def profile_to_df(call):
-    "Helper function to profile a function call and save the timing in a pd df"
+    """Helper function to profile a function call and save the timing in a pd df.
+
+    Source: https://stackoverflow.com/questions/44302726/pandas-how-to-store-cprofile-output-in-a-pandas-dataframe
+    """
     cProfile.run(call, filename="temp.txt")
     st = pstats.Stats("temp.txt")
 
diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index 1bc6c3a..e159211 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -1,12 +1,13 @@
 """Implement a simple version of locality-sensitive hashing.
 
-To deal with high-dimensional data (=many mentions), the clas stores the feature vectors
+To deal with high-dimensional data (=many mentions), the class stores the feature vectors
 as sparse matrices and uses random projections as hash functions. 
 
 See chapter 3 in "Mining of Massive Datasets" (http://www.mmds.org/).
 The time complexity is explained at the end of this video: https://www.youtube.com/watch?v=Arni-zkqMBA
-(number of hyperplanes = band length). 
-The video does not talk about amplification; see the book for this.
+(number of hyperplanes = band length).
+The use of multiple bands is called amplification, which is discussed in the book 
+but not in the video.
 """
 
 import itertools
@@ -17,7 +18,7 @@
 from sklearn.preprocessing import MultiLabelBinarizer
 import time 
 
-# First, define a bunch of functions. TODO: should they be defined elsewhere? utils?
+# First, define a bunch of functions. TODO: should they be defined elsewhere? put in utils?
 
 def k_shingle(s, k):
     "Convert string s into shingles of length k."
@@ -35,8 +36,8 @@ def cols_to_int_multidim(a):
     Notes
     -----
     The advantage is that it uses vectorized numpy to collapse an
-    entire row into one integer. The disadvantage is that because one additional row increases 
-    the size of the integer at least by an order of magnitude, this only works for cases where 
+    entire row into one integer. The disadvantage is that one additional row increases 
+    the size of the integer at least by an order of magnitude, which only works for cases where 
     the bands are not too large. But in practice, optimal bands are typically not long enough 
     to cause problems.
 
@@ -49,9 +50,9 @@ def cols_to_int_multidim(a):
     existing_powers = np.floor(np.log10(a))
     n_bands, nrows, ncols = a.shape 
 
-    # cumsum_powers = np.fliplr(np.cumsum(np.fliplr(existing_powers), axis=1))
+    # sum existing powers from right to left
     cumsum_powers = np.flip(np.cumsum(np.flip(existing_powers, axis=2), axis=2), axis=2)
-
+     
     add_powers = [x for x in reversed(range(ncols))]
     add_powers = np.tile(add_powers, (nrows, 1))
 
@@ -88,9 +89,8 @@ def signature_to_3d_bands(a, n_bands, band_length):
     stacked_bands = a.reshape(n_items*n_bands, band_length) 
     # reorder so that the first band of all items comes first, then the second band of all items, etc.
     reordering_vector = np.arange(n_items*n_bands).reshape(n_items, n_bands).T.reshape(1, -1)
-
-    result = stacked_bands[reordering_vector, :].reshape(n_bands, n_items, band_length)
     
+    result = stacked_bands[reordering_vector, :].reshape(n_bands, n_items, band_length)
     return result 
 
 def group_unique_indices(a):
@@ -98,6 +98,7 @@ def group_unique_indices(a):
 
     In a 3-dimensional array, for each array (axis 0), 
     compute the indices of rows (axis=1) that are identical.
+    Based on 1d-version here: https://stackoverflow.com/questions/23268605/grouping-indices-of-unique-elements-in-numpy
 
     :param a: 3-dimensional array
     :type a: np.ndarray
@@ -141,7 +142,7 @@ class LSHBase:
     Methods
     -------
     encode_binary()
-        One-hot encode mentions, based on shingles 
+        One-hot encode mentions, based on shingles
     """
     # Important: order of occurences in shingles and vectors = order of input list (=order of occurrence in document)
     def __init__(self, mentions, shingle_size):
@@ -159,18 +160,19 @@ def __init__(self, mentions, shingle_size):
             self.shingles = [k_shingle(m, shingle_size) for m in mentions.values()]
         elif isinstance(mentions, list):
             self.shingles = [k_shingle(m, shingle_size) for m in mentions]
-        self._rep_items_not_show = ["shingles"]
+        self._rep_items_not_show = ["shingles"] # do not show in __repr__ b/c too long
 
     def __repr__(self):
         items_dict_show = {k: v for k, v in self.__dict__.items() 
                                 if k not in self._rep_items_not_show
-                                and k[0] != "_"
+                                and k[0] != "_" # omit private attributes
                             }
         items_dict_show = [f"{k}={v}" for k, v in items_dict_show.items()]
         return f"<{type(self).__name__}() with {', '.join(items_dict_show)}>"
 
     def _build_vocab(self):
-        "Make vocabulary of unique shingles in all mentions"
+        "Make vocabulary of unique shingles in all mentions."
+        logging.debug("making vocabulary from shingles")
         vocab = list(set([shingle for sublist in self.shingles for shingle in sublist]))
         self.vocab = vocab
 
@@ -241,7 +243,7 @@ def __init__(self, mentions, shingle_size, n_bands, band_length=None, seed=3):
         self.n_bands = n_bands
         if band_length is None:
             log_n_mentions = math.ceil(math.log(len(mentions))) # for O(log(N)) complexity
-            self.band_length = max(1, log_n_mentions)
+            self.band_length = max(1, log_n_mentions) # use 1 if exp(log(n_mentions)) < 1
         else:
             self.band_length = band_length
         self.signature_size = n_bands * self.band_length 
@@ -257,20 +259,20 @@ def make_signature(self):
             self.rng.choice([-1, 1], (n_rows, n_cols))
         )
         products = self.vectors.dot(hyperplanes.transpose()).toarray()
-        sign = 1 + (products > 0)
+        sign = 1 + (products > 0) # need +1 for cols_to_int_multidim
         self.signature = sign
 
     def _all_candidates_to_all(self):
         """Assign all mentions as candidates to all other mentions. 
-        For edge cases where no single mention is longer than the shingle size
+        For edge cases where no single mention is longer than the shingle size.
         """
         n_mentions = self.vectors.shape[0]
         self.candidates = [set(range(n_mentions)) for _ in range(n_mentions)]
 
     def get_candidates(self):
-        """Extract most similar mentions from signature.
+        """Extract similar mentions from signature.
 
-        For each mention, extract most similar mentions based on whether part 
+        For each mention, extract similar mentions based on whether part 
         of their signatures overlap.
 
         :return: Index of mentions that are similar to each other.
@@ -278,7 +280,6 @@ def get_candidates(self):
         :rtype: list
         """
         logging.debug("getting candidates...")
-        # n_bands = int(self.signature_size / self.band_length)
         if self.vectors.shape[0] == 1:
             candidates = [set()]
             candidates[0].add(0)
@@ -307,17 +308,14 @@ def cluster(self):
         :rtype: list
         """
         start = time.time()
-        logging.debug("building vocabulary")
         self._build_vocab()
-        logging.debug("encoding to binary")
         self.encode_binary()
+
         logging.debug("making signature")
         if self.vectors.shape[1] == 0: # no signature possible b/c no mention is longer than the shingle size.
-            logging.debug('self.vectors.shape[1] is 0.')
             self._all_candidates_to_all()
         else:
             self.make_signature()
-            logging.debug("getting candidate groups")
             self.get_candidates()
         self.time = time.time() - start 
 
@@ -334,6 +332,6 @@ def efficiency_gain_comparisons(self):
         and to assess whether number of comparisons is meaningfully reduced.
         """
         sizes = [len(g) for g in self.candidates]
-        runtime_all = len(self.candidates)*len(self.candidates)
-        runtime_lsh = len(self.candidates)*(sum(sizes)/len(sizes))
+        runtime_all = len(self.candidates) * len(self.candidates)
+        runtime_lsh = len(self.candidates) * (sum(sizes)/len(sizes))
         print(f"LSH makes fraction {round(runtime_lsh/runtime_all, 2)} of comparisons relative to option all.")

From 231ca4585d400dc7f83a70d2380032bd402d1968 Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Wed, 25 Jan 2023 17:10:24 +0100
Subject: [PATCH 41/43] make dirs for output of efficiency test if necessary

---
 scripts/efficiency_test.py   | 21 +++++++++++++++++----
 src/REL/lsh.py               | 10 +++++-----
 src/REL/training_datasets.py |  6 +++---
 3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py
index 36891c9..8c318cd 100644
--- a/scripts/efficiency_test.py
+++ b/scripts/efficiency_test.py
@@ -80,7 +80,19 @@ def profile_to_df(call):
 base_url = "/home/flavio/projects/rel20/data"
 wiki_version = "wiki_2019"
 datasets = TrainingEvaluationDatasets(base_url, wiki_version, args.search_corefs).load()[args.name_dataset] 
-save_data_to = f"{base_url}/efficiency_test/" # save all recorded data in this directory 
+
+# create directories where to save the output from the tests
+dir_efficiency_test = os.path.join(base_url, "efficiency_test")
+sub_directories = {
+    "profile": "profile",
+    "predictions": "predictions",
+    "n_mentions_time": "n_mentions_time"
+}
+sub_directories = {k: os.path.join(dir_efficiency_test, v) for k, v in sub_directories.items()}
+
+for d in sub_directories.values():
+    if not os.path.exists(d):
+        os.makedirs(d)
 
 
 server = False
@@ -159,7 +171,8 @@ def profile_to_df(call):
         "timing": timing
     }
     
-    filename = f"{save_data_to}predictions/{args.name_dataset}_{args.n_docs}_{args.search_corefs}"
+    iteration_identifier = f"{args.name_dataset}_{args.n_docs}_{args.search_corefs}"
+    filename = os.path.join(sub_directories["predictions"], iteration_identifier)
 
     with open(f"{filename}.pickle", "wb") as f:
         pickle.dump(output, f, protocol=pickle.HIGHEST_PROTOCOL)        
@@ -167,7 +180,7 @@ def profile_to_df(call):
     # ## 4.b Profile the disambiguation part 
     if args.profile:
         print("Profiling disambiguation")
-        filename = f"{save_data_to}profile/{args.name_dataset}_{args.n_docs}_{args.search_corefs}"
+        filename = os.path.join(sub_directories["profile"], iteration_identifier)
 
         df_stats = profile_to_df(call="model.predict(mentions_dataset)")
         df_stats.to_csv(f"{filename}.csv", index=False)
@@ -205,7 +218,7 @@ def profile_to_df(call):
                 timing_by_dataset[name]['profile'] = df_profile
         
         # save timing by dataset
-        filename = f"{save_data_to}n_mentions_time/{args.name_dataset}_{args.search_corefs}"
+        filename = os.path.join(sub_directories["n_mentions_time"], f"{args.name_dataset}_{args.search_corefs}" )
 
         with open(f"{filename}.pickle", "wb") as f:
             pickle.dump(timing_by_dataset, f, protocol=pickle.HIGHEST_PROTOCOL)
diff --git a/src/REL/lsh.py b/src/REL/lsh.py
index e159211..bd174e1 100644
--- a/src/REL/lsh.py
+++ b/src/REL/lsh.py
@@ -14,9 +14,10 @@
 import logging 
 import math 
 import numpy as np 
+import time 
+
 from scipy import sparse
 from sklearn.preprocessing import MultiLabelBinarizer
-import time 
 
 # First, define a bunch of functions. TODO: should they be defined elsewhere? put in utils?
 
@@ -89,7 +90,7 @@ def signature_to_3d_bands(a, n_bands, band_length):
     stacked_bands = a.reshape(n_items*n_bands, band_length) 
     # reorder so that the first band of all items comes first, then the second band of all items, etc.
     reordering_vector = np.arange(n_items*n_bands).reshape(n_items, n_bands).T.reshape(1, -1)
-    
+
     result = stacked_bands[reordering_vector, :].reshape(n_bands, n_items, band_length)
     return result 
 
@@ -144,7 +145,7 @@ class LSHBase:
     encode_binary()
         One-hot encode mentions, based on shingles
     """
-    # Important: order of occurences in shingles and vectors = order of input list (=order of occurrence in document)
+    
     def __init__(self, mentions, shingle_size):
         """
 
@@ -311,7 +312,6 @@ def cluster(self):
         self._build_vocab()
         self.encode_binary()
 
-        logging.debug("making signature")
         if self.vectors.shape[1] == 0: # no signature possible b/c no mention is longer than the shingle size.
             self._all_candidates_to_all()
         else:
@@ -334,4 +334,4 @@ def efficiency_gain_comparisons(self):
         sizes = [len(g) for g in self.candidates]
         runtime_all = len(self.candidates) * len(self.candidates)
         runtime_lsh = len(self.candidates) * (sum(sizes)/len(sizes))
-        print(f"LSH makes fraction {round(runtime_lsh/runtime_all, 2)} of comparisons relative to option all.")
+        print(f"option 'lsh' makes fraction {round(runtime_lsh/runtime_all, 2)} of comparisons relative to option 'all'.")
diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index 0891626..c91e42d 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -147,9 +147,9 @@ def with_coref(self, dataset, search_corefs_in="all"):
                     band_length = math.ceil(math.log(len(input_mentions)))
                     lsh_corefs = LSHRandomProjections(
                         mentions=input_mentions,
-                        shingle_size=2, # best recall: 2. acceptable: 2
-                        n_bands=15, # best recall: 400. acceptable: 200. 
-                        band_length=band_length # best recall: 15. acceptable: 15
+                        shingle_size=2, 
+                        n_bands=15,  
+                        band_length=band_length 
                     )
                     lsh_corefs.cluster()
                     assert len(content) == len(lsh_corefs.candidates)

From 3f06dacc2725516adfa3be938fff1514a670bfef Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Wed, 25 Jan 2023 17:19:44 +0100
Subject: [PATCH 42/43] use logging in with_coref

---
 src/REL/training_datasets.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/REL/training_datasets.py b/src/REL/training_datasets.py
index c91e42d..eef6ec8 100644
--- a/src/REL/training_datasets.py
+++ b/src/REL/training_datasets.py
@@ -1,3 +1,4 @@
+import logging
 import os
 import pickle
 import math 
@@ -136,7 +137,7 @@ def with_coref(self, dataset, search_corefs_in="all"):
         :type search_corefs_in: string.
         :return: dataset with updated candidate entities and p(e|m) scores.
         """
-        print(f"with_coref() is called with search_corefs_in={search_corefs_in}.")
+        logging.info(f"with_coref() is called with search_corefs_in={search_corefs_in}.")
         assert search_corefs_in in ['lsh', 'all']
         for data_name, content in dataset.items():
             if len(content) == 0:

From 5aa84db2272f5cd8fd882d7d9a62d487fe2b237f Mon Sep 17 00:00:00 2001
From: f-hafner <f.hafner@esciencecenter.nl>
Date: Wed, 15 Feb 2023 18:08:24 +0100
Subject: [PATCH 43/43] add base_url argument to efficiency tests

---
 scripts/efficiency_test.py      | 18 ++++++++++++------
 scripts/run_efficiency_tests.sh | 29 +++++++++++++++++------------
 2 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py
index 8c318cd..c47d5ed 100644
--- a/scripts/efficiency_test.py
+++ b/scripts/efficiency_test.py
@@ -39,6 +39,12 @@ def profile_to_df(call):
 
 
 parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--url",
+    dest="base_url",
+    type=str,
+    help="path to input and output data"
+)
 parser.add_argument(
     '--search_corefs',
     type=str,
@@ -77,12 +83,12 @@ def profile_to_df(call):
 print(f"args.search_corefs is {args.search_corefs}")
 
 
-base_url = "/home/flavio/projects/rel20/data"
+# base_url = "/home/flavio/projects/rel20/data"
 wiki_version = "wiki_2019"
-datasets = TrainingEvaluationDatasets(base_url, wiki_version, args.search_corefs).load()[args.name_dataset] 
+datasets = TrainingEvaluationDatasets(args.base_url, wiki_version, args.search_corefs).load()[args.name_dataset] 
 
 # create directories where to save the output from the tests
-dir_efficiency_test = os.path.join(base_url, "efficiency_test")
+dir_efficiency_test = os.path.join(args.base_url, "efficiency_test")
 sub_directories = {
     "profile": "profile",
     "predictions": "predictions",
@@ -144,7 +150,7 @@ def profile_to_df(call):
 
     flair.device = torch.device("cpu")
 
-    mention_detection = MentionDetection(base_url, wiki_version)
+    mention_detection = MentionDetection(args.base_url, wiki_version)
 
     # Alternatively use Flair NER tagger.
     tagger_ner = SequenceTagger.load("ner-fast")
@@ -156,9 +162,9 @@ def profile_to_df(call):
     # 3. Load model.
     config = {
         "mode": "eval",
-        "model_path": "{}/{}/generated/model".format(base_url, wiki_version),
+        "model_path": "{}/{}/generated/model".format(args.base_url, wiki_version),
     }
-    model = EntityDisambiguation(base_url, wiki_version, config, search_corefs=args.search_corefs) 
+    model = EntityDisambiguation(args.base_url, wiki_version, config, search_corefs=args.search_corefs) 
 
     # 4. Entity disambiguation.
     start = time()
diff --git a/scripts/run_efficiency_tests.sh b/scripts/run_efficiency_tests.sh
index 7fb1cb5..85059e0 100644
--- a/scripts/run_efficiency_tests.sh
+++ b/scripts/run_efficiency_tests.sh
@@ -1,23 +1,28 @@
 
+BASE_URL="$1"
 
+DATASETS=("aida_testB")
+DOCSIZES=(50 500)
+COREF_OPTIONS=("all" "off" "lsh")
 
-datasets=("aida_testB")
 
-docsizes=(50 500)
+echo $DATASETS
 
 
-echo $datasets
-
-
-echo "--Running efficiency tests by data set and n_docs--"
+echo "--Running efficiency tests by data set, n_docs and coref option--"
 
 # do profiling and checking predictions in one 
-for size in ${docsizes[@]}; do
-    for ds in ${datasets[@]}; do
-        echo $ds, echo $size
-        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "all"
-        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "lsh"
-        python scripts/efficiency_test.py --profile --n_docs $size --name_dataset "$ds" --search_corefs "off"
+for size in ${DOCSIZES[@]}; do
+    for ds in ${DATASETS[@]}; do
+        for option in ${COREF_OPTIONS[@]}; do
+            echo $ds, echo $size, echo $option 
+            python scripts/efficiency_test.py \
+                --url "$BASE_URL" \
+                --profile \
+                --n_docs $size \
+                --name_dataset "$ds" \
+                --search_corefs $option 
+        done
     done 
 done