From fd407c049b66254e0335ca64c39c336aba986a9e Mon Sep 17 00:00:00 2001
From: Sriharsha-hatwar <shatwar@umass.edu>
Date: Sat, 11 Mar 2023 21:10:12 +0000
Subject: [PATCH 01/17] Changes to requirements.in

---
 requirements.in | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.in b/requirements.in
index 509d2a8..727a17e 100644
--- a/requirements.in
+++ b/requirements.in
@@ -25,7 +25,7 @@ wandb
 
 #############################
 # HummingBird Requirements
-#awscli==1.25.90
+awscli==1.25.90
 #git+https://github.com/microsoft/hummingbird.git@mainterl/fine-tune-trees
 
 # ECC Layer Requirements
@@ -33,4 +33,4 @@ higra==0.6.4
 cvxpy
 cvxpylayers
 pytorch_lightning
-git+https://github.com/dhdhagar/NeuMiss.git@dev
\ No newline at end of file
+git+https://github.com/dhdhagar/NeuMiss.git@dev

From bb5e19cbb8f8172f419268852a496045d92a1338 Mon Sep 17 00:00:00 2001
From: Dhruv Agarwal <dhruv.agarwal17@gmail.com>
Date: Wed, 15 Mar 2023 15:23:09 -0400
Subject: [PATCH 02/17] Debugging, SDP fixes, grad_acc, no-sdp, bceloss, local
 runs, bug fixes (#39)

* Add --debug and --track_errors to log cvxpylayer errors
* Fix save_to_wandb call for hyperparameters
* Fix tensor serializable error
* Fix for --eval_only_split flow
* Add `sdp_scale` hyperparameter to scale the weight matrix to the SDP layer by the maximum element
* Add `gradient_accumulation` hyperparameter
* Change run defaults and sweep configs
* Add sweep prefix option to run_sweep
* Increase sweep agent memory
* Clamp cvxpy output to [0,1]
* Address meshgrid warning
* Add `weighted_loss` to e2e sweep config
* Modify run_sweep.sh to take in seed start and end values
* Add `use_sdp` hyperparam to control whether to use the SDP during training and inference or directly use the MLP output with HAC-cut
* Log errors before crashing; make error tracking the default behavior
* Exception handling improvements
* Make tqdm verbose even in silent mode
* Save best dev model before testing
* Add *-nosdp sweep configurations
* Add `e2e_loss` hyperparam to control whether to use Frobenius or BCE loss
* Change default subsampling to 80 (train) and 100 (dev)
* Add --local to run with wandb disabled, change default weighted_loss to false to stay consistent with icml23 submission
---
 add_agent.sh                              |   2 +-
 e2e_debug/solve.py                        | 140 +++++++++++
 e2e_pipeline/hac_cut_layer.py             |  22 +-
 e2e_pipeline/model.py                     |  13 +-
 e2e_pipeline/sdp_layer.py                 |  62 +++--
 e2e_scripts/evaluate.py                   |  65 +++--
 e2e_scripts/train.py                      | 291 +++++++++++++++-------
 e2e_scripts/train_utils.py                |  49 +++-
 run_sweep.sh                              |  14 +-
 utils/parser.py                           |  12 +
 wandb_configs/sweeps/e2e-nosdp-warm.json  |  16 ++
 wandb_configs/sweeps/e2e-nosdp.json       |  15 ++
 wandb_configs/sweeps/e2e.json             |   2 +-
 wandb_configs/sweeps/frac-nosdp-warm.json |  17 ++
 wandb_configs/sweeps/frac-nosdp.json      |  16 ++
 wandb_configs/sweeps/frac.json            |   2 +-
 wandb_configs/sweeps/mlp.json             |   5 +-
 17 files changed, 588 insertions(+), 155 deletions(-)
 create mode 100644 e2e_debug/solve.py
 create mode 100644 wandb_configs/sweeps/e2e-nosdp-warm.json
 create mode 100644 wandb_configs/sweeps/e2e-nosdp.json
 create mode 100644 wandb_configs/sweeps/frac-nosdp-warm.json
 create mode 100644 wandb_configs/sweeps/frac-nosdp.json

diff --git a/add_agent.sh b/add_agent.sh
index 19b0a2b..13b93f5 100644
--- a/add_agent.sh
+++ b/add_agent.sh
@@ -10,7 +10,7 @@ gpu_name=${6:-"gypsum-1080ti"}  # "gypsum-1080ti"
 for ((i = 1; i <= ${n_agents}; i++)); do
   JOB_DESC=${model}_${dataset}_sweep${seed}-${i} && JOB_NAME=${JOB_DESC}_$(date +%s) && \
   sbatch -J ${JOB_NAME} -e jobs/${JOB_NAME}.err -o jobs/${JOB_NAME}.log \
-    --partition=${gpu_name} --gres=gpu:1 --mem=80G --time=12:00:00 \
+    --partition=${gpu_name} --gres=gpu:1 --mem=100G --time=12:00:00 \
     run_sbatch.sh e2e_scripts/train.py \
     --dataset="${dataset}" \
     --dataset_random_seed=${seed} \
diff --git a/e2e_debug/solve.py b/e2e_debug/solve.py
new file mode 100644
index 0000000..6e47ef2
--- /dev/null
+++ b/e2e_debug/solve.py
@@ -0,0 +1,140 @@
+import json
+import argparse
+import cvxpy as cp
+import logging
+import numpy as np
+import torch
+
+from IPython import embed
+
+from e2e_pipeline.hac_cut_layer import HACCutLayer
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s', datefmt='%m/%d/%Y %H:%M:%S',
+                    level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class Parser(argparse.ArgumentParser):
+    def __init__(self):
+        super().__init__()
+        self.add_argument(
+            "--data_fpath", type=str
+        )
+        self.add_argument(
+            "--data_idx", type=int, default=0
+        )
+        self.add_argument(
+            "--scs_max_sdp_iters", type=int, default=50000
+        )
+        self.add_argument(
+            "--scs_silent", action="store_true",
+        )
+        self.add_argument(
+            "--scs_eps", type=float, default=1e-3
+        )
+        self.add_argument(
+            "--scs_scale", type=float, default=1e-1,
+        )
+        self.add_argument(
+            "--scs_dont_normalize", action="store_true",
+        )
+        self.add_argument(
+            "--scs_use_indirect", action="store_true",
+        )
+        self.add_argument(
+            "--scs_dont_use_quad_obj", action="store_true",
+        )
+        self.add_argument(
+            "--scs_alpha", type=float, default=1.5
+        )
+        self.add_argument(
+            "--scs_log_csv_filename", type=str,
+        )
+        self.add_argument(
+            "--interactive", action="store_true",
+        )
+
+
+if __name__ == '__main__':
+    parser = Parser()
+    args = parser.parse_args()
+    logger.info("Script arguments:")
+    logger.info(args.__dict__)
+
+    # Read error file
+    logger.info("Reading input data")
+    with open(args.data_fpath, 'r') as fh:
+        data = json.load(fh)
+    assert len(data['errors']) > 0
+    # Pick specific error instance to process
+    error_data = data['errors'][args.data_idx]
+
+    # Extract input data from the error instance
+    _raw = np.array(error_data['model_call_args']['data'])
+    _W_val = np.array(error_data['cvxpy_layer_args']['W_val'])
+
+    # Construct cvxpy problem
+    logger.info('Constructing optimization problem')
+    # edge_weights = _W_val.tocoo()
+    n = _W_val.shape[0]
+    W = _W_val
+    # W = csr_matrix((edge_weights.data, (edge_weights.row, edge_weights.col)), shape=(n, n))
+    X = cp.Variable((n, n), PSD=True)
+    # Build constraint set
+    constraints = [
+        cp.diag(X) == np.ones((n,)),
+        X[:n, :] >= 0,
+        X[:n, :] <= 1
+    ]
+
+    # Setup HAC Cut
+    hac_cut = HACCutLayer()
+    hac_cut.eval()
+
+    sdp_obj_value = float('inf')
+    result_idxs, results_X, results_clustering = [], [], []
+    no_solution_scaling_factors = []
+    for i in range(1, 10):  # n
+        # Skipping 1; no scaling leads to non-convergence (infinite objective value)
+        if i == 1:
+            scaling_factor = np.max(W)
+        else:
+            scaling_factor = i
+        logger.info(f'Scaling factor={scaling_factor}')
+        # Create problem
+        W_scaled = W / scaling_factor
+        problem = cp.Problem(cp.Maximize(cp.trace(W_scaled @ X)), constraints)
+        # Solve problem
+        sdp_obj_value = problem.solve(
+            solver=cp.SCS,
+            verbose=not args.scs_silent,
+            max_iters=args.scs_max_sdp_iters,
+            eps=args.scs_eps,
+            normalize=not args.scs_dont_normalize,
+            alpha=args.scs_alpha,
+            scale=args.scs_scale,
+            use_indirect=args.scs_use_indirect,
+            use_quad_obj=not args.scs_dont_use_quad_obj,
+            log_csv_filename=args.scs_log_csv_filename
+        )
+        logger.info(f"@scaling={scaling_factor}, objective value = {sdp_obj_value}, norm={np.linalg.norm(W_scaled)}")
+        if sdp_obj_value != float('inf'):
+            result_idxs.append(i)
+            results_X.append(X.value)
+            # Find clustering solution
+            hac_cut.get_rounded_solution(torch.tensor(X.value), torch.tensor(W_scaled))
+            results_clustering.append(hac_cut.cluster_labels.numpy())
+        else:
+            no_solution_scaling_factors.append(scaling_factor)
+    logger.info(f"Solution not found = {len(no_solution_scaling_factors)}")
+    logger.info(f"Solution found = {len(results_X)}")
+
+    logger.info("Same clustering:")
+    for i in range(len(results_clustering)-1):
+        logger.info(np.array_equal(results_clustering[i], results_clustering[i + 1]))
+    # logger.info(f"Solution found with scaling factor = {scaling_factor}")
+    # if args.interactive and sdp_obj_value == float('inf'):
+    #     embed()
+
+    if args.interactive:
+        embed()
diff --git a/e2e_pipeline/hac_cut_layer.py b/e2e_pipeline/hac_cut_layer.py
index c2b5a99..f3e44a4 100644
--- a/e2e_pipeline/hac_cut_layer.py
+++ b/e2e_pipeline/hac_cut_layer.py
@@ -13,7 +13,7 @@ def __init__(self):
     Takes fractional SDP output as input, and simultaneously builds & cuts avg. HAC tree to get rounded solution.
     Executes straight-through estimator as the backward pass.
     """
-    def get_rounded_solution(self, X, weights, _MAX_DIST=10, use_similarities=True, max_similarity=1, verbose=False):
+    def get_rounded_solution(self, X, weights, _MAX_DIST=1000, use_similarities=True, max_similarity=1, verbose=False):
         """
         X is a symmetric NxN matrix of fractional, decision values with a 1-diagonal (output from the SDP layer)
         weights is an NxN upper-triangular (shift 1) matrix of edge weights
@@ -34,7 +34,8 @@ def get_rounded_solution(self, X, weights, _MAX_DIST=10, use_similarities=True,
         round_matrix = torch.eye(D, device=device)
 
         # Take the upper triangular and mask the other values with a large number
-        Y = _MAX_DIST * torch.ones(D, D, device=device).tril() + (max_similarity-X if use_similarities else X).triu(1)
+        _MAX_DIST = torch.max(torch.abs(X)) * _MAX_DIST
+        Y = _MAX_DIST * torch.ones(D, D, device=device).tril() + (max_similarity - X if use_similarities else X).triu(1)
         # Compute the dissimilarity minima per row
         values, indices = torch.min(Y, dim=1)
 
@@ -100,7 +101,7 @@ def get_rounded_solution(self, X, weights, _MAX_DIST=10, use_similarities=True,
             # Energy calculations
             clustering[max_node] = clustering[parent_1] + clustering[parent_2]
             leaf_indices = torch.where(clustering[max_node])[0]
-            leaf_edges = torch.meshgrid(leaf_indices, leaf_indices)
+            leaf_edges = torch.meshgrid(leaf_indices, leaf_indices, indexing='ij')
             energy[max_node] = energy[parent_1] + energy[parent_2]
             merge_energy = torch.sum(weights[leaf_edges])
             if merge_energy >= energy[max_node]:
@@ -123,9 +124,16 @@ def get_rounded_solution(self, X, weights, _MAX_DIST=10, use_similarities=True,
         self.round_matrix = round_matrix
         self.cluster_labels = clustering[-1]
         self.parents = parents
-        objective_matrix = weights * torch.triu(round_matrix, diagonal=1)
-        self.objective_value = (energy[max_node] - torch.sum(objective_matrix[objective_matrix < 0])).item()  # MA
+        with torch.no_grad():
+            objective_matrix = weights * torch.triu(round_matrix, diagonal=1)
+            self.objective_value = (energy[max_node] - torch.sum(objective_matrix[objective_matrix < 0])).item()  # MA
         return self.round_matrix
 
-    def forward(self, X, W, use_similarities=True):
-        return X + (self.get_rounded_solution(X, W, use_similarities=use_similarities) - X).detach()
+    def forward(self, X, W, use_similarities=True, return_triu=False):
+        solution = X + (self.get_rounded_solution(X, W,
+                                                  use_similarities=use_similarities,
+                                                  max_similarity=torch.max(X)) - X).detach()
+        if return_triu:
+            triu_indices = torch.triu_indices(len(solution), len(solution), offset=1)
+            return solution[triu_indices[0], triu_indices[1]]
+        return solution
diff --git a/e2e_pipeline/model.py b/e2e_pipeline/model.py
index ddabe71..928ae21 100644
--- a/e2e_pipeline/model.py
+++ b/e2e_pipeline/model.py
@@ -15,7 +15,8 @@
 class EntResModel(torch.nn.Module):
     def __init__(self, n_features, neumiss_depth, dropout_p, dropout_only_once, add_neumiss,
                  neumiss_deq, hidden_dim, n_hidden_layers, add_batchnorm, activation,
-                 negative_slope, hidden_config, sdp_max_iters, sdp_eps, use_rounded_loss=True):
+                 negative_slope, hidden_config, sdp_max_iters, sdp_eps, sdp_scale=False, use_rounded_loss=True,
+                 return_triu_on_train=False, use_sdp=True):
         super().__init__()
         # Layers
         self.mlp_layer = MLPLayer(n_features=n_features, neumiss_depth=neumiss_depth, dropout_p=dropout_p,
@@ -23,10 +24,12 @@ def __init__(self, n_features, neumiss_depth, dropout_p, dropout_only_once, add_
                                   hidden_dim=hidden_dim, n_hidden_layers=n_hidden_layers, add_batchnorm=add_batchnorm,
                                   activation=activation, negative_slope=negative_slope, hidden_config=hidden_config)
         self.uncompress_layer = UncompressTransformLayer()
-        self.sdp_layer = SDPLayer(max_iters=sdp_max_iters, eps=sdp_eps)
+        self.sdp_layer = SDPLayer(max_iters=sdp_max_iters, eps=sdp_eps, scale_input=sdp_scale)
         self.hac_cut_layer = HACCutLayer()
         # Configs
         self.use_rounded_loss = use_rounded_loss
+        self.return_triu_on_train = return_triu_on_train
+        self.use_sdp = use_sdp
 
     def forward(self, x, N, warmstart=False, verbose=False):
         edge_weights = torch.squeeze(self.mlp_layer(x))
@@ -41,14 +44,16 @@ def forward(self, x, N, warmstart=False, verbose=False):
             logger.info(f"Size of W_matrix = {edge_weights_uncompressed.size()}")
             logger.info(f"\n{edge_weights_uncompressed}")
 
-        output_probs = self.sdp_layer(edge_weights_uncompressed, N)
+        output_probs = self.sdp_layer(edge_weights_uncompressed, N, use_sdp=self.use_sdp, return_triu=(
+                    self.training and not self.use_rounded_loss and self.return_triu_on_train))
         if verbose:
             logger.info(f"Size of X = {output_probs.size()}")
             logger.info(f"\n{output_probs}")
         if self.training and not self.use_rounded_loss:
             return output_probs
 
-        pred_clustering = self.hac_cut_layer(output_probs, edge_weights_uncompressed)
+        pred_clustering = self.hac_cut_layer(output_probs, edge_weights_uncompressed,
+                                             return_triu=(self.training and self.return_triu_on_train))
         if verbose:
             logger.info(f"Size of X_r = {pred_clustering.size()}")
             logger.info(f"\n{pred_clustering}")
diff --git a/e2e_pipeline/sdp_layer.py b/e2e_pipeline/sdp_layer.py
index 5a27ceb..5a57def 100644
--- a/e2e_pipeline/sdp_layer.py
+++ b/e2e_pipeline/sdp_layer.py
@@ -11,14 +11,28 @@
                     level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+
 class CvxpyException(Exception):
-    pass
+    def __init__(self, data=None):
+        self.data = data
+
+
+def get_max_agree_objective(weights, probs, verbose=False):
+    with torch.no_grad():
+        objective_matrix = weights * torch.triu(probs, diagonal=1)
+        objective_value_IC = torch.sum(objective_matrix).item()
+        objective_value_MA = objective_value_IC - torch.sum(objective_matrix[objective_matrix < 0]).item()
+        if verbose:
+            logger.info(f'SDP objective: intra-cluster={objective_value_IC}, max-agree={objective_value_MA}')
+        return objective_value_MA
+
 
 class SDPLayer(torch.nn.Module):
-    def __init__(self, max_iters: int = 50000, eps: float = 1e-3):
+    def __init__(self, max_iters: int = 50000, eps: float = 1e-3, scale_input=False):
         super().__init__()
         self.max_iters = max_iters
         self.eps = eps
+        self.scale_input = scale_input
         self.objective_value = None  # Stores the last run objective value
 
     def build_and_solve_sdp(self, W_val, N, verbose=False):
@@ -46,26 +60,46 @@ def build_and_solve_sdp(self, W_val, N, verbose=False):
 
         # Forward pass through the SDP cvxpylayer
         try:
-            pw_probs = self.cvxpy_layer(W_val, solver_args={
+            pw_prob_matrix = self.cvxpy_layer(W_val, solver_args={
                 "solve_method": "SCS",
                 "verbose": verbose,
                 "max_iters": self.max_iters,
                 "eps": self.eps
             })[0]
+            # Fix to prevent invalid solution values close to 0 and 1 but outside the range
+            pw_prob_matrix = torch.clamp(pw_prob_matrix, min=0, max=1)
         except:
             logger.error(f'CvxpyException: Error running forward pass on W_val of shape {W_val.shape}')
-            raise CvxpyException()
+            raise CvxpyException(data={
+                                     'W_val': W_val.detach().tolist(),
+                                     'solver_args': {
+                                         "solve_method": "SCS",
+                                         "verbose": verbose,
+                                         "max_iters": self.max_iters,
+                                         "eps": self.eps
+                                     }
+                                 })
+        objective_value_MA = get_max_agree_objective(W_val, pw_prob_matrix, verbose=verbose)
+        return objective_value_MA, pw_prob_matrix
+
+    def get_sigmoid_matrix(self, W_val, N, verbose=False):
+        pw_prob_matrix = torch.sigmoid(W_val)
+        objective_value_MA = get_max_agree_objective(W_val, pw_prob_matrix, verbose=verbose)
+        return objective_value_MA, pw_prob_matrix
 
-        with torch.no_grad():
-            objective_matrix = W_val * torch.triu(pw_probs, diagonal=1)
-            objective_value_IC = torch.sum(objective_matrix).item()
-            objective_value_MA = objective_value_IC - torch.sum(objective_matrix[objective_matrix < 0]).item()
+    def forward(self, edge_weights_uncompressed, N, use_sdp=True, return_triu=False, verbose=False):
+        W_val = edge_weights_uncompressed
+        if self.scale_input:
+            with torch.no_grad():
+                scale_factor = torch.max(torch.abs(W_val))
             if verbose:
-                logger.info(f'SDP objective: intra-cluster={objective_value_IC}, max-agree={objective_value_MA}')
+                logger.info(f"Scaling W_val by {scale_factor}")
+            W_val /= scale_factor
 
-        return objective_value_MA, pw_probs
+        solver = self.build_and_solve_sdp if use_sdp else self.get_sigmoid_matrix
+        self.objective_value, pw_prob_matrix = solver(W_val, N, verbose)
 
-    def forward(self, edge_weights_uncompressed, N, verbose=False):
-        objective_value, pw_probs = self.build_and_solve_sdp(edge_weights_uncompressed, N, verbose)
-        self.objective_value = objective_value
-        return pw_probs
+        if return_triu:
+            triu_indices = torch.triu_indices(len(pw_prob_matrix), len(pw_prob_matrix), offset=1)
+            return pw_prob_matrix[triu_indices[0], triu_indices[1]]
+        return pw_prob_matrix
diff --git a/e2e_scripts/evaluate.py b/e2e_scripts/evaluate.py
index d1ae7fd..67105ae 100644
--- a/e2e_scripts/evaluate.py
+++ b/e2e_scripts/evaluate.py
@@ -13,7 +13,7 @@
 from e2e_pipeline.cc_inference import CCInference
 from e2e_pipeline.hac_inference import HACInference
 from e2e_pipeline.sdp_layer import CvxpyException
-from e2e_scripts.train_utils import compute_b3_f1
+from e2e_scripts.train_utils import compute_b3_f1, save_to_wandb_run
 
 from IPython import embed
 
@@ -22,10 +22,13 @@
                     level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, val_dataloader=None,
-             tqdm_label='', device=None, verbose=False):
+
+def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, clustering_threshold=None,
+             val_dataloader=None, tqdm_label='', device=None, verbose=False, debug=False, _errors=None,
+             run_dir='./'):
     """
-    clustering_fn, val_dataloader: unused when pairwise_mode is False (only added to keep fn signature identical)
+    clustering_fn, clustering_threshold, val_dataloader: unused when pairwise_mode is False
+    (only added to keep fn signature identical)
     """
     device = device if device is not None else torch.device("cuda" if torch.cuda.is_available() else "cpu")
     n_features = dataloader.dataset[0][0].shape[1]
@@ -39,7 +42,7 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, val_da
     }
     max_pred_id = -1
     n_exceptions = 0
-    for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}', disable=(not verbose))):
+    for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}')):
         if overfit_batch_idx > -1:
             if idx < overfit_batch_idx:
                 continue
@@ -56,10 +59,24 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, val_da
             # Forward pass through the e2e model
             data = data.to(device)
             try:
-                _ = model(data, block_size)
-            except CvxpyException:
-                if tqdm_label is not 'dev':
-                    raise CvxpyException()
+                _ = model(data, block_size, verbose=verbose)
+            except CvxpyException as e:
+                logger.info(e)
+                _error_obj = {
+                    'method': 'eval',
+                    'model_type': 'e2e',
+                    'data_split': tqdm_label,
+                    'model_call_args': {
+                        'data': data.detach().tolist(),
+                        'block_size': block_size
+                    },
+                    'cvxpy_layer_args': e.data
+                }
+                if _errors is not None:
+                    _errors.append(_error_obj)
+                    save_to_wandb_run({'errors': _errors}, 'errors.json', run_dir, logger)
+                if not debug:  # if tqdm_label is not 'dev' and not debug:
+                    raise CvxpyException(data=_error_obj)
                 # If split is dev, skip batch and continue
                 all_gold = all_gold[:-len(cluster_ids)]
                 n_exceptions += 1
@@ -79,7 +96,7 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, val_da
 
 def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", return_pred_only=False,
                       thresh_for_f1=0.5, clustering_fn=None, clustering_threshold=None, val_dataloader=None,
-                      tqdm_label='', device=None, verbose=False):
+                      tqdm_label='', device=None, verbose=False, debug=False, _errors=None, run_dir='./'):
     device = device if device is not None else torch.device("cuda" if torch.cuda.is_available() else "cpu")
     n_features = dataloader.dataset[0][0].shape[1]
 
@@ -97,7 +114,7 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret
         }
         max_pred_id = -1  # In each iteration, add to all blockwise predicted IDs to distinguish from previous blocks
         n_exceptions = 0
-        for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}', disable=(not verbose))):
+        for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}')):
             if overfit_batch_idx > -1:
                 if idx < overfit_batch_idx:
                     continue
@@ -114,11 +131,25 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret
                 # Forward pass through the e2e model
                 data = data.to(device)
                 try:
-                    pred_cluster_ids = clustering_fn(model(data), block_size, min_id=(max_pred_id + 1),
+                    pred_cluster_ids = clustering_fn(model(data, verbose=verbose), block_size, min_id=(max_pred_id + 1),
                                                      threshold=clustering_threshold)
-                except CvxpyException:
-                    if tqdm_label is not 'dev':
-                        raise CvxpyException()
+                except CvxpyException as e:
+                    logger.info(e)
+                    _error_obj = {
+                        'method': 'eval',
+                        'model_type': 'pairwise_cc',
+                        'data_split': tqdm_label,
+                        'model_call_args': {
+                            'data': data.detach().tolist(),
+                            'block_size': block_size
+                        },
+                        'cvxpy_layer_args': e.data
+                    }
+                    if _errors is not None:
+                        _errors.append(_error_obj)
+                        save_to_wandb_run({'errors': _errors}, 'errors.json', run_dir, logger)
+                    if not debug:  # if tqdm_label is not 'dev' and not debug:
+                        raise CvxpyException(data=_error_obj)
                     # If split is dev, skip batch and continue
                     all_gold = all_gold[:-len(cluster_ids)]
                     n_exceptions += 1
@@ -136,7 +167,7 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret
         return (b3_f1, vmeasure, cc_obj_vals) if clustering_fn.__class__ is CCInference else (b3_f1, vmeasure)
 
     y_pred, targets = [], []
-    for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}', disable=(not verbose))):
+    for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}')):
         if overfit_batch_idx > -1:
             if idx < overfit_batch_idx:
                 continue
@@ -148,7 +179,7 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret
         target = target.flatten().float()
         # Forward pass through the pairwise model
         data = data.to(device)
-        y_pred.append(torch.sigmoid(model(data)).cpu().numpy())
+        y_pred.append(torch.sigmoid(model(data, verbose=verbose)).cpu().numpy())
         targets.append(target)
     y_pred = np.hstack(y_pred)
     targets = np.hstack(targets)
diff --git a/e2e_scripts/train.py b/e2e_scripts/train.py
index 8ea2ba0..1148587 100644
--- a/e2e_scripts/train.py
+++ b/e2e_scripts/train.py
@@ -18,7 +18,7 @@
 from e2e_pipeline.sdp_layer import CvxpyException
 from e2e_scripts.evaluate import evaluate, evaluate_pairwise
 from e2e_scripts.train_utils import DEFAULT_HYPERPARAMS, get_dataloaders, get_matrix_size_from_triu, \
-    uncompress_target_tensor, count_parameters, log_cc_objective_values
+    uncompress_target_tensor, count_parameters, log_cc_objective_values, save_to_wandb_run, FrobeniusLoss
 from utils.parser import Parser
 
 from IPython import embed
@@ -31,8 +31,8 @@
 
 def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, group=None,
           save_model=False, load_model_from_wandb_run=None, load_model_from_fpath=None,
-          load_hyp_from_wandb_run=None, eval_only_split=None, skip_initial_eval=False,
-          pairwise_eval_clustering=None):
+          eval_only_split=None, skip_initial_eval=False, pairwise_eval_clustering=None,
+          debug=False, track_errors=True, local=False):
     init_args = {
         'config': DEFAULT_HYPERPARAMS
     }
@@ -45,6 +45,8 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
         init_args.update({'tags': tags})
     if group is not None:
         init_args.update({'group': group})
+    if local:
+        init_args.update({'mode': 'disabled'})
 
     # Start wandb run
     with wandb.init(**init_args) as run:
@@ -53,9 +55,10 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
         logger.info("Run hyperparameters:")
         logger.info(hyp)
         # Save hyperparameters as a json file and store in wandb run
-        with open(os.path.join(run.dir, 'hyperparameters.json'), 'w') as fh:
-            json.dump(dict(hyp), fh)
-        wandb.save('hyperparameters.json')
+        save_to_wandb_run(dict(hyp), 'hyperparameters.json', run.dir, logger)
+
+        # Track errors
+        _errors = [] if track_errors else None
 
         # Seed everything
         if hyp['run_random_seed'] is not None:
@@ -65,6 +68,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
 
         pairwise_mode = hyp['pairwise_mode']
         weighted_loss = hyp['weighted_loss']
+        e2e_loss = hyp['e2e_loss']
         batch_size = hyp['batch_size'] if pairwise_mode else 1  # Force clustering runs to operate on 1 block only
         n_epochs = hyp['n_epochs']
         n_warmstart_epochs = hyp['n_warmstart_epochs']
@@ -82,6 +86,8 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
         negative_slope = hyp["negative_slope"]
         sdp_max_iters = hyp["sdp_max_iters"]
         sdp_eps = hyp["sdp_eps"]
+        sdp_scale = hyp["sdp_scale"]
+        grad_acc = hyp['batch_size'] if hyp["gradient_accumulation"] else 1
         overfit_batch_idx = hyp['overfit_batch_idx']
         clustering_metrics = {'b3_f1': 0, 'vmeasure': 1}
         pairwise_metrics = {'auroc': 0, 'f1': 1}
@@ -101,10 +107,25 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
         if not pairwise_mode:
             model = EntResModel(n_features, neumiss_depth, dropout_p, dropout_only_once, add_neumiss,
                                 neumiss_deq, hidden_dim, n_hidden_layers, add_batchnorm, activation,
-                                negative_slope, hidden_config, sdp_max_iters, sdp_eps,
-                                use_rounded_loss=hyp["use_rounded_loss"])
+                                negative_slope, hidden_config, sdp_max_iters, sdp_eps, sdp_scale,
+                                use_rounded_loss=hyp["use_rounded_loss"], return_triu_on_train=(e2e_loss == "bce"),
+                                use_sdp=hyp["use_sdp"])
             # Define loss
-            loss_fn_e2e = lambda pred, gold: torch.norm(gold - pred)
+            if e2e_loss not in ["frob", "bce"]:
+                raise ValueError("Invalid value for e2e_loss")
+            loss_fn_e2e = FrobeniusLoss() if e2e_loss == 'frob' else torch.nn.BCELoss()
+
+            pos_weight = None
+            if weighted_loss:
+                if overfit_batch_idx > -1:
+                    n_pos = train_dataloader.dataset[overfit_batch_idx][1].sum()
+                    pos_weight = (len(train_dataloader.dataset[overfit_batch_idx][1]) - n_pos) / n_pos
+                else:
+                    _n_pos, _n_total = 0., 0.
+                    for _i in range(len(train_dataloader.dataset)):
+                        _n_pos += train_dataloader.dataset[_i][1].sum()
+                        _n_total += len(train_dataloader.dataset[_i][1])
+                        pos_weight = (_n_total - _n_pos) / _n_pos
             # Define eval
             eval_fn = evaluate
             pairwise_clustering_fns = [None]  # Unused when pairwise_mode is False
@@ -118,11 +139,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                                                                   hyp["subsample_sz_dev"],
                                                                   True, hyp['batch_size'])
                 # Define loss
-                pos_weight = None
-                if weighted_loss and overfit_batch_idx == -1:
-                    n_pos = train_dataloader_pairwise.dataset[:][1].sum()
-                    pos_weight = torch.tensor((len(train_dataloader_pairwise.dataset) - n_pos) / n_pos)
-                loss_fn_pairwise = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
+                loss_fn_pairwise = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight))
         else:
             model = PairwiseModel(n_features, neumiss_depth, dropout_p, dropout_only_once, add_neumiss,
                                   neumiss_deq, hidden_dim, n_hidden_layers, add_batchnorm, activation,
@@ -144,14 +161,14 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
             pairwise_clustering_fns = [None]
             if pairwise_eval_clustering is not None:
                 if pairwise_eval_clustering == 'cc':
-                    pairwise_clustering_fns = [CCInference(sdp_max_iters, sdp_eps)]
+                    pairwise_clustering_fns = [CCInference(sdp_max_iters, sdp_eps, sdp_scale)]
                     pairwise_clustering_fns[0].eval()
                     pairwise_clustering_fn_labels = ['cc']
                 elif pairwise_eval_clustering == 'hac':
                     pairwise_clustering_fns = [HACInference()]
                     pairwise_clustering_fn_labels = ['hac']
                 elif pairwise_eval_clustering == 'both':
-                    cc_inference = CCInference(sdp_max_iters, sdp_eps)
+                    cc_inference = CCInference(sdp_max_iters, sdp_eps, sdp_scale)
                     pairwise_clustering_fns = [cc_inference, HACInference(), cc_inference]
                     pairwise_clustering_fns[0].eval()
                     pairwise_clustering_fn_labels = ['cc', 'hac', 'cc-fixed']
@@ -188,33 +205,47 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                 'dev': val_dataloader,
                 'test': test_dataloader
             }
+            start_time = time.time()
             with torch.no_grad():
                 model.eval()
+
+                eval_dataloader = dataloaders[eval_only_split]
+                eval_scores = eval_fn(model, eval_dataloader, tqdm_label=eval_only_split, device=device, verbose=verbose,
+                                      debug=debug, _errors=_errors)
+                logger.info(f"Eval: {eval_only_split}_{list(eval_metric_to_idx)[0]}={eval_scores[0]}, " +
+                            f"{eval_only_split}_{list(eval_metric_to_idx)[1]}={eval_scores[1]}")
+                # Log eval metrics
+                wandb.log({f'best_{eval_only_split}_{list(eval_metric_to_idx)[0]}': eval_scores[0],
+                           f'best_{eval_only_split}_{list(eval_metric_to_idx)[1]}': eval_scores[1]})
+                if len(eval_scores) == 3:
+                    log_cc_objective_values(scores=eval_scores, split_name=eval_only_split, log_prefix='Eval',
+                                            verbose=verbose, logger=logger)
+
+                # For pairwise-mode:
                 if pairwise_clustering_fns[0] is not None:
-                    assert eval_only_split == 'test'  # Clustering in --eval_only_split implemented only for test set
-                    eval_metric_to_idx = clustering_metrics
-                    eval_dataloader = test_dataloader_e2e
-                else:
-                    eval_dataloader = dataloaders[eval_only_split]
-                start_time = time.time()
-                clustering_threshold = None
-                for i, pairwise_clustering_fn in enumerate(pairwise_clustering_fns):
-                    eval_scores = eval_fn(model, eval_dataloader, clustering_fn=pairwise_clustering_fn,
-                                          clustering_threshold=clustering_threshold, val_dataloader=val_dataloader_e2e,
-                                          tqdm_label=eval_only_split, device=device, verbose=verbose)
-                    if pairwise_clustering_fn.__class__ is HACInference:
-                        clustering_threshold = pairwise_clustering_fn.cut_threshold
-                    if verbose:
+                    clustering_threshold = None
+                    for i, pairwise_clustering_fn in enumerate(pairwise_clustering_fns):
+                        clustering_scores = eval_fn(model, test_dataloader_e2e,  # Clustering only implemented for TEST
+                                                    clustering_fn=pairwise_clustering_fn,
+                                                    clustering_threshold=clustering_threshold,
+                                                    val_dataloader=val_dataloader_e2e,
+                                                    tqdm_label='test clustering', device=device, verbose=verbose,
+                                                    debug=debug, _errors=_errors)
+                        if pairwise_clustering_fn.__class__ is HACInference:
+                            clustering_threshold = pairwise_clustering_fn.cut_threshold
                         logger.info(
-                            f"Eval: {eval_only_split}_{list(eval_metric_to_idx)[0]}_{pairwise_clustering_fn_labels[i]}={eval_scores[0]}, " +
-                            f"{eval_only_split}_{list(eval_metric_to_idx)[1]}_{pairwise_clustering_fn_labels[i]}={eval_scores[1]}")
-                    wandb.log({'epoch': 0, f'{eval_only_split}_{list(eval_metric_to_idx)[0]}_{pairwise_clustering_fn_labels[i]}': eval_scores[0],
-                               f'{eval_only_split}_{list(eval_metric_to_idx)[1]}_{pairwise_clustering_fn_labels[i]}': eval_scores[1]})
-                    if len(eval_scores) == 3:
-                        log_cc_objective_values(scores=eval_scores,
-                                                split_name=f'{eval_only_split}_{pairwise_clustering_fn_labels[i]}',
-                                                log_prefix='Eval', verbose=verbose, logger=logger)
-                end_time = time.time()
+                            f"Eval: test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[0]}, " +
+                            f"test_{list(clustering_metrics)[1]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[1]}")
+                        # Log eval metrics
+                        wandb.log({f'best_test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}':
+                                       clustering_scores[0],
+                                   f'best_test_{list(clustering_metrics)[1]}_{pairwise_clustering_fn_labels[i]}':
+                                       clustering_scores[1]})
+                        if len(clustering_scores) == 3:
+                            log_cc_objective_values(scores=clustering_scores,
+                                                    split_name=f'best_test_{pairwise_clustering_fn_labels[i]}',
+                                                    log_prefix='Eval', verbose=verbose, logger=logger)
+            end_time = time.time()
         else:
             # Training
             wandb.watch(model)
@@ -243,20 +274,35 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                     model.eval()
                     if overfit_batch_idx > -1:
                         train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx,
-                                               tqdm_label='train', device=device, verbose=verbose)
-                        if verbose:
-                            logger.info(f"Initial: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " +
-                                        f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}")
+                                               tqdm_label='train', device=device, verbose=verbose, debug=debug,
+                                               _errors=_errors)
+                        logger.info(f"Initial: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " +
+                                    f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}")
                         wandb.log({'epoch': 0, f'train_{list(eval_metric_to_idx)[0]}': train_scores[0],
                                    f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]})
                     else:
-                        dev_scores = eval_fn(model, val_dataloader, tqdm_label='dev', device=device, verbose=verbose)
-                        if verbose:
-                            logger.info(f"Initial: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " +
-                                        f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}")
+                        dev_scores = eval_fn(model, val_dataloader, tqdm_label='dev', device=device, verbose=verbose,
+                                             debug=debug, _errors=_errors)
+                        logger.info(f"Initial: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " +
+                                    f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}")
                         wandb.log({'epoch': 0, f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0],
                                    f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1]})
 
+            if not pairwise_mode and grad_acc > 1:
+                grad_acc_steps = []
+                _seen_pw = 0
+                _seen_blk = 0
+                for d in train_dataloader.dataset:
+                    _blk_sz = len(d[1])
+                    _seen_pw += _blk_sz
+                    _seen_blk += 1
+                    if _seen_pw >= grad_acc:
+                        grad_acc_steps.append(_seen_blk)
+                        _seen_pw = 0
+                        _seen_blk = 0
+                if _seen_blk > 0:
+                    grad_acc_steps.append(_seen_blk)
+
             model.train()
             start_time = time.time()  # Tracks full training runtime
             for i in range(n_epochs):
@@ -271,9 +317,13 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                 wandb.log({'epoch': i + 1})
                 running_loss = []
                 n_exceptions = 0
+
+                grad_acc_count = 0
+                grad_acc_idx = 0
+                optimizer.zero_grad()
+
                 for (idx, batch) in enumerate(tqdm(_train_dataloader,
-                                                   desc=f"{'Warm-starting' if warmstart_mode else 'Training'} {i + 1}",
-                                                   disable=(not verbose))):
+                                                   desc=f"{'Warm-starting' if warmstart_mode else 'Training'} {i + 1}")):
                     if overfit_batch_idx > -1:
                         if idx < overfit_batch_idx:
                             continue
@@ -299,45 +349,94 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
 
                     # Forward pass through the e2e or pairwise model
                     data, target = data.to(device), target.to(device)
-                    output = model(data, N=block_size, warmstart=warmstart_mode, verbose=verbose)
+                    try:
+                        output = model(data, N=block_size, warmstart=warmstart_mode, verbose=verbose)
+                    except CvxpyException as e:
+                        logger.info(e)
+                        _error_obj = {
+                            'method': 'train_forward',
+                            'model_type': 'e2e' if not pairwise_mode else 'pairwise',
+                            'data_split': 'train',
+                            'model_call_args': {
+                                'data': data.detach().cpu(),
+                                'block_size': block_size
+                            },
+                            'cvxpy_layer_args': e.data
+                        }
+                        if _errors is not None:
+                            _errors.append(_error_obj)
+                            save_to_wandb_run({'errors': _errors}, 'errors.json', run.dir, logger)
+                        if debug:
+                            n_exceptions += 1
+                            logger.info(
+                                f'Caught CvxpyException in forward call (count -> {n_exceptions}): skipping batch')
+                            continue
 
                     # Calculate the loss
                     if not pairwise_mode and not warmstart_mode:
-                        gold_output = uncompress_target_tensor(target, device=device)
+                        grad_acc_denom = 1 if grad_acc == 1 else grad_acc_steps[grad_acc_idx]
+                        if e2e_loss != "bce":
+                            target = uncompress_target_tensor(target, device=device)
                         if verbose:
-                            logger.info(f"Gold:\n{gold_output}")
-                        try:
-                            loss = loss_fn(output.view_as(gold_output), gold_output) / (2 * block_size)
-                        except CvxpyException:
-                            n_exceptions += 1
-                            logger.info(f'Caught CvxpyException {n_exceptions}: skipping batch')
-                            continue
+                            logger.info(f"Gold:\n{target}")
+                        if pos_weight is not None:
+                            loss_weight = target * pos_weight + (1 - target)
+                            loss_fn.weight = loss_weight
+                        loss = loss_fn(output.view_as(target), target) / grad_acc_denom
                     else:
+                        # Pairwise or warmstart mode
                         if verbose:
                             logger.info(f"Gold:\n{target}")
                         loss = loss_fn(output.view_as(target), target)
 
-                    optimizer.zero_grad()
-                    loss.backward()
-                    optimizer.step()
+                    try:
+                        loss.backward()
+                        if not pairwise_mode and grad_acc > 1:
+                            grad_acc_count += len(data)
+                    except Exception as e:
+                        logger.info(e)
+                        if isinstance(e, CvxpyException):
+                            _error_obj = {
+                                'method': 'train_backward',
+                                'model_type': 'e2e' if not pairwise_mode else 'pairwise',
+                                'data_split': 'train',
+                                'model_call_args': {
+                                    'data': data.detach().cpu(),
+                                    'block_size': block_size
+                                }
+                            }
+                            if _errors is not None:
+                                _errors.append(_error_obj)
+                                save_to_wandb_run({'errors': _errors}, 'errors.json', run.dir, logger)
+                            if debug:
+                                n_exceptions += 1
+                                logger.info(
+                                    f'Caught CvxpyException in backward call (count -> {n_exceptions}): skipping batch')
+                                continue
+                    if pairwise_mode or (
+                            idx == len(_train_dataloader.dataset) - 1) or grad_acc == 1 or grad_acc_count >= grad_acc:
+                        optimizer.step()
+                        optimizer.zero_grad()
+                        if grad_acc > 1:
+                            grad_acc_count = 0
+                            grad_acc_idx += 1
 
                     if verbose:
                         logger.info(f"Loss = {loss.item()}")
                     running_loss.append(loss.item())
                     wandb.log({f'train_loss{"_warmstart" if warmstart_mode else ""}': np.mean(running_loss)})
 
-                if verbose:
-                    logger.info(f"Epoch loss = {np.mean(running_loss)}")
+                logger.info(f"Epoch loss = {np.mean(running_loss)}")
 
                 # Get model performance on dev (or 'train' for overfitting runs)
                 with torch.no_grad():
                     model.eval()
                     if overfit_batch_idx > -1:
                         train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx,
-                                               tqdm_label='train', device=device, verbose=verbose)
-                        if verbose:
-                            logger.info(f"Epoch {i + 1}: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " +
-                                        f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}")
+                                               tqdm_label='train', device=device, verbose=verbose, debug=debug,
+                                               _errors=_errors)
+                        logger.info(f"Epoch {i + 1}: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " +
+                                    f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}")
                         wandb.log({f'train_{list(eval_metric_to_idx)[0]}': train_scores[0],
                                    f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]})
                         if use_lr_scheduler:
@@ -346,17 +445,16 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                             elif hyp['lr_scheduler'] == 'step':
                                 scheduler.step()
                     else:
-                        dev_scores = eval_fn(model, val_dataloader, tqdm_label='dev', device=device, verbose=verbose)
-                        if verbose:
-                            logger.info(f"Epoch {i + 1}: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " +
-                                        f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}")
+                        dev_scores = eval_fn(model, val_dataloader, tqdm_label='dev', device=device, verbose=verbose,
+                                             debug=debug, _errors=_errors)
+                        logger.info(f"Epoch {i + 1}: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " +
+                                    f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}")
                         wandb.log({f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0],
                                    f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1],
                                    f'train_epoch_loss': np.mean(running_loss)})
                         dev_opt_score = dev_scores[eval_metric_to_idx[dev_opt_metric]]
                         if dev_opt_score > best_dev_score:
-                            if verbose:
-                                logger.info(f"New best dev {dev_opt_metric} score @ epoch{i+1}: {dev_opt_score}")
+                            logger.info(f"New best dev {dev_opt_metric} score @ epoch{i+1}: {dev_opt_score}")
                             best_epoch = i
                             best_dev_score = dev_opt_score
                             best_dev_scores = dev_scores
@@ -367,18 +465,23 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                             elif hyp['lr_scheduler'] == 'step':
                                 scheduler.step()
                 model.train()
-
             end_time = time.time()
 
+            # Save model
+            if save_model:
+                torch.save(best_dev_state_dict, os.path.join(run.dir, 'model_state_dict_best.pt'))
+                wandb.save('model_state_dict_best.pt')
+                logger.info(f"Saved best model on dev to {os.path.join(run.dir, 'model_state_dict_best.pt')}")
+
+            # Evaluate the best dev model on test
             if overfit_batch_idx == -1:
-                # Evaluate best dev model on test
                 model.load_state_dict(best_dev_state_dict)
                 with torch.no_grad():
                     model.eval()
-                    test_scores = eval_fn(model, test_dataloader, tqdm_label='test', device=device, verbose=verbose)
-                    if verbose:
-                        logger.info(f"Final: test_{list(eval_metric_to_idx)[0]}={test_scores[0]}, " +
-                                    f"test_{list(eval_metric_to_idx)[1]}={test_scores[1]}")
+                    test_scores = eval_fn(model, test_dataloader, tqdm_label='test', device=device, verbose=verbose,
+                                          debug=debug, _errors=_errors)
+                    logger.info(f"Final: test_{list(eval_metric_to_idx)[0]}={test_scores[0]}, " +
+                                f"test_{list(eval_metric_to_idx)[1]}={test_scores[1]}")
                     # Log final metrics
                     wandb.log({'best_dev_epoch': best_epoch + 1,
                                f'best_dev_{list(eval_metric_to_idx)[0]}': best_dev_scores[0],
@@ -387,7 +490,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                                f'best_test_{list(eval_metric_to_idx)[1]}': test_scores[1]})
                     if len(test_scores) == 3:
                         log_cc_objective_values(scores=test_scores, split_name='best_test', log_prefix='Final',
-                                                verbose=verbose, logger=logger)
+                                                verbose=True, logger=logger)
                     # For pairwise-mode:
                     if pairwise_clustering_fns[0] is not None:
                         clustering_threshold = None
@@ -396,30 +499,27 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                                                         clustering_fn=pairwise_clustering_fn,
                                                         clustering_threshold=clustering_threshold,
                                                         val_dataloader=val_dataloader_e2e,
-                                                        tqdm_label='test clustering', device=device, verbose=verbose)
+                                                        tqdm_label='test clustering', device=device, verbose=verbose,
+                                                        debug=debug, _errors=_errors)
                             if pairwise_clustering_fn.__class__ is HACInference:
                                 clustering_threshold = pairwise_clustering_fn.cut_threshold
-                            if verbose:
-                                logger.info(f"Final: test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[0]}, " +
-                                            f"test_{list(clustering_metrics)[1]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[1]}")
+                            logger.info(f"Final: test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[0]}, " +
+                                        f"test_{list(clustering_metrics)[1]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[1]}")
                             # Log final metrics
                             wandb.log({f'best_test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}': clustering_scores[0],
                                        f'best_test_{list(clustering_metrics)[1]}_{pairwise_clustering_fn_labels[i]}': clustering_scores[1]})
                             if len(clustering_scores) == 3:
                                 log_cc_objective_values(scores=clustering_scores,
                                                         split_name=f'best_test_{pairwise_clustering_fn_labels[i]}',
-                                                        log_prefix='Final', verbose=verbose, logger=logger)
+                                                        log_prefix='Final', verbose=True, logger=logger)
 
 
         run.summary["z_model_parameters"] = count_parameters(model)
         run.summary["z_run_time"] = round(end_time - start_time)
         run.summary["z_run_dir_path"] = run.dir
 
-        # Save models
-        if save_model:
-            torch.save(best_dev_state_dict, os.path.join(run.dir, 'model_state_dict_best.pt'))
-            wandb.save('model_state_dict_best.pt')
-            logger.info(f"Saved best model on dev to {os.path.join(run.dir, 'model_state_dict_best.pt')}")
+        if _errors is not None:
+            save_to_wandb_run({'errors': _errors}, 'errors.json', run.dir, logger)
 
         logger.info(f"Run directory: {run.dir}")
         logger.info("End of train() call")
@@ -496,7 +596,10 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                                            verbose=not args['silent'],
                                            tags=args['wandb_tags'],
                                            save_model=args['save_model'],
-                                           skip_initial_eval=args['skip_initial_eval']),
+                                           skip_initial_eval=args['skip_initial_eval'],
+                                           debug=args['debug'],
+                                           track_errors=not args['no_error_tracking'],
+                                           local=args['local']),
                     count=args['wandb_max_runs'])
 
         logger.info("End of sweep")
@@ -524,8 +627,10 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
               save_model=args['save_model'],
               load_model_from_wandb_run=args['load_model_from_wandb_run'],
               load_model_from_fpath=args['load_model_from_fpath'],
-              load_hyp_from_wandb_run=args['load_hyp_from_wandb_run'],
               eval_only_split=args['eval_only_split'],
               skip_initial_eval=args['skip_initial_eval'],
-              pairwise_eval_clustering=args['pairwise_eval_clustering'])
+              pairwise_eval_clustering=args['pairwise_eval_clustering'],
+              debug=args['debug'],
+              track_errors=not args['no_error_tracking'],
+              local=args['local'])
         logger.info("End of run")
diff --git a/e2e_scripts/train_utils.py b/e2e_scripts/train_utils.py
index 0cf34b7..efeea6e 100644
--- a/e2e_scripts/train_utils.py
+++ b/e2e_scripts/train_utils.py
@@ -1,16 +1,18 @@
 """
     Helper functions and constants for e2e_scripts/train.py
 """
-
+import os
+import json
 from collections import defaultdict
 from typing import Dict
-from typing import Tuple
+from typing import Tuple, Optional
 import math
 import pickle
 from torch.utils.data import DataLoader
 from s2and.consts import PREPROCESSED_DATA_DIR
 from s2and.data import S2BlocksDataset
 from s2and.eval import b3_precision_recall_fscore
+from torch import Tensor
 import torch
 import numpy as np
 import wandb
@@ -23,8 +25,8 @@
     # Dataset
     "dataset": "pubmed",
     "dataset_random_seed": 1,
-    "subsample_sz_train": -1,
-    "subsample_sz_dev": -1,
+    "subsample_sz_train": 80,
+    "subsample_sz_dev": 100,
     # Run config
     "run_random_seed": 17,
     "pairwise_mode": False,
@@ -45,23 +47,27 @@
     "activation": "leaky_relu",
     "negative_slope": 0.01,
     "use_rounded_loss": True,
+    "use_sdp": True,
+    "e2e_loss": "frob",  # e2e only: "frob", "bce"
     # Solver config
     "sdp_max_iters": 50000,
-    "sdp_eps": 1e-1,
+    "sdp_eps": 1e-3,
+    "sdp_scale": True,
     # Training config
-    "batch_size": 10000,  # For pairwise_mode only
-    "lr": 1e-4,
+    "batch_size": 10000,  # pairwise only; used by e2e if gradient_accumulation is true
+    "lr": 4e-3,
     "n_epochs": 5,
     "n_warmstart_epochs": 0,
-    "weighted_loss": True,  # For pairwise_mode only; TODO: Think about implementing for e2e
+    "weighted_loss": False,
     "use_lr_scheduler": True,
-    "lr_scheduler": "plateau",  # "step"
+    "lr_scheduler": "plateau",  # "plateau", "step"
     "lr_factor": 0.4,
     "lr_min": 1e-6,
     "lr_scheduler_patience": 2,
     "lr_step_size": 2,
     "lr_gamma": 0.4,
     "weight_decay": 0.01,
+    "gradient_accumulation": False,  # e2e only; accumulate over <batch_size> pairwise examples
     "dev_opt_metric": 'b3_f1',  # e2e: {'b3_f1', 'vmeasure'}; pairwise: {'auroc', 'f1'}
     "overfit_batch_idx": -1
 }
@@ -133,6 +139,7 @@ def compute_b3_f1(true_cluster_ids, pred_cluster_ids):
         pred_cluster_dict[pred_cluster_ids[i]].append(i)
     return b3_precision_recall_fscore(true_cluster_dict, pred_cluster_dict)
 
+
 def log_cc_objective_values(scores, split_name, log_prefix, verbose, logger, plot=False):
     frac, round = np.array(scores[2]['sdp']), np.array(scores[2]['round'])
     # Objective across blocks
@@ -151,3 +158,27 @@ def log_cc_objective_values(scores, split_name, log_prefix, verbose, logger, plo
                f'{split_name}_obj_ratio': mean_approx_ratio})
 
     # TODO: Implement plotting the approx. ratio v/s block sizes
+
+
+def save_to_wandb_run(file, fname, fpath, logger):
+    with open(os.path.join(fpath, fname), 'w') as fh:
+        json.dump(file, fh)
+    wandb.save(fname)
+    logger.info(f"Saved {fname} to {os.path.join(fpath, fname)}")
+
+
+class FrobeniusLoss:
+    def __init__(self, weight: Optional[Tensor] = None, reduction: str = 'original') -> None:
+        self.weight = weight
+        self.reduction = reduction
+
+    def __call__(self, input: Tensor, target: Tensor) -> Tensor:
+        n = len(target)
+        normalization = 1.
+        if self.reduction == 'mean':
+            normalization = n * (n - 1)
+        elif self.reduction == 'original':  # TODO: Probably want to not use this
+            normalization = 2 * n
+        if self.weight is None:
+            return torch.norm((target - input)) / normalization
+        return torch.norm(self.weight * (target - input)) / normalization
diff --git a/run_sweep.sh b/run_sweep.sh
index 6c8776f..b61c509 100644
--- a/run_sweep.sh
+++ b/run_sweep.sh
@@ -1,21 +1,23 @@
 #!/bin/bash -e
 
 dataset=${1:-"pubmed"}
-n_seeds=${2:-5}
-model=${3:-"e2e"}  # Used as prefix and to pick up the right sweep file
-gpu_name=${4:-"gypsum-1080ti"}
+n_seed_start=${2:-1}
+n_seed_end=${3:-5}
+model=${4:-"e2e"}  # Used as prefix and to pick up the right sweep file
+gpu_name=${5:-"gypsum-1080ti"}
+sweep_prefix=${6:-""}
 
-for ((i = 1; i <= ${n_seeds}; i++)); do
+for ((i = ${n_seed_start}; i <= ${n_seed_end}; i++)); do
   JOB_DESC=${model}_${dataset}_sweep${i} && JOB_NAME=${JOB_DESC}_$(date +%s) && \
   sbatch -J ${JOB_NAME} -e jobs/${JOB_NAME}.err -o jobs/${JOB_NAME}.log \
-    --partition=${gpu_name} --gres=gpu:1 --mem=80G --time=12:00:00 \
+    --partition=${gpu_name} --gres=gpu:1 --mem=100G --time=12:00:00 \
     run_sbatch.sh e2e_scripts/train.py \
     --dataset="${dataset}" \
     --dataset_random_seed=${i} \
     --pairwise_eval_clustering="both" \
     --skip_initial_eval \
     --silent \
-    --wandb_sweep_name="${model}_${dataset}_${i}" \
+    --wandb_sweep_name="${sweep_prefix}_${model}_${dataset}_${i}" \
     --wandb_sweep_params="wandb_configs/sweeps/${model}.json" \
     --wandb_tags="${model},${dataset},seed_${i}"
   echo "    Logs: jobs/${JOB_NAME}.err"
diff --git a/utils/parser.py b/utils/parser.py
index e4211e6..3291481 100644
--- a/utils/parser.py
+++ b/utils/parser.py
@@ -134,3 +134,15 @@ def add_training_args(self):
             help="(only in --pairwise_mode) Whether to run clustering during --eval_only_split and final test eval. " +
             "Accepts 'cc' for correlation clustering, 'hac' for agglomerative clustering, and 'both' to run both.",
         )
+        parser.add_argument(
+            "--debug", action="store_true",
+            help="Enable debugging mode, where train-eval flows do not quit on known errors in order to allow tracking",
+        )
+        parser.add_argument(
+            "--no_error_tracking", action="store_true",
+            help="Disable error logging for SDP forward and backward passes",
+        )
+        parser.add_argument(
+            "--local", action="store_true",
+            help="Run script with wandb disabled",
+        )
diff --git a/wandb_configs/sweeps/e2e-nosdp-warm.json b/wandb_configs/sweeps/e2e-nosdp-warm.json
new file mode 100644
index 0000000..3846f0d
--- /dev/null
+++ b/wandb_configs/sweeps/e2e-nosdp-warm.json
@@ -0,0 +1,16 @@
+{
+    "n_epochs": {"value": 10},
+    "lr": {"max": 2e-1, "min": 1e-5},
+    "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]},
+    "dev_opt_metric": {"value": "b3_f1"},
+    "neumiss_depth": {"values": [10, 20]},
+    "hidden_dim": {"values": [512, 1024]},
+    "n_hidden_layers": {"values": [1, 2]},
+    "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
+    "lr_scheduler": {"value": "plateau"},
+    "subsample_sz_train": {"value": 80},
+    "subsample_sz_dev": {"value": 100},
+    "activation": {"values": ["leaky_relu", "relu"]},
+    "use_sdp": {"value":  false},
+    "n_warmstart_epochs": {"value":  2}
+}
diff --git a/wandb_configs/sweeps/e2e-nosdp.json b/wandb_configs/sweeps/e2e-nosdp.json
new file mode 100644
index 0000000..4e02afe
--- /dev/null
+++ b/wandb_configs/sweeps/e2e-nosdp.json
@@ -0,0 +1,15 @@
+{
+    "n_epochs": {"value": 10},
+    "lr": {"max": 2e-1, "min": 1e-5},
+    "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]},
+    "dev_opt_metric": {"value": "b3_f1"},
+    "neumiss_depth": {"values": [10, 20]},
+    "hidden_dim": {"values": [512, 1024]},
+    "n_hidden_layers": {"values": [1, 2]},
+    "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
+    "lr_scheduler": {"value": "plateau"},
+    "subsample_sz_train": {"value": 80},
+    "subsample_sz_dev": {"value": 100},
+    "activation": {"values": ["leaky_relu", "relu"]},
+    "use_sdp": {"value":  false}
+}
diff --git a/wandb_configs/sweeps/e2e.json b/wandb_configs/sweeps/e2e.json
index d3db142..20991ba 100644
--- a/wandb_configs/sweeps/e2e.json
+++ b/wandb_configs/sweeps/e2e.json
@@ -7,7 +7,7 @@
     "hidden_dim": {"values": [512, 1024]},
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
-    "lr_scheduler": {"values": ["plateau", "step"]},
+    "lr_scheduler": {"value": "plateau"},
     "subsample_sz_train": {"value": 80},
     "subsample_sz_dev": {"value": 100},
     "activation": {"values": ["leaky_relu", "relu"]}
diff --git a/wandb_configs/sweeps/frac-nosdp-warm.json b/wandb_configs/sweeps/frac-nosdp-warm.json
new file mode 100644
index 0000000..75503ce
--- /dev/null
+++ b/wandb_configs/sweeps/frac-nosdp-warm.json
@@ -0,0 +1,17 @@
+{
+    "n_epochs": {"value": 10},
+    "lr": {"max": 2e-1, "min": 1e-5},
+    "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]},
+    "dev_opt_metric": {"value": "b3_f1"},
+    "neumiss_depth": {"values": [10, 20]},
+    "hidden_dim": {"values": [512, 1024]},
+    "n_hidden_layers": {"values": [1, 2]},
+    "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
+    "lr_scheduler": {"value": "plateau"},
+    "subsample_sz_train": {"value": 80},
+    "subsample_sz_dev": {"value": 100},
+    "activation": {"values": ["leaky_relu", "relu"]},
+    "use_rounded_loss": {"value": false},
+    "use_sdp": {"value":  false},
+    "n_warmstart_epochs": {"value":  2}
+}
diff --git a/wandb_configs/sweeps/frac-nosdp.json b/wandb_configs/sweeps/frac-nosdp.json
new file mode 100644
index 0000000..f27ee08
--- /dev/null
+++ b/wandb_configs/sweeps/frac-nosdp.json
@@ -0,0 +1,16 @@
+{
+    "n_epochs": {"value": 10},
+    "lr": {"max": 2e-1, "min": 1e-5},
+    "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]},
+    "dev_opt_metric": {"value": "b3_f1"},
+    "neumiss_depth": {"values": [10, 20]},
+    "hidden_dim": {"values": [512, 1024]},
+    "n_hidden_layers": {"values": [1, 2]},
+    "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
+    "lr_scheduler": {"value": "plateau"},
+    "subsample_sz_train": {"value": 80},
+    "subsample_sz_dev": {"value": 100},
+    "activation": {"values": ["leaky_relu", "relu"]},
+    "use_rounded_loss": {"value": false},
+    "use_sdp": {"value":  false}
+}
diff --git a/wandb_configs/sweeps/frac.json b/wandb_configs/sweeps/frac.json
index d91acd6..7eb6812 100644
--- a/wandb_configs/sweeps/frac.json
+++ b/wandb_configs/sweeps/frac.json
@@ -7,7 +7,7 @@
     "hidden_dim": {"values": [512, 1024]},
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
-    "lr_scheduler": {"values": ["plateau", "step"]},
+    "lr_scheduler": {"value": "plateau"},
     "subsample_sz_train": {"value": 80},
     "subsample_sz_dev": {"value": 100},
     "activation": {"values": ["leaky_relu", "relu"]},
diff --git a/wandb_configs/sweeps/mlp.json b/wandb_configs/sweeps/mlp.json
index 3df8746..a5f49fc 100644
--- a/wandb_configs/sweeps/mlp.json
+++ b/wandb_configs/sweeps/mlp.json
@@ -8,6 +8,7 @@
     "hidden_dim": {"values": [512, 1024]},
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
-    "lr_scheduler": {"values": ["plateau", "step"]},
-    "activation": {"values": ["leaky_relu", "relu"]}
+    "lr_scheduler": {"value": "plateau"},
+    "activation": {"values": ["leaky_relu", "relu"]},
+    "weighted_loss": {"value":  true}
 }

From f1a19aa0cd2a78f2c6c4a7dd6846cd2c32d42d24 Mon Sep 17 00:00:00 2001
From: Sriharsha-hatwar <shatwar@umass.edu>
Date: Wed, 15 Mar 2023 19:31:14 +0000
Subject: [PATCH 03/17] point feature creation implementation

---
 e2e_scripts/preprocess_s2and_pointwise.py |  94 ++++++++++++++++++
 s2and/data.py                             |  10 +-
 s2and/featurizer.py                       | 111 ++++++++++++++++++++++
 3 files changed, 214 insertions(+), 1 deletion(-)
 create mode 100644 e2e_scripts/preprocess_s2and_pointwise.py

diff --git a/e2e_scripts/preprocess_s2and_pointwise.py b/e2e_scripts/preprocess_s2and_pointwise.py
new file mode 100644
index 0000000..6aac36e
--- /dev/null
+++ b/e2e_scripts/preprocess_s2and_pointwise.py
@@ -0,0 +1,94 @@
+"""
+Run from command line:
+    python e2e_scripts/preprocess_s2and_data.py --data_home_dir="./data" --dataset_name="pubmed"
+"""
+import sys
+
+from typing import Union, Dict
+from typing import Tuple
+
+from s2and.consts import PREPROCESSED_DATA_DIR
+from s2and.featurizer import FeaturizationInfo, store_featurized_pickles, many_pairs_featurize, pointwise_featurize
+from os.path import join
+from s2and.data import ANDData
+import pickle
+import numpy as np
+from scipy.sparse import csr_matrix, coo_matrix
+from utils.parser import Parser
+
+from s2and.data import ANDData
+import logging
+from s2and.featurizer import FeaturizationInfo, featurize
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s', datefmt='%m/%d/%Y %H:%M:%S',
+                    level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed):
+    """
+    Fetch pointwise feature for dataset and store in a pickle.
+    """
+    processed_data = {}
+    parent_dir = f"{data_home_dir}/{dataset_name}"
+    """
+    AND_dataset = ANDData(
+        signatures=join(parent_dir, f"{dataset_name}_signatures.json"),
+        papers=join(parent_dir, f"{dataset_name}_papers.json"),
+        mode="inference",
+        clusters=join(parent_dir, f"{dataset_name}_clusters.json"),
+        block_type="s2",
+        train_pairs_size=100000,
+        val_pairs_size=10000,
+        test_pairs_size=10000,
+        name=dataset_name,
+        n_jobs=16,
+        random_seed=random_seed,
+    )
+    """
+    #print("This is for pickling dataset....")
+    #with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f:
+    #    pickle.dump(AND_dataset, f)
+    
+    print("getting pickled dataset...")
+    with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f:
+        AND_dataset = pickle.load(f)
+    print("Loaded pickle dataset...")
+    
+    
+
+    point_features_row, point_features_col,  point_features_data, num_feats, num_points = pointwise_featurize(AND_dataset,
+                                                                                                              n_jobs=16,
+                                                                                                              use_cache=False,
+                                                                                                        random_seed=random_seed)
+    logger.info('converting feature indices to csr_matrix')
+    point_features = coo_matrix(
+            (point_features_data, (point_features_row, point_features_col)),
+            shape=(num_points, num_feats)
+    ).tocsr()
+    
+    print("Matrix creation done.")
+    processed_data['mention_level_features'] = point_features
+    
+    logger.info('Dumping processed data')
+    
+    with open(f'{dataset_name}_feature_processed.pkl', 'wb') as f:
+        pickle.dump(processed_data, f)
+
+if __name__=='__main__':
+    # Creates the pickles that store the preprocessed data
+    # Read cmd line args
+    
+    parser = Parser(add_preprocessing_args=True)
+    parser.add_preprocessing_args()
+
+    args = parser.parse_args()
+    print(args)
+
+    params = args.__dict__
+    data_home_dir = params["data_home_dir"]
+    dataset = params["dataset_name"]
+
+    seed = 1211 # Dummy not needed, can be totally removed.
+    print("Preprocessing started for seed value", seed)
+    save_pickled_pointwise_features(data_home_dir, dataset, seed)
+    print("Matrix")
diff --git a/s2and/data.py b/s2and/data.py
index 9d75eb1..ca0f7f1 100644
--- a/s2and/data.py
+++ b/s2and/data.py
@@ -495,7 +495,15 @@ def __init__(
         self.preprocess_signatures(name_counts_loaded)
         logger.info("preprocessed signatures")
 
-
+    def force_signature_to_cluster_mapping(self):
+        if self.clusters is not None:
+            self.signature_to_cluster_id = {}
+            logger.info("making signature to cluster id")
+            for cluster_id, cluster_info in self.clusters.items():
+                for signature in cluster_info["signature_ids"]:
+                    self.signature_to_cluster_id[signature] = cluster_id
+            logger.info("made signature to cluster id")
+    
     def get_signature_objects(self, signature_ids: Dict[str, List[str]]) -> Dict[str, List[Signature]]:
         """
         Returns a dict of blockId with a list of it's Signature objects, useful for qualitative analysis
diff --git a/s2and/featurizer.py b/s2and/featurizer.py
index f306980..743d7e9 100644
--- a/s2and/featurizer.py
+++ b/s2and/featurizer.py
@@ -8,6 +8,9 @@
 import functools
 import logging
 from collections import Counter
+from collections.abc import Iterable
+
+from sklearn import preprocessing
 
 from tqdm import tqdm
 
@@ -825,6 +828,114 @@ def featurize(
         logger.info("featurized test")
         return train_features, val_features, test_features
 
+    
+def pointwise_featurize(
+    dataset: ANDData,
+    n_jobs: int = 1,
+    use_cache: bool = False,
+    chunk_size: int = DEFAULT_CHUNK_SIZE,
+    nan_value: float = np.nan,
+    delete_training_data: bool = False,
+    random_seed: int = 1,
+):
+    """
+    Featurizes the input dataset and stores as a unified pickle file. 
+
+    Parameters
+    ----------
+    dataset: ANDData
+        the dataset containing the relevant data
+    n_jobs: int
+        the number of cpus to use
+    use_cache: bool
+        whether or not to use write to/read from the features cache
+    chunk_size: int
+        the chunk size for multiprocessing
+    nan_value: float
+        the value to replace nans with
+    delete_training_data: bool
+        Whether to delete some suspicious training examples
+
+    Returns
+    -------
+    Returns the three items : 
+    1. Row indices of the sparse matrix containing the data
+    2. Column indices of the sparse matrix containing the data
+    3. The data to be filled in the given row and column combination.
+    """
+    # Do you think OrderedSet and OrderedDict should be used here? 
+    signature_feature_set = set()
+    signature_dict = {}
+    
+    # We dont need to iterate signature per block as we need to create for all the signatures irrespective of the block.
+
+    for signature_key, values in dataset.signatures.items():
+        per_signature_features = dataset.signatures[signature_key]._asdict()
+        signature_dict[signature_key] = []
+        for feature_key, value in per_signature_features.items():
+            index_key = None
+            if (value is None
+                    or (isinstance(value, Iterable) and len(value) == 0)):
+                continue
+            try:
+                if np.isnan(value):
+                    print('\n!!!! Found a NaN !!!!\n')
+                    exit()
+                    continue
+            except:
+                pass
+            
+            # Let us check the type of value for each signatures. 
+            
+            if isinstance(value, str) or isinstance(value, int):
+                index_key = (feature_key, value)
+                signature_feature_set.add(str(index_key))
+                signature_dict[signature_key].append(index_key)
+            elif isinstance(value, Counter):
+                for val in value.keys():
+                    index_key = (feature_key, val)
+                    signature_feature_set.add(str(index_key))
+                    signature_dict[signature_key].append(index_key)
+            elif isinstance(value, Iterable):
+                for val in value:
+                    index_key = (feature_key, val)
+                    signature_feature_set.add(str(index_key))
+                    signature_dict[signature_key].append(index_key)
+            else:
+                print('\n!!!! Found another type !!!!\n')
+                embed()
+                exit()
+    logger.info('Label encoding the values')
+    # Label encoding code --- 
+    
+    """"
+    {
+        "signature_id_one" : [(feat_key_1, val_1), (feat_key_2, val_2) ...],
+        "signature_id_two" : [(feat_key_1, val_1), (feat_key_3, val_3) ...]
+        
+    }
+    """
+    le_signature_feature_set = preprocessing.LabelEncoder()
+    le_signature_feature_set.fit(list(signature_feature_set))
+    
+    le_signature_dict = preprocessing.LabelEncoder()
+    le_signature_dict.fit(list(signature_dict.keys()))
+    
+    point_features_row, point_features_col, point_features_data = [], [], []
+    num_points = len(signature_dict.keys())
+    num_feats = len(signature_feature_set)
+    for key, values in signature_dict.items():
+        encoded_key_val = le_signature_dict.transform([key])[0]
+        val_strings = [str(val) for val in values]
+        encoded_values_val = le_signature_feature_set.transform(val_strings)
+        for val in encoded_values_val :
+            point_features_row.append(encoded_key_val)
+            point_features_col.append(val)
+            point_features_data.append(1)
+    
+    return point_features_row, point_features_col, point_features_data, num_feats, num_points
+            
+
 def store_featurized_pickles(
     dataset: ANDData,
     featurizer_info: FeaturizationInfo,

From f1efefb1b292f34433980be81e34907e1077cada Mon Sep 17 00:00:00 2001
From: Sriharsha-hatwar <shatwar@umass.edu>
Date: Wed, 15 Mar 2023 19:37:15 +0000
Subject: [PATCH 04/17] minor changes , added comments

---
 e2e_scripts/preprocess_s2and_pointwise.py | 14 +++++++-------
 s2and/featurizer.py                       |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/e2e_scripts/preprocess_s2and_pointwise.py b/e2e_scripts/preprocess_s2and_pointwise.py
index 6aac36e..9035372 100644
--- a/e2e_scripts/preprocess_s2and_pointwise.py
+++ b/e2e_scripts/preprocess_s2and_pointwise.py
@@ -1,6 +1,6 @@
 """
 Run from command line:
-    python e2e_scripts/preprocess_s2and_data.py --data_home_dir="./data" --dataset_name="pubmed"
+    python e2e_scripts/preprocess_s2and_pointwise.py --data_home_dir="./data" --dataset_name="pubmed"
 """
 import sys
 
@@ -30,7 +30,7 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed):
     """
     processed_data = {}
     parent_dir = f"{data_home_dir}/{dataset_name}"
-    """
+    
     AND_dataset = ANDData(
         signatures=join(parent_dir, f"{dataset_name}_signatures.json"),
         papers=join(parent_dir, f"{dataset_name}_papers.json"),
@@ -44,15 +44,15 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed):
         n_jobs=16,
         random_seed=random_seed,
     )
-    """
+    
     #print("This is for pickling dataset....")
     #with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f:
     #    pickle.dump(AND_dataset, f)
     
-    print("getting pickled dataset...")
-    with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f:
-        AND_dataset = pickle.load(f)
-    print("Loaded pickle dataset...")
+    #print("getting pickled dataset...")
+    #with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f:
+    #    AND_dataset = pickle.load(f)
+    #print("Loaded pickle dataset...")
     
     
 
diff --git a/s2and/featurizer.py b/s2and/featurizer.py
index 743d7e9..bf41ed8 100644
--- a/s2and/featurizer.py
+++ b/s2and/featurizer.py
@@ -864,7 +864,7 @@ def pointwise_featurize(
     3. The data to be filled in the given row and column combination.
     """
     # Do you think OrderedSet and OrderedDict should be used here? 
-    signature_feature_set = set()
+    signature_feature_set = set() # The feature is stored a str and not tuple to facilitate label encoding.
     signature_dict = {}
     
     # We dont need to iterate signature per block as we need to create for all the signatures irrespective of the block.
@@ -889,7 +889,7 @@ def pointwise_featurize(
             
             if isinstance(value, str) or isinstance(value, int):
                 index_key = (feature_key, value)
-                signature_feature_set.add(str(index_key))
+                signature_feature_set.add(str(index_key)) # Converting to str from tuple.
                 signature_dict[signature_key].append(index_key)
             elif isinstance(value, Counter):
                 for val in value.keys():

From 548f151c4e059280881a4d7c620cc23ded9ec2da Mon Sep 17 00:00:00 2001
From: Dhruv Agarwal <dhruv.agarwal17@gmail.com>
Date: Fri, 17 Mar 2023 02:09:13 -0400
Subject: [PATCH 05/17] Multiprocessing dev, block size stats, eval_all method
 (#40)

---
 e2e_pipeline/cc_inference.py         |   7 +-
 e2e_pipeline/hac_inference.py        |   6 +-
 e2e_pipeline/sdp_layer.py            |  14 +-
 e2e_scripts/evaluate.py              |  17 +-
 e2e_scripts/get_block_sizes.py       | 116 +++++++++
 e2e_scripts/preprocess_s2and_data.py |   4 +-
 e2e_scripts/train.py                 | 340 ++++++++++++++++++++-------
 e2e_scripts/train_utils.py           |  25 +-
 run_sweep.sh                         |   2 +-
 utils/parser.py                      |   8 +
 10 files changed, 433 insertions(+), 106 deletions(-)
 create mode 100644 e2e_scripts/get_block_sizes.py

diff --git a/e2e_pipeline/cc_inference.py b/e2e_pipeline/cc_inference.py
index e486e0c..75700c2 100644
--- a/e2e_pipeline/cc_inference.py
+++ b/e2e_pipeline/cc_inference.py
@@ -17,11 +17,12 @@ class CCInference(torch.nn.Module):
     Correlation clustering inference-only model. Expects edge weights and the number of nodes as input.
     """
 
-    def __init__(self, sdp_max_iters, sdp_eps):
+    def __init__(self, sdp_max_iters, sdp_eps, sdp_scale, use_sdp):
         super().__init__()
         self.uncompress_layer = UncompressTransformLayer()
-        self.sdp_layer = SDPLayer(max_iters=sdp_max_iters, eps=sdp_eps)
+        self.sdp_layer = SDPLayer(max_iters=sdp_max_iters, eps=sdp_eps, scale_input=sdp_scale)
         self.hac_cut_layer = HACCutLayer()
+        self.use_sdp = use_sdp
 
     def forward(self, edge_weights, N, min_id=0, threshold=None, verbose=False):
         edge_weights = torch.squeeze(edge_weights)
@@ -29,7 +30,7 @@ def forward(self, edge_weights, N, min_id=0, threshold=None, verbose=False):
             # threshold is used to convert a similarity score (in [0,1]) into edge weights (in R, i.e. + and -)
             edge_weights = torch.sigmoid(edge_weights) - threshold
         edge_weights_uncompressed = self.uncompress_layer(edge_weights, N)
-        output_probs = self.sdp_layer(edge_weights_uncompressed, N)
+        output_probs = self.sdp_layer(edge_weights_uncompressed, N, use_sdp=self.use_sdp)
         pred_clustering = self.hac_cut_layer(output_probs, edge_weights_uncompressed)
 
         if verbose:
diff --git a/e2e_pipeline/hac_inference.py b/e2e_pipeline/hac_inference.py
index cf20ad5..30e1b9d 100644
--- a/e2e_pipeline/hac_inference.py
+++ b/e2e_pipeline/hac_inference.py
@@ -33,7 +33,6 @@ def tune_threshold(self, model, dataloader, device, n_trials=1000):
         all_gold = []
         blockwise_trees = []
         all_dists = []
-        max_pred_id = -1  # In each iteration, add to all blockwise predicted IDs to distinguish from previous blocks
         n_features = dataloader.dataset[0][0].shape[1]
         for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Tuning threshold on dev')):
             data, _, cluster_ids = batch
@@ -46,7 +45,8 @@ def tune_threshold(self, model, dataloader, device, n_trials=1000):
 
             # Forward pass through the e2e model
             data = data.to(device)
-            tree_and_alts, dists = self.cluster(model(data), block_size, return_tree=True)
+            edge_weights = model(data, N=len(cluster_ids), warmstart=True)
+            tree_and_alts, dists = self.cluster(edge_weights, block_size, return_tree=True)
             blockwise_trees.append(tree_and_alts)
             all_dists.append(dists)
 
@@ -61,7 +61,7 @@ def tune_threshold(self, model, dataloader, device, n_trials=1000):
         best_dev_metric = -1
         for _thresh in tqdm(thresholds, desc="Finding best cut threshold"):
             all_pred = []
-            max_pred_id = -1
+            max_pred_id = -1  # In each iter, add to all blockwise predicted IDs to distinguish from previous blocks
             for (_hac, _hac_alts) in blockwise_trees:
                 _cut_labels = self.cut_tree(_hac, _hac_alts, _thresh)
                 pred_cluster_ids = _cut_labels + (max_pred_id + 1)
diff --git a/e2e_pipeline/sdp_layer.py b/e2e_pipeline/sdp_layer.py
index 5a57def..bdc10c0 100644
--- a/e2e_pipeline/sdp_layer.py
+++ b/e2e_pipeline/sdp_layer.py
@@ -41,26 +41,26 @@ def build_and_solve_sdp(self, W_val, N, verbose=False):
         Returns a symmetric NxN matrix of fractional, decision values with a 1-diagonal
         """
         # Initialize the cvxpy layer
-        self.X = cp.Variable((N, N), PSD=True)
-        self.W = cp.Parameter((N, N))
+        X = cp.Variable((N, N), PSD=True)
+        W = cp.Parameter((N, N))
 
         # build out constraint set
         constraints = [
-            cp.diag(self.X) == np.ones((N,)),
-            self.X[:N, :] >= 0,
+            cp.diag(X) == np.ones((N,)),
+            X[:N, :] >= 0,
         ]
 
         # create problem
-        self.prob = cp.Problem(cp.Maximize(cp.trace(self.W @ self.X)), constraints)
+        prob = cp.Problem(cp.Maximize(cp.trace(W @ X)), constraints)
         # Note: maximizing the trace is equivalent to maximizing the sum_E (w_uv * X_uv) objective
         # because W is upper-triangular and X is symmetric
 
         # Build the SDP cvxpylayer
-        self.cvxpy_layer = CvxpyLayer(self.prob, parameters=[self.W], variables=[self.X])
+        cvxpy_layer = CvxpyLayer(prob, parameters=[W], variables=[X])
 
         # Forward pass through the SDP cvxpylayer
         try:
-            pw_prob_matrix = self.cvxpy_layer(W_val, solver_args={
+            pw_prob_matrix = cvxpy_layer(W_val, solver_args={
                 "solve_method": "SCS",
                 "verbose": verbose,
                 "max_iters": self.max_iters,
diff --git a/e2e_scripts/evaluate.py b/e2e_scripts/evaluate.py
index 67105ae..b7ba424 100644
--- a/e2e_scripts/evaluate.py
+++ b/e2e_scripts/evaluate.py
@@ -4,6 +4,7 @@
 import logging
 
 from tqdm import tqdm
+from time import time
 from sklearn.metrics.cluster import v_measure_score
 from sklearn.metrics import roc_curve, auc
 from sklearn.metrics import precision_recall_fscore_support
@@ -25,7 +26,7 @@
 
 def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, clustering_threshold=None,
              val_dataloader=None, tqdm_label='', device=None, verbose=False, debug=False, _errors=None,
-             run_dir='./'):
+             run_dir='./', tqdm_position=None):
     """
     clustering_fn, clustering_threshold, val_dataloader: unused when pairwise_mode is False
     (only added to keep fn signature identical)
@@ -42,7 +43,7 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste
     }
     max_pred_id = -1
     n_exceptions = 0
-    for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}')):
+    for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}', position=tqdm_position)):
         if overfit_batch_idx > -1:
             if idx < overfit_batch_idx:
                 continue
@@ -63,6 +64,7 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste
             except CvxpyException as e:
                 logger.info(e)
                 _error_obj = {
+                    'id': f'e_{int(time())}',
                     'method': 'eval',
                     'model_type': 'e2e',
                     'data_split': tqdm_label,
@@ -96,7 +98,8 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste
 
 def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", return_pred_only=False,
                       thresh_for_f1=0.5, clustering_fn=None, clustering_threshold=None, val_dataloader=None,
-                      tqdm_label='', device=None, verbose=False, debug=False, _errors=None, run_dir='./'):
+                      tqdm_label='', device=None, verbose=False, debug=False, _errors=None, run_dir='./',
+                      tqdm_position=None):
     device = device if device is not None else torch.device("cuda" if torch.cuda.is_available() else "cpu")
     n_features = dataloader.dataset[0][0].shape[1]
 
@@ -114,7 +117,7 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret
         }
         max_pred_id = -1  # In each iteration, add to all blockwise predicted IDs to distinguish from previous blocks
         n_exceptions = 0
-        for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}')):
+        for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}', position=tqdm_position)):
             if overfit_batch_idx > -1:
                 if idx < overfit_batch_idx:
                     continue
@@ -131,11 +134,13 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret
                 # Forward pass through the e2e model
                 data = data.to(device)
                 try:
-                    pred_cluster_ids = clustering_fn(model(data, verbose=verbose), block_size, min_id=(max_pred_id + 1),
+                    edge_weights = model(data, N=block_size, warmstart=True, verbose=verbose)
+                    pred_cluster_ids = clustering_fn(edge_weights, block_size, min_id=(max_pred_id + 1),
                                                      threshold=clustering_threshold)
                 except CvxpyException as e:
                     logger.info(e)
                     _error_obj = {
+                        'id': f'e_{int(time())}',
                         'method': 'eval',
                         'model_type': 'pairwise_cc',
                         'data_split': tqdm_label,
@@ -167,7 +172,7 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret
         return (b3_f1, vmeasure, cc_obj_vals) if clustering_fn.__class__ is CCInference else (b3_f1, vmeasure)
 
     y_pred, targets = [], []
-    for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}')):
+    for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}', position=tqdm_position)):
         if overfit_batch_idx > -1:
             if idx < overfit_batch_idx:
                 continue
diff --git a/e2e_scripts/get_block_sizes.py b/e2e_scripts/get_block_sizes.py
new file mode 100644
index 0000000..89dfd87
--- /dev/null
+++ b/e2e_scripts/get_block_sizes.py
@@ -0,0 +1,116 @@
+import argparse
+import glob
+import json
+import logging
+import os
+import numpy as np
+import pickle
+from time import time
+from tqdm import tqdm
+
+from IPython import embed
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s', datefmt='%m/%d/%Y %H:%M:%S',
+                    level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class NpEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.integer):
+            return int(obj)
+        if isinstance(obj, np.floating):
+            return float(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return super(NpEncoder, self).default(obj)
+
+
+class Parser(argparse.ArgumentParser):
+    def __init__(self):
+        super().__init__()
+        self.add_argument(
+            "--src", type=str
+        )
+        self.add_argument(
+            "--unique", action="store_true",
+        )
+        self.add_argument(
+            "--silent", action="store_true",
+        )
+        self.add_argument(
+            "--interactive", action="store_true",
+        )
+
+
+if __name__ == '__main__':
+    parser = Parser()
+    args = parser.parse_args()
+    logger.info("Script arguments:")
+    logger.info(args.__dict__)
+
+    root = "data/preprocessed_data"
+    save_fpath = f'./data_block_sizes{"_" + int(time()) if args.unique else ""}.json'
+    ignore = ['pubmed_OLD']
+    n_seeds = 5
+    splits = ['train', 'val', 'test']
+
+    result = {}
+
+    for dataset_path in tqdm(glob.glob(os.path.join(root, "*")), disable=args.silent):
+        dataset = dataset_path.split('/')[-1]
+        if dataset in ignore:
+            continue
+        result[dataset] = {}
+        _seen_blk_across = set()
+        for seed in range(1, n_seeds+1):
+            result[dataset][seed] = {}
+            _seen_blk = set()
+            _full_bkl_sizes = []
+            for split in splits:
+                _blk_szs = []
+                fpath = os.path.join(dataset_path, f'seed{seed}', f'{split}_features.pkl')
+                with open(fpath, 'rb') as fh:
+                    block_dict = pickle.load(fh)
+                    for k in block_dict.keys():
+                        assert k not in _seen_blk
+                        _seen_blk.add(k)
+                        _, _, cluster_ids = block_dict[k]
+                        _blk_szs.append(len(cluster_ids))
+                result[dataset][seed][split] = {
+                    'n_blocks': len(_blk_szs),
+                    'min': np.min(_blk_szs),
+                    'max': np.max(_blk_szs),
+                    'mean': np.mean(_blk_szs),
+                    'median': np.median(_blk_szs)
+                }
+                _full_bkl_sizes += _blk_szs
+            result[dataset][seed]['full'] = {
+                'n_blocks': len(_full_bkl_sizes),
+                'min': np.min(_full_bkl_sizes),
+                'max': np.max(_full_bkl_sizes),
+                'mean': np.mean(_full_bkl_sizes),
+                'median': np.median(_full_bkl_sizes)
+            }
+            _seen_blk_across = _seen_blk_across.union(_seen_blk)
+        result[dataset]['mean_across_seeds'] = {
+            'n_blocks': np.mean([result[dataset][seed]['full']['n_blocks'] for seed in range(1, n_seeds + 1)]),
+            'min': np.mean([result[dataset][seed]['full']['min'] for seed in range(1, n_seeds + 1)]),
+            'max': np.mean([result[dataset][seed]['full']['max'] for seed in range(1, n_seeds + 1)]),
+            'mean': np.mean([result[dataset][seed]['full']['mean'] for seed in range(1, n_seeds + 1)]),
+            'median': np.mean([result[dataset][seed]['full']['median'] for seed in range(1, n_seeds + 1)])
+        }
+        result[dataset]['n_blocks'] = len(_seen_blk_across)
+
+        logger.info(f'Dataset: {dataset}')
+        logger.info(f'  Blocks covered: {result[dataset]["n_blocks"]}')
+        logger.info(f'  Across seed stats (mean):')
+        for k, v in result[dataset]['mean_across_seeds'].items():
+            logger.info(f'      {k}: {v}')
+
+    with open(save_fpath, 'w') as fh:
+        json.dump(result, fh, cls=NpEncoder)
+    logger.info(f'Saved results to {save_fpath}')
+
+    if args.interactive:
+        embed()
diff --git a/e2e_scripts/preprocess_s2and_data.py b/e2e_scripts/preprocess_s2and_data.py
index 1322586..cb46521 100644
--- a/e2e_scripts/preprocess_s2and_data.py
+++ b/e2e_scripts/preprocess_s2and_data.py
@@ -38,10 +38,10 @@ def save_blockwise_featurized_data(dataset_name, random_seed):
         n_jobs=16,
         random_seed=random_seed,
     )
-
+    logger.info("Loaded ANDData object")
     # Load the featurizer, which calculates pairwise similarity scores
     featurization_info = FeaturizationInfo()
-    # the cache will make it faster to train multiple times - it stores the features on disk for you
+    logger.info("Loaded featurization info")
     train_pkl, val_pkl, test_pkl = store_featurized_pickles(AND_dataset,
                                                             featurization_info,
                                                             n_jobs=16,
diff --git a/e2e_scripts/train.py b/e2e_scripts/train.py
index 1148587..23b36d8 100644
--- a/e2e_scripts/train.py
+++ b/e2e_scripts/train.py
@@ -1,5 +1,7 @@
+import glob
 import json
 import os
+import sys
 import time
 import logging
 import random
@@ -18,9 +20,17 @@
 from e2e_pipeline.sdp_layer import CvxpyException
 from e2e_scripts.evaluate import evaluate, evaluate_pairwise
 from e2e_scripts.train_utils import DEFAULT_HYPERPARAMS, get_dataloaders, get_matrix_size_from_triu, \
-    uncompress_target_tensor, count_parameters, log_cc_objective_values, save_to_wandb_run, FrobeniusLoss
+    uncompress_target_tensor, count_parameters, log_cc_objective_values, save_to_wandb_run, FrobeniusLoss, \
+    copy_and_load_model
 from utils.parser import Parser
 
+from torch.multiprocessing import Process, set_start_method, Manager
+
+try:
+    set_start_method('spawn', force=True)
+except RuntimeError:
+    pass
+
 from IPython import embed
 
 
@@ -29,10 +39,105 @@
 logger = logging.getLogger(__name__)
 
 
+def _check_process(_proc, _return_dict, logger, run, overfit_batch_idx, use_lr_scheduler, hyp,
+                  scheduler, eval_metric_to_idx, dev_opt_metric, i, best_epoch, best_dev_score,
+                  best_dev_scores, best_dev_state_dict, sync=False):
+    if _proc is not None:
+        if _return_dict['_state'] == 'done' or (sync and _return_dict['_state'] == 'start'):
+            _proc.join()
+            _return_dict['_state'] = 'finish'
+            if _return_dict['_method'] == 'init_eval':
+                logger.info(_return_dict['local'])
+                run.log(_return_dict['wandb'])
+            elif _return_dict['_method'] == 'dev_eval':
+                logger.info(_return_dict['local'])
+                run.log(_return_dict['wandb'])
+                if overfit_batch_idx > -1:
+                    if use_lr_scheduler:
+                        if hyp['lr_scheduler'] == 'plateau':
+                            scheduler.step(_return_dict['train_scores'][eval_metric_to_idx[dev_opt_metric]])
+                        elif hyp['lr_scheduler'] == 'step':
+                            scheduler.step()
+                else:
+                    dev_scores = _return_dict['dev_scores']
+                    dev_opt_score = dev_scores[eval_metric_to_idx[dev_opt_metric]]
+                    if dev_opt_score > best_dev_score:
+                        logger.info(f"New best dev {dev_opt_metric} score @ epoch{i+1}: {dev_opt_score}")
+                        best_epoch = i
+                        best_dev_score = dev_opt_score
+                        best_dev_scores = dev_scores
+                        best_dev_state_dict = torch.load(_return_dict['state_dict_path'], device)
+                    if use_lr_scheduler:
+                        if hyp['lr_scheduler'] == 'plateau':
+                            scheduler.step(dev_scores[eval_metric_to_idx[dev_opt_metric]])
+                        elif hyp['lr_scheduler'] == 'step':
+                            scheduler.step()
+    return best_epoch, best_dev_score, best_dev_scores, best_dev_state_dict
+
+
+def init_eval(model_class, model_args, state_dict_path, overfit_batch_idx, eval_fn, train_dataloader, device, verbose,
+              debug, _errors, eval_metric_to_idx, val_dataloader, return_dict):
+    return_dict['_state'] = 'start'
+    return_dict['_method'] = 'init_eval'
+    model = model_class(*model_args)
+    model.load_state_dict(torch.load(state_dict_path))
+    model.to(device)
+    with torch.no_grad():
+        model.eval()
+        if overfit_batch_idx > -1:
+            train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx,
+                                   tqdm_label='train', device=device, verbose=verbose, debug=debug,
+                                   _errors=_errors, tqdm_position=0)
+            return_dict['local'] = f"Initial: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " + \
+                                   f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}"
+            return_dict['wandb'] = {'epoch': 0, f'train_{list(eval_metric_to_idx)[0]}': train_scores[0],
+                                    f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]}
+        else:
+            dev_scores = eval_fn(model, val_dataloader, tqdm_label='dev 0', device=device, verbose=verbose,
+                                 debug=debug, _errors=_errors, tqdm_position=0)
+            return_dict['local'] = f"Initial: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " + \
+                                   f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}"
+            return_dict['wandb'] = {'epoch': 0, f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0],
+                                    f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1]}
+    del model
+    return_dict['_state'] = 'done'
+
+
+def dev_eval(model_class, model_args, state_dict_path, overfit_batch_idx, eval_fn, train_dataloader, device, verbose,
+             debug, _errors, eval_metric_to_idx, val_dataloader, return_dict, i, run_dir):
+    return_dict['_state'] = 'start'
+    return_dict['_method'] = 'dev_eval'
+    return_dict['state_dict_path'] = state_dict_path
+    model = model_class(*model_args)
+    model.load_state_dict(torch.load(state_dict_path))
+    model.to(device)
+    with torch.no_grad():
+        model.eval()
+        if overfit_batch_idx > -1:
+            train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx,
+                                   tqdm_label='train', device=device, verbose=verbose, debug=debug,
+                                   _errors=_errors)
+            return_dict['local'] = f"Epoch {i + 1}: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " + \
+                                   f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}"
+            return_dict['wandb'] = {f'train_{list(eval_metric_to_idx)[0]}': train_scores[0],
+                                    f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]}
+            return_dict['train_scores'] = train_scores
+        else:
+            dev_scores = eval_fn(model, val_dataloader, tqdm_label=f'dev {i+1}', device=device, verbose=verbose,
+                                 debug=debug, _errors=_errors)
+            return_dict['local'] = f"Epoch {i + 1}: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " + \
+                                   f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}"
+            return_dict['wandb'] = {f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0],
+                                    f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1]}
+            return_dict['dev_scores'] = dev_scores
+    del model
+    return_dict['_state'] = 'done'
+
+
 def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, group=None,
           save_model=False, load_model_from_wandb_run=None, load_model_from_fpath=None,
-          eval_only_split=None, skip_initial_eval=False, pairwise_eval_clustering=None,
-          debug=False, track_errors=True, local=False):
+          eval_only_split=None, eval_all=False, skip_initial_eval=False, pairwise_eval_clustering=None,
+          debug=False, track_errors=True, local=False, sync_dev=False):
     init_args = {
         'config': DEFAULT_HYPERPARAMS
     }
@@ -48,6 +153,11 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
     if local:
         init_args.update({'mode': 'disabled'})
 
+    # Parallel process for validation runs
+    _proc = None
+    _return_dict = Manager().dict()
+    _return_dict['_state'] = 'initial'
+
     # Start wandb run
     with wandb.init(**init_args) as run:
         wandb.config.update(hyperparams, allow_val_change=True)
@@ -55,7 +165,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
         logger.info("Run hyperparameters:")
         logger.info(hyp)
         # Save hyperparameters as a json file and store in wandb run
-        save_to_wandb_run(dict(hyp), 'hyperparameters.json', run.dir, logger)
+        save_to_wandb_run(dict(hyp), 'hyperparameters.json', run.dir, logger, error_logger=False)
 
         # Track errors
         _errors = [] if track_errors else None
@@ -68,6 +178,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
 
         pairwise_mode = hyp['pairwise_mode']
         weighted_loss = hyp['weighted_loss']
+        use_rounded_loss = hyp["use_rounded_loss"]
         e2e_loss = hyp['e2e_loss']
         batch_size = hyp['batch_size'] if pairwise_mode else 1  # Force clustering runs to operate on 1 block only
         n_epochs = hyp['n_epochs']
@@ -84,6 +195,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
         neumiss_depth = hyp["neumiss_depth"]
         add_neumiss = not hyp['convert_nan']
         negative_slope = hyp["negative_slope"]
+        use_sdp = hyp["use_sdp"]
         sdp_max_iters = hyp["sdp_max_iters"]
         sdp_eps = hyp["sdp_eps"]
         sdp_scale = hyp["sdp_scale"]
@@ -105,11 +217,11 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
 
         # Create model with hyperparams
         if not pairwise_mode:
-            model = EntResModel(n_features, neumiss_depth, dropout_p, dropout_only_once, add_neumiss,
-                                neumiss_deq, hidden_dim, n_hidden_layers, add_batchnorm, activation,
-                                negative_slope, hidden_config, sdp_max_iters, sdp_eps, sdp_scale,
-                                use_rounded_loss=hyp["use_rounded_loss"], return_triu_on_train=(e2e_loss == "bce"),
-                                use_sdp=hyp["use_sdp"])
+            model_args = (n_features, neumiss_depth, dropout_p, dropout_only_once, add_neumiss,
+                         neumiss_deq, hidden_dim, n_hidden_layers, add_batchnorm, activation,
+                         negative_slope, hidden_config, sdp_max_iters, sdp_eps, sdp_scale,
+                         use_rounded_loss, (e2e_loss == "bce"), use_sdp)
+            model = EntResModel(*model_args)
             # Define loss
             if e2e_loss not in ["frob", "bce"]:
                 raise ValueError("Invalid value for e2e_loss")
@@ -141,9 +253,10 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                 # Define loss
                 loss_fn_pairwise = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight))
         else:
-            model = PairwiseModel(n_features, neumiss_depth, dropout_p, dropout_only_once, add_neumiss,
-                                  neumiss_deq, hidden_dim, n_hidden_layers, add_batchnorm, activation,
-                                  negative_slope, hidden_config)
+            model_args = (n_features, neumiss_depth, dropout_p, dropout_only_once, add_neumiss,
+                               neumiss_deq, hidden_dim, n_hidden_layers, add_batchnorm, activation,
+                               negative_slope, hidden_config)
+            model = PairwiseModel(*model_args)
             # Define loss
             pos_weight = None
             if weighted_loss:
@@ -161,14 +274,14 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
             pairwise_clustering_fns = [None]
             if pairwise_eval_clustering is not None:
                 if pairwise_eval_clustering == 'cc':
-                    pairwise_clustering_fns = [CCInference(sdp_max_iters, sdp_eps, sdp_scale)]
+                    pairwise_clustering_fns = [CCInference(sdp_max_iters, sdp_eps, sdp_scale, use_sdp)]
                     pairwise_clustering_fns[0].eval()
                     pairwise_clustering_fn_labels = ['cc']
                 elif pairwise_eval_clustering == 'hac':
                     pairwise_clustering_fns = [HACInference()]
                     pairwise_clustering_fn_labels = ['hac']
                 elif pairwise_eval_clustering == 'both':
-                    cc_inference = CCInference(sdp_max_iters, sdp_eps, sdp_scale)
+                    cc_inference = CCInference(sdp_max_iters, sdp_eps, sdp_scale, use_sdp)
                     pairwise_clustering_fns = [cc_inference, HACInference(), cc_inference]
                     pairwise_clustering_fns[0].eval()
                     pairwise_clustering_fn_labels = ['cc', 'hac', 'cc-fixed']
@@ -181,7 +294,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                                                                              hyp["normalize_data"],
                                                                              hyp["subsample_sz_train"],
                                                                              hyp["subsample_sz_dev"],
-                                                                             False, 1)
+                                                                             pairwise_mode=False, batch_size=1)
         logger.info(f"Model loaded: {model}", )
 
         # Load stored model, if available
@@ -198,8 +311,55 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
             logger.info(f'Loaded stored model.')
         model.to(device)
 
-        if eval_only_split is not None:
-            # Run inference and exit
+        if eval_all:
+            # Run all inference variants on the test set and exit
+            cc_inference_sdp = CCInference(sdp_max_iters, sdp_eps, sdp_scale, use_sdp=True)
+            cc_inference_nosdp = CCInference(sdp_max_iters, sdp_eps, sdp_scale, use_sdp=False)
+            inference_fns = [HACInference(),
+                             cc_inference_sdp, cc_inference_sdp,
+                             cc_inference_nosdp, cc_inference_nosdp]
+            inference_fn_labels = ['hac',
+                                   'cc', 'cc-fixed',
+                                   'cc-nosdp', 'cc-nosdp-fixed']
+            cc_inference_sdp.eval()
+            cc_inference_nosdp.eval()
+            _, val_dataloader_e2e, test_dataloader_e2e = get_dataloaders(hyp["dataset"],
+                                                                         hyp["dataset_random_seed"],
+                                                                         hyp["convert_nan"],
+                                                                         hyp["nan_value"],
+                                                                         hyp["normalize_data"],
+                                                                         hyp["subsample_sz_train"],
+                                                                         hyp["subsample_sz_dev"],
+                                                                         pairwise_mode=False, batch_size=1)
+            start_time = time.time()
+            with torch.no_grad():
+                model.eval()
+                clustering_threshold = None
+                for i, inference_fn in enumerate(inference_fns):
+                    logger.info(f'Inference method: {inference_fn_labels[i]}')
+                    clustering_scores = evaluate_pairwise(model, test_dataloader_e2e,
+                                                          clustering_fn=inference_fn,
+                                                          clustering_threshold=clustering_threshold if i % 2 == 0 else None,
+                                                          val_dataloader=val_dataloader_e2e,
+                                                          tqdm_label='test clustering', device=device, verbose=verbose,
+                                                          debug=debug, _errors=_errors)
+                    if inference_fn.__class__ is HACInference:
+                        clustering_threshold = inference_fn.cut_threshold
+                    logger.info(
+                        f"Eval: test_{list(clustering_metrics)[0]}_{inference_fn_labels[i]}={clustering_scores[0]}, " +
+                        f"test_{list(clustering_metrics)[1]}_{inference_fn_labels[i]}={clustering_scores[1]}")
+                    # Log eval metrics
+                    wandb.log({f'best_test_{list(clustering_metrics)[0]}_{inference_fn_labels[i]}':
+                                   clustering_scores[0],
+                               f'best_test_{list(clustering_metrics)[1]}_{inference_fn_labels[i]}':
+                                   clustering_scores[1]})
+                    if len(clustering_scores) == 3:
+                        log_cc_objective_values(scores=clustering_scores,
+                                                split_name=f'best_test_{inference_fn_labels[i]}',
+                                                log_prefix='Eval', verbose=verbose, logger=logger)
+            end_time = time.time()
+        elif eval_only_split is not None:
+            # Run inference on the specified split and exit
             dataloaders = {
                 'train': train_dataloader,
                 'dev': val_dataloader,
@@ -247,7 +407,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                                                     log_prefix='Eval', verbose=verbose, logger=logger)
             end_time = time.time()
         else:
-            # Training
+            # Train and evaluate
             wandb.watch(model)
 
             optimizer = torch.optim.AdamW(model.parameters(), lr=hyp['lr'])
@@ -263,31 +423,22 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                     scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=hyp['lr_step_size'],
                                                                 gamma=hyp['lr_gamma'], verbose=True)
 
-            best_dev_state_dict = None
+            best_dev_state_dict = copy.deepcopy(model.state_dict())
             best_dev_score = -1  # Stores the score of only the specified optimization metric
-            best_dev_scores = None  # Contains scores of all metrics
+            best_dev_scores = ()  # Contains scores of all metrics
             best_epoch = 0
 
             if not skip_initial_eval:
                 # Get initial model performance on dev (or 'train' for overfitting runs)
-                with torch.no_grad():
-                    model.eval()
-                    if overfit_batch_idx > -1:
-                        train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx,
-                                               tqdm_label='train', device=device, verbose=verbose, debug=debug,
-                                               _errors=_errors)
-                        logger.info(f"Initial: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " +
-                                    f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}")
-                        wandb.log({'epoch': 0, f'train_{list(eval_metric_to_idx)[0]}': train_scores[0],
-                                   f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]})
-                    else:
-                        dev_scores = eval_fn(model, val_dataloader, tqdm_label='dev', device=device, verbose=verbose,
-                                             debug=debug, _errors=_errors)
-                        logger.info(f"Initial: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " +
-                                    f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}")
-                        wandb.log({'epoch': 0, f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0],
-                                   f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1]})
-
+                _state_dict_path = copy_and_load_model(model, run.dir, device, store_only=True)
+                _proc = Process(target=init_eval,
+                                kwargs=dict(model_class=model.__class__, model_args=model_args,
+                                            state_dict_path=_state_dict_path,
+                                            overfit_batch_idx=overfit_batch_idx, eval_fn=eval_fn,
+                                            train_dataloader=train_dataloader, device=device, verbose=verbose,
+                                            debug=debug, _errors=_errors, eval_metric_to_idx=eval_metric_to_idx,
+                                            val_dataloader=val_dataloader, return_dict=_return_dict))
+                _proc.start()
             if not pairwise_mode and grad_acc > 1:
                 grad_acc_steps = []
                 _seen_pw = 0
@@ -323,7 +474,21 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                 optimizer.zero_grad()
 
                 for (idx, batch) in enumerate(tqdm(_train_dataloader,
-                                                   desc=f"{'Warm-starting' if warmstart_mode else 'Training'} {i + 1}")):
+                                                   desc=f"{'Warm-starting' if warmstart_mode else 'Training'} {i + 1}",
+                                                   position=1)):
+                    best_epoch, best_dev_score, best_dev_scores, best_dev_state_dict = _check_process(_proc,
+                                                                                                      _return_dict,
+                                                                                                      logger, run,
+                                                                                                      overfit_batch_idx,
+                                                                                                      use_lr_scheduler,
+                                                                                                      hyp, scheduler,
+                                                                                                      eval_metric_to_idx,
+                                                                                                      dev_opt_metric,
+                                                                                                      i - 1, best_epoch,
+                                                                                                      best_dev_score,
+                                                                                                      best_dev_scores,
+                                                                                                      best_dev_state_dict,
+                                                                                                      sync=sync_dev)
                     if overfit_batch_idx > -1:
                         if idx < overfit_batch_idx:
                             continue
@@ -354,6 +519,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                     except CvxpyException as e:
                         logger.info(e)
                         _error_obj = {
+                            'id': f'tf_{int(time.time())}',
                             'method': 'train_forward',
                             'model_type': 'e2e' if not pairwise_mode else 'pairwise',
                             'data_split': 'train',
@@ -397,6 +563,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                         logger.info(e)
                         if isinstance(e, CvxpyException):
                             _error_obj = {
+                                'id': f'tb_{int(time.time())}',
                                 'method': 'train_backward',
                                 'model_type': 'e2e' if not pairwise_mode else 'pairwise',
                                 'data_split': 'train',
@@ -426,47 +593,47 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                     running_loss.append(loss.item())
                     wandb.log({f'train_loss{"_warmstart" if warmstart_mode else ""}': np.mean(running_loss)})
 
+                # Sync to get previous epoch's dev eval
+                best_epoch, best_dev_score, best_dev_scores, best_dev_state_dict = _check_process(_proc, _return_dict,
+                                                                                                  logger, run,
+                                                                                                  overfit_batch_idx,
+                                                                                                  use_lr_scheduler,
+                                                                                                  hyp, scheduler,
+                                                                                                  eval_metric_to_idx,
+                                                                                                  dev_opt_metric, i - 1,
+                                                                                                  best_epoch,
+                                                                                                  best_dev_score,
+                                                                                                  best_dev_scores,
+                                                                                                  best_dev_state_dict,
+                                                                                                  sync=True)
+
                 logger.info(f"Epoch loss = {np.mean(running_loss)}")
+                wandb.log({f'train_epoch_loss': np.mean(running_loss)})
 
                 # Get model performance on dev (or 'train' for overfitting runs)
-                with torch.no_grad():
-                    model.eval()
-                    if overfit_batch_idx > -1:
-                        train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx,
-                                               tqdm_label='train', device=device, verbose=verbose, debug=debug,
-                                               _errors=_errors)
-                        logger.info(f"Epoch {i + 1}: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " +
-                                    f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}")
-                        wandb.log({f'train_{list(eval_metric_to_idx)[0]}': train_scores[0],
-                                   f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]})
-                        if use_lr_scheduler:
-                            if hyp['lr_scheduler'] == 'plateau':
-                                scheduler.step(train_scores[eval_metric_to_idx[dev_opt_metric]])
-                            elif hyp['lr_scheduler'] == 'step':
-                                scheduler.step()
-                    else:
-                        dev_scores = eval_fn(model, val_dataloader, tqdm_label='dev', device=device, verbose=verbose,
-                                             debug=debug, _errors=_errors)
-                        logger.info(f"Epoch {i + 1}: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " +
-                                    f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}")
-                        wandb.log({f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0],
-                                   f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1],
-                                   f'train_epoch_loss': np.mean(running_loss)})
-                        dev_opt_score = dev_scores[eval_metric_to_idx[dev_opt_metric]]
-                        if dev_opt_score > best_dev_score:
-                            logger.info(f"New best dev {dev_opt_metric} score @ epoch{i+1}: {dev_opt_score}")
-                            best_epoch = i
-                            best_dev_score = dev_opt_score
-                            best_dev_scores = dev_scores
-                            best_dev_state_dict = copy.deepcopy(model.state_dict())
-                        if use_lr_scheduler:
-                            if hyp['lr_scheduler'] == 'plateau':
-                                scheduler.step(dev_scores[eval_metric_to_idx[dev_opt_metric]])
-                            elif hyp['lr_scheduler'] == 'step':
-                                scheduler.step()
-                model.train()
+                _state_dict_path = copy_and_load_model(model, run.dir, device, store_only=True)
+                _proc = Process(target=dev_eval,
+                                kwargs=dict(model_class=model.__class__, model_args=model_args,
+                                            state_dict_path=_state_dict_path, overfit_batch_idx=overfit_batch_idx,
+                                            eval_fn=eval_fn, train_dataloader=train_dataloader, device=device,
+                                            verbose=verbose, debug=debug, _errors=_errors,
+                                            eval_metric_to_idx=eval_metric_to_idx, val_dataloader=val_dataloader,
+                                            return_dict=_return_dict, i=i, run_dir=run.dir))
+                _proc.start()
             end_time = time.time()
 
+            best_epoch, best_dev_score, best_dev_scores, best_dev_state_dict = _check_process(_proc, _return_dict,
+                                                                                             logger, run,
+                                                                                             overfit_batch_idx,
+                                                                                             use_lr_scheduler,
+                                                                                             hyp, scheduler,
+                                                                                             eval_metric_to_idx,
+                                                                                             dev_opt_metric, i,
+                                                                                             best_epoch,
+                                                                                             best_dev_score,
+                                                                                             best_dev_scores,
+                                                                                             best_dev_state_dict,
+                                                                                             sync=True)
             # Save model
             if save_model:
                 torch.save(best_dev_state_dict, os.path.join(run.dir, 'model_state_dict_best.pt'))
@@ -479,7 +646,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                 with torch.no_grad():
                     model.eval()
                     test_scores = eval_fn(model, test_dataloader, tqdm_label='test', device=device, verbose=verbose,
-                                          debug=debug, _errors=_errors)
+                                          debug=debug, _errors=_errors, tqdm_position=2)
                     logger.info(f"Final: test_{list(eval_metric_to_idx)[0]}={test_scores[0]}, " +
                                 f"test_{list(eval_metric_to_idx)[1]}={test_scores[1]}")
                     # Log final metrics
@@ -500,7 +667,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                                                         clustering_threshold=clustering_threshold,
                                                         val_dataloader=val_dataloader_e2e,
                                                         tqdm_label='test clustering', device=device, verbose=verbose,
-                                                        debug=debug, _errors=_errors)
+                                                        debug=debug, _errors=_errors, tqdm_position=2)
                             if pairwise_clustering_fn.__class__ is HACInference:
                                 clustering_threshold = pairwise_clustering_fn.cut_threshold
                             logger.info(f"Final: test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[0]}, " +
@@ -513,14 +680,17 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                                                         split_name=f'best_test_{pairwise_clustering_fn_labels[i]}',
                                                         log_prefix='Final', verbose=True, logger=logger)
 
-
         run.summary["z_model_parameters"] = count_parameters(model)
         run.summary["z_run_time"] = round(end_time - start_time)
         run.summary["z_run_dir_path"] = run.dir
 
         if _errors is not None:
-            save_to_wandb_run({'errors': _errors}, 'errors.json', run.dir, logger)
-
+            _all_errors = save_to_wandb_run({'errors': _errors}, 'errors.json', run.dir, logger)
+            if len(_all_errors['errors']) > 0:
+                logger.warning(f'Errors were encountered during the run. LOGS: {os.path.join(run.dir, "errors.json")}')
+        # Cleanup
+        for filename in glob.glob(os.path.join(run.dir, "_temp_state_dict*")):
+            os.remove(filename)
         logger.info(f"Run directory: {run.dir}")
         logger.info("End of train() call")
 
@@ -579,7 +749,8 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
             sweep_config.update({
                 'early_terminate': {
                     'type': 'hyperband',
-                    'min_iter': 5
+                    'min_iter': 4,
+                    'eta': 2
                 }
             })
 
@@ -599,7 +770,8 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                                            skip_initial_eval=args['skip_initial_eval'],
                                            debug=args['debug'],
                                            track_errors=not args['no_error_tracking'],
-                                           local=args['local']),
+                                           local=args['local'],
+                                           sync_dev=args['sync_dev']),
                     count=args['wandb_max_runs'])
 
         logger.info("End of sweep")
@@ -628,9 +800,11 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
               load_model_from_wandb_run=args['load_model_from_wandb_run'],
               load_model_from_fpath=args['load_model_from_fpath'],
               eval_only_split=args['eval_only_split'],
+              eval_all=args['eval_all'],
               skip_initial_eval=args['skip_initial_eval'],
               pairwise_eval_clustering=args['pairwise_eval_clustering'],
               debug=args['debug'],
               track_errors=not args['no_error_tracking'],
-              local=args['local'])
+              local=args['local'],
+              sync_dev=args['sync_dev'])
         logger.info("End of run")
diff --git a/e2e_scripts/train_utils.py b/e2e_scripts/train_utils.py
index efeea6e..c86c498 100644
--- a/e2e_scripts/train_utils.py
+++ b/e2e_scripts/train_utils.py
@@ -1,6 +1,7 @@
 """
     Helper functions and constants for e2e_scripts/train.py
 """
+import copy
 import os
 import json
 from collections import defaultdict
@@ -8,6 +9,7 @@
 from typing import Tuple, Optional
 import math
 import pickle
+from time import time
 from torch.utils.data import DataLoader
 from s2and.consts import PREPROCESSED_DATA_DIR
 from s2and.data import S2BlocksDataset
@@ -160,11 +162,20 @@ def log_cc_objective_values(scores, split_name, log_prefix, verbose, logger, plo
     # TODO: Implement plotting the approx. ratio v/s block sizes
 
 
-def save_to_wandb_run(file, fname, fpath, logger):
+def save_to_wandb_run(file, fname, fpath, logger, error_logger=True):
+    if error_logger and os.path.exists(os.path.join(fpath, fname)):
+        with open(os.path.join(fpath, fname), 'r') as fh:
+            all_errors = json.load(fh)['errors']
+            all_ids = set([e['id'] for e in all_errors])
+            for new_error in file['errors']:
+                if new_error['id'] not in all_ids:
+                    all_errors.append(new_error)
+            file['errors'] = all_errors
     with open(os.path.join(fpath, fname), 'w') as fh:
         json.dump(file, fh)
     wandb.save(fname)
     logger.info(f"Saved {fname} to {os.path.join(fpath, fname)}")
+    return file
 
 
 class FrobeniusLoss:
@@ -182,3 +193,15 @@ def __call__(self, input: Tensor, target: Tensor) -> Tensor:
         if self.weight is None:
             return torch.norm((target - input)) / normalization
         return torch.norm(self.weight * (target - input)) / normalization
+
+
+def copy_and_load_model(model, run_dir, device, store_only=False):
+    _model = copy.deepcopy(model)
+    _PATH = os.path.join(run_dir, f'_temp_state_dict_{int(time())}.pt')
+    torch.save(model.state_dict(), _PATH)
+    if store_only:
+        return _PATH
+    _STATE_DICT = torch.load(_PATH, device)
+    _model.load_state_dict(_STATE_DICT)
+    os.remove(_PATH)
+    return _model
diff --git a/run_sweep.sh b/run_sweep.sh
index b61c509..c635c0f 100644
--- a/run_sweep.sh
+++ b/run_sweep.sh
@@ -17,7 +17,7 @@ for ((i = ${n_seed_start}; i <= ${n_seed_end}; i++)); do
     --pairwise_eval_clustering="both" \
     --skip_initial_eval \
     --silent \
-    --wandb_sweep_name="${sweep_prefix}_${model}_${dataset}_${i}" \
+    --wandb_sweep_name="${sweep_prefix}${model}_${dataset}_${i}" \
     --wandb_sweep_params="wandb_configs/sweeps/${model}.json" \
     --wandb_tags="${model},${dataset},seed_${i}"
   echo "    Logs: jobs/${JOB_NAME}.err"
diff --git a/utils/parser.py b/utils/parser.py
index 3291481..f16f61f 100644
--- a/utils/parser.py
+++ b/utils/parser.py
@@ -125,6 +125,10 @@ def add_training_args(self):
             "--eval_only_split", type=str,
             help="Run script in inference-only mode on a particular data split (train / dev / test)",
         )
+        parser.add_argument(
+            "--eval_all", action='store_true',
+            help="Evaluate model using all inference methods over the test set and exit",
+        )
         parser.add_argument(
             "--skip_initial_eval", action='store_true',
             help="Whether to skip dev evaluation before training starts",
@@ -146,3 +150,7 @@ def add_training_args(self):
             "--local", action="store_true",
             help="Run script with wandb disabled",
         )
+        parser.add_argument(
+            "--sync_dev", action="store_true",
+            help="Whether to force dev evaluations to run synchronously",
+        )

From 64d3d28f9db45478807904f2a2ad24461ac2738d Mon Sep 17 00:00:00 2001
From: arana_umass_edu <arana_umass_edu@ials-gpu016.unity.rc.umass.edu>
Date: Fri, 17 Mar 2023 18:17:51 +0000
Subject: [PATCH 06/17] optimizinf pointwise feature code

---
 e2e_scripts/preprocess_s2and_pointwise.py | 26 +++++------
 s2and/featurizer.py                       | 56 +++++++++++++----------
 2 files changed, 43 insertions(+), 39 deletions(-)

diff --git a/e2e_scripts/preprocess_s2and_pointwise.py b/e2e_scripts/preprocess_s2and_pointwise.py
index 9035372..002aeae 100644
--- a/e2e_scripts/preprocess_s2and_pointwise.py
+++ b/e2e_scripts/preprocess_s2and_pointwise.py
@@ -19,12 +19,13 @@
 from s2and.data import ANDData
 import logging
 from s2and.featurizer import FeaturizationInfo, featurize
+from IPython import embed
 
 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s', datefmt='%m/%d/%Y %H:%M:%S',
                     level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed):
+def save_pickled_pointwise_features(data_home_dir, dataset_name):
     """
     Fetch pointwise feature for dataset and store in a pickle.
     """
@@ -45,27 +46,25 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed):
         random_seed=random_seed,
     )
     
-    #print("This is for pickling dataset....")
-    #with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f:
-    #    pickle.dump(AND_dataset, f)
+    # print("Storing pickled dataset....")
+    # with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f:
+    #     pickle.dump(AND_dataset, f)
     
-    #print("getting pickled dataset...")
-    #with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f:
-    #    AND_dataset = pickle.load(f)
-    #print("Loaded pickle dataset...")
+    # print("Loading pickled dataset...")
+    # with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f:
+    #     AND_dataset = pickle.load(f)
+    # print("Loaded pickle dataset...")
     
     
 
     point_features_row, point_features_col,  point_features_data, num_feats, num_points = pointwise_featurize(AND_dataset,
                                                                                                               n_jobs=16,
-                                                                                                              use_cache=False,
-                                                                                                        random_seed=random_seed)
+                                                                                                            use_cache=False)
     logger.info('converting feature indices to csr_matrix')
     point_features = coo_matrix(
             (point_features_data, (point_features_row, point_features_col)),
             shape=(num_points, num_feats)
     ).tocsr()
-    
     print("Matrix creation done.")
     processed_data['mention_level_features'] = point_features
     
@@ -88,7 +87,6 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed):
     data_home_dir = params["data_home_dir"]
     dataset = params["dataset_name"]
 
-    seed = 1211 # Dummy not needed, can be totally removed.
-    print("Preprocessing started for seed value", seed)
-    save_pickled_pointwise_features(data_home_dir, dataset, seed)
+    print("Preprocessing started")
+    save_pickled_pointwise_features(data_home_dir, dataset)
     print("Matrix")
diff --git a/s2and/featurizer.py b/s2and/featurizer.py
index bf41ed8..022edb8 100644
--- a/s2and/featurizer.py
+++ b/s2and/featurizer.py
@@ -9,6 +9,7 @@
 import logging
 from collections import Counter
 from collections.abc import Iterable
+from IPython import embed
 
 from sklearn import preprocessing
 
@@ -828,15 +829,14 @@ def featurize(
         logger.info("featurized test")
         return train_features, val_features, test_features
 
+
+
     
 def pointwise_featurize(
     dataset: ANDData,
     n_jobs: int = 1,
     use_cache: bool = False,
     chunk_size: int = DEFAULT_CHUNK_SIZE,
-    nan_value: float = np.nan,
-    delete_training_data: bool = False,
-    random_seed: int = 1,
 ):
     """
     Featurizes the input dataset and stores as a unified pickle file. 
@@ -851,10 +851,6 @@ def pointwise_featurize(
         whether or not to use write to/read from the features cache
     chunk_size: int
         the chunk size for multiprocessing
-    nan_value: float
-        the value to replace nans with
-    delete_training_data: bool
-        Whether to delete some suspicious training examples
 
     Returns
     -------
@@ -874,6 +870,20 @@ def pointwise_featurize(
         signature_dict[signature_key] = []
         for feature_key, value in per_signature_features.items():
             index_key = None
+            
+            features_to_ignore = [
+                    'author_info_name_counts',
+                    'author_info_position',
+                    'author_info_block',
+                    'author_info_given_block',
+                    'paper_id',
+                    'author_id',
+                    'sourced_author_source',
+                    'sourced_author_ids',
+            ]
+            if feature_key in features_to_ignore:
+                continue
+                
             if (value is None
                     or (isinstance(value, Iterable) and len(value) == 0)):
                 continue
@@ -888,18 +898,18 @@ def pointwise_featurize(
             # Let us check the type of value for each signatures. 
             
             if isinstance(value, str) or isinstance(value, int):
-                index_key = (feature_key, value)
-                signature_feature_set.add(str(index_key)) # Converting to str from tuple.
+                index_key = str((feature_key, value))
+                signature_feature_set.add(index_key) # Converting to str from tuple.
                 signature_dict[signature_key].append(index_key)
             elif isinstance(value, Counter):
                 for val in value.keys():
-                    index_key = (feature_key, val)
-                    signature_feature_set.add(str(index_key))
+                    index_key = str((feature_key, val))
+                    signature_feature_set.add(index_key)
                     signature_dict[signature_key].append(index_key)
             elif isinstance(value, Iterable):
                 for val in value:
-                    index_key = (feature_key, val)
-                    signature_feature_set.add(str(index_key))
+                    index_key = str((feature_key, val))
+                    signature_feature_set.add(index_key)
                     signature_dict[signature_key].append(index_key)
             else:
                 print('\n!!!! Found another type !!!!\n')
@@ -918,21 +928,17 @@ def pointwise_featurize(
     le_signature_feature_set = preprocessing.LabelEncoder()
     le_signature_feature_set.fit(list(signature_feature_set))
     
-    le_signature_dict = preprocessing.LabelEncoder()
-    le_signature_dict.fit(list(signature_dict.keys()))
-    
     point_features_row, point_features_col, point_features_data = [], [], []
     num_points = len(signature_dict.keys())
-    num_feats = len(signature_feature_set)
-    for key, values in signature_dict.items():
-        encoded_key_val = le_signature_dict.transform([key])[0]
-        val_strings = [str(val) for val in values]
-        encoded_values_val = le_signature_feature_set.transform(val_strings)
-        for val in encoded_values_val :
-            point_features_row.append(encoded_key_val)
-            point_features_col.append(val)
-            point_features_data.append(1)
+    num_feats = len(signature_feature_set)   
     
+    for index, (_, values) in tqdm(enumerate(signature_dict.items()), desc="Converting to spare matrix"):
+        encoded_signature_features = le_signature_feature_set.transform(values)
+        for feature_label in encoded_signature_features :
+            point_features_row.append(index)
+            point_features_col.append(feature_label)
+            point_features_data.append(1)
+                          
     return point_features_row, point_features_col, point_features_data, num_feats, num_points
             
 

From c1e13ae21024d441b7a949d9e08baccbb58346c8 Mon Sep 17 00:00:00 2001
From: Dhruv Agarwal <dhruv.agarwal17@gmail.com>
Date: Sat, 18 Mar 2023 04:23:50 -0400
Subject: [PATCH 07/17] Inference solver, parallel eval iterations, sweep
 config changes (#41)

---
 e2e_debug/solve.py                        |  41 +--
 e2e_pipeline/sdp_layer.py                 |  41 ++-
 e2e_pipeline/uncompress_layer.py          |   2 +-
 e2e_scripts/evaluate.py                   | 140 ++++++++--
 e2e_scripts/preprocess_s2and_data.py      |   2 +-
 e2e_scripts/train.py                      | 326 ++++++++--------------
 e2e_scripts/train_utils.py                | 183 ++++++++++--
 s2and/data.py                             |   7 +-
 utils/parser.py                           |   5 +-
 wandb_configs/sweeps/e2e-nosdp-warm.json  |   6 +-
 wandb_configs/sweeps/e2e-nosdp.json       |   6 +-
 wandb_configs/sweeps/e2e-warm.json        |   6 +-
 wandb_configs/sweeps/e2e.json             |   6 +-
 wandb_configs/sweeps/frac-nosdp-warm.json |   6 +-
 wandb_configs/sweeps/frac-nosdp.json      |   6 +-
 wandb_configs/sweeps/frac-warm.json       |   6 +-
 wandb_configs/sweeps/frac.json            |   6 +-
 wandb_configs/sweeps/mlp.json             |   1 +
 18 files changed, 487 insertions(+), 309 deletions(-)

diff --git a/e2e_debug/solve.py b/e2e_debug/solve.py
index 6e47ef2..ccefa35 100644
--- a/e2e_debug/solve.py
+++ b/e2e_debug/solve.py
@@ -50,6 +50,9 @@ def __init__(self):
         self.add_argument(
             "--scs_log_csv_filename", type=str,
         )
+        self.add_argument(
+            "--max_scaling", action="store_true",
+        )
         self.add_argument(
             "--interactive", action="store_true",
         )
@@ -63,15 +66,18 @@ def __init__(self):
 
     # Read error file
     logger.info("Reading input data")
-    with open(args.data_fpath, 'r') as fh:
-        data = json.load(fh)
-    assert len(data['errors']) > 0
-    # Pick specific error instance to process
-    error_data = data['errors'][args.data_idx]
+    if args.data_fpath.endswith('.pt'):
+        _W_val = torch.load(args.data_fpath, map_location='cpu').numpy()
+    else:
+        with open(args.data_fpath, 'r') as fh:
+            data = json.load(fh)
+        assert len(data['errors']) > 0
+        # Pick specific error instance to process
+        error_data = data['errors'][args.data_idx]
 
-    # Extract input data from the error instance
-    _raw = np.array(error_data['model_call_args']['data'])
-    _W_val = np.array(error_data['cvxpy_layer_args']['W_val'])
+        # Extract input data from the error instance
+        _raw = np.array(error_data['model_call_args']['data'])
+        _W_val = np.array(error_data['cvxpy_layer_args']['W_val'])
 
     # Construct cvxpy problem
     logger.info('Constructing optimization problem')
@@ -84,7 +90,7 @@ def __init__(self):
     constraints = [
         cp.diag(X) == np.ones((n,)),
         X[:n, :] >= 0,
-        X[:n, :] <= 1
+        # X[:n, :] <= 1
     ]
 
     # Setup HAC Cut
@@ -94,12 +100,14 @@ def __init__(self):
     sdp_obj_value = float('inf')
     result_idxs, results_X, results_clustering = [], [], []
     no_solution_scaling_factors = []
-    for i in range(1, 10):  # n
+    for i in range(0, 10):  # n
         # Skipping 1; no scaling leads to non-convergence (infinite objective value)
-        if i == 1:
-            scaling_factor = np.max(W)
+        if i == 0:
+            scaling_factor = np.max(np.abs(W))
         else:
             scaling_factor = i
+            if args.max_scaling:
+                continue
         logger.info(f'Scaling factor={scaling_factor}')
         # Create problem
         W_scaled = W / scaling_factor
@@ -114,8 +122,7 @@ def __init__(self):
             alpha=args.scs_alpha,
             scale=args.scs_scale,
             use_indirect=args.scs_use_indirect,
-            use_quad_obj=not args.scs_dont_use_quad_obj,
-            log_csv_filename=args.scs_log_csv_filename
+            # use_quad_obj=not args.scs_dont_use_quad_obj
         )
         logger.info(f"@scaling={scaling_factor}, objective value = {sdp_obj_value}, norm={np.linalg.norm(W_scaled)}")
         if sdp_obj_value != float('inf'):
@@ -129,9 +136,9 @@ def __init__(self):
     logger.info(f"Solution not found = {len(no_solution_scaling_factors)}")
     logger.info(f"Solution found = {len(results_X)}")
 
-    logger.info("Same clustering:")
-    for i in range(len(results_clustering)-1):
-        logger.info(np.array_equal(results_clustering[i], results_clustering[i + 1]))
+    # logger.info("Same clustering:")
+    # for i in range(len(results_clustering)-1):
+    #     logger.info(np.array_equal(results_clustering[i], results_clustering[i + 1]))
     # logger.info(f"Solution found with scaling factor = {scaling_factor}")
     # if args.interactive and sdp_obj_value == float('inf'):
     #     embed()
diff --git a/e2e_pipeline/sdp_layer.py b/e2e_pipeline/sdp_layer.py
index bdc10c0..be914b8 100644
--- a/e2e_pipeline/sdp_layer.py
+++ b/e2e_pipeline/sdp_layer.py
@@ -50,22 +50,33 @@ def build_and_solve_sdp(self, W_val, N, verbose=False):
             X[:N, :] >= 0,
         ]
 
-        # create problem
-        prob = cp.Problem(cp.Maximize(cp.trace(W @ X)), constraints)
-        # Note: maximizing the trace is equivalent to maximizing the sum_E (w_uv * X_uv) objective
-        # because W is upper-triangular and X is symmetric
-
-        # Build the SDP cvxpylayer
-        cvxpy_layer = CvxpyLayer(prob, parameters=[W], variables=[X])
-
-        # Forward pass through the SDP cvxpylayer
         try:
-            pw_prob_matrix = cvxpy_layer(W_val, solver_args={
-                "solve_method": "SCS",
-                "verbose": verbose,
-                "max_iters": self.max_iters,
-                "eps": self.eps
-            })[0]
+            if self.training:
+                # create problem
+                prob = cp.Problem(cp.Maximize(cp.trace(W @ X)), constraints)
+                # Note: maximizing the trace is equivalent to maximizing the sum_E (w_uv * X_uv) objective
+                # because W is upper-triangular and X is symmetric
+                # Build the SDP cvxpylayer
+                cvxpy_layer = CvxpyLayer(prob, parameters=[W], variables=[X])
+                # Forward pass through the SDP cvxpylayer
+                pw_prob_matrix = cvxpy_layer(W_val, solver_args={
+                    "solve_method": "SCS",
+                    "verbose": verbose,
+                    "max_iters": self.max_iters,
+                    "eps": self.eps
+                })[0]
+            else:
+                # create problem
+                prob = cp.Problem(cp.Maximize(cp.trace(W_val.cpu().numpy() @ X)), constraints)
+                _solve_val = prob.solve(
+                    solver=cp.SCS,
+                    verbose=verbose,
+                    max_iters=self.max_iters,
+                    eps=self.eps
+                )
+                if _solve_val == float('inf'):
+                    raise ValueError()
+                pw_prob_matrix = torch.tensor(X.value, device=W_val.device)
             # Fix to prevent invalid solution values close to 0 and 1 but outside the range
             pw_prob_matrix = torch.clamp(pw_prob_matrix, min=0, max=1)
         except:
diff --git a/e2e_pipeline/uncompress_layer.py b/e2e_pipeline/uncompress_layer.py
index 93d99dd..7198b07 100644
--- a/e2e_pipeline/uncompress_layer.py
+++ b/e2e_pipeline/uncompress_layer.py
@@ -6,7 +6,7 @@ def __init__(self):
         super().__init__()
 
     def forward(self, compressed_matrix, N, make_symmetric=False, ones_diagonal=False):
-        device = compressed_matrix.get_device()
+        device = compressed_matrix.device
         triu_indices = torch.triu_indices(N, N, offset=1, device=device)
         if make_symmetric:
             sym_indices = torch.stack((torch.cat((triu_indices[0], triu_indices[1])),
diff --git a/e2e_scripts/evaluate.py b/e2e_scripts/evaluate.py
index b7ba424..71ec3d6 100644
--- a/e2e_scripts/evaluate.py
+++ b/e2e_scripts/evaluate.py
@@ -8,13 +8,14 @@
 from sklearn.metrics.cluster import v_measure_score
 from sklearn.metrics import roc_curve, auc
 from sklearn.metrics import precision_recall_fscore_support
+from torch.multiprocessing import Process, Manager
 import numpy as np
 import torch
 
 from e2e_pipeline.cc_inference import CCInference
 from e2e_pipeline.hac_inference import HACInference
 from e2e_pipeline.sdp_layer import CvxpyException
-from e2e_scripts.train_utils import compute_b3_f1, save_to_wandb_run
+from e2e_scripts.train_utils import compute_b3_f1, save_to_wandb_run, copy_and_load_model
 
 from IPython import embed
 
@@ -24,13 +25,50 @@
 logger = logging.getLogger(__name__)
 
 
+def _run_iter(model_class, state_dict_path, _fork_id, _shared_list, eval_fn, **kwargs):
+    model = model_class(*kwargs['model_args'])
+    model.load_state_dict(torch.load(state_dict_path))
+    model.to('cpu')
+    model.eval()
+    with torch.no_grad():
+        res = eval_fn(model=model, **kwargs)
+    _shared_list.append(res)
+    del model
+
+
+def _fork_iter(batch_idx, _fork_id, _shared_list, eval_fn, **kwargs):
+    kwargs['model_class'] = kwargs['model'].__class__
+    kwargs['state_dict_path'] = copy_and_load_model(kwargs['model'], kwargs['run_dir'], 'cpu', store_only=True)
+    del kwargs['model']
+    kwargs['overfit_batch_idx'] = batch_idx
+    kwargs['tqdm_label'] = f'{kwargs["tqdm_label"]} (fork{_fork_id})'
+    kwargs['_fork_id'] = _fork_id
+    kwargs['tqdm_position'] = (0 if kwargs['tqdm_position'] is None else kwargs['tqdm_position']) + _fork_id + 1
+    kwargs['return_iter'] = True
+    kwargs['fork_size'] = -1
+    kwargs['_shared_list'] = _shared_list
+    kwargs['disable_tqdm'] = True
+    kwargs['device'] = 'cpu'
+    kwargs['eval_fn'] = eval_fn
+    _proc = Process(target=_run_iter, kwargs=kwargs)
+    _proc.start()
+    return _proc
+
+
 def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, clustering_threshold=None,
              val_dataloader=None, tqdm_label='', device=None, verbose=False, debug=False, _errors=None,
-             run_dir='./', tqdm_position=None):
+             run_dir='./', tqdm_position=None, model_args=None, return_iter=False, fork_size=500,
+             disable_tqdm=False):
     """
     clustering_fn, clustering_threshold, val_dataloader: unused when pairwise_mode is False
     (only added to keep fn signature identical)
     """
+    fn_args = locals()
+    fork_enabled = fork_size > -1 and model_args is not None
+    if fork_enabled:
+        _fork_id = 1
+        _shared_list = Manager().list()
+        _procs = []
     device = device if device is not None else torch.device("cuda" if torch.cuda.is_available() else "cpu")
     n_features = dataloader.dataset[0][0].shape[1]
 
@@ -43,7 +81,8 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste
     }
     max_pred_id = -1
     n_exceptions = 0
-    for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}', position=tqdm_position)):
+    pbar = tqdm(dataloader, desc=f'Eval {tqdm_label}', position=tqdm_position, disable=disable_tqdm)
+    for (idx, batch) in enumerate(pbar):
         if overfit_batch_idx > -1:
             if idx < overfit_batch_idx:
                 continue
@@ -51,11 +90,16 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste
                 break
         data, _, cluster_ids = batch
         block_size = len(cluster_ids)
-        all_gold += list(np.reshape(cluster_ids, (block_size,)))
+        pbar.set_description(f'Eval {tqdm_label} (sz={block_size})')
         data = data.reshape(-1, n_features).float()
         if data.shape[0] == 0:
             # Only one signature in block; manually assign a unique cluster
             pred_cluster_ids = [max_pred_id + 1]
+        elif fork_enabled and block_size >= fork_size:
+            _proc = _fork_iter(idx, _fork_id, _shared_list, evaluate, **fn_args)
+            _fork_id += 1
+            _procs.append((_proc, block_size))
+            continue
         else:
             # Forward pass through the e2e model
             data = data.to(device)
@@ -79,8 +123,6 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste
                     save_to_wandb_run({'errors': _errors}, 'errors.json', run_dir, logger)
                 if not debug:  # if tqdm_label is not 'dev' and not debug:
                     raise CvxpyException(data=_error_obj)
-                # If split is dev, skip batch and continue
-                all_gold = all_gold[:-len(cluster_ids)]
                 n_exceptions += 1
                 logger.info(f'Caught CvxpyException {n_exceptions}: skipping batch')
                 continue
@@ -89,8 +131,34 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste
             cc_obj_vals['sdp'].append(model.sdp_layer.objective_value)
             cc_obj_vals['block_idxs'].append(idx)
             cc_obj_vals['block_sizes'].append(block_size)
+        all_gold += list(np.reshape(cluster_ids, (block_size,)))
         max_pred_id = max(pred_cluster_ids)
         all_pred += list(pred_cluster_ids)
+        if overfit_batch_idx > -1 and return_iter:
+            return {
+                'cluster_labels': model.hac_cut_layer.cluster_labels,
+                'round_objective_value': model.hac_cut_layer.objective_value,
+                'sdp_objective_value': model.sdp_layer.objective_value,
+                'block_idx': idx,
+                'block_size': block_size,
+                'cluster_ids': cluster_ids
+            }
+
+    if fork_enabled and len(_procs) > 0:
+        _procs.sort(key=lambda x: x[1])  # To visualize progress
+        for _proc in tqdm(_procs, desc=f'Eval {tqdm_label} (waiting for forks to join)', position=tqdm_position):
+            _proc[0].join()
+        assert len(_procs) == len(_shared_list), "All forked eval iterations did not return results"
+        for _data in _shared_list:
+            pred_cluster_ids = (_data['cluster_labels'] + (max_pred_id + 1)).tolist()
+            cc_obj_vals['round'].append(_data['round_objective_value'])
+            cc_obj_vals['sdp'].append(_data['sdp_objective_value'])
+            cc_obj_vals['block_idxs'].append(_data['block_idx'])
+            cc_obj_vals['block_sizes'].append(_data['block_size'])
+            all_gold += list(np.reshape(_data['cluster_ids'], (_data['block_size'],)))
+            max_pred_id = max(pred_cluster_ids)
+            all_pred += list(pred_cluster_ids)
+
     vmeasure = v_measure_score(all_gold, all_pred)
     b3_f1 = compute_b3_f1(all_gold, all_pred)[2]
     return b3_f1, vmeasure, cc_obj_vals
@@ -99,7 +167,13 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste
 def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", return_pred_only=False,
                       thresh_for_f1=0.5, clustering_fn=None, clustering_threshold=None, val_dataloader=None,
                       tqdm_label='', device=None, verbose=False, debug=False, _errors=None, run_dir='./',
-                      tqdm_position=None):
+                      tqdm_position=None, model_args=None, return_iter=False, fork_size=500, disable_tqdm=False):
+    fn_args = locals()
+    fork_enabled = fork_size > -1 and model_args is not None
+    if fork_enabled:
+        _fork_id = 1
+        _shared_list = Manager().list()
+        _procs = []
     device = device if device is not None else torch.device("cuda" if torch.cuda.is_available() else "cpu")
     n_features = dataloader.dataset[0][0].shape[1]
 
@@ -117,7 +191,8 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret
         }
         max_pred_id = -1  # In each iteration, add to all blockwise predicted IDs to distinguish from previous blocks
         n_exceptions = 0
-        for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}', position=tqdm_position)):
+        pbar = tqdm(dataloader, desc=f'Eval {tqdm_label}', position=tqdm_position, disable=disable_tqdm)
+        for (idx, batch) in enumerate(pbar):
             if overfit_batch_idx > -1:
                 if idx < overfit_batch_idx:
                     continue
@@ -125,11 +200,16 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret
                     break
             data, _, cluster_ids = batch
             block_size = len(cluster_ids)
-            all_gold += list(np.reshape(cluster_ids, (block_size,)))
+            pbar.set_description(f'Eval {tqdm_label} (sz={block_size})')
             data = data.reshape(-1, n_features).float()
             if data.shape[0] == 0:
                 # Only one signature in block; manually assign a unique cluster
                 pred_cluster_ids = [max_pred_id + 1]
+            elif fork_enabled and block_size >= fork_size and clustering_fn.__class__ is CCInference:
+                _proc = _fork_iter(idx, _fork_id, _shared_list, evaluate_pairwise, **fn_args)
+                _fork_id += 1
+                _procs.append((_proc, block_size))
+                continue
             else:
                 # Forward pass through the e2e model
                 data = data.to(device)
@@ -155,24 +235,49 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret
                         save_to_wandb_run({'errors': _errors}, 'errors.json', run_dir, logger)
                     if not debug:  # if tqdm_label is not 'dev' and not debug:
                         raise CvxpyException(data=_error_obj)
-                    # If split is dev, skip batch and continue
-                    all_gold = all_gold[:-len(cluster_ids)]
                     n_exceptions += 1
                     logger.info(f'Caught CvxpyException {n_exceptions}: skipping batch')
                     continue
+                if clustering_fn.__class__ is CCInference:
+                    cc_obj_vals['round'].append(clustering_fn.hac_cut_layer.objective_value)
+                    cc_obj_vals['sdp'].append(clustering_fn.sdp_layer.objective_value)
+                    cc_obj_vals['block_idxs'].append(idx)
+                    cc_obj_vals['block_sizes'].append(block_size)
+            all_gold += list(np.reshape(cluster_ids, (block_size,)))
             max_pred_id = max(pred_cluster_ids)
             all_pred += list(pred_cluster_ids)
-            if clustering_fn.__class__ is CCInference:
-                cc_obj_vals['round'].append(clustering_fn.hac_cut_layer.objective_value)
-                cc_obj_vals['sdp'].append(clustering_fn.sdp_layer.objective_value)
-                cc_obj_vals['block_idxs'].append(idx)
-                cc_obj_vals['block_sizes'].append(block_size)
+            if overfit_batch_idx > -1 and return_iter:
+                return {
+                    'cluster_labels': list(np.array(pred_cluster_ids) - (max_pred_id + 1)),
+                    'round_objective_value': clustering_fn.hac_cut_layer.objective_value,
+                    'sdp_objective_value': clustering_fn.sdp_layer.objective_value,
+                    'block_idx': idx,
+                    'block_size': block_size,
+                    'cluster_ids': cluster_ids
+                }
+
+        if fork_enabled and len(_procs) > 0:
+            _procs.sort(key=lambda x: x[1])  # To visualize progress
+            for _proc in tqdm(_procs, desc=f'Eval {tqdm_label} (waiting for forks to join)', position=tqdm_position):
+                _proc[0].join()
+            assert len(_procs) == len(_shared_list), "All forked eval iterations did not return results"
+            for _data in _shared_list:
+                pred_cluster_ids = (_data['cluster_labels'] + (max_pred_id + 1)).tolist()
+                cc_obj_vals['round'].append(_data['round_objective_value'])
+                cc_obj_vals['sdp'].append(_data['sdp_objective_value'])
+                cc_obj_vals['block_idxs'].append(_data['block_idx'])
+                cc_obj_vals['block_sizes'].append(_data['block_size'])
+                all_gold += list(np.reshape(_data['cluster_ids'], (_data['block_size'],)))
+                max_pred_id = max(pred_cluster_ids)
+                all_pred += list(pred_cluster_ids)
+
         vmeasure = v_measure_score(all_gold, all_pred)
         b3_f1 = compute_b3_f1(all_gold, all_pred)[2]
         return (b3_f1, vmeasure, cc_obj_vals) if clustering_fn.__class__ is CCInference else (b3_f1, vmeasure)
 
     y_pred, targets = [], []
-    for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}', position=tqdm_position)):
+    pbar = tqdm(dataloader, desc=f'Eval {tqdm_label}', position=tqdm_position, disable=disable_tqdm)
+    for (idx, batch) in enumerate(pbar):
         if overfit_batch_idx > -1:
             if idx < overfit_batch_idx:
                 continue
@@ -180,6 +285,7 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret
                 break
         data, target = batch
         data = data.reshape(-1, n_features).float()
+        pbar.set_description(f'Eval {tqdm_label} (sz={len(data)})')
         assert data.shape[0] != 0
         target = target.flatten().float()
         # Forward pass through the pairwise model
diff --git a/e2e_scripts/preprocess_s2and_data.py b/e2e_scripts/preprocess_s2and_data.py
index cb46521..d97283b 100644
--- a/e2e_scripts/preprocess_s2and_data.py
+++ b/e2e_scripts/preprocess_s2and_data.py
@@ -118,7 +118,7 @@ def find_total_num_train_pairs(blockwise_data):
     DATA_HOME_DIR = params["data_home_dir"]
     dataset = params["dataset_name"]
 
-    random_seeds = {1, 2, 3, 4, 5}
+    random_seeds = [1, 2, 3, 4, 5] if params["dataset_seed"] is None else [params["dataset_seed"]]
     for seed in random_seeds:
         print("Preprocessing started for seed value", seed)
         save_blockwise_featurized_data(dataset, seed)
diff --git a/e2e_scripts/train.py b/e2e_scripts/train.py
index 23b36d8..a645257 100644
--- a/e2e_scripts/train.py
+++ b/e2e_scripts/train.py
@@ -1,7 +1,6 @@
 import glob
 import json
 import os
-import sys
 import time
 import logging
 import random
@@ -10,8 +9,8 @@
 import wandb
 import torch
 import numpy as np
-
 from tqdm import tqdm
+from torch.multiprocessing import set_start_method, Manager
 
 from e2e_pipeline.cc_inference import CCInference
 from e2e_pipeline.hac_inference import HACInference
@@ -21,119 +20,21 @@
 from e2e_scripts.evaluate import evaluate, evaluate_pairwise
 from e2e_scripts.train_utils import DEFAULT_HYPERPARAMS, get_dataloaders, get_matrix_size_from_triu, \
     uncompress_target_tensor, count_parameters, log_cc_objective_values, save_to_wandb_run, FrobeniusLoss, \
-    copy_and_load_model
+    get_feature_count, _check_process, fork_eval, init_eval, dev_eval
 from utils.parser import Parser
 
-from torch.multiprocessing import Process, set_start_method, Manager
+from IPython import embed
 
 try:
     set_start_method('spawn', force=True)
 except RuntimeError:
     pass
 
-from IPython import embed
-
-
 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s', datefmt='%m/%d/%Y %H:%M:%S',
                     level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
-def _check_process(_proc, _return_dict, logger, run, overfit_batch_idx, use_lr_scheduler, hyp,
-                  scheduler, eval_metric_to_idx, dev_opt_metric, i, best_epoch, best_dev_score,
-                  best_dev_scores, best_dev_state_dict, sync=False):
-    if _proc is not None:
-        if _return_dict['_state'] == 'done' or (sync and _return_dict['_state'] == 'start'):
-            _proc.join()
-            _return_dict['_state'] = 'finish'
-            if _return_dict['_method'] == 'init_eval':
-                logger.info(_return_dict['local'])
-                run.log(_return_dict['wandb'])
-            elif _return_dict['_method'] == 'dev_eval':
-                logger.info(_return_dict['local'])
-                run.log(_return_dict['wandb'])
-                if overfit_batch_idx > -1:
-                    if use_lr_scheduler:
-                        if hyp['lr_scheduler'] == 'plateau':
-                            scheduler.step(_return_dict['train_scores'][eval_metric_to_idx[dev_opt_metric]])
-                        elif hyp['lr_scheduler'] == 'step':
-                            scheduler.step()
-                else:
-                    dev_scores = _return_dict['dev_scores']
-                    dev_opt_score = dev_scores[eval_metric_to_idx[dev_opt_metric]]
-                    if dev_opt_score > best_dev_score:
-                        logger.info(f"New best dev {dev_opt_metric} score @ epoch{i+1}: {dev_opt_score}")
-                        best_epoch = i
-                        best_dev_score = dev_opt_score
-                        best_dev_scores = dev_scores
-                        best_dev_state_dict = torch.load(_return_dict['state_dict_path'], device)
-                    if use_lr_scheduler:
-                        if hyp['lr_scheduler'] == 'plateau':
-                            scheduler.step(dev_scores[eval_metric_to_idx[dev_opt_metric]])
-                        elif hyp['lr_scheduler'] == 'step':
-                            scheduler.step()
-    return best_epoch, best_dev_score, best_dev_scores, best_dev_state_dict
-
-
-def init_eval(model_class, model_args, state_dict_path, overfit_batch_idx, eval_fn, train_dataloader, device, verbose,
-              debug, _errors, eval_metric_to_idx, val_dataloader, return_dict):
-    return_dict['_state'] = 'start'
-    return_dict['_method'] = 'init_eval'
-    model = model_class(*model_args)
-    model.load_state_dict(torch.load(state_dict_path))
-    model.to(device)
-    with torch.no_grad():
-        model.eval()
-        if overfit_batch_idx > -1:
-            train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx,
-                                   tqdm_label='train', device=device, verbose=verbose, debug=debug,
-                                   _errors=_errors, tqdm_position=0)
-            return_dict['local'] = f"Initial: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " + \
-                                   f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}"
-            return_dict['wandb'] = {'epoch': 0, f'train_{list(eval_metric_to_idx)[0]}': train_scores[0],
-                                    f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]}
-        else:
-            dev_scores = eval_fn(model, val_dataloader, tqdm_label='dev 0', device=device, verbose=verbose,
-                                 debug=debug, _errors=_errors, tqdm_position=0)
-            return_dict['local'] = f"Initial: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " + \
-                                   f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}"
-            return_dict['wandb'] = {'epoch': 0, f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0],
-                                    f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1]}
-    del model
-    return_dict['_state'] = 'done'
-
-
-def dev_eval(model_class, model_args, state_dict_path, overfit_batch_idx, eval_fn, train_dataloader, device, verbose,
-             debug, _errors, eval_metric_to_idx, val_dataloader, return_dict, i, run_dir):
-    return_dict['_state'] = 'start'
-    return_dict['_method'] = 'dev_eval'
-    return_dict['state_dict_path'] = state_dict_path
-    model = model_class(*model_args)
-    model.load_state_dict(torch.load(state_dict_path))
-    model.to(device)
-    with torch.no_grad():
-        model.eval()
-        if overfit_batch_idx > -1:
-            train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx,
-                                   tqdm_label='train', device=device, verbose=verbose, debug=debug,
-                                   _errors=_errors)
-            return_dict['local'] = f"Epoch {i + 1}: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " + \
-                                   f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}"
-            return_dict['wandb'] = {f'train_{list(eval_metric_to_idx)[0]}': train_scores[0],
-                                    f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]}
-            return_dict['train_scores'] = train_scores
-        else:
-            dev_scores = eval_fn(model, val_dataloader, tqdm_label=f'dev {i+1}', device=device, verbose=verbose,
-                                 debug=debug, _errors=_errors)
-            return_dict['local'] = f"Epoch {i + 1}: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " + \
-                                   f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}"
-            return_dict['wandb'] = {f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0],
-                                    f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1]}
-            return_dict['dev_scores'] = dev_scores
-    del model
-    return_dict['_state'] = 'done'
-
-
 def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, group=None,
           save_model=False, load_model_from_wandb_run=None, load_model_from_fpath=None,
           eval_only_split=None, eval_all=False, skip_initial_eval=False, pairwise_eval_clustering=None,
@@ -206,14 +107,20 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
         eval_metric_to_idx = clustering_metrics if not pairwise_mode else pairwise_metrics
         dev_opt_metric = hyp['dev_opt_metric'] if hyp['dev_opt_metric'] in eval_metric_to_idx \
             else list(eval_metric_to_idx)[0]
+        training_mode = not eval_all and eval_only_split is None
 
         # Get data loaders (optionally with imputation, normalization)
-        train_dataloader, val_dataloader, test_dataloader = get_dataloaders(hyp["dataset"], hyp["dataset_random_seed"],
-                                                                            hyp["convert_nan"], hyp["nan_value"],
-                                                                            hyp["normalize_data"], hyp["subsample_sz_train"],
-                                                                            hyp["subsample_sz_dev"], pairwise_mode,
-                                                                            batch_size)
-        n_features = train_dataloader.dataset[0][0].shape[1]
+        if training_mode:
+            train_dataloader, val_dataloader, test_dataloader = get_dataloaders(hyp["dataset"],
+                                                                                hyp["dataset_random_seed"],
+                                                                                hyp["convert_nan"], hyp["nan_value"],
+                                                                                hyp["normalize_data"],
+                                                                                hyp["subsample_sz_train"],
+                                                                                hyp["subsample_sz_dev"], pairwise_mode,
+                                                                                batch_size)
+            n_features = train_dataloader.dataset[0][0].shape[1]
+        else:
+            n_features = get_feature_count(hyp["dataset"], hyp["dataset_random_seed"])
 
         # Create model with hyperparams
         if not pairwise_mode:
@@ -222,53 +129,44 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                          negative_slope, hidden_config, sdp_max_iters, sdp_eps, sdp_scale,
                          use_rounded_loss, (e2e_loss == "bce"), use_sdp)
             model = EntResModel(*model_args)
-            # Define loss
-            if e2e_loss not in ["frob", "bce"]:
-                raise ValueError("Invalid value for e2e_loss")
-            loss_fn_e2e = FrobeniusLoss() if e2e_loss == 'frob' else torch.nn.BCELoss()
-
-            pos_weight = None
-            if weighted_loss:
-                if overfit_batch_idx > -1:
-                    n_pos = train_dataloader.dataset[overfit_batch_idx][1].sum()
-                    pos_weight = (len(train_dataloader.dataset[overfit_batch_idx][1]) - n_pos) / n_pos
-                else:
-                    _n_pos, _n_total = 0., 0.
-                    for _i in range(len(train_dataloader.dataset)):
-                        _n_pos += train_dataloader.dataset[_i][1].sum()
-                        _n_total += len(train_dataloader.dataset[_i][1])
-                        pos_weight = (_n_total - _n_pos) / _n_pos
             # Define eval
             eval_fn = evaluate
             pairwise_clustering_fns = [None]  # Unused when pairwise_mode is False
-            if n_warmstart_epochs > 0:
-                train_dataloader_pairwise, _, _ = get_dataloaders(hyp["dataset"],
-                                                                  hyp["dataset_random_seed"],
-                                                                  hyp["convert_nan"],
-                                                                  hyp["nan_value"],
-                                                                  hyp["normalize_data"],
-                                                                  hyp["subsample_sz_train"],
-                                                                  hyp["subsample_sz_dev"],
-                                                                  True, hyp['batch_size'])
+
+            if training_mode:  # => model will be used for training
                 # Define loss
-                loss_fn_pairwise = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight))
+                if e2e_loss not in ["frob", "bce"]:
+                    raise ValueError("Invalid value for e2e_loss")
+                loss_fn_e2e = FrobeniusLoss() if e2e_loss == 'frob' else torch.nn.BCELoss()
+
+                pos_weight = None
+                if weighted_loss:
+                    if overfit_batch_idx > -1:
+                        n_pos = train_dataloader.dataset[overfit_batch_idx][1].sum()
+                        pos_weight = (len(train_dataloader.dataset[overfit_batch_idx][1]) - n_pos) / n_pos
+                    else:
+                        _n_pos, _n_total = 0., 0.
+                        for _i in range(len(train_dataloader.dataset)):
+                            _n_pos += train_dataloader.dataset[_i][1].sum()
+                            _n_total += len(train_dataloader.dataset[_i][1])
+                        pos_weight = (_n_total - _n_pos) / _n_pos if _n_pos > 0 else 1.
+                if n_warmstart_epochs > 0:
+                    train_dataloader_pairwise = get_dataloaders(hyp["dataset"],
+                                                                hyp["dataset_random_seed"],
+                                                                hyp["convert_nan"],
+                                                                hyp["nan_value"],
+                                                                hyp["normalize_data"],
+                                                                hyp["subsample_sz_train"],
+                                                                hyp["subsample_sz_dev"],
+                                                                pairwise_mode=True, batch_size=hyp['batch_size'],
+                                                                split='train')
+                    # Define loss
+                    loss_fn_pairwise = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight))
         else:
             model_args = (n_features, neumiss_depth, dropout_p, dropout_only_once, add_neumiss,
-                               neumiss_deq, hidden_dim, n_hidden_layers, add_batchnorm, activation,
-                               negative_slope, hidden_config)
+                          neumiss_deq, hidden_dim, n_hidden_layers, add_batchnorm, activation,
+                          negative_slope, hidden_config)
             model = PairwiseModel(*model_args)
-            # Define loss
-            pos_weight = None
-            if weighted_loss:
-                if overfit_batch_idx > -1:
-                    n_pos = \
-                        train_dataloader.dataset[overfit_batch_idx * batch_size:(overfit_batch_idx + 1) * batch_size][
-                            1].sum()
-                    pos_weight = torch.tensor((batch_size - n_pos) / n_pos)
-                else:
-                    n_pos = train_dataloader.dataset[:][1].sum()
-                    pos_weight = torch.tensor((len(train_dataloader.dataset) - n_pos) / n_pos)
-            loss_fn_pairwise = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
             # Define eval
             eval_fn = evaluate_pairwise
             pairwise_clustering_fns = [None]
@@ -287,14 +185,28 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                     pairwise_clustering_fn_labels = ['cc', 'hac', 'cc-fixed']
                 else:
                     raise ValueError('Invalid argument passed to --pairwise_eval_clustering')
-                _, val_dataloader_e2e, test_dataloader_e2e = get_dataloaders(hyp["dataset"],
-                                                                             hyp["dataset_random_seed"],
-                                                                             hyp["convert_nan"],
-                                                                             hyp["nan_value"],
-                                                                             hyp["normalize_data"],
-                                                                             hyp["subsample_sz_train"],
-                                                                             hyp["subsample_sz_dev"],
-                                                                             pairwise_mode=False, batch_size=1)
+                val_dataloader_e2e, test_dataloader_e2e = get_dataloaders(hyp["dataset"],
+                                                                          hyp["dataset_random_seed"],
+                                                                          hyp["convert_nan"],
+                                                                          hyp["nan_value"],
+                                                                          hyp["normalize_data"],
+                                                                          hyp["subsample_sz_train"],
+                                                                          hyp["subsample_sz_dev"],
+                                                                          pairwise_mode=False, batch_size=1,
+                                                                          split=['dev', 'test'])
+            if training_mode:  # => model will be used for training
+                # Define loss
+                pos_weight = None
+                if weighted_loss:
+                    if overfit_batch_idx > -1:
+                        n_pos = \
+                            train_dataloader.dataset[overfit_batch_idx * batch_size:(overfit_batch_idx + 1) * batch_size][
+                                1].sum()
+                        pos_weight = torch.tensor((batch_size - n_pos) / n_pos if n_pos > 0 else 1.)
+                    else:
+                        n_pos = train_dataloader.dataset[:][1].sum()
+                        pos_weight = torch.tensor((len(train_dataloader.dataset) - n_pos) / n_pos if n_pos > 0 else 1.)
+                loss_fn_pairwise = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
         logger.info(f"Model loaded: {model}", )
 
         # Load stored model, if available
@@ -323,14 +235,15 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                                    'cc-nosdp', 'cc-nosdp-fixed']
             cc_inference_sdp.eval()
             cc_inference_nosdp.eval()
-            _, val_dataloader_e2e, test_dataloader_e2e = get_dataloaders(hyp["dataset"],
-                                                                         hyp["dataset_random_seed"],
-                                                                         hyp["convert_nan"],
-                                                                         hyp["nan_value"],
-                                                                         hyp["normalize_data"],
-                                                                         hyp["subsample_sz_train"],
-                                                                         hyp["subsample_sz_dev"],
-                                                                         pairwise_mode=False, batch_size=1)
+            val_dataloader_e2e, test_dataloader_e2e = get_dataloaders(hyp["dataset"],
+                                                                      hyp["dataset_random_seed"],
+                                                                      hyp["convert_nan"],
+                                                                      hyp["nan_value"],
+                                                                      hyp["normalize_data"],
+                                                                      hyp["subsample_sz_train"],
+                                                                      hyp["subsample_sz_dev"],
+                                                                      pairwise_mode=False, batch_size=1,
+                                                                      split=['dev', 'test'])
             start_time = time.time()
             with torch.no_grad():
                 model.eval()
@@ -342,7 +255,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                                                           clustering_threshold=clustering_threshold if i % 2 == 0 else None,
                                                           val_dataloader=val_dataloader_e2e,
                                                           tqdm_label='test clustering', device=device, verbose=verbose,
-                                                          debug=debug, _errors=_errors)
+                                                          debug=debug, _errors=_errors, model_args=model_args)
                     if inference_fn.__class__ is HACInference:
                         clustering_threshold = inference_fn.cut_threshold
                     logger.info(
@@ -360,18 +273,16 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
             end_time = time.time()
         elif eval_only_split is not None:
             # Run inference on the specified split and exit
-            dataloaders = {
-                'train': train_dataloader,
-                'dev': val_dataloader,
-                'test': test_dataloader
-            }
             start_time = time.time()
             with torch.no_grad():
                 model.eval()
-
-                eval_dataloader = dataloaders[eval_only_split]
+                eval_dataloader = get_dataloaders(hyp["dataset"], hyp["dataset_random_seed"],
+                                                  hyp["convert_nan"], hyp["nan_value"],
+                                                  hyp["normalize_data"], hyp["subsample_sz_train"],
+                                                  hyp["subsample_sz_dev"], pairwise_mode,
+                                                  batch_size, split=eval_only_split)
                 eval_scores = eval_fn(model, eval_dataloader, tqdm_label=eval_only_split, device=device, verbose=verbose,
-                                      debug=debug, _errors=_errors)
+                                      debug=debug, _errors=_errors, model_args=model_args)
                 logger.info(f"Eval: {eval_only_split}_{list(eval_metric_to_idx)[0]}={eval_scores[0]}, " +
                             f"{eval_only_split}_{list(eval_metric_to_idx)[1]}={eval_scores[1]}")
                 # Log eval metrics
@@ -380,7 +291,6 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                 if len(eval_scores) == 3:
                     log_cc_objective_values(scores=eval_scores, split_name=eval_only_split, log_prefix='Eval',
                                             verbose=verbose, logger=logger)
-
                 # For pairwise-mode:
                 if pairwise_clustering_fns[0] is not None:
                     clustering_threshold = None
@@ -390,7 +300,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                                                     clustering_threshold=clustering_threshold,
                                                     val_dataloader=val_dataloader_e2e,
                                                     tqdm_label='test clustering', device=device, verbose=verbose,
-                                                    debug=debug, _errors=_errors)
+                                                    debug=debug, _errors=_errors, model_args=model_args)
                         if pairwise_clustering_fn.__class__ is HACInference:
                             clustering_threshold = pairwise_clustering_fn.cut_threshold
                         logger.info(
@@ -430,15 +340,14 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
 
             if not skip_initial_eval:
                 # Get initial model performance on dev (or 'train' for overfitting runs)
-                _state_dict_path = copy_and_load_model(model, run.dir, device, store_only=True)
-                _proc = Process(target=init_eval,
-                                kwargs=dict(model_class=model.__class__, model_args=model_args,
-                                            state_dict_path=_state_dict_path,
-                                            overfit_batch_idx=overfit_batch_idx, eval_fn=eval_fn,
-                                            train_dataloader=train_dataloader, device=device, verbose=verbose,
-                                            debug=debug, _errors=_errors, eval_metric_to_idx=eval_metric_to_idx,
-                                            val_dataloader=val_dataloader, return_dict=_return_dict))
-                _proc.start()
+                _proc = fork_eval(target=init_eval, args=dict(model_args=model_args,
+                                                              overfit_batch_idx=overfit_batch_idx, eval_fn=eval_fn,
+                                                              train_dataloader=train_dataloader, device=device,
+                                                              verbose=verbose,
+                                                              debug=debug, _errors=_errors,
+                                                              eval_metric_to_idx=eval_metric_to_idx,
+                                                              val_dataloader=val_dataloader, return_dict=_return_dict),
+                                  model=model, run_dir=run.dir, device=device, logger=logger)
             if not pairwise_mode and grad_acc > 1:
                 grad_acc_steps = []
                 _seen_pw = 0
@@ -473,9 +382,9 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                 grad_acc_idx = 0
                 optimizer.zero_grad()
 
-                for (idx, batch) in enumerate(tqdm(_train_dataloader,
-                                                   desc=f"{'Warm-starting' if warmstart_mode else 'Training'} {i + 1}",
-                                                   position=1)):
+                pbar = tqdm(_train_dataloader, desc=f"{'Warm-starting' if warmstart_mode else 'Training'} {i + 1}",
+                            position=1)
+                for (idx, batch) in enumerate(pbar):
                     best_epoch, best_dev_score, best_dev_scores, best_dev_state_dict = _check_process(_proc,
                                                                                                       _return_dict,
                                                                                                       logger, run,
@@ -506,6 +415,8 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                         # Block contains only one signature pair; batchnorm throws error
                         continue
                     block_size = get_matrix_size_from_triu(data)
+                    pbar.set_description(f"{'Warm-starting' if warmstart_mode else 'Training'} {i + 1} " + \
+                                         f"(sz={len(data) if (pairwise_mode or warmstart_mode) else block_size})")
                     target = target.flatten().float()
                     if verbose:
                         logger.info(f"Batch shape: {data.shape}")
@@ -611,29 +522,29 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                 wandb.log({f'train_epoch_loss': np.mean(running_loss)})
 
                 # Get model performance on dev (or 'train' for overfitting runs)
-                _state_dict_path = copy_and_load_model(model, run.dir, device, store_only=True)
-                _proc = Process(target=dev_eval,
-                                kwargs=dict(model_class=model.__class__, model_args=model_args,
-                                            state_dict_path=_state_dict_path, overfit_batch_idx=overfit_batch_idx,
-                                            eval_fn=eval_fn, train_dataloader=train_dataloader, device=device,
+                _proc = fork_eval(target=dev_eval,
+                                  args=dict(model_args=model_args,
+                                            overfit_batch_idx=overfit_batch_idx, eval_fn=eval_fn,
+                                            train_dataloader=train_dataloader, device=device,
                                             verbose=verbose, debug=debug, _errors=_errors,
                                             eval_metric_to_idx=eval_metric_to_idx, val_dataloader=val_dataloader,
-                                            return_dict=_return_dict, i=i, run_dir=run.dir))
-                _proc.start()
+                                            return_dict=_return_dict, i=i),
+                                  model=model, run_dir=run.dir, device=device, logger=logger,
+                                  sync=(idx == len(_train_dataloader.dataset) - 1))
             end_time = time.time()
 
             best_epoch, best_dev_score, best_dev_scores, best_dev_state_dict = _check_process(_proc, _return_dict,
-                                                                                             logger, run,
-                                                                                             overfit_batch_idx,
-                                                                                             use_lr_scheduler,
-                                                                                             hyp, scheduler,
-                                                                                             eval_metric_to_idx,
-                                                                                             dev_opt_metric, i,
-                                                                                             best_epoch,
-                                                                                             best_dev_score,
-                                                                                             best_dev_scores,
-                                                                                             best_dev_state_dict,
-                                                                                             sync=True)
+                                                                                              logger, run,
+                                                                                              overfit_batch_idx,
+                                                                                              use_lr_scheduler,
+                                                                                              hyp, scheduler,
+                                                                                              eval_metric_to_idx,
+                                                                                              dev_opt_metric, i,
+                                                                                              best_epoch,
+                                                                                              best_dev_score,
+                                                                                              best_dev_scores,
+                                                                                              best_dev_state_dict,
+                                                                                              sync=True)
             # Save model
             if save_model:
                 torch.save(best_dev_state_dict, os.path.join(run.dir, 'model_state_dict_best.pt'))
@@ -646,7 +557,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                 with torch.no_grad():
                     model.eval()
                     test_scores = eval_fn(model, test_dataloader, tqdm_label='test', device=device, verbose=verbose,
-                                          debug=debug, _errors=_errors, tqdm_position=2)
+                                          debug=debug, _errors=_errors, tqdm_position=2, model_args=model_args)
                     logger.info(f"Final: test_{list(eval_metric_to_idx)[0]}={test_scores[0]}, " +
                                 f"test_{list(eval_metric_to_idx)[1]}={test_scores[1]}")
                     # Log final metrics
@@ -667,7 +578,8 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                                                         clustering_threshold=clustering_threshold,
                                                         val_dataloader=val_dataloader_e2e,
                                                         tqdm_label='test clustering', device=device, verbose=verbose,
-                                                        debug=debug, _errors=_errors, tqdm_position=2)
+                                                        debug=debug, _errors=_errors, tqdm_position=2,
+                                                        model_args=model_args)
                             if pairwise_clustering_fn.__class__ is HACInference:
                                 clustering_threshold = pairwise_clustering_fn.cut_threshold
                             logger.info(f"Final: test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[0]}, " +
diff --git a/e2e_scripts/train_utils.py b/e2e_scripts/train_utils.py
index c86c498..0fe40c9 100644
--- a/e2e_scripts/train_utils.py
+++ b/e2e_scripts/train_utils.py
@@ -9,26 +9,27 @@
 from typing import Tuple, Optional
 import math
 import pickle
+import torch
+import numpy as np
+import wandb
 from time import time
+from sklearn.preprocessing import StandardScaler
 from torch.utils.data import DataLoader
 from s2and.consts import PREPROCESSED_DATA_DIR
 from s2and.data import S2BlocksDataset
 from s2and.eval import b3_precision_recall_fscore
 from torch import Tensor
-import torch
-import numpy as np
-import wandb
+from torch.multiprocessing import Process
 
 from IPython import embed
 
-
 # Default hyperparameters
 DEFAULT_HYPERPARAMS = {
     # Dataset
     "dataset": "pubmed",
     "dataset_random_seed": 1,
-    "subsample_sz_train": 80,
-    "subsample_sz_dev": 100,
+    "subsample_sz_train": 60,
+    "subsample_sz_dev": -1,
     # Run config
     "run_random_seed": 17,
     "pairwise_mode": False,
@@ -56,11 +57,11 @@
     "sdp_eps": 1e-3,
     "sdp_scale": True,
     # Training config
-    "batch_size": 10000,  # pairwise only; used by e2e if gradient_accumulation is true
-    "lr": 4e-3,
+    "batch_size": 8000,  # pairwise only; used by e2e if gradient_accumulation is true
+    "lr": 1e-3,
     "n_epochs": 5,
     "n_warmstart_epochs": 0,
-    "weighted_loss": False,
+    "weighted_loss": True,
     "use_lr_scheduler": True,
     "lr_scheduler": "plateau",  # "plateau", "step"
     "lr_factor": 0.4,
@@ -69,7 +70,7 @@
     "lr_step_size": 2,
     "lr_gamma": 0.4,
     "weight_decay": 0.01,
-    "gradient_accumulation": False,  # e2e only; accumulate over <batch_size> pairwise examples
+    "gradient_accumulation": True,  # e2e only; accumulate over <batch_size> pairwise examples
     "dev_opt_metric": 'b3_f1',  # e2e: {'b3_f1', 'vmeasure'}; pairwise: {'auroc', 'f1'}
     "overfit_batch_idx": -1
 }
@@ -83,25 +84,42 @@ def read_blockwise_features(pkl):
 
 
 def get_dataloaders(dataset, dataset_seed, convert_nan, nan_value, normalize, subsample_sz_train, subsample_sz_dev,
-                    pairwise_mode, batch_size):
-    train_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{dataset_seed}/train_features.pkl"
-    val_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{dataset_seed}/val_features.pkl"
-    test_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{dataset_seed}/test_features.pkl"
+                    pairwise_mode, batch_size, shuffle=False, split=None):
+    pickle_path = {
+        'train': f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{dataset_seed}/train_features.pkl",
+        'dev': f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{dataset_seed}/val_features.pkl",
+        'test': f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{dataset_seed}/test_features.pkl"
+    }
+    subsample_sz = {
+        'train': subsample_sz_train,
+        'dev': subsample_sz_dev,
+        'test': -1
+    }
+    train_scaler = StandardScaler()
+    train_X = np.concatenate(list(map(lambda x: x[0], read_blockwise_features(pickle_path['train']).values())))
+    train_scaler.fit(train_X)
 
-    train_dataset = S2BlocksDataset(read_blockwise_features(train_pkl), convert_nan=convert_nan, nan_value=nan_value,
-                                    scale=normalize, subsample_sz=subsample_sz_train, pairwise_mode=pairwise_mode)
-    train_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=batch_size)
+    def _get_dataloader(_split):
+        dataset = S2BlocksDataset(read_blockwise_features(pickle_path[_split]), convert_nan=convert_nan,
+                                  nan_value=nan_value, scale=normalize, scaler=train_scaler,
+                                  subsample_sz=subsample_sz[_split],
+                                  pairwise_mode=pairwise_mode, sort_desc=(_split in ['dev', 'test']))
+        dataloader = DataLoader(dataset, shuffle=shuffle, batch_size=batch_size)
+        return dataloader
 
-    val_dataset = S2BlocksDataset(read_blockwise_features(val_pkl), convert_nan=convert_nan, nan_value=nan_value,
-                                  scale=normalize, scaler=train_dataset.scaler, subsample_sz=subsample_sz_dev,
-                                  pairwise_mode=pairwise_mode)
-    val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size)
+    if split is None:
+        return _get_dataloader('train'), _get_dataloader('dev'), _get_dataloader('test')
+    if type(split) is str:
+        return _get_dataloader(split)
+    if type(split) is list:
+        return tuple([_get_dataloader(_split) for _split in split])
+    raise ValueError('Invalid argument to split')
 
-    test_dataset = S2BlocksDataset(read_blockwise_features(test_pkl), convert_nan=convert_nan, nan_value=nan_value,
-                                   scale=normalize, scaler=train_dataset.scaler, pairwise_mode=pairwise_mode)
-    test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)
 
-    return train_dataloader, val_dataloader, test_dataloader
+def get_feature_count(dataset, dataset_seed):
+    data_fpath = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{dataset_seed}/test_features.pkl"
+    block_dict = read_blockwise_features(data_fpath)
+    return next(iter(block_dict.values()))[0].shape[1]
 
 
 def uncompress_target_tensor(compressed_targets, make_symmetric=True, device=None):
@@ -205,3 +223,118 @@ def copy_and_load_model(model, run_dir, device, store_only=False):
     _model.load_state_dict(_STATE_DICT)
     os.remove(_PATH)
     return _model
+
+
+def _check_process(_proc, _return_dict, logger, run, overfit_batch_idx, use_lr_scheduler, hyp,
+                   scheduler, eval_metric_to_idx, dev_opt_metric, i, best_epoch, best_dev_score,
+                   best_dev_scores, best_dev_state_dict, sync=False):
+    if _proc is not None:
+        if _return_dict['_state'] == 'done' or (sync and _return_dict['_state'] != 'finish'):
+            _proc.join()
+            _return_dict['_state'] = 'finish'
+            if _return_dict['_method'] == 'init_eval':
+                logger.info(_return_dict['local'])
+                run.log(_return_dict['wandb'])
+                if overfit_batch_idx == -1:
+                    best_dev_scores = _return_dict['dev_scores']
+                    best_dev_score = best_dev_scores[eval_metric_to_idx[dev_opt_metric]]
+            elif _return_dict['_method'] == 'dev_eval':
+                logger.info(_return_dict['local'])
+                run.log(_return_dict['wandb'])
+                if overfit_batch_idx > -1:
+                    if use_lr_scheduler:
+                        if hyp['lr_scheduler'] == 'plateau':
+                            scheduler.step(_return_dict['train_scores'][eval_metric_to_idx[dev_opt_metric]])
+                        elif hyp['lr_scheduler'] == 'step':
+                            scheduler.step()
+                else:
+                    dev_scores = _return_dict['dev_scores']
+                    dev_opt_score = dev_scores[eval_metric_to_idx[dev_opt_metric]]
+                    if dev_opt_score > best_dev_score:
+                        logger.info(f"New best dev {dev_opt_metric} score @ epoch{i + 1}: {dev_opt_score}")
+                        best_epoch = i
+                        best_dev_score = dev_opt_score
+                        best_dev_scores = dev_scores
+                        best_dev_state_dict = torch.load(_return_dict['state_dict_path'])
+                    if use_lr_scheduler:
+                        if hyp['lr_scheduler'] == 'plateau':
+                            scheduler.step(dev_scores[eval_metric_to_idx[dev_opt_metric]])
+                        elif hyp['lr_scheduler'] == 'step':
+                            scheduler.step()
+    return best_epoch, best_dev_score, best_dev_scores, best_dev_state_dict
+
+
+def init_eval(model_class, model_args, state_dict_path, overfit_batch_idx, eval_fn, train_dataloader, device, verbose,
+              debug, _errors, eval_metric_to_idx, val_dataloader, return_dict):
+    return_dict['_state'] = 'start'
+    return_dict['_method'] = 'init_eval'
+    model = model_class(*model_args)
+    model.load_state_dict(torch.load(state_dict_path))
+    model.to(device)
+    with torch.no_grad():
+        model.eval()
+        if overfit_batch_idx > -1:
+            train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx,
+                                   tqdm_label='train', device=device, verbose=verbose, debug=debug,
+                                   _errors=_errors, tqdm_position=0, model_args=model_args)
+            return_dict['local'] = f"Initial: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " + \
+                                   f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}"
+            return_dict['wandb'] = {'epoch': 0, f'train_{list(eval_metric_to_idx)[0]}': train_scores[0],
+                                    f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]}
+        else:
+            dev_scores = eval_fn(model, val_dataloader, tqdm_label='dev 0', device=device, verbose=verbose,
+                                 debug=debug, _errors=_errors, tqdm_position=0, model_args=model_args)
+            return_dict['local'] = f"Initial: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " + \
+                                   f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}"
+            return_dict['wandb'] = {'epoch': 0, f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0],
+                                    f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1]}
+            return_dict['dev_scores'] = dev_scores
+    del model
+    return_dict['_state'] = 'done'
+    return return_dict
+
+
+def dev_eval(model_class, model_args, state_dict_path, overfit_batch_idx, eval_fn, train_dataloader, device, verbose,
+             debug, _errors, eval_metric_to_idx, val_dataloader, return_dict, i):
+    return_dict['_state'] = 'start'
+    return_dict['_method'] = 'dev_eval'
+    return_dict['state_dict_path'] = state_dict_path
+    model = model_class(*model_args)
+    model.load_state_dict(torch.load(state_dict_path))
+    model.to(device)
+    with torch.no_grad():
+        model.eval()
+        if overfit_batch_idx > -1:
+            train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx,
+                                   tqdm_label='train', device=device, verbose=verbose, debug=debug,
+                                   _errors=_errors, model_args=model_args)
+            return_dict['local'] = f"Epoch {i + 1}: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " + \
+                                   f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}"
+            return_dict['wandb'] = {f'train_{list(eval_metric_to_idx)[0]}': train_scores[0],
+                                    f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]}
+            return_dict['train_scores'] = train_scores
+        else:
+            dev_scores = eval_fn(model, val_dataloader, tqdm_label=f'dev {i + 1}', device=device, verbose=verbose,
+                                 debug=debug, _errors=_errors, model_args=model_args)
+            return_dict['local'] = f"Epoch {i + 1}: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " + \
+                                   f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}"
+            return_dict['wandb'] = {f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0],
+                                    f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1]}
+            return_dict['dev_scores'] = dev_scores
+    del model
+    return_dict['_state'] = 'done'
+    return return_dict
+
+
+def fork_eval(target, args, model, run_dir, device, logger, sync=False):
+    state_dict_path = copy_and_load_model(model, run_dir, device, store_only=True)
+    args['model_class'] = model.__class__
+    args['state_dict_path'] = state_dict_path
+    if sync:
+        target(**args)
+        proc = Process()
+    else:
+        proc = Process(target=target, kwargs=args)
+        logger.info('Forking eval')
+    proc.start()
+    return proc
diff --git a/s2and/data.py b/s2and/data.py
index 9d75eb1..745251b 100644
--- a/s2and/data.py
+++ b/s2and/data.py
@@ -128,7 +128,7 @@ class S2BlocksDataset(Dataset):
     """
     def __init__(self, block_dict: Dict[str, Tuple[np.ndarray, np.ndarray, np.ndarray]],
                  convert_nan=True, nan_value=-1, scale=False, scaler=None, subsample_sz=-1,
-                 pairwise_mode=False):
+                 pairwise_mode=False, sort_desc=False):
         self.pairwise_mode = pairwise_mode
         self.block_dict = block_dict
         self.convert_nan = convert_nan
@@ -171,6 +171,11 @@ def __init__(self, block_dict: Dict[str, Tuple[np.ndarray, np.ndarray, np.ndarra
             else:
                 self.blockwise_data.append((X, y, cluster_ids))
                 self.blockwise_keys.append(dict_key)
+        if sort_desc:
+            self.blockwise_keys = list(map(lambda x: x[1], sorted(enumerate(self.blockwise_keys),
+                                                                  key=lambda x: len(self.blockwise_data[x[0]][2]),
+                                                                  reverse=True)))
+            self.blockwise_data.sort(key=lambda x: -len(x[2]))
         if self.pairwise_mode:
             self.pairwise_data = {'X': [], 'y': []}
             self.cluster_ids = []
diff --git a/utils/parser.py b/utils/parser.py
index f16f61f..de21db8 100644
--- a/utils/parser.py
+++ b/utils/parser.py
@@ -38,6 +38,9 @@ def add_preprocessing_args(self):
         parser.add_argument(
             "--dataset_name", type=str, help="name of AND dataset that you want to preprocess"
         )
+        parser.add_argument(
+            "--dataset_seed", type=int
+        )
 
     def add_training_args(self):
         """
@@ -94,7 +97,7 @@ def add_training_args(self):
             help="Whether to prevent wandb sweep early terminate or not",
         )
         parser.add_argument(
-            "--wandb_max_runs", type=int, default=600,
+            "--wandb_max_runs", type=int, default=120,
             help="Maximum number of runs to try in the sweep",
         )
         parser.add_argument(
diff --git a/wandb_configs/sweeps/e2e-nosdp-warm.json b/wandb_configs/sweeps/e2e-nosdp-warm.json
index 3846f0d..294b543 100644
--- a/wandb_configs/sweeps/e2e-nosdp-warm.json
+++ b/wandb_configs/sweeps/e2e-nosdp-warm.json
@@ -8,9 +8,9 @@
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
     "lr_scheduler": {"value": "plateau"},
-    "subsample_sz_train": {"value": 80},
-    "subsample_sz_dev": {"value": 100},
     "activation": {"values": ["leaky_relu", "relu"]},
     "use_sdp": {"value":  false},
-    "n_warmstart_epochs": {"value":  2}
+    "n_warmstart_epochs": {"value":  2},
+    "gradient_accumulation": {"values":  [true, false]},
+    "weighted_loss": {"values":  [true, false]}
 }
diff --git a/wandb_configs/sweeps/e2e-nosdp.json b/wandb_configs/sweeps/e2e-nosdp.json
index 4e02afe..5b47c39 100644
--- a/wandb_configs/sweeps/e2e-nosdp.json
+++ b/wandb_configs/sweeps/e2e-nosdp.json
@@ -8,8 +8,8 @@
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
     "lr_scheduler": {"value": "plateau"},
-    "subsample_sz_train": {"value": 80},
-    "subsample_sz_dev": {"value": 100},
     "activation": {"values": ["leaky_relu", "relu"]},
-    "use_sdp": {"value":  false}
+    "use_sdp": {"value":  false},
+    "gradient_accumulation": {"values":  [true, false]},
+    "weighted_loss": {"values":  [true, false]}
 }
diff --git a/wandb_configs/sweeps/e2e-warm.json b/wandb_configs/sweeps/e2e-warm.json
index 77de43c..19e511b 100644
--- a/wandb_configs/sweeps/e2e-warm.json
+++ b/wandb_configs/sweeps/e2e-warm.json
@@ -8,8 +8,8 @@
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
     "lr_scheduler": {"value": "plateau"},
-    "subsample_sz_train": {"value": 80},
-    "subsample_sz_dev": {"value": 100},
     "activation": {"values": ["leaky_relu", "relu"]},
-    "n_warmstart_epochs": {"value":  2}
+    "n_warmstart_epochs": {"value":  2},
+    "gradient_accumulation": {"values":  [true, false]},
+    "weighted_loss": {"values":  [true, false]}
 }
diff --git a/wandb_configs/sweeps/e2e.json b/wandb_configs/sweeps/e2e.json
index 20991ba..e084f00 100644
--- a/wandb_configs/sweeps/e2e.json
+++ b/wandb_configs/sweeps/e2e.json
@@ -8,7 +8,7 @@
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
     "lr_scheduler": {"value": "plateau"},
-    "subsample_sz_train": {"value": 80},
-    "subsample_sz_dev": {"value": 100},
-    "activation": {"values": ["leaky_relu", "relu"]}
+    "activation": {"values": ["leaky_relu", "relu"]},
+    "gradient_accumulation": {"values":  [true, false]},
+    "weighted_loss": {"values":  [true, false]}
 }
diff --git a/wandb_configs/sweeps/frac-nosdp-warm.json b/wandb_configs/sweeps/frac-nosdp-warm.json
index 75503ce..491e04c 100644
--- a/wandb_configs/sweeps/frac-nosdp-warm.json
+++ b/wandb_configs/sweeps/frac-nosdp-warm.json
@@ -8,10 +8,10 @@
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
     "lr_scheduler": {"value": "plateau"},
-    "subsample_sz_train": {"value": 80},
-    "subsample_sz_dev": {"value": 100},
     "activation": {"values": ["leaky_relu", "relu"]},
     "use_rounded_loss": {"value": false},
     "use_sdp": {"value":  false},
-    "n_warmstart_epochs": {"value":  2}
+    "n_warmstart_epochs": {"value":  2},
+    "gradient_accumulation": {"values":  [true, false]},
+    "weighted_loss": {"values":  [true, false]}
 }
diff --git a/wandb_configs/sweeps/frac-nosdp.json b/wandb_configs/sweeps/frac-nosdp.json
index f27ee08..d9a1e41 100644
--- a/wandb_configs/sweeps/frac-nosdp.json
+++ b/wandb_configs/sweeps/frac-nosdp.json
@@ -8,9 +8,9 @@
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
     "lr_scheduler": {"value": "plateau"},
-    "subsample_sz_train": {"value": 80},
-    "subsample_sz_dev": {"value": 100},
     "activation": {"values": ["leaky_relu", "relu"]},
     "use_rounded_loss": {"value": false},
-    "use_sdp": {"value":  false}
+    "use_sdp": {"value":  false},
+    "gradient_accumulation": {"values":  [true, false]},
+    "weighted_loss": {"values":  [true, false]}
 }
diff --git a/wandb_configs/sweeps/frac-warm.json b/wandb_configs/sweeps/frac-warm.json
index fa4b935..b13efc5 100644
--- a/wandb_configs/sweeps/frac-warm.json
+++ b/wandb_configs/sweeps/frac-warm.json
@@ -8,9 +8,9 @@
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
     "lr_scheduler": {"value": "plateau"},
-    "subsample_sz_train": {"value": 80},
-    "subsample_sz_dev": {"value": 100},
     "activation": {"values": ["leaky_relu", "relu"]},
     "use_rounded_loss": {"value": false},
-    "n_warmstart_epochs": {"value":  2}
+    "n_warmstart_epochs": {"value":  2},
+    "gradient_accumulation": {"values":  [true, false]},
+    "weighted_loss": {"values":  [true, false]}
 }
diff --git a/wandb_configs/sweeps/frac.json b/wandb_configs/sweeps/frac.json
index 7eb6812..a572b76 100644
--- a/wandb_configs/sweeps/frac.json
+++ b/wandb_configs/sweeps/frac.json
@@ -8,8 +8,8 @@
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
     "lr_scheduler": {"value": "plateau"},
-    "subsample_sz_train": {"value": 80},
-    "subsample_sz_dev": {"value": 100},
     "activation": {"values": ["leaky_relu", "relu"]},
-    "use_rounded_loss": {"value": false}
+    "use_rounded_loss": {"value": false},
+    "gradient_accumulation": {"values":  [true, false]},
+    "weighted_loss": {"values":  [true, false]}
 }
diff --git a/wandb_configs/sweeps/mlp.json b/wandb_configs/sweeps/mlp.json
index a5f49fc..24274c7 100644
--- a/wandb_configs/sweeps/mlp.json
+++ b/wandb_configs/sweeps/mlp.json
@@ -10,5 +10,6 @@
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
     "lr_scheduler": {"value": "plateau"},
     "activation": {"values": ["leaky_relu", "relu"]},
+    "gradient_accumulation": {"value":  false},
     "weighted_loss": {"value":  true}
 }

From 69617d842773b56e39dfef985ab86f986e6a895c Mon Sep 17 00:00:00 2001
From: Dhruv Agarwal <dhruv.agarwal17@gmail.com>
Date: Sat, 18 Mar 2023 14:44:53 -0400
Subject: [PATCH 08/17] Add gradient clipping with norm

---
 e2e_scripts/train.py       | 4 ++++
 e2e_scripts/train_utils.py | 1 +
 2 files changed, 5 insertions(+)

diff --git a/e2e_scripts/train.py b/e2e_scripts/train.py
index a645257..eb0b483 100644
--- a/e2e_scripts/train.py
+++ b/e2e_scripts/train.py
@@ -493,6 +493,10 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                                 continue
                     if pairwise_mode or (
                             idx == len(_train_dataloader.dataset) - 1) or grad_acc == 1 or grad_acc_count >= grad_acc:
+                        if hyp["max_grad_norm"] != -1:
+                            torch.nn.utils.clip_grad_norm_(
+                                model.parameters(), hyp["max_grad_norm"]
+                            )
                         optimizer.step()
                         optimizer.zero_grad()
                         if grad_acc > 1:
diff --git a/e2e_scripts/train_utils.py b/e2e_scripts/train_utils.py
index 0fe40c9..4083905 100644
--- a/e2e_scripts/train_utils.py
+++ b/e2e_scripts/train_utils.py
@@ -69,6 +69,7 @@
     "lr_scheduler_patience": 2,
     "lr_step_size": 2,
     "lr_gamma": 0.4,
+    "max_grad_norm": 1,  # Off if set to -1
     "weight_decay": 0.01,
     "gradient_accumulation": True,  # e2e only; accumulate over <batch_size> pairwise examples
     "dev_opt_metric": 'b3_f1',  # e2e: {'b3_f1', 'vmeasure'}; pairwise: {'auroc', 'f1'}

From d84ae96ba1536d43fd209c3656440c4eabc1cf05 Mon Sep 17 00:00:00 2001
From: Dhruv Agarwal <dhruv.agarwal17@gmail.com>
Date: Sat, 18 Mar 2023 16:31:36 -0400
Subject: [PATCH 09/17] Set limit on parallel forks in parallel iterations

---
 e2e_scripts/evaluate.py | 145 ++++++++++++++++++++--------------------
 1 file changed, 73 insertions(+), 72 deletions(-)

diff --git a/e2e_scripts/evaluate.py b/e2e_scripts/evaluate.py
index 71ec3d6..f3d15bc 100644
--- a/e2e_scripts/evaluate.py
+++ b/e2e_scripts/evaluate.py
@@ -58,7 +58,7 @@ def _fork_iter(batch_idx, _fork_id, _shared_list, eval_fn, **kwargs):
 def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, clustering_threshold=None,
              val_dataloader=None, tqdm_label='', device=None, verbose=False, debug=False, _errors=None,
              run_dir='./', tqdm_position=None, model_args=None, return_iter=False, fork_size=500,
-             disable_tqdm=False):
+             max_parallel_forks=5, disable_tqdm=False):
     """
     clustering_fn, clustering_threshold, val_dataloader: unused when pairwise_mode is False
     (only added to keep fn signature identical)
@@ -96,41 +96,41 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste
             # Only one signature in block; manually assign a unique cluster
             pred_cluster_ids = [max_pred_id + 1]
         elif fork_enabled and block_size >= fork_size:
-            _proc = _fork_iter(idx, _fork_id, _shared_list, evaluate, **fn_args)
-            _fork_id += 1
-            _procs.append((_proc, block_size))
-            continue
-        else:
-            # Forward pass through the e2e model
-            data = data.to(device)
-            try:
-                _ = model(data, block_size, verbose=verbose)
-            except CvxpyException as e:
-                logger.info(e)
-                _error_obj = {
-                    'id': f'e_{int(time())}',
-                    'method': 'eval',
-                    'model_type': 'e2e',
-                    'data_split': tqdm_label,
-                    'model_call_args': {
-                        'data': data.detach().tolist(),
-                        'block_size': block_size
-                    },
-                    'cvxpy_layer_args': e.data
-                }
-                if _errors is not None:
-                    _errors.append(_error_obj)
-                    save_to_wandb_run({'errors': _errors}, 'errors.json', run_dir, logger)
-                if not debug:  # if tqdm_label is not 'dev' and not debug:
-                    raise CvxpyException(data=_error_obj)
-                n_exceptions += 1
-                logger.info(f'Caught CvxpyException {n_exceptions}: skipping batch')
+            if (len(_procs) - len(_shared_list)) < max_parallel_forks:
+                _proc = _fork_iter(idx, _fork_id, _shared_list, evaluate, **fn_args)
+                _fork_id += 1
+                _procs.append((_proc, block_size))
                 continue
-            pred_cluster_ids = (model.hac_cut_layer.cluster_labels + (max_pred_id + 1)).tolist()
-            cc_obj_vals['round'].append(model.hac_cut_layer.objective_value)
-            cc_obj_vals['sdp'].append(model.sdp_layer.objective_value)
-            cc_obj_vals['block_idxs'].append(idx)
-            cc_obj_vals['block_sizes'].append(block_size)
+        # Forward pass through the e2e model
+        data = data.to(device)
+        try:
+            _ = model(data, block_size, verbose=verbose)
+        except CvxpyException as e:
+            logger.info(e)
+            _error_obj = {
+                'id': f'e_{int(time())}',
+                'method': 'eval',
+                'model_type': 'e2e',
+                'data_split': tqdm_label,
+                'model_call_args': {
+                    'data': data.detach().tolist(),
+                    'block_size': block_size
+                },
+                'cvxpy_layer_args': e.data
+            }
+            if _errors is not None:
+                _errors.append(_error_obj)
+                save_to_wandb_run({'errors': _errors}, 'errors.json', run_dir, logger)
+            if not debug:  # if tqdm_label is not 'dev' and not debug:
+                raise CvxpyException(data=_error_obj)
+            n_exceptions += 1
+            logger.info(f'Caught CvxpyException {n_exceptions}: skipping batch')
+            continue
+        pred_cluster_ids = (model.hac_cut_layer.cluster_labels + (max_pred_id + 1)).tolist()
+        cc_obj_vals['round'].append(model.hac_cut_layer.objective_value)
+        cc_obj_vals['sdp'].append(model.sdp_layer.objective_value)
+        cc_obj_vals['block_idxs'].append(idx)
+        cc_obj_vals['block_sizes'].append(block_size)
         all_gold += list(np.reshape(cluster_ids, (block_size,)))
         max_pred_id = max(pred_cluster_ids)
         all_pred += list(pred_cluster_ids)
@@ -167,7 +167,8 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste
 def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", return_pred_only=False,
                       thresh_for_f1=0.5, clustering_fn=None, clustering_threshold=None, val_dataloader=None,
                       tqdm_label='', device=None, verbose=False, debug=False, _errors=None, run_dir='./',
-                      tqdm_position=None, model_args=None, return_iter=False, fork_size=500, disable_tqdm=False):
+                      tqdm_position=None, model_args=None, return_iter=False, fork_size=500, max_parallel_forks=5,
+                      disable_tqdm=False):
     fn_args = locals()
     fork_enabled = fork_size > -1 and model_args is not None
     if fork_enabled:
@@ -206,43 +207,43 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret
                 # Only one signature in block; manually assign a unique cluster
                 pred_cluster_ids = [max_pred_id + 1]
             elif fork_enabled and block_size >= fork_size and clustering_fn.__class__ is CCInference:
-                _proc = _fork_iter(idx, _fork_id, _shared_list, evaluate_pairwise, **fn_args)
-                _fork_id += 1
-                _procs.append((_proc, block_size))
-                continue
-            else:
-                # Forward pass through the e2e model
-                data = data.to(device)
-                try:
-                    edge_weights = model(data, N=block_size, warmstart=True, verbose=verbose)
-                    pred_cluster_ids = clustering_fn(edge_weights, block_size, min_id=(max_pred_id + 1),
-                                                     threshold=clustering_threshold)
-                except CvxpyException as e:
-                    logger.info(e)
-                    _error_obj = {
-                        'id': f'e_{int(time())}',
-                        'method': 'eval',
-                        'model_type': 'pairwise_cc',
-                        'data_split': tqdm_label,
-                        'model_call_args': {
-                            'data': data.detach().tolist(),
-                            'block_size': block_size
-                        },
-                        'cvxpy_layer_args': e.data
-                    }
-                    if _errors is not None:
-                        _errors.append(_error_obj)
-                        save_to_wandb_run({'errors': _errors}, 'errors.json', run_dir, logger)
-                    if not debug:  # if tqdm_label is not 'dev' and not debug:
-                        raise CvxpyException(data=_error_obj)
-                    n_exceptions += 1
-                    logger.info(f'Caught CvxpyException {n_exceptions}: skipping batch')
+                if (len(_procs) - len(_shared_list)) < max_parallel_forks:
+                    _proc = _fork_iter(idx, _fork_id, _shared_list, evaluate_pairwise, **fn_args)
+                    _fork_id += 1
+                    _procs.append((_proc, block_size))
                     continue
-                if clustering_fn.__class__ is CCInference:
-                    cc_obj_vals['round'].append(clustering_fn.hac_cut_layer.objective_value)
-                    cc_obj_vals['sdp'].append(clustering_fn.sdp_layer.objective_value)
-                    cc_obj_vals['block_idxs'].append(idx)
-                    cc_obj_vals['block_sizes'].append(block_size)
+            # Forward pass through the e2e model
+            data = data.to(device)
+            try:
+                edge_weights = model(data, N=block_size, warmstart=True, verbose=verbose)
+                pred_cluster_ids = clustering_fn(edge_weights, block_size, min_id=(max_pred_id + 1),
+                                                 threshold=clustering_threshold)
+            except CvxpyException as e:
+                logger.info(e)
+                _error_obj = {
+                    'id': f'e_{int(time())}',
+                    'method': 'eval',
+                    'model_type': 'pairwise_cc',
+                    'data_split': tqdm_label,
+                    'model_call_args': {
+                        'data': data.detach().tolist(),
+                        'block_size': block_size
+                    },
+                    'cvxpy_layer_args': e.data
+                }
+                if _errors is not None:
+                    _errors.append(_error_obj)
+                    save_to_wandb_run({'errors': _errors}, 'errors.json', run_dir, logger)
+                if not debug:  # if tqdm_label is not 'dev' and not debug:
+                    raise CvxpyException(data=_error_obj)
+                n_exceptions += 1
+                logger.info(f'Caught CvxpyException {n_exceptions}: skipping batch')
+                continue
+            if clustering_fn.__class__ is CCInference:
+                cc_obj_vals['round'].append(clustering_fn.hac_cut_layer.objective_value)
+                cc_obj_vals['sdp'].append(clustering_fn.sdp_layer.objective_value)
+                cc_obj_vals['block_idxs'].append(idx)
+                cc_obj_vals['block_sizes'].append(block_size)
             all_gold += list(np.reshape(cluster_ids, (block_size,)))
             max_pred_id = max(pred_cluster_ids)
             all_pred += list(pred_cluster_ids)

From 5bfa8a2b7257ffb0a876355ce4d517aca5892fef Mon Sep 17 00:00:00 2001
From: Dhruv Agarwal <dhruv.agarwal17@gmail.com>
Date: Sat, 18 Mar 2023 16:54:31 -0400
Subject: [PATCH 10/17] Set limit on parallel forks in parallel iterations

---
 e2e_scripts/evaluate.py    | 6 +++---
 e2e_scripts/train_utils.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/e2e_scripts/evaluate.py b/e2e_scripts/evaluate.py
index f3d15bc..340e14e 100644
--- a/e2e_scripts/evaluate.py
+++ b/e2e_scripts/evaluate.py
@@ -57,8 +57,8 @@ def _fork_iter(batch_idx, _fork_id, _shared_list, eval_fn, **kwargs):
 
 def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, clustering_threshold=None,
              val_dataloader=None, tqdm_label='', device=None, verbose=False, debug=False, _errors=None,
-             run_dir='./', tqdm_position=None, model_args=None, return_iter=False, fork_size=500,
-             max_parallel_forks=5, disable_tqdm=False):
+             run_dir='./', tqdm_position=None, model_args=None, return_iter=False, fork_size=300,
+             max_parallel_forks=4, disable_tqdm=False):
     """
     clustering_fn, clustering_threshold, val_dataloader: unused when pairwise_mode is False
     (only added to keep fn signature identical)
@@ -167,7 +167,7 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste
 def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", return_pred_only=False,
                       thresh_for_f1=0.5, clustering_fn=None, clustering_threshold=None, val_dataloader=None,
                       tqdm_label='', device=None, verbose=False, debug=False, _errors=None, run_dir='./',
-                      tqdm_position=None, model_args=None, return_iter=False, fork_size=500, max_parallel_forks=5,
+                      tqdm_position=None, model_args=None, return_iter=False, fork_size=300, max_parallel_forks=4,
                       disable_tqdm=False):
     fn_args = locals()
     fork_enabled = fork_size > -1 and model_args is not None
diff --git a/e2e_scripts/train_utils.py b/e2e_scripts/train_utils.py
index 4083905..617f064 100644
--- a/e2e_scripts/train_utils.py
+++ b/e2e_scripts/train_utils.py
@@ -29,7 +29,7 @@
     "dataset": "pubmed",
     "dataset_random_seed": 1,
     "subsample_sz_train": 60,
-    "subsample_sz_dev": -1,
+    "subsample_sz_dev": 300,
     # Run config
     "run_random_seed": 17,
     "pairwise_mode": False,

From b1bbb49e74c7232c1ceda0fd7ec5e05a06d1cc63 Mon Sep 17 00:00:00 2001
From: Dhruv Agarwal <dhruv.agarwal17@gmail.com>
Date: Sat, 18 Mar 2023 17:00:38 -0400
Subject: [PATCH 11/17] Add log message for iteration fork

---
 e2e_scripts/evaluate.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/e2e_scripts/evaluate.py b/e2e_scripts/evaluate.py
index 340e14e..bfaf5e9 100644
--- a/e2e_scripts/evaluate.py
+++ b/e2e_scripts/evaluate.py
@@ -51,6 +51,7 @@ def _fork_iter(batch_idx, _fork_id, _shared_list, eval_fn, **kwargs):
     kwargs['device'] = 'cpu'
     kwargs['eval_fn'] = eval_fn
     _proc = Process(target=_run_iter, kwargs=kwargs)
+    logger.info('Forking eval iteration')
     _proc.start()
     return _proc
 

From 87081d7908e8dc553fad31fab9e4407fa2d8fd84 Mon Sep 17 00:00:00 2001
From: Dhruv Agarwal <dhruv.agarwal17@gmail.com>
Date: Mon, 20 Mar 2023 12:30:24 -0400
Subject: [PATCH 12/17] Larger datasets, hyperparameter search space,
 icml_final_eval (#42)

---
 add_agent.sh                              |   2 +-
 e2e_scripts/evaluate.py                   |  20 ++-
 e2e_scripts/train.py                      | 154 ++++++++++++++++------
 e2e_scripts/train_utils.py                |   3 +-
 get_wandb_results.py                      | 126 ++++++++++++++++++
 rerun_batch.sh                            | 127 ++++++++++++++++++
 rerun_best.sh                             |  19 +++
 run_sweep.sh                              |   6 +-
 utils/parser.py                           |   4 +
 wandb_configs/sweeps/e2e-nosdp-warm.json  |   8 +-
 wandb_configs/sweeps/e2e-nosdp.json       |   6 +-
 wandb_configs/sweeps/e2e-warm.json        |   8 +-
 wandb_configs/sweeps/e2e.json             |   6 +-
 wandb_configs/sweeps/frac-nosdp-warm.json |   6 +-
 wandb_configs/sweeps/frac-nosdp.json      |   6 +-
 wandb_configs/sweeps/frac-warm.json       |   6 +-
 wandb_configs/sweeps/frac.json            |   6 +-
 wandb_configs/sweeps/mlp.json             |   6 +-
 18 files changed, 441 insertions(+), 78 deletions(-)
 create mode 100644 get_wandb_results.py
 create mode 100644 rerun_batch.sh
 create mode 100644 rerun_best.sh

diff --git a/add_agent.sh b/add_agent.sh
index 13b93f5..eba0497 100644
--- a/add_agent.sh
+++ b/add_agent.sh
@@ -10,7 +10,7 @@ gpu_name=${6:-"gypsum-1080ti"}  # "gypsum-1080ti"
 for ((i = 1; i <= ${n_agents}; i++)); do
   JOB_DESC=${model}_${dataset}_sweep${seed}-${i} && JOB_NAME=${JOB_DESC}_$(date +%s) && \
   sbatch -J ${JOB_NAME} -e jobs/${JOB_NAME}.err -o jobs/${JOB_NAME}.log \
-    --partition=${gpu_name} --gres=gpu:1 --mem=100G --time=12:00:00 \
+    --partition=${gpu_name} --gres=gpu:1 --mem=120G --time=12:00:00 \
     run_sbatch.sh e2e_scripts/train.py \
     --dataset="${dataset}" \
     --dataset_random_seed=${seed} \
diff --git a/e2e_scripts/evaluate.py b/e2e_scripts/evaluate.py
index bfaf5e9..8f61c70 100644
--- a/e2e_scripts/evaluate.py
+++ b/e2e_scripts/evaluate.py
@@ -58,8 +58,8 @@ def _fork_iter(batch_idx, _fork_id, _shared_list, eval_fn, **kwargs):
 
 def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, clustering_threshold=None,
              val_dataloader=None, tqdm_label='', device=None, verbose=False, debug=False, _errors=None,
-             run_dir='./', tqdm_position=None, model_args=None, return_iter=False, fork_size=300,
-             max_parallel_forks=4, disable_tqdm=False):
+             run_dir='./', tqdm_position=None, model_args=None, return_iter=False, fork_size=500,
+             max_parallel_forks=3, disable_tqdm=False):
     """
     clustering_fn, clustering_threshold, val_dataloader: unused when pairwise_mode is False
     (only added to keep fn signature identical)
@@ -97,6 +97,7 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste
             # Only one signature in block; manually assign a unique cluster
             pred_cluster_ids = [max_pred_id + 1]
         elif fork_enabled and block_size >= fork_size:
+            logger.info(f"Eval fork info: len(_procs)={len(_procs)}, len(_shared_list)={len(_shared_list)}")
             if (len(_procs) - len(_shared_list)) < max_parallel_forks:
                 _proc = _fork_iter(idx, _fork_id, _shared_list, evaluate, **fn_args)
                 _fork_id += 1
@@ -149,7 +150,11 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste
         _procs.sort(key=lambda x: x[1])  # To visualize progress
         for _proc in tqdm(_procs, desc=f'Eval {tqdm_label} (waiting for forks to join)', position=tqdm_position):
             _proc[0].join()
-        assert len(_procs) == len(_shared_list), "All forked eval iterations did not return results"
+        try:
+            assert len(_procs) == len(_shared_list)
+        except:
+            logger.info("Error: All forked eval iterations did not return results")
+            raise ValueError("All forked eval iterations did not return results")
         for _data in _shared_list:
             pred_cluster_ids = (_data['cluster_labels'] + (max_pred_id + 1)).tolist()
             cc_obj_vals['round'].append(_data['round_objective_value'])
@@ -168,7 +173,7 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste
 def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", return_pred_only=False,
                       thresh_for_f1=0.5, clustering_fn=None, clustering_threshold=None, val_dataloader=None,
                       tqdm_label='', device=None, verbose=False, debug=False, _errors=None, run_dir='./',
-                      tqdm_position=None, model_args=None, return_iter=False, fork_size=300, max_parallel_forks=4,
+                      tqdm_position=None, model_args=None, return_iter=False, fork_size=500, max_parallel_forks=3,
                       disable_tqdm=False):
     fn_args = locals()
     fork_enabled = fork_size > -1 and model_args is not None
@@ -208,6 +213,7 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret
                 # Only one signature in block; manually assign a unique cluster
                 pred_cluster_ids = [max_pred_id + 1]
             elif fork_enabled and block_size >= fork_size and clustering_fn.__class__ is CCInference:
+                logger.info(f"Eval fork info: len(_procs)={len(_procs)}, len(_shared_list)={len(_shared_list)}")
                 if (len(_procs) - len(_shared_list)) < max_parallel_forks:
                     _proc = _fork_iter(idx, _fork_id, _shared_list, evaluate_pairwise, **fn_args)
                     _fork_id += 1
@@ -262,7 +268,11 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret
             _procs.sort(key=lambda x: x[1])  # To visualize progress
             for _proc in tqdm(_procs, desc=f'Eval {tqdm_label} (waiting for forks to join)', position=tqdm_position):
                 _proc[0].join()
-            assert len(_procs) == len(_shared_list), "All forked eval iterations did not return results"
+            try:
+                assert len(_procs) == len(_shared_list)
+            except:
+                logger.info("Error: All forked eval iterations did not return results")
+                raise ValueError("All forked eval iterations did not return results")
             for _data in _shared_list:
                 pred_cluster_ids = (_data['cluster_labels'] + (max_pred_id + 1)).tolist()
                 cc_obj_vals['round'].append(_data['round_objective_value'])
diff --git a/e2e_scripts/train.py b/e2e_scripts/train.py
index eb0b483..9bc1bc4 100644
--- a/e2e_scripts/train.py
+++ b/e2e_scripts/train.py
@@ -38,7 +38,7 @@
 def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, group=None,
           save_model=False, load_model_from_wandb_run=None, load_model_from_fpath=None,
           eval_only_split=None, eval_all=False, skip_initial_eval=False, pairwise_eval_clustering=None,
-          debug=False, track_errors=True, local=False, sync_dev=False):
+          debug=False, track_errors=True, local=False, sync_dev=False, icml_final_eval=False):
     init_args = {
         'config': DEFAULT_HYPERPARAMS
     }
@@ -63,6 +63,27 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
     with wandb.init(**init_args) as run:
         wandb.config.update(hyperparams, allow_val_change=True)
         hyp = wandb.config
+
+        # Limit training epochs by dataset in e2e mode (for tractability)
+        max_epochs_by_dataset = {
+            'e2e': {
+                'aminer': 3,
+                'kisti': 3,
+                'arnetminer': 5
+            },
+            'nosdp': {
+                'aminer': 3,
+                'kisti': 3,
+                'arnetminer': 5
+            }
+        }
+        n_epochs_override = None
+        if not hyp['pairwise_mode']:
+            _training_method = 'e2e' if hyp['use_sdp'] else 'nosdp'
+            if hyp['dataset'] in max_epochs_by_dataset[_training_method]:
+                n_epochs_override = max_epochs_by_dataset[_training_method][hyp['dataset']]
+                logger.info(f'Limiting number of epochs from {hyp["n_epochs"]} to {n_epochs_override}')
+
         logger.info("Run hyperparameters:")
         logger.info(hyp)
         # Save hyperparameters as a json file and store in wandb run
@@ -82,7 +103,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
         use_rounded_loss = hyp["use_rounded_loss"]
         e2e_loss = hyp['e2e_loss']
         batch_size = hyp['batch_size'] if pairwise_mode else 1  # Force clustering runs to operate on 1 block only
-        n_epochs = hyp['n_epochs']
+        n_epochs = n_epochs_override if n_epochs_override is not None else hyp['n_epochs']
         n_warmstart_epochs = hyp['n_warmstart_epochs']
         use_lr_scheduler = hyp['use_lr_scheduler']
         hidden_dim = hyp["hidden_dim"]
@@ -282,7 +303,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                                                   hyp["subsample_sz_dev"], pairwise_mode,
                                                   batch_size, split=eval_only_split)
                 eval_scores = eval_fn(model, eval_dataloader, tqdm_label=eval_only_split, device=device, verbose=verbose,
-                                      debug=debug, _errors=_errors, model_args=model_args)
+                                      debug=debug, _errors=_errors, model_args=model_args, run_dir=run.dir)
                 logger.info(f"Eval: {eval_only_split}_{list(eval_metric_to_idx)[0]}={eval_scores[0]}, " +
                             f"{eval_only_split}_{list(eval_metric_to_idx)[1]}={eval_scores[1]}")
                 # Log eval metrics
@@ -300,7 +321,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
                                                     clustering_threshold=clustering_threshold,
                                                     val_dataloader=val_dataloader_e2e,
                                                     tqdm_label='test clustering', device=device, verbose=verbose,
-                                                    debug=debug, _errors=_errors, model_args=model_args)
+                                                    debug=debug, _errors=_errors, model_args=model_args, run_dir=run.dir)
                         if pairwise_clustering_fn.__class__ is HACInference:
                             clustering_threshold = pairwise_clustering_fn.cut_threshold
                         logger.info(
@@ -558,43 +579,96 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
             # Evaluate the best dev model on test
             if overfit_batch_idx == -1:
                 model.load_state_dict(best_dev_state_dict)
-                with torch.no_grad():
-                    model.eval()
-                    test_scores = eval_fn(model, test_dataloader, tqdm_label='test', device=device, verbose=verbose,
-                                          debug=debug, _errors=_errors, tqdm_position=2, model_args=model_args)
-                    logger.info(f"Final: test_{list(eval_metric_to_idx)[0]}={test_scores[0]}, " +
-                                f"test_{list(eval_metric_to_idx)[1]}={test_scores[1]}")
-                    # Log final metrics
-                    wandb.log({'best_dev_epoch': best_epoch + 1,
-                               f'best_dev_{list(eval_metric_to_idx)[0]}': best_dev_scores[0],
-                               f'best_dev_{list(eval_metric_to_idx)[1]}': best_dev_scores[1],
-                               f'best_test_{list(eval_metric_to_idx)[0]}': test_scores[0],
-                               f'best_test_{list(eval_metric_to_idx)[1]}': test_scores[1]})
-                    if len(test_scores) == 3:
-                        log_cc_objective_values(scores=test_scores, split_name='best_test', log_prefix='Final',
-                                                verbose=True, logger=logger)
-                    # For pairwise-mode:
-                    if pairwise_clustering_fns[0] is not None:
+
+                if icml_final_eval:
+                    # Run all inference variants on the test set and exit
+                    cc_inference_sdp = CCInference(sdp_max_iters, sdp_eps, sdp_scale, use_sdp=True)
+                    cc_inference_nosdp = CCInference(sdp_max_iters, sdp_eps, sdp_scale, use_sdp=False)
+                    inference_fns = [HACInference(),
+                                     cc_inference_sdp, cc_inference_sdp,
+                                     cc_inference_nosdp, cc_inference_nosdp]
+                    inference_fn_labels = ['hac',
+                                           'cc', 'cc-fixed',
+                                           'cc-nosdp', 'cc-nosdp-fixed']
+                    cc_inference_sdp.eval()
+                    cc_inference_nosdp.eval()
+                    val_dataloader_e2e, test_dataloader_e2e = get_dataloaders(hyp["dataset"],
+                                                                              hyp["dataset_random_seed"],
+                                                                              hyp["convert_nan"],
+                                                                              hyp["nan_value"],
+                                                                              hyp["normalize_data"],
+                                                                              hyp["subsample_sz_train"],
+                                                                              hyp["subsample_sz_dev"],
+                                                                              pairwise_mode=False, batch_size=1,
+                                                                              split=['dev', 'test'])
+                    inf_start_time = time.time()
+                    with torch.no_grad():
+                        model.eval()
                         clustering_threshold = None
-                        for i, pairwise_clustering_fn in enumerate(pairwise_clustering_fns):
-                            clustering_scores = eval_fn(model, test_dataloader_e2e,
-                                                        clustering_fn=pairwise_clustering_fn,
-                                                        clustering_threshold=clustering_threshold,
-                                                        val_dataloader=val_dataloader_e2e,
-                                                        tqdm_label='test clustering', device=device, verbose=verbose,
-                                                        debug=debug, _errors=_errors, tqdm_position=2,
-                                                        model_args=model_args)
-                            if pairwise_clustering_fn.__class__ is HACInference:
-                                clustering_threshold = pairwise_clustering_fn.cut_threshold
-                            logger.info(f"Final: test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[0]}, " +
-                                        f"test_{list(clustering_metrics)[1]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[1]}")
-                            # Log final metrics
-                            wandb.log({f'best_test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}': clustering_scores[0],
-                                       f'best_test_{list(clustering_metrics)[1]}_{pairwise_clustering_fn_labels[i]}': clustering_scores[1]})
+                        for i, inference_fn in enumerate(inference_fns):
+                            logger.info(f'Inference method: {inference_fn_labels[i]}')
+                            clustering_scores = evaluate_pairwise(model, test_dataloader_e2e,
+                                                                  clustering_fn=inference_fn,
+                                                                  clustering_threshold=clustering_threshold if i % 2 == 0 else None,
+                                                                  val_dataloader=val_dataloader_e2e,
+                                                                  tqdm_label='test clustering', device=device,
+                                                                  verbose=verbose,
+                                                                  debug=debug, _errors=_errors, model_args=model_args)
+                            if inference_fn.__class__ is HACInference:
+                                clustering_threshold = inference_fn.cut_threshold
+                            logger.info(
+                                f"Eval: test_{list(clustering_metrics)[0]}_{inference_fn_labels[i]}={clustering_scores[0]}, " +
+                                f"test_{list(clustering_metrics)[1]}_{inference_fn_labels[i]}={clustering_scores[1]}")
+                            # Log eval metrics
+                            wandb.log({f'best_test_{list(clustering_metrics)[0]}_{inference_fn_labels[i]}':
+                                           clustering_scores[0],
+                                       f'best_test_{list(clustering_metrics)[1]}_{inference_fn_labels[i]}':
+                                           clustering_scores[1]})
                             if len(clustering_scores) == 3:
                                 log_cc_objective_values(scores=clustering_scores,
-                                                        split_name=f'best_test_{pairwise_clustering_fn_labels[i]}',
-                                                        log_prefix='Final', verbose=True, logger=logger)
+                                                        split_name=f'best_test_{inference_fn_labels[i]}',
+                                                        log_prefix='Eval', verbose=verbose, logger=logger)
+                    inf_end_time = time.time()
+                    run.summary["z_inf_time"] = round(inf_end_time - inf_start_time)
+                else:
+                    with torch.no_grad():
+                        model.eval()
+                        test_scores = eval_fn(model, test_dataloader, tqdm_label='test', device=device, verbose=verbose,
+                                              debug=debug, _errors=_errors, tqdm_position=2, model_args=model_args,
+                                              run_dir=run.dir)
+                        logger.info(f"Final: test_{list(eval_metric_to_idx)[0]}={test_scores[0]}, " +
+                                    f"test_{list(eval_metric_to_idx)[1]}={test_scores[1]}")
+                        # Log final metrics
+                        wandb.log({'best_dev_epoch': best_epoch + 1,
+                                   f'best_dev_{list(eval_metric_to_idx)[0]}': best_dev_scores[0],
+                                   f'best_dev_{list(eval_metric_to_idx)[1]}': best_dev_scores[1],
+                                   f'best_test_{list(eval_metric_to_idx)[0]}': test_scores[0],
+                                   f'best_test_{list(eval_metric_to_idx)[1]}': test_scores[1]})
+                        if len(test_scores) == 3:
+                            log_cc_objective_values(scores=test_scores, split_name='best_test', log_prefix='Final',
+                                                    verbose=True, logger=logger)
+                        # For pairwise-mode:
+                        if pairwise_clustering_fns[0] is not None:
+                            clustering_threshold = None
+                            for i, pairwise_clustering_fn in enumerate(pairwise_clustering_fns):
+                                clustering_scores = eval_fn(model, test_dataloader_e2e,
+                                                            clustering_fn=pairwise_clustering_fn,
+                                                            clustering_threshold=clustering_threshold,
+                                                            val_dataloader=val_dataloader_e2e,
+                                                            tqdm_label='test clustering', device=device, verbose=verbose,
+                                                            debug=debug, _errors=_errors, tqdm_position=2,
+                                                            model_args=model_args, run_dir=run.dir)
+                                if pairwise_clustering_fn.__class__ is HACInference:
+                                    clustering_threshold = pairwise_clustering_fn.cut_threshold
+                                logger.info(f"Final: test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[0]}, " +
+                                            f"test_{list(clustering_metrics)[1]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[1]}")
+                                # Log final metrics
+                                wandb.log({f'best_test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}': clustering_scores[0],
+                                           f'best_test_{list(clustering_metrics)[1]}_{pairwise_clustering_fn_labels[i]}': clustering_scores[1]})
+                                if len(clustering_scores) == 3:
+                                    log_cc_objective_values(scores=clustering_scores,
+                                                            split_name=f'best_test_{pairwise_clustering_fn_labels[i]}',
+                                                            log_prefix='Final', verbose=True, logger=logger)
 
         run.summary["z_model_parameters"] = count_parameters(model)
         run.summary["z_run_time"] = round(end_time - start_time)
@@ -676,6 +750,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
             sweep_id = wandb.sweep(sweep=sweep_config,
                                    project=args['wandb_project'],
                                    entity=args['wandb_entity'])
+            logger.info(f"SWEEP_ID={sweep_id}")
 
         # Start sweep job
         wandb.agent(sweep_id,
@@ -722,5 +797,6 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g
               debug=args['debug'],
               track_errors=not args['no_error_tracking'],
               local=args['local'],
-              sync_dev=args['sync_dev'])
+              sync_dev=args['sync_dev'],
+              icml_final_eval=args['icml_final_eval'])
         logger.info("End of run")
diff --git a/e2e_scripts/train_utils.py b/e2e_scripts/train_utils.py
index 617f064..d94701a 100644
--- a/e2e_scripts/train_utils.py
+++ b/e2e_scripts/train_utils.py
@@ -4,6 +4,7 @@
 import copy
 import os
 import json
+import random
 from collections import defaultdict
 from typing import Dict
 from typing import Tuple, Optional
@@ -216,7 +217,7 @@ def __call__(self, input: Tensor, target: Tensor) -> Tensor:
 
 def copy_and_load_model(model, run_dir, device, store_only=False):
     _model = copy.deepcopy(model)
-    _PATH = os.path.join(run_dir, f'_temp_state_dict_{int(time())}.pt')
+    _PATH = os.path.join(run_dir, f'_temp_state_dict_{int(time())}-{random.randint(0, 100)}.pt')
     torch.save(model.state_dict(), _PATH)
     if store_only:
         return _PATH
diff --git a/get_wandb_results.py b/get_wandb_results.py
new file mode 100644
index 0000000..eda0eb2
--- /dev/null
+++ b/get_wandb_results.py
@@ -0,0 +1,126 @@
+import argparse
+import json
+import logging
+import csv
+from copy import deepcopy
+import numpy as np
+import pandas as pd
+
+from IPython import embed
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s', datefmt='%m/%d/%Y %H:%M:%S',
+                    level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class Parser(argparse.ArgumentParser):
+    def __init__(self):
+        super().__init__()
+        self.add_argument(
+            "--data_fpath", type=str
+        )
+        self.add_argument(
+            "--interactive", action="store_true",
+        )
+        self.add_argument(
+            "--get_b3_f1_across", action="store_true",
+        )
+
+
+def get_df_by_dataset(res, dataset):
+    new_res = {}
+    for _r in res:
+        if dataset in _r:
+            new_res[_r.replace(f"{dataset}_", '')] = res[_r]
+    return pd.DataFrame(new_res).T
+
+if __name__ == '__main__':
+    parser = Parser()
+    args = parser.parse_args()
+    logger.info("Script arguments:")
+    logger.info(args.__dict__)
+
+    if args.data_fpath is not None:
+        fpath = args.data_fpath
+    else:
+        # hardcoded during dev
+        fpath = 'wandb_export_2023-03-19T14_30_08.659-04_00.csv'
+
+    results = []
+    with open(fpath, mode='r') as csv_file:
+        csv_reader = csv.DictReader(csv_file)
+        line_count = 0
+        for row in csv_reader:
+            if line_count == 0:
+                print(f'Column names are {", ".join(row)}')
+            else:
+                results.append(deepcopy(row))
+            line_count += 1
+        print(f'Processed {line_count} lines.')
+
+    final = {}
+    out_keys = {
+        'train_time': 'z_run_time',
+        'inf_time': 'z_inf_time',
+        'b3_f1_hac': 'best_test_b3_f1_hac',
+        'b3_f1_cc': 'best_test_b3_f1_cc',
+        'b3_f1_cc-fixed': 'best_test_b3_f1_cc-fixed',
+        'b3_f1_cc-nosdp': 'best_test_b3_f1_cc-nosdp',
+        'b3_f1_cc-nosdp-fixed': 'best_test_b3_f1_cc-nosdp-fixed',
+        'vmeasure_hac': 'best_test_vmeasure_hac',
+        'vmeasure_cc': 'best_test_vmeasure_cc',
+        'vmeasure_cc-fixed': 'best_test_vmeasure_cc-fixed',
+        'vmeasure_cc-nosdp': 'best_test_vmeasure_cc-nosdp',
+        'vmeasure_cc-nosdp-fixed': 'best_test_vmeasure_cc-nosdp-fixed'
+    }
+
+    for r in results:
+        try:
+            method = f"{'mlp' if r['pairwise_mode']=='true' else 'e2e'}"
+            if r['pairwise_mode'] == 'false':
+                method += f"{'_nosdp' if r['use_sdp']=='false' else ''}"
+                method += f"{'_round' if r['use_rounded_loss'] == 'true' else '_frac'}"
+            key = f"{r['dataset']}_{method}"
+
+            if key not in final:
+                final[key] = {o: [] for o in out_keys.keys()}
+
+            for _key in out_keys:
+                final[key][_key].append(float(r[out_keys[_key]]))
+        except:
+            continue
+
+    means, stds, comb = {}, {}, {}
+    for k in final:
+        if k is not means:
+            means[k] = {}
+            stds[k] = {}
+            comb[k] = {}
+        for _k in final[k]:
+            means[k][_k] = round(np.mean(final[k][_k])*(1 if 'time' in _k else 100), 2)
+            stds[k][_k] = round(np.std(final[k][_k])*(1 if 'time' in _k else 100), 2)
+            comb[k][_k] = f"{means[k][_k]}±{stds[k][_k]}"
+
+    with open('results-mean.json', 'w') as fh:
+        json.dump(means, fh)
+    with open('results-std.json', 'w') as fh:
+        json.dump(stds, fh)
+    with open('results.json', 'w') as fh:
+        json.dump(comb, fh)
+
+    res_df = pd.DataFrame(comb)
+
+    if args.get_b3_f1_across:
+        # Average b3_f1 numbers of each training method over all inference methods
+        print()
+        print()
+        mean_dfs = {}
+        for d in ['pubmed', 'qian', 'zbmath', 'arnetminer', 'kisti']:
+            print(f'Dataset: {d}')
+            mean_dfs[d] = get_df_by_dataset(means, d).T[
+                      ['b3_f1_hac', 'b3_f1_cc', 'b3_f1_cc-fixed', 'b3_f1_cc-nosdp', 'b3_f1_cc-nosdp-fixed']].T.mean()
+            print(mean_dfs[d])
+            print()
+
+    if args.interactive:
+        embed()
diff --git a/rerun_batch.sh b/rerun_batch.sh
new file mode 100644
index 0000000..0d91868
--- /dev/null
+++ b/rerun_batch.sh
@@ -0,0 +1,127 @@
+#!/bin/bash -e
+
+sh rerun_best.sh gffw3aq7 gypsum-1080ti
+sh rerun_best.sh 88qaxovr gypsum-1080ti
+sh rerun_best.sh c4f4u06r gypsum-1080ti
+sh rerun_best.sh tuojhxl9 gypsum-1080ti
+sh rerun_best.sh 896wsqzi gypsum-1080ti
+sh rerun_best.sh ugfvzuu3 gypsum-1080ti
+sh rerun_best.sh xgovhwp2 gypsum-1080ti
+sh rerun_best.sh xwei54ka gypsum-1080ti
+sh rerun_best.sh ehg8oouh gypsum-1080ti
+sh rerun_best.sh hbbp5yk5 gypsum-1080ti
+sh rerun_best.sh 8r0o10am gypsum-1080ti
+sh rerun_best.sh jlrho35c gypsum-1080ti
+sh rerun_best.sh 7fq9ubkl gypsum-1080ti
+sh rerun_best.sh 9li0p2xf gypsum-1080ti
+sh rerun_best.sh 3v83ldl4 gypsum-1080ti
+sh rerun_best.sh 8gmw28xf gypsum-1080ti
+sh rerun_best.sh 5mobobvf gypsum-1080ti
+sh rerun_best.sh w4lo7mic gypsum-1080ti
+sh rerun_best.sh inlrt56m gypsum-1080ti
+sh rerun_best.sh 5841jp68 gypsum-1080ti
+sh rerun_best.sh 44ghc7aa gypsum-titanx
+sh rerun_best.sh c36kghyo gypsum-titanx
+sh rerun_best.sh 6w7t2y5m gypsum-titanx
+sh rerun_best.sh bo5ww9oj gypsum-titanx
+sh rerun_best.sh i1g1bwuz gypsum-titanx
+sh rerun_best.sh vdxqpisp gypsum-titanx
+sh rerun_best.sh sc4xc4lq gypsum-titanx
+sh rerun_best.sh 41uylhgc gypsum-titanx
+sh rerun_best.sh ellbgtzj gypsum-titanx
+sh rerun_best.sh 20j5pp3p gypsum-titanx
+sh rerun_best.sh mqwfys78 gypsum-titanx
+sh rerun_best.sh 4cl8lvl5 gypsum-titanx
+sh rerun_best.sh jhnlrb9b gypsum-titanx
+sh rerun_best.sh d8gybu3j gypsum-titanx
+sh rerun_best.sh i13k9nhb gypsum-titanx
+sh rerun_best.sh yfc5xfq6 gypsum-titanx
+sh rerun_best.sh by24aayn gypsum-titanx
+sh rerun_best.sh ubiwtwso gypsum-titanx
+sh rerun_best.sh o0y4csbo gypsum-titanx
+sh rerun_best.sh wntemai3 gypsum-titanx
+sh rerun_best.sh nmtlv76s gypsum-2080ti
+sh rerun_best.sh prz43ogk gypsum-2080ti
+sh rerun_best.sh 2edwecpz gypsum-2080ti
+sh rerun_best.sh th5hl878 gypsum-2080ti
+sh rerun_best.sh wtrrazuk gypsum-2080ti
+sh rerun_best.sh oa404d8x gypsum-2080ti
+sh rerun_best.sh 1ke4vxc7 gypsum-2080ti
+sh rerun_best.sh s6rozj6y gypsum-2080ti
+sh rerun_best.sh mz0gjtgm gypsum-2080ti
+sh rerun_best.sh hodwzmv2 gypsum-2080ti
+sh rerun_best.sh cxhocbc9 gypsum-2080ti
+sh rerun_best.sh grajc9xd gypsum-2080ti
+sh rerun_best.sh 8yin7z6k gypsum-2080ti
+sh rerun_best.sh ykpeo4zt gypsum-2080ti
+sh rerun_best.sh xpbybund gypsum-2080ti
+sh rerun_best.sh j55f9ppp gypsum-2080ti
+sh rerun_best.sh wuu45zhi gypsum-2080ti
+sh rerun_best.sh f5t9ge27 gypsum-2080ti
+sh rerun_best.sh kboyitfu gypsum-2080ti
+sh rerun_best.sh s8v1grpa gypsum-2080ti
+sh rerun_best.sh 2l8mjiei gypsum-m40
+sh rerun_best.sh ztng9hxr gypsum-m40
+sh rerun_best.sh 44dgz6e7 gypsum-m40
+sh rerun_best.sh 4uza846x gypsum-m40
+sh rerun_best.sh p7q0x2x4 gypsum-m40
+sh rerun_best.sh xqts82x9 gypsum-m40
+sh rerun_best.sh 85coxdiq gypsum-m40
+sh rerun_best.sh 4zlgu03n gypsum-m40
+sh rerun_best.sh 5y0yeyil gypsum-m40
+sh rerun_best.sh 33v38tro gypsum-m40
+sh rerun_best.sh 3cphu97j gypsum-m40
+sh rerun_best.sh rseqi816 gypsum-m40
+sh rerun_best.sh a4xdafqa gypsum-m40
+sh rerun_best.sh kxv70u0z gypsum-m40
+sh rerun_best.sh vjdcqev1 gypsum-m40
+sh rerun_best.sh fmx2rqe7 gypsum-m40
+sh rerun_best.sh ehrfeu8f gypsum-m40
+sh rerun_best.sh 3qja957g gypsum-m40
+sh rerun_best.sh y5lbmu6d gypsum-m40
+sh rerun_best.sh s21w56en gypsum-m40
+sh rerun_best.sh g40syomc gypsum-titanx
+sh rerun_best.sh nucn0flw gypsum-titanx
+sh rerun_best.sh j8211otn gypsum-titanx
+sh rerun_best.sh 3ylnj3zg gypsum-titanx
+sh rerun_best.sh 63pa6vn8 gypsum-titanx
+sh rerun_best.sh ukvsewnh gypsum-titanx
+sh rerun_best.sh hc3f7qsd gypsum-titanx
+sh rerun_best.sh 37e6x0rx gypsum-titanx
+sh rerun_best.sh nu08k76t gypsum-titanx
+sh rerun_best.sh 8h4rjiok gypsum-titanx
+sh rerun_best.sh z81s4dat gypsum-titanx
+sh rerun_best.sh c81jlxii gypsum-titanx
+sh rerun_best.sh kmidsylz gypsum-titanx
+sh rerun_best.sh 8t6zp873 gypsum-titanx
+sh rerun_best.sh rsve7a2h gypsum-titanx
+sh rerun_best.sh sefe99yi gypsum-titanx
+sh rerun_best.sh 6r1frbt4 gypsum-titanx
+sh rerun_best.sh 0gs1obh1 gypsum-titanx
+sh rerun_best.sh cspdnl7j gypsum-titanx
+sh rerun_best.sh 8hcrk3n9 gypsum-titanx
+sh rerun_best.sh c6drhs6a gypsum-m40
+sh rerun_best.sh ym7mdlep gypsum-m40
+sh rerun_best.sh wioahicm gypsum-m40
+sh rerun_best.sh z9k6elm0 gypsum-m40
+sh rerun_best.sh t74hzmfa gypsum-m40
+sh rerun_best.sh 7sgzno3w gypsum-m40
+sh rerun_best.sh y8ckivlk gypsum-m40
+sh rerun_best.sh 5qwq07l6 gypsum-m40
+sh rerun_best.sh uc4jgv30 gypsum-m40
+sh rerun_best.sh tziwf98r gypsum-m40
+sh rerun_best.sh 0zof37l4 gypsum-m40
+sh rerun_best.sh gn8osqi9 gypsum-m40
+sh rerun_best.sh 3j1zpwd2 gypsum-m40
+sh rerun_best.sh mcw4dk3x gypsum-m40
+sh rerun_best.sh d4kudukt gypsum-m40
+sh rerun_best.sh mi55qs26 gypsum-m40
+sh rerun_best.sh 1j7867tv gypsum-m40
+sh rerun_best.sh wb3de1t8 gypsum-m40
+sh rerun_best.sh 3w093bm5 gypsum-m40
+sh rerun_best.sh 1daqvigq gypsum-m40
+sh rerun_best.sh ymt0dxxd gypsum-m40
+sh rerun_best.sh ue2i5chg gypsum-m40
+sh rerun_best.sh 2vj2gvkc gypsum-m40
+sh rerun_best.sh 08uafgmw gypsum-m40
+sh rerun_best.sh 2ptqopi7 gypsum-m40
diff --git a/rerun_best.sh b/rerun_best.sh
new file mode 100644
index 0000000..6549e38
--- /dev/null
+++ b/rerun_best.sh
@@ -0,0 +1,19 @@
+#!/bin/bash -e
+
+entity="dhdhagar"
+project="prob-ent-resolution"
+run_id=${1}
+gpu_name=${2:-"gypsum-1080ti"}
+run_tag=${3:-"icml_rebut_best"}
+
+JOB_DESC=rerun_${run_id} && JOB_NAME=${JOB_DESC}_$(date +%s) && \
+  sbatch -J ${JOB_NAME} -e jobs/${JOB_NAME}.err -o jobs/${JOB_NAME}.log \
+    --partition=${gpu_name} --gres=gpu:1 --mem=120G --time=4:00:00 \
+    run_sbatch.sh e2e_scripts/train.py \
+    --load_hyp_from_wandb_run="${entity}/${project}/${run_id}" \
+    --icml_final_eval \
+    --skip_initial_eval \
+    --silent \
+    --wandb_tags="${run_tag},${run_id}" \
+    --save_model
+  echo "    Logs: jobs/${JOB_NAME}.err"
diff --git a/run_sweep.sh b/run_sweep.sh
index c635c0f..926e3ea 100644
--- a/run_sweep.sh
+++ b/run_sweep.sh
@@ -10,15 +10,15 @@ sweep_prefix=${6:-""}
 for ((i = ${n_seed_start}; i <= ${n_seed_end}; i++)); do
   JOB_DESC=${model}_${dataset}_sweep${i} && JOB_NAME=${JOB_DESC}_$(date +%s) && \
   sbatch -J ${JOB_NAME} -e jobs/${JOB_NAME}.err -o jobs/${JOB_NAME}.log \
-    --partition=${gpu_name} --gres=gpu:1 --mem=100G --time=12:00:00 \
+    --partition=${gpu_name} --gres=gpu:1 --mem=120G --time=12:00:00 \
     run_sbatch.sh e2e_scripts/train.py \
     --dataset="${dataset}" \
     --dataset_random_seed=${i} \
     --pairwise_eval_clustering="both" \
     --skip_initial_eval \
     --silent \
-    --wandb_sweep_name="${sweep_prefix}${model}_${dataset}_${i}" \
+    --wandb_sweep_name="${sweep_prefix}_${model}_${dataset}_${i}" \
     --wandb_sweep_params="wandb_configs/sweeps/${model}.json" \
-    --wandb_tags="${model},${dataset},seed_${i}"
+    --wandb_tags="${model},${dataset},seed_${i},${sweep_prefix}"
   echo "    Logs: jobs/${JOB_NAME}.err"
 done
diff --git a/utils/parser.py b/utils/parser.py
index de21db8..535f1af 100644
--- a/utils/parser.py
+++ b/utils/parser.py
@@ -157,3 +157,7 @@ def add_training_args(self):
             "--sync_dev", action="store_true",
             help="Whether to force dev evaluations to run synchronously",
         )
+        parser.add_argument(
+            "--icml_final_eval", action="store_true",
+            help="ICML REBUTTAL ONLY: Run all eval after training",
+        )
diff --git a/wandb_configs/sweeps/e2e-nosdp-warm.json b/wandb_configs/sweeps/e2e-nosdp-warm.json
index 294b543..c4f22cf 100644
--- a/wandb_configs/sweeps/e2e-nosdp-warm.json
+++ b/wandb_configs/sweeps/e2e-nosdp-warm.json
@@ -1,16 +1,16 @@
 {
     "n_epochs": {"value": 10},
-    "lr": {"max": 2e-1, "min": 1e-5},
+    "lr": {"max": 7e-2, "min": 1e-5},
     "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]},
     "dev_opt_metric": {"value": "b3_f1"},
     "neumiss_depth": {"values": [10, 20]},
-    "hidden_dim": {"values": [512, 1024]},
+    "hidden_dim": {"values": [256, 512]},
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
     "lr_scheduler": {"value": "plateau"},
-    "activation": {"values": ["leaky_relu", "relu"]},
+    "activation": {"values": ["leaky_relu"]},
     "use_sdp": {"value":  false},
     "n_warmstart_epochs": {"value":  2},
     "gradient_accumulation": {"values":  [true, false]},
     "weighted_loss": {"values":  [true, false]}
-}
+}
\ No newline at end of file
diff --git a/wandb_configs/sweeps/e2e-nosdp.json b/wandb_configs/sweeps/e2e-nosdp.json
index 5b47c39..1c29da8 100644
--- a/wandb_configs/sweeps/e2e-nosdp.json
+++ b/wandb_configs/sweeps/e2e-nosdp.json
@@ -1,14 +1,14 @@
 {
     "n_epochs": {"value": 10},
-    "lr": {"max": 2e-1, "min": 1e-5},
+    "lr": {"max": 7e-2, "min": 1e-5},
     "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]},
     "dev_opt_metric": {"value": "b3_f1"},
     "neumiss_depth": {"values": [10, 20]},
-    "hidden_dim": {"values": [512, 1024]},
+    "hidden_dim": {"values": [256, 512]},
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
     "lr_scheduler": {"value": "plateau"},
-    "activation": {"values": ["leaky_relu", "relu"]},
+    "activation": {"values": ["leaky_relu"]},
     "use_sdp": {"value":  false},
     "gradient_accumulation": {"values":  [true, false]},
     "weighted_loss": {"values":  [true, false]}
diff --git a/wandb_configs/sweeps/e2e-warm.json b/wandb_configs/sweeps/e2e-warm.json
index 19e511b..8f68ae6 100644
--- a/wandb_configs/sweeps/e2e-warm.json
+++ b/wandb_configs/sweeps/e2e-warm.json
@@ -1,15 +1,15 @@
 {
     "n_epochs": {"value": 10},
-    "lr": {"max": 2e-1, "min": 1e-5},
+    "lr": {"max": 7e-2, "min": 1e-5},
     "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]},
     "dev_opt_metric": {"value": "b3_f1"},
     "neumiss_depth": {"values": [10, 20]},
-    "hidden_dim": {"values": [512, 1024]},
+    "hidden_dim": {"values": [256, 512]},
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
     "lr_scheduler": {"value": "plateau"},
-    "activation": {"values": ["leaky_relu", "relu"]},
+    "activation": {"values": ["leaky_relu"]},
     "n_warmstart_epochs": {"value":  2},
     "gradient_accumulation": {"values":  [true, false]},
     "weighted_loss": {"values":  [true, false]}
-}
+}
\ No newline at end of file
diff --git a/wandb_configs/sweeps/e2e.json b/wandb_configs/sweeps/e2e.json
index e084f00..b7948aa 100644
--- a/wandb_configs/sweeps/e2e.json
+++ b/wandb_configs/sweeps/e2e.json
@@ -1,14 +1,14 @@
 {
     "n_epochs": {"value": 10},
-    "lr": {"max": 2e-1, "min": 1e-5},
+    "lr": {"max": 7e-2, "min": 1e-5},
     "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]},
     "dev_opt_metric": {"value": "b3_f1"},
     "neumiss_depth": {"values": [10, 20]},
-    "hidden_dim": {"values": [512, 1024]},
+    "hidden_dim": {"values": [256, 512]},
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
     "lr_scheduler": {"value": "plateau"},
-    "activation": {"values": ["leaky_relu", "relu"]},
+    "activation": {"values": ["leaky_relu"]},
     "gradient_accumulation": {"values":  [true, false]},
     "weighted_loss": {"values":  [true, false]}
 }
diff --git a/wandb_configs/sweeps/frac-nosdp-warm.json b/wandb_configs/sweeps/frac-nosdp-warm.json
index 491e04c..cc105d2 100644
--- a/wandb_configs/sweeps/frac-nosdp-warm.json
+++ b/wandb_configs/sweeps/frac-nosdp-warm.json
@@ -1,14 +1,14 @@
 {
     "n_epochs": {"value": 10},
-    "lr": {"max": 2e-1, "min": 1e-5},
+    "lr": {"max": 7e-2, "min": 1e-5},
     "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]},
     "dev_opt_metric": {"value": "b3_f1"},
     "neumiss_depth": {"values": [10, 20]},
-    "hidden_dim": {"values": [512, 1024]},
+    "hidden_dim": {"values": [256, 512]},
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
     "lr_scheduler": {"value": "plateau"},
-    "activation": {"values": ["leaky_relu", "relu"]},
+    "activation": {"values": ["leaky_relu"]},
     "use_rounded_loss": {"value": false},
     "use_sdp": {"value":  false},
     "n_warmstart_epochs": {"value":  2},
diff --git a/wandb_configs/sweeps/frac-nosdp.json b/wandb_configs/sweeps/frac-nosdp.json
index d9a1e41..0c6ad42 100644
--- a/wandb_configs/sweeps/frac-nosdp.json
+++ b/wandb_configs/sweeps/frac-nosdp.json
@@ -1,14 +1,14 @@
 {
     "n_epochs": {"value": 10},
-    "lr": {"max": 2e-1, "min": 1e-5},
+    "lr": {"max": 7e-2, "min": 1e-5},
     "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]},
     "dev_opt_metric": {"value": "b3_f1"},
     "neumiss_depth": {"values": [10, 20]},
-    "hidden_dim": {"values": [512, 1024]},
+    "hidden_dim": {"values": [256, 512]},
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
     "lr_scheduler": {"value": "plateau"},
-    "activation": {"values": ["leaky_relu", "relu"]},
+    "activation": {"values": ["leaky_relu"]},
     "use_rounded_loss": {"value": false},
     "use_sdp": {"value":  false},
     "gradient_accumulation": {"values":  [true, false]},
diff --git a/wandb_configs/sweeps/frac-warm.json b/wandb_configs/sweeps/frac-warm.json
index b13efc5..cac98fd 100644
--- a/wandb_configs/sweeps/frac-warm.json
+++ b/wandb_configs/sweeps/frac-warm.json
@@ -1,14 +1,14 @@
 {
     "n_epochs": {"value": 10},
-    "lr": {"max": 2e-1, "min": 1e-5},
+    "lr": {"max": 7e-2, "min": 1e-5},
     "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]},
     "dev_opt_metric": {"value": "b3_f1"},
     "neumiss_depth": {"values": [10, 20]},
-    "hidden_dim": {"values": [512, 1024]},
+    "hidden_dim": {"values": [256, 512]},
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
     "lr_scheduler": {"value": "plateau"},
-    "activation": {"values": ["leaky_relu", "relu"]},
+    "activation": {"values": ["leaky_relu"]},
     "use_rounded_loss": {"value": false},
     "n_warmstart_epochs": {"value":  2},
     "gradient_accumulation": {"values":  [true, false]},
diff --git a/wandb_configs/sweeps/frac.json b/wandb_configs/sweeps/frac.json
index a572b76..3b4b277 100644
--- a/wandb_configs/sweeps/frac.json
+++ b/wandb_configs/sweeps/frac.json
@@ -1,14 +1,14 @@
 {
     "n_epochs": {"value": 10},
-    "lr": {"max": 2e-1, "min": 1e-5},
+    "lr": {"max": 7e-2, "min": 1e-5},
     "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]},
     "dev_opt_metric": {"value": "b3_f1"},
     "neumiss_depth": {"values": [10, 20]},
-    "hidden_dim": {"values": [512, 1024]},
+    "hidden_dim": {"values": [256, 512]},
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
     "lr_scheduler": {"value": "plateau"},
-    "activation": {"values": ["leaky_relu", "relu"]},
+    "activation": {"values": ["leaky_relu"]},
     "use_rounded_loss": {"value": false},
     "gradient_accumulation": {"values":  [true, false]},
     "weighted_loss": {"values":  [true, false]}
diff --git a/wandb_configs/sweeps/mlp.json b/wandb_configs/sweeps/mlp.json
index 24274c7..a0c1e4a 100644
--- a/wandb_configs/sweeps/mlp.json
+++ b/wandb_configs/sweeps/mlp.json
@@ -1,15 +1,15 @@
 {
     "pairwise_mode": {"value": true},
     "n_epochs": {"value": 10},
-    "lr": {"max": 2e-1, "min": 1e-5},
+    "lr": {"max": 7e-2, "min": 1e-5},
     "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]},
     "dev_opt_metric": {"value": "auroc"},
     "neumiss_depth": {"values": [10, 20]},
-    "hidden_dim": {"values": [512, 1024]},
+    "hidden_dim": {"values": [256, 512]},
     "n_hidden_layers": {"values": [1, 2]},
     "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]},
     "lr_scheduler": {"value": "plateau"},
-    "activation": {"values": ["leaky_relu", "relu"]},
+    "activation": {"values": ["leaky_relu"]},
     "gradient_accumulation": {"value":  false},
     "weighted_loss": {"value":  true}
 }

From b0679fcb54cdbe5066f857fa52d51b9f9e621cdf Mon Sep 17 00:00:00 2001
From: Sriharsha-hatwar <shatwar@umass.edu>
Date: Thu, 23 Mar 2023 15:15:13 +0000
Subject: [PATCH 13/17] Initial commit for pickling sparse matrix for different
 splits

---
 e2e_scripts/preprocess_s2and_data.py      | 18 +++---
 e2e_scripts/preprocess_s2and_pointwise.py | 79 +++++++++++++++++------
 s2and/featurizer.py                       | 11 +++-
 3 files changed, 78 insertions(+), 30 deletions(-)

diff --git a/e2e_scripts/preprocess_s2and_data.py b/e2e_scripts/preprocess_s2and_data.py
index 1322586..acd19b6 100644
--- a/e2e_scripts/preprocess_s2and_data.py
+++ b/e2e_scripts/preprocess_s2and_data.py
@@ -16,14 +16,15 @@
 from s2and.data import ANDData
 import logging
 from s2and.featurizer import FeaturizationInfo, featurize
+from preprocess_s2and_pointwise import save_pickled_pointwise_features
 
 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s', datefmt='%m/%d/%Y %H:%M:%S',
                     level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
-def save_blockwise_featurized_data(dataset_name, random_seed):
-    parent_dir = f"{DATA_HOME_DIR}/{dataset_name}"
+def save_blockwise_featurized_data(data_home_dir, dataset_name, random_seed):
+    parent_dir = f"{data_home_dir}/{dataset_name}"
     AND_dataset = ANDData(
         signatures=join(parent_dir, f"{dataset_name}_signatures.json"),
         papers=join(parent_dir, f"{dataset_name}_papers.json"),
@@ -115,13 +116,17 @@ def find_total_num_train_pairs(blockwise_data):
     print(args)
 
     params = args.__dict__
-    DATA_HOME_DIR = params["data_home_dir"]
+    data_home_dir = params["data_home_dir"]
     dataset = params["dataset_name"]
 
     random_seeds = {1, 2, 3, 4, 5}
     for seed in random_seeds:
         print("Preprocessing started for seed value", seed)
-        save_blockwise_featurized_data(dataset, seed)
+        # Create the AND Dataset for the particular seed. (write a function let it be in train_utils.py 
+        # Provide the AND Dataset to the functions : save_pickled_pointwise_features and save_blockwise_featurized_data 
+        #save_blockwise_featurized_data(data_home_dir, dataset, seed)
+        save_pickled_pointwise_features(data_home_dir, dataset, seed)
+        
 
         # Check the pickles are created OK
         train_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{seed}/train_features.pkl"
@@ -129,7 +134,4 @@ def find_total_num_train_pairs(blockwise_data):
         test_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{seed}/test_features.pkl"
         blockwise_features = read_blockwise_features(train_pkl)
         find_total_num_train_pairs(blockwise_features)
-        #verify_diff_with_s2and(dataset, seed)
-
-
-
+        #verify_diff_with_s2and(dataset, seed)
\ No newline at end of file
diff --git a/e2e_scripts/preprocess_s2and_pointwise.py b/e2e_scripts/preprocess_s2and_pointwise.py
index 002aeae..5cd8d3e 100644
--- a/e2e_scripts/preprocess_s2and_pointwise.py
+++ b/e2e_scripts/preprocess_s2and_pointwise.py
@@ -12,6 +12,7 @@
 from os.path import join
 from s2and.data import ANDData
 import pickle
+import os
 import numpy as np
 from scipy.sparse import csr_matrix, coo_matrix
 from utils.parser import Parser
@@ -25,13 +26,56 @@
                     level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-def save_pickled_pointwise_features(data_home_dir, dataset_name):
+def save_pointwise_in_different_splits(AND_dataset, sparse_matrix, label_encoder_signatures, random_seed):
+    logger.info('extracting signature depending on different split')
+
+    train_block, val_block, test_block = AND_dataset.split_cluster_signatures()
+
+    train_pointwise_features = {}
+    validation_pointwise_features = {}
+    test_pointwise_features = {}
+
+    # The above three should have a key-list(val) (where val is a list of signature IDs) under them. 
+    
+    # Doing for training block : 
+    for block_id, list_of_signatures in train_block.items():
+        # Let us transform each of those using label encoder and index them from the sparse matrix.
+        encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
+        train_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
+
+    # Doing for validation block : 
+    for block_id, list_of_signatures in val_block.items():
+        # Let us transform each of those using label encoder and index them from the sparse matrix.
+        encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
+        validation_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
+    
+    for block_id, list_of_signatures in test_block.items():
+        # Let us transform each of those using label encoder and index them from the sparse matrix.
+        encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
+        test_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
+
+    if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}")):
+        os.makedirs(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}")
+
+    train_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/train_features.pkl"
+    val_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/val_features.pkl"
+    test_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/test_features.pkl"
+
+    with open(train_pkl,"wb") as _pkl_file:
+        pickle.dump(train_pointwise_features, _pkl_file)
+    with open(val_pkl,"wb") as _pkl_file:
+        pickle.dump(validation_pointwise_features, _pkl_file)
+    with open(test_pkl,"wb") as _pkl_file:
+        pickle.dump(test_pointwise_features, _pkl_file)
+    
+
+def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed):
     """
     Fetch pointwise feature for dataset and store in a pickle.
     """
     processed_data = {}
     parent_dir = f"{data_home_dir}/{dataset_name}"
-    
+    """
     AND_dataset = ANDData(
         signatures=join(parent_dir, f"{dataset_name}_signatures.json"),
         papers=join(parent_dir, f"{dataset_name}_papers.json"),
@@ -43,21 +87,22 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name):
         test_pairs_size=10000,
         name=dataset_name,
         n_jobs=16,
-        random_seed=random_seed,
+        random_seed=random_seed
     )
     
-    # print("Storing pickled dataset....")
-    # with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f:
-    #     pickle.dump(AND_dataset, f)
-    
-    # print("Loading pickled dataset...")
-    # with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f:
-    #     AND_dataset = pickle.load(f)
-    # print("Loaded pickle dataset...")
+    print("Storing pickled dataset....")
+    with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f:
+        pickle.dump(AND_dataset, f)
+    """
+    # Use below line carefully. 
+    print("Loading pickled dataset...")
+    with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f:
+        AND_dataset = pickle.load(f)
+    print("Loaded pickle dataset...")
     
     
 
-    point_features_row, point_features_col,  point_features_data, num_feats, num_points = pointwise_featurize(AND_dataset,
+    point_features_row, point_features_col,  point_features_data, num_feats, num_points, le_feature_dict = pointwise_featurize(AND_dataset,
                                                                                                               n_jobs=16,
                                                                                                             use_cache=False)
     logger.info('converting feature indices to csr_matrix')
@@ -66,12 +111,7 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name):
             shape=(num_points, num_feats)
     ).tocsr()
     print("Matrix creation done.")
-    processed_data['mention_level_features'] = point_features
-    
-    logger.info('Dumping processed data')
-    
-    with open(f'{dataset_name}_feature_processed.pkl', 'wb') as f:
-        pickle.dump(processed_data, f)
+    save_pointwise_in_different_splits(AND_dataset, point_features, le_feature_dict, random_seed)
 
 if __name__=='__main__':
     # Creates the pickles that store the preprocessed data
@@ -86,7 +126,8 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name):
     params = args.__dict__
     data_home_dir = params["data_home_dir"]
     dataset = params["dataset_name"]
+    random_seed = 1000
 
     print("Preprocessing started")
-    save_pickled_pointwise_features(data_home_dir, dataset)
+    save_pickled_pointwise_features(data_home_dir, dataset, random_seed)
     print("Matrix")
diff --git a/s2and/featurizer.py b/s2and/featurizer.py
index 022edb8..1ccc62b 100644
--- a/s2and/featurizer.py
+++ b/s2and/featurizer.py
@@ -927,19 +927,24 @@ def pointwise_featurize(
     """
     le_signature_feature_set = preprocessing.LabelEncoder()
     le_signature_feature_set.fit(list(signature_feature_set))
+
+    # I am using this for easy retrieval for training, val and test block retrieval. 
+    le_signature_dict = preprocessing.LabelEncoder()
+    le_signature_dict.fit(list(signature_dict.keys()))
     
     point_features_row, point_features_col, point_features_data = [], [], []
     num_points = len(signature_dict.keys())
     num_feats = len(signature_feature_set)   
     
-    for index, (_, values) in tqdm(enumerate(signature_dict.items()), desc="Converting to spare matrix"):
+    for _, (key, values) in tqdm(enumerate(signature_dict.items()), desc="Converting to sparse matrix"):
         encoded_signature_features = le_signature_feature_set.transform(values)
+        encoded_key_val = le_signature_dict.transform([key])[0]
         for feature_label in encoded_signature_features :
-            point_features_row.append(index)
+            point_features_row.append(encoded_key_val)
             point_features_col.append(feature_label)
             point_features_data.append(1)
                           
-    return point_features_row, point_features_col, point_features_data, num_feats, num_points
+    return point_features_row, point_features_col, point_features_data, num_feats, num_points, le_signature_dict
             
 
 def store_featurized_pickles(

From 1f2037b0e3c25b3d74d4014d03dddded7f5b11ab Mon Sep 17 00:00:00 2001
From: arana_umass_edu <arana_umass_edu@ials-gpu016.unity.rc.umass.edu>
Date: Thu, 23 Mar 2023 21:46:31 +0000
Subject: [PATCH 14/17] create pointwise feature set then shuffle and split

---
 e2e_scripts/preprocess_s2and_data.py      |  17 ++-
 e2e_scripts/preprocess_s2and_pointwise.py | 146 ++++++++++++----------
 s2and/featurizer.py                       |  29 +++--
 3 files changed, 104 insertions(+), 88 deletions(-)

diff --git a/e2e_scripts/preprocess_s2and_data.py b/e2e_scripts/preprocess_s2and_data.py
index acd19b6..93fb2d0 100644
--- a/e2e_scripts/preprocess_s2and_data.py
+++ b/e2e_scripts/preprocess_s2and_data.py
@@ -16,14 +16,14 @@
 from s2and.data import ANDData
 import logging
 from s2and.featurizer import FeaturizationInfo, featurize
-from preprocess_s2and_pointwise import save_pickled_pointwise_features
+from preprocess_s2and_pointwise import save_pickled_pointwise_features, create_signature_features_matrix
 
 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s', datefmt='%m/%d/%Y %H:%M:%S',
                     level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
-def save_blockwise_featurized_data(data_home_dir, dataset_name, random_seed):
+def save_featurized_data(data_home_dir, dataset_name, random_seed, point_features_mat, le_signatures):
     parent_dir = f"{data_home_dir}/{dataset_name}"
     AND_dataset = ANDData(
         signatures=join(parent_dir, f"{dataset_name}_signatures.json"),
@@ -43,6 +43,7 @@ def save_blockwise_featurized_data(data_home_dir, dataset_name, random_seed):
     # Load the featurizer, which calculates pairwise similarity scores
     featurization_info = FeaturizationInfo()
     # the cache will make it faster to train multiple times - it stores the features on disk for you
+    save_pickled_pointwise_features(AND_dataset, point_features_mat, le_signatures, random_seed)
     train_pkl, val_pkl, test_pkl = store_featurized_pickles(AND_dataset,
                                                             featurization_info,
                                                             n_jobs=16,
@@ -65,8 +66,8 @@ def find_total_num_train_pairs(blockwise_data):
     for block_id in blockwise_data.keys():
         count += len(blockwise_data[block_id][0])
 
-    print("Total num of signature pairs", count)
-
+    print("Total num of signature pairs", count)    
+    
 # def verify_diff_with_s2and(dataset_name, random_seed):
 #     parent_dir = f"{DATA_HOME_DIR}/{dataset_name}"
 #     AND_dataset = ANDData(
@@ -105,7 +106,6 @@ def find_total_num_train_pairs(blockwise_data):
 #
 #     print("VERIFICATION STATUS: ", s2and_set==our_set)
 
-
 if __name__=='__main__':
     # Creates the pickles that store the preprocessed data
     # Read cmd line args
@@ -118,14 +118,13 @@ def find_total_num_train_pairs(blockwise_data):
     params = args.__dict__
     data_home_dir = params["data_home_dir"]
     dataset = params["dataset_name"]
+    
+    point_features_mat, le_signatures = create_signature_features_matrix(data_home_dir, dataset)
 
     random_seeds = {1, 2, 3, 4, 5}
     for seed in random_seeds:
         print("Preprocessing started for seed value", seed)
-        # Create the AND Dataset for the particular seed. (write a function let it be in train_utils.py 
-        # Provide the AND Dataset to the functions : save_pickled_pointwise_features and save_blockwise_featurized_data 
-        #save_blockwise_featurized_data(data_home_dir, dataset, seed)
-        save_pickled_pointwise_features(data_home_dir, dataset, seed)
+        save_featurized_data(data_home_dir, dataset, seed, point_features_mat, le_signatures)
         
 
         # Check the pickles are created OK
diff --git a/e2e_scripts/preprocess_s2and_pointwise.py b/e2e_scripts/preprocess_s2and_pointwise.py
index 5cd8d3e..2918d7c 100644
--- a/e2e_scripts/preprocess_s2and_pointwise.py
+++ b/e2e_scripts/preprocess_s2and_pointwise.py
@@ -26,56 +26,71 @@
                     level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-def save_pointwise_in_different_splits(AND_dataset, sparse_matrix, label_encoder_signatures, random_seed):
-    logger.info('extracting signature depending on different split')
-
-    train_block, val_block, test_block = AND_dataset.split_cluster_signatures()
-
-    train_pointwise_features = {}
-    validation_pointwise_features = {}
-    test_pointwise_features = {}
-
-    # The above three should have a key-list(val) (where val is a list of signature IDs) under them. 
-    
-    # Doing for training block : 
-    for block_id, list_of_signatures in train_block.items():
-        # Let us transform each of those using label encoder and index them from the sparse matrix.
-        encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
-        train_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
-
-    # Doing for validation block : 
-    for block_id, list_of_signatures in val_block.items():
-        # Let us transform each of those using label encoder and index them from the sparse matrix.
-        encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
-        validation_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
+def save_pickled_pointwise_features(AND_dataset, sparse_matrix, 
+                                    label_encoder_signatures,
+                                    random_seed: int = None):
+    """
+    Fetch pointwise feature for dataset and store in a pickle.
+    """
     
-    for block_id, list_of_signatures in test_block.items():
-        # Let us transform each of those using label encoder and index them from the sparse matrix.
-        encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
-        test_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
-
-    if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}")):
-        os.makedirs(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}")
-
-    train_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/train_features.pkl"
-    val_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/val_features.pkl"
-    test_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/test_features.pkl"
-
-    with open(train_pkl,"wb") as _pkl_file:
-        pickle.dump(train_pointwise_features, _pkl_file)
-    with open(val_pkl,"wb") as _pkl_file:
-        pickle.dump(validation_pointwise_features, _pkl_file)
-    with open(test_pkl,"wb") as _pkl_file:
-        pickle.dump(test_pointwise_features, _pkl_file)
+    if random_seed:
+        train_block, val_block, test_block = AND_dataset.split_cluster_signatures()
+
+        train_pointwise_features = {}
+        validation_pointwise_features = {}
+        test_pointwise_features = {}
+
+        # The above three should have a key-list(val) (where val is a list of signature IDs) under them. 
+
+        # Doing for training block : 
+        for block_id, list_of_signatures in train_block.items():
+            # Let us transform each of those using label encoder and index them from the sparse matrix.
+            encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
+            train_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
+
+        # Doing for validation block : 
+        for block_id, list_of_signatures in val_block.items():
+            # Let us transform each of those using label encoder and index them from the sparse matrix.
+            encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
+            validation_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
+
+        for block_id, list_of_signatures in test_block.items():
+            # Let us transform each of those using label encoder and index them from the sparse matrix.
+            encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
+            test_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
+
+        if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}")):
+            os.makedirs(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}")
+
+        train_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}/train_signature_features.pkl"
+        val_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}/val_signature_features.pkl"
+        test_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}/test_signature_features.pkl"
+
+        with open(train_pkl,"wb") as _pkl_file:
+            pickle.dump(train_pointwise_features, _pkl_file)
+        with open(val_pkl,"wb") as _pkl_file:
+            pickle.dump(validation_pointwise_features, _pkl_file)
+        with open(test_pkl,"wb") as _pkl_file:
+            pickle.dump(test_pointwise_features, _pkl_file)
+    else:
+        processed_data = {}
+        point_features_mat, _ = create_signature_features_matrix(data_home_dir, AND_dataset.name)
+        processed_data['mention_level_features'] = point_features_mat
+
+        logger.info('Dumping processed data')
+        file_name = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/{dataset_name}_all_signature_features.pkl"
+
+        with open(file_name, 'wb') as f:
+            pickle.dump(processed_data, f)
     
 
-def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed):
+def create_signature_features_matrix(data_home_dir, dataset_name):
     """
-    Fetch pointwise feature for dataset and store in a pickle.
+    Generate pointwise feature set for the entire dataset and return sparse matrix 
+    representation for each signature and their respective features.
     """
-    processed_data = {}
+    logger.info("Signature features pre-procesing started")
     parent_dir = f"{data_home_dir}/{dataset_name}"
-    """
     AND_dataset = ANDData(
         signatures=join(parent_dir, f"{dataset_name}_signatures.json"),
         papers=join(parent_dir, f"{dataset_name}_papers.json"),
@@ -86,33 +101,27 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed):
         val_pairs_size=10000,
         test_pairs_size=10000,
         name=dataset_name,
-        n_jobs=16,
-        random_seed=random_seed
+        n_jobs=16
     )
     
-    print("Storing pickled dataset....")
-    with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f:
-        pickle.dump(AND_dataset, f)
-    """
-    # Use below line carefully. 
-    print("Loading pickled dataset...")
-    with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f:
-        AND_dataset = pickle.load(f)
-    print("Loaded pickle dataset...")
+#     print("Storing pickled dataset....")
+#     with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f:
+#         pickle.dump(AND_dataset, f)
+    
+#     # Use below line carefully. 
+#     print("Loading pickled dataset...")
+#     with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f:
+#         AND_dataset = pickle.load(f)
+#     print("Loaded pickle dataset...")
+    
+    point_features_mat, le_signatures = pointwise_featurize(AND_dataset,
+                                                          n_jobs=16,
+                                                        use_cache=False)
+    
+    logger.info("Signature features pre-procesing completed")
+    return point_features_mat, le_signatures
     
     
-
-    point_features_row, point_features_col,  point_features_data, num_feats, num_points, le_feature_dict = pointwise_featurize(AND_dataset,
-                                                                                                              n_jobs=16,
-                                                                                                            use_cache=False)
-    logger.info('converting feature indices to csr_matrix')
-    point_features = coo_matrix(
-            (point_features_data, (point_features_row, point_features_col)),
-            shape=(num_points, num_feats)
-    ).tocsr()
-    print("Matrix creation done.")
-    save_pointwise_in_different_splits(AND_dataset, point_features, le_feature_dict, random_seed)
-
 if __name__=='__main__':
     # Creates the pickles that store the preprocessed data
     # Read cmd line args
@@ -129,5 +138,4 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed):
     random_seed = 1000
 
     print("Preprocessing started")
-    save_pickled_pointwise_features(data_home_dir, dataset, random_seed)
-    print("Matrix")
+    save_pickled_pointwise_features(data_home_dir, dataset)
diff --git a/s2and/featurizer.py b/s2and/featurizer.py
index 1ccc62b..15278e4 100644
--- a/s2and/featurizer.py
+++ b/s2and/featurizer.py
@@ -10,6 +10,7 @@
 from collections import Counter
 from collections.abc import Iterable
 from IPython import embed
+from scipy.sparse import csr_matrix, coo_matrix
 
 from sklearn import preprocessing
 
@@ -839,7 +840,7 @@ def pointwise_featurize(
     chunk_size: int = DEFAULT_CHUNK_SIZE,
 ):
     """
-    Featurizes the input dataset and stores as a unified pickle file. 
+    Extarct Pointwise Features from the dataset. 
 
     Parameters
     ----------
@@ -855,16 +856,15 @@ def pointwise_featurize(
     Returns
     -------
     Returns the three items : 
-    1. Row indices of the sparse matrix containing the data
-    2. Column indices of the sparse matrix containing the data
-    3. The data to be filled in the given row and column combination.
+    1. Sparse matrix poitwise feature representation of all the signatures in a dataset.
+    2. Label encoder to index signature according to their ids
     """
     # Do you think OrderedSet and OrderedDict should be used here? 
     signature_feature_set = set() # The feature is stored a str and not tuple to facilitate label encoding.
     signature_dict = {}
     
     # We dont need to iterate signature per block as we need to create for all the signatures irrespective of the block.
-
+    logger.info('Creating signatures feature set...')
     for signature_key, values in dataset.signatures.items():
         per_signature_features = dataset.signatures[signature_key]._asdict()
         signature_dict[signature_key] = []
@@ -915,7 +915,9 @@ def pointwise_featurize(
                 print('\n!!!! Found another type !!!!\n')
                 embed()
                 exit()
-    logger.info('Label encoding the values')
+    logger.info('Created signatures feature set...')
+    
+    logger.info('Label encoding signature features...')
     # Label encoding code --- 
     
     """"
@@ -927,7 +929,7 @@ def pointwise_featurize(
     """
     le_signature_feature_set = preprocessing.LabelEncoder()
     le_signature_feature_set.fit(list(signature_feature_set))
-
+    
     # I am using this for easy retrieval for training, val and test block retrieval. 
     le_signature_dict = preprocessing.LabelEncoder()
     le_signature_dict.fit(list(signature_dict.keys()))
@@ -936,15 +938,22 @@ def pointwise_featurize(
     num_points = len(signature_dict.keys())
     num_feats = len(signature_feature_set)   
     
-    for _, (key, values) in tqdm(enumerate(signature_dict.items()), desc="Converting to sparse matrix"):
+    for key, values in tqdm(signature_dict.items(), desc="Converting to spare matrix"):
         encoded_signature_features = le_signature_feature_set.transform(values)
         encoded_key_val = le_signature_dict.transform([key])[0]
         for feature_label in encoded_signature_features :
             point_features_row.append(encoded_key_val)
             point_features_col.append(feature_label)
             point_features_data.append(1)
-                          
-    return point_features_row, point_features_col, point_features_data, num_feats, num_points, le_signature_dict
+    logger.info('Label encoding completed...')
+    
+    logger.info('converting feature indices to csr_matrix')
+    point_features = coo_matrix(
+            (point_features_data, (point_features_row, point_features_col)),
+            shape=(num_points, num_feats)
+    ).tocsr()
+    print("Matrix creation done.")                      
+    return point_features, le_signature_dict
             
 
 def store_featurized_pickles(

From e1cbad5a2788a4019a6b61e3b2597debecded55b Mon Sep 17 00:00:00 2001
From: Sriharsha-hatwar <shatwar@umass.edu>
Date: Sun, 26 Mar 2023 00:04:36 +0000
Subject: [PATCH 15/17] Added the verification code needed for verifying the
 signature sparse matix length and order.

---
 e2e_scripts/preprocess_s2and_data.py      | 52 ++++++++++++++++++++++-
 e2e_scripts/preprocess_s2and_pointwise.py |  6 ++-
 s2and/featurizer.py                       |  5 ++-
 3 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/e2e_scripts/preprocess_s2and_data.py b/e2e_scripts/preprocess_s2and_data.py
index d6962b3..97ca5d0 100644
--- a/e2e_scripts/preprocess_s2and_data.py
+++ b/e2e_scripts/preprocess_s2and_data.py
@@ -15,6 +15,7 @@
 
 from s2and.data import ANDData
 import logging
+import json
 from s2and.featurizer import FeaturizationInfo, featurize
 from preprocess_s2and_pointwise import save_pickled_pointwise_features, create_signature_features_matrix
 
@@ -22,6 +23,42 @@
                     level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+def validate_pointwise_featurizer(dataset, pointwise_matrix, le_signature_ids):
+    # This function is here to validate two things : 
+    # 1. Whether the length of the signtaures in the matrix is same as the length of the json file. 
+    #   Also the block id. 
+    # 2. Check whether the order is the same between the json file and the matrix.
+    #   The order needs to be checked on block level as well as the signature level. 
+
+    print("### --- Validating the pointwise matrix creation")
+    print("The shape of the matrix : ",pointwise_matrix.shape)
+    file_signature_location= f"../data/{dataset.name}/{dataset.name}_signatures.json"
+    with open(file_signature_location, 'r') as myfile:
+        data=myfile.read() 
+    dict_obj = json.loads(data)
+    file_keys = list(dict_obj.keys())
+    print("Length of the signatures file : ", len(dict_obj))
+    print("### -- Validating the signature order. ")
+    indices = list(range(pointwise_matrix.shape[0]))
+    inverse_transformed_signature_ids = list(le_signature_ids.inverse_transform(indices))
+    ordered = True
+    length = len(dict_obj)
+    if len(dict_obj) == pointwise_matrix.shape[0]:
+        print("The lengths are same")
+        index = 0
+        while (ordered and index < length):
+            if inverse_transformed_signature_ids[index] == file_keys[index]:
+                index += 1
+            else:
+                print("inverse_transformed_signature_ids[index] :", inverse_transformed_signature_ids[index])
+                print("file_keys[index] : ", file_keys[index])
+                ordered = False
+                print("The order is not same")
+        if ordered:
+            print("The order is same.")
+        
+    else:
+        print("The lengths are not same..")
 
 def save_featurized_data(data_home_dir, dataset_name, random_seed, point_features_mat, le_signatures):
     parent_dir = f"{data_home_dir}/{dataset_name}"
@@ -46,6 +83,8 @@ def save_featurized_data(data_home_dir, dataset_name, random_seed, point_feature
 
     save_pickled_pointwise_features(AND_dataset, point_features_mat, le_signatures, random_seed)
 
+    validate_pointwise_featurizer(AND_dataset, point_features_mat, le_signatures)
+
     train_pkl, val_pkl, test_pkl = store_featurized_pickles(AND_dataset,
                                                             featurization_info,
                                                             n_jobs=16,
@@ -123,6 +162,17 @@ def find_total_num_train_pairs(blockwise_data):
     
     point_features_mat, le_signatures = create_signature_features_matrix(data_home_dir, dataset)
 
+    # Added this for speeding up while testing.
+    matrix_pickle_file_location = "./matrix_pickle.pkl"
+
+    with open(matrix_pickle_file_location,"wb") as _pkl_file:
+        pickle.dump((point_features_mat, le_signatures), _pkl_file)
+
+    """
+    with open(matrix_pickle_file_location, 'rb') as f:
+        point_features_mat, le_signatures = pickle.load(f)
+    """
+    
     random_seeds = [1, 2, 3, 4, 5] if params["dataset_seed"] is None else [params["dataset_seed"]]
     for seed in random_seeds:
         print("Preprocessing started for seed value", seed)
@@ -135,4 +185,4 @@ def find_total_num_train_pairs(blockwise_data):
         test_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{seed}/test_features.pkl"
         blockwise_features = read_blockwise_features(train_pkl)
         find_total_num_train_pairs(blockwise_features)
-        #verify_diff_with_s2and(dataset, seed)
\ No newline at end of file
+        #verify_diff_with_s2and(dataset, seed)
diff --git a/e2e_scripts/preprocess_s2and_pointwise.py b/e2e_scripts/preprocess_s2and_pointwise.py
index 2918d7c..50c482c 100644
--- a/e2e_scripts/preprocess_s2and_pointwise.py
+++ b/e2e_scripts/preprocess_s2and_pointwise.py
@@ -34,6 +34,7 @@ def save_pickled_pointwise_features(AND_dataset, sparse_matrix,
     """
     
     if random_seed:
+        # This splits the signatures per three different blocks
         train_block, val_block, test_block = AND_dataset.split_cluster_signatures()
 
         train_pointwise_features = {}
@@ -41,7 +42,8 @@ def save_pickled_pointwise_features(AND_dataset, sparse_matrix,
         test_pointwise_features = {}
 
         # The above three should have a key-list(val) (where val is a list of signature IDs) under them. 
-
+        # Below three for loops go through the blocks, gets the corresponding row index of the signature 
+        # from the label encoder, splices the matrix with only those rows and stores per block.
         # Doing for training block : 
         for block_id, list_of_signatures in train_block.items():
             # Let us transform each of those using label encoder and index them from the sparse matrix.
@@ -78,7 +80,7 @@ def save_pickled_pointwise_features(AND_dataset, sparse_matrix,
         processed_data['mention_level_features'] = point_features_mat
 
         logger.info('Dumping processed data')
-        file_name = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/{dataset_name}_all_signature_features.pkl"
+        file_name = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/{AND_dataset.name}_all_signature_features.pkl"
 
         with open(file_name, 'wb') as f:
             pickle.dump(processed_data, f)
diff --git a/s2and/featurizer.py b/s2and/featurizer.py
index 15278e4..e4c7a4b 100644
--- a/s2and/featurizer.py
+++ b/s2and/featurizer.py
@@ -871,6 +871,7 @@ def pointwise_featurize(
         for feature_key, value in per_signature_features.items():
             index_key = None
             
+            # TODO : WHy to ignore this?
             features_to_ignore = [
                     'author_info_name_counts',
                     'author_info_position',
@@ -896,6 +897,8 @@ def pointwise_featurize(
                 pass
             
             # Let us check the type of value for each signatures. 
+            # This will go through each signature and depending on the type of key in the 
+            # key-val pair, flatens it and is used a single feature to create a sparse matrix.
             
             if isinstance(value, str) or isinstance(value, int):
                 index_key = str((feature_key, value))
@@ -938,7 +941,7 @@ def pointwise_featurize(
     num_points = len(signature_dict.keys())
     num_feats = len(signature_feature_set)   
     
-    for key, values in tqdm(signature_dict.items(), desc="Converting to spare matrix"):
+    for key, values in tqdm(signature_dict.items(), desc="Converting to sparse matrix"):
         encoded_signature_features = le_signature_feature_set.transform(values)
         encoded_key_val = le_signature_dict.transform([key])[0]
         for feature_label in encoded_signature_features :

From 2fc77e81a19fa24d8b03c899a9c907987c8b50d7 Mon Sep 17 00:00:00 2001
From: Sriharsha-hatwar <shatwar@umass.edu>
Date: Tue, 4 Apr 2023 23:27:50 +0000
Subject: [PATCH 16/17] adding the folder for storing the pointwise matrix

---
 e2e_scripts/preprocess_s2and_pointwise.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/e2e_scripts/preprocess_s2and_pointwise.py b/e2e_scripts/preprocess_s2and_pointwise.py
index 50c482c..27242cd 100644
--- a/e2e_scripts/preprocess_s2and_pointwise.py
+++ b/e2e_scripts/preprocess_s2and_pointwise.py
@@ -61,12 +61,12 @@ def save_pickled_pointwise_features(AND_dataset, sparse_matrix,
             encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
             test_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
 
-        if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}")):
-            os.makedirs(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}")
+        if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}")):
+            os.makedirs(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}")
 
-        train_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}/train_signature_features.pkl"
-        val_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}/val_signature_features.pkl"
-        test_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}/test_signature_features.pkl"
+        train_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/train_signature_features.pkl"
+        val_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/val_signature_features.pkl"
+        test_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/test_signature_features.pkl"
 
         with open(train_pkl,"wb") as _pkl_file:
             pickle.dump(train_pointwise_features, _pkl_file)
@@ -120,6 +120,16 @@ def create_signature_features_matrix(data_home_dir, dataset_name):
                                                           n_jobs=16,
                                                         use_cache=False)
     
+    matrix_pickle_file_location = f'preprocess_matrix_{dataset_name}.pkl'
+    print("Storing pickled matrix ....")
+    with open(matrix_pickle_file_location, 'wb') as f:
+        pickle.dump((point_features_mat, le_signatures), f)
+
+    print("### loading from pickle")
+    with open(matrix_pickle_file_location, 'rb') as f:
+        point_features_mat, le_signatures = pickle.load(f)
+    
+    
     logger.info("Signature features pre-procesing completed")
     return point_features_mat, le_signatures
     

From 85cbbd75a44a74a303f4c6ab12155a8896a9877f Mon Sep 17 00:00:00 2001
From: Sriharsha-hatwar <shatwar@umass.edu>
Date: Mon, 10 Apr 2023 02:53:53 +0000
Subject: [PATCH 17/17] Changes to address the validation code and pointwise
 feature creation

---
 e2e_scripts/preprocess_s2and_data.py      | 179 +++++++++++++++++-----
 e2e_scripts/preprocess_s2and_pointwise.py |   6 +-
 s2and/featurizer.py                       |  75 +++++++--
 3 files changed, 211 insertions(+), 49 deletions(-)

diff --git a/e2e_scripts/preprocess_s2and_data.py b/e2e_scripts/preprocess_s2and_data.py
index 97ca5d0..bfcc15b 100644
--- a/e2e_scripts/preprocess_s2and_data.py
+++ b/e2e_scripts/preprocess_s2and_data.py
@@ -23,42 +23,144 @@
                     level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-def validate_pointwise_featurizer(dataset, pointwise_matrix, le_signature_ids):
-    # This function is here to validate two things : 
-    # 1. Whether the length of the signtaures in the matrix is same as the length of the json file. 
-    #   Also the block id. 
-    # 2. Check whether the order is the same between the json file and the matrix.
-    #   The order needs to be checked on block level as well as the signature level. 
 
-    print("### --- Validating the pointwise matrix creation")
-    print("The shape of the matrix : ",pointwise_matrix.shape)
-    file_signature_location= f"../data/{dataset.name}/{dataset.name}_signatures.json"
-    with open(file_signature_location, 'r') as myfile:
-        data=myfile.read() 
-    dict_obj = json.loads(data)
-    file_keys = list(dict_obj.keys())
-    print("Length of the signatures file : ", len(dict_obj))
-    print("### -- Validating the signature order. ")
-    indices = list(range(pointwise_matrix.shape[0]))
-    inverse_transformed_signature_ids = list(le_signature_ids.inverse_transform(indices))
-    ordered = True
-    length = len(dict_obj)
-    if len(dict_obj) == pointwise_matrix.shape[0]:
-        print("The lengths are same")
+def validate_order_all_block(pointwise_block, features_block):
+    # First, check the blockwise similarity
+    keys_pointwise = list(pointwise_block.keys())
+    keys_features = list(features_block.keys())
+
+    if len(keys_pointwise) == len(keys_features):
+        print("The number of blocks are same across the features")
+        ordered = True
         index = 0
-        while (ordered and index < length):
-            if inverse_transformed_signature_ids[index] == file_keys[index]:
-                index += 1
+        len_of_blocks = len(keys_pointwise)
+        while ordered and index < len_of_blocks:
+            if keys_pointwise[index] == keys_features[index]:
+                index+=1
             else:
-                print("inverse_transformed_signature_ids[index] :", inverse_transformed_signature_ids[index])
-                print("file_keys[index] : ", file_keys[index])
                 ordered = False
-                print("The order is not same")
-        if ordered:
-            print("The order is same.")
         
+        if not ordered:
+            print("The blocks are not in order.")
+        else:
+            print("The blocks are in order.")
+        return ordered
+    else:
+        print("The number of blocks in seed : ", seed, "are not the same across features")
+        return False
+
+def validate_order_inside_block(pointwise_block, features_block):
+    pointwise_signature_list = []
+    signature_id_list = []
+
+    for block, val in pointwise_block.items():
+        list_of_sig = val[0]
+        pointwise_signature_list.extend(list_of_sig)
+    
+    #print("pointwise_signature_list : ", pointwise_signature_list)
+    
+    for block, val in features_block.items():
+        list_of_sig = [sigs.signature_id for sigs in val]
+        signature_id_list.extend(list_of_sig)
+    
+    #print("signature_id_list : ", signature_id_list)
+
+    # Now for validation part. 
+    ordered = True
+    index = 0
+    len_of_sigs = len(pointwise_signature_list)
+    while ordered and index < len_of_sigs:
+        if pointwise_signature_list[index] == signature_id_list[index]:
+            index += 1
+        else:
+            ordered = False
+    if not ordered:
+        print("The Signatures are not in order.")
     else:
-        print("The lengths are not same..")
+        print("The Signatures are in order.")
+    return ordered
+
+def validate_pointwise_featurizer(dataset):
+    print("### --- Validating the pointwise matrix creation")
+    # Need to go through each pickle file in all the seeds.
+    seeds = [1]
+
+    for seed in seeds:
+        train_point_loc = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/pointwise/seed{seed}/train_signature_features.pkl"
+        val_point_loc = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/pointwise/seed{seed}/val_signature_features.pkl"
+        test_point_loc = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/pointwise/seed{seed}/test_signature_features.pkl"
+
+
+        train_loc = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/seed{seed}/train_signatures.pkl"
+        val_loc = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/seed{seed}/val_signatures.pkl"
+        test_loc = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/seed{seed}/test_signatures.pkl"
+
+        # This is what the pointwise has created. 
+        with open(train_point_loc, 'rb') as f:
+            train_block_pointwise_data = pickle.load(f)
+        
+        with open(train_loc, 'rb') as f:
+            train_block_data = pickle.load(f)
+
+
+        # For training block.
+        train_blocks_in_order = validate_order_all_block(train_block_pointwise_data, train_block_data)
+
+        if train_blocks_in_order:
+            print("Training block of seed ", seed, "in order")
+        else:
+            print("Training block of seed ", seed, " are not in order")
+
+        is_signature_train_in_order = validate_order_inside_block(train_block_pointwise_data, train_block_data)
+        
+        if is_signature_train_in_order:
+            print("Training signature of seed ", seed , "in order")
+        else:
+            print("Training signature of seed ", seed , "are not in order")
+
+        
+
+        # For validation parts..
+        with open(val_point_loc, 'rb') as f:
+            val_block_pointwise_data = pickle.load(f)
+        with open(val_loc, 'rb') as f:
+            val_block_data = pickle.load(f)
+
+        val_blocks_in_order = validate_order_all_block(val_block_pointwise_data, val_block_data)
+
+        if val_blocks_in_order:
+            print("Validation block of seed ", seed, "in order")
+        else:
+            print("Validation block of seed ", seed, " are not in order")
+
+        is_signature_val_in_order = validate_order_inside_block(val_block_pointwise_data, val_block_data)
+        
+        if is_signature_val_in_order:
+            print("Validation signature of seed ", seed , "in order")
+        else:
+            print("Validation signature of seed ", seed , "are not in order")
+        
+
+        # For testing parts.. 
+        with open(test_point_loc, 'rb') as f:
+            test_block_pointwise_data = pickle.load(f)
+        with open(test_loc, 'rb') as f:
+            test_block_data = pickle.load(f)
+
+        test_blocks_in_order = validate_order_all_block(test_block_pointwise_data, test_block_data)
+
+        if test_blocks_in_order:
+            print("Test block of seed ", seed, "in order")
+        else:
+            print("Test block of seed ", seed, " are not in order")
+
+        is_signature_test_in_order = validate_order_inside_block(test_block_pointwise_data, test_block_data)
+        
+        if is_signature_test_in_order:
+            print("Test signature of seed ", seed , "in order")
+        else:
+            print("Test signature of seed ", seed , "are not in order")
+        
 
 def save_featurized_data(data_home_dir, dataset_name, random_seed, point_features_mat, le_signatures):
     parent_dir = f"{data_home_dir}/{dataset_name}"
@@ -81,15 +183,20 @@ def save_featurized_data(data_home_dir, dataset_name, random_seed, point_feature
     featurization_info = FeaturizationInfo()
     logger.info("Loaded featurization info")
 
-    save_pickled_pointwise_features(AND_dataset, point_features_mat, le_signatures, random_seed)
+    #save_pickled_pointwise_features(AND_dataset, point_features_mat, le_signatures, random_seed)
 
-    validate_pointwise_featurizer(AND_dataset, point_features_mat, le_signatures)
+    
 
     train_pkl, val_pkl, test_pkl = store_featurized_pickles(AND_dataset,
                                                             featurization_info,
                                                             n_jobs=16,
                                                             use_cache=False,
-                                                            random_seed=random_seed)
+                                                            random_seed=random_seed,
+                                                            pointwise_matrix=point_features_mat,
+                                                            le_signatures=le_signatures)
+    
+    validate_pointwise_featurizer(AND_dataset)
+    print(" ## Validation and save process completed.")
 
     return train_pkl, val_pkl, test_pkl
 
@@ -173,7 +280,7 @@ def find_total_num_train_pairs(blockwise_data):
         point_features_mat, le_signatures = pickle.load(f)
     """
     
-    random_seeds = [1, 2, 3, 4, 5] if params["dataset_seed"] is None else [params["dataset_seed"]]
+    random_seeds = [1] if params["dataset_seed"] is None else [params["dataset_seed"]]
     for seed in random_seeds:
         print("Preprocessing started for seed value", seed)
         save_featurized_data(data_home_dir, dataset, seed, point_features_mat, le_signatures)
@@ -183,6 +290,6 @@ def find_total_num_train_pairs(blockwise_data):
         train_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{seed}/train_features.pkl"
         val_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{seed}/val_features.pkl"
         test_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{seed}/test_features.pkl"
-        blockwise_features = read_blockwise_features(train_pkl)
-        find_total_num_train_pairs(blockwise_features)
+        #blockwise_features = read_blockwise_features(train_pkl)
+        #find_total_num_train_pairs(blockwise_features)
         #verify_diff_with_s2and(dataset, seed)
diff --git a/e2e_scripts/preprocess_s2and_pointwise.py b/e2e_scripts/preprocess_s2and_pointwise.py
index 27242cd..dfe8f6e 100644
--- a/e2e_scripts/preprocess_s2and_pointwise.py
+++ b/e2e_scripts/preprocess_s2and_pointwise.py
@@ -48,18 +48,18 @@ def save_pickled_pointwise_features(AND_dataset, sparse_matrix,
         for block_id, list_of_signatures in train_block.items():
             # Let us transform each of those using label encoder and index them from the sparse matrix.
             encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
-            train_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
+            train_pointwise_features[block_id] = (list_of_signatures, sparse_matrix[encoded_signature_id_list, :])
 
         # Doing for validation block : 
         for block_id, list_of_signatures in val_block.items():
             # Let us transform each of those using label encoder and index them from the sparse matrix.
             encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
-            validation_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
+            validation_pointwise_features[block_id] = (list_of_signatures, sparse_matrix[encoded_signature_id_list, :])
 
         for block_id, list_of_signatures in test_block.items():
             # Let us transform each of those using label encoder and index them from the sparse matrix.
             encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
-            test_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
+            test_pointwise_features[block_id] = (list_of_signatures, sparse_matrix[encoded_signature_id_list, :])
 
         if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}")):
             os.makedirs(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}")
diff --git a/s2and/featurizer.py b/s2and/featurizer.py
index e4c7a4b..a72b5d7 100644
--- a/s2and/featurizer.py
+++ b/s2and/featurizer.py
@@ -9,6 +9,7 @@
 import logging
 from collections import Counter
 from collections.abc import Iterable
+from collections import OrderedDict
 from IPython import embed
 from scipy.sparse import csr_matrix, coo_matrix
 
@@ -856,12 +857,16 @@ def pointwise_featurize(
     Returns
     -------
     Returns the three items : 
-    1. Sparse matrix poitwise feature representation of all the signatures in a dataset.
+    1. Sparse matrix pointwise feature representation of all the signatures in a dataset.
     2. Label encoder to index signature according to their ids
     """
     # Do you think OrderedSet and OrderedDict should be used here? 
-    signature_feature_set = set() # The feature is stored a str and not tuple to facilitate label encoding.
-    signature_dict = {}
+    # I am using this to facilitate the order to be maintained. 
+    # signature_feature_dict is facilitating an ordered set storage by using an ordered dict
+    # and is used to store the feature
+    #  signature_dict - Is an ordered storage of Signature IDs. 
+    signature_feature_dict = OrderedDict() # The feature is stored a str and not tuple to facilitate label encoding.
+    signature_dict = OrderedDict()
     
     # We dont need to iterate signature per block as we need to create for all the signatures irrespective of the block.
     logger.info('Creating signatures feature set...')
@@ -902,17 +907,23 @@ def pointwise_featurize(
             
             if isinstance(value, str) or isinstance(value, int):
                 index_key = str((feature_key, value))
-                signature_feature_set.add(index_key) # Converting to str from tuple.
+                #signature_feature_set.add(index_key) # Converting to str from tuple.
+                if index_key not in signature_feature_dict:
+                    signature_feature_dict[index_key] = None
                 signature_dict[signature_key].append(index_key)
             elif isinstance(value, Counter):
                 for val in value.keys():
                     index_key = str((feature_key, val))
-                    signature_feature_set.add(index_key)
+                    #signature_feature_set.add(index_key)
+                    if index_key not in signature_feature_dict:
+                        signature_feature_dict[index_key] = None
                     signature_dict[signature_key].append(index_key)
             elif isinstance(value, Iterable):
                 for val in value:
                     index_key = str((feature_key, val))
-                    signature_feature_set.add(index_key)
+                    #signature_feature_set.add(index_key)
+                    if index_key not in signature_feature_dict:
+                        signature_feature_dict[index_key] = None
                     signature_dict[signature_key].append(index_key)
             else:
                 print('\n!!!! Found another type !!!!\n')
@@ -930,8 +941,12 @@ def pointwise_featurize(
         
     }
     """
+    # Before label encoding, I am converting the signature_set
+    # (which was a ordered dict into a list of signature by just getting the keys)
+    signature_feature_set = list(signature_feature_dict.keys()) 
+
     le_signature_feature_set = preprocessing.LabelEncoder()
-    le_signature_feature_set.fit(list(signature_feature_set))
+    le_signature_feature_set.fit(signature_feature_set)
     
     # I am using this for easy retrieval for training, val and test block retrieval. 
     le_signature_dict = preprocessing.LabelEncoder()
@@ -958,6 +973,21 @@ def pointwise_featurize(
     print("Matrix creation done.")                      
     return point_features, le_signature_dict
             
+def create_pointwise_block(object_list, pointwise_matrix, le_signatures):
+    # So object_list contains a dict of blocks and signatures. 
+    # So we get the same block as key, get the signature IDs from the values and use the label
+    # encoder to get the corresponding indices and extract the indices from the pointwise matrix. 
+
+    pointwise_featurize = {}
+    for key, val in object_list.items():
+        # here val is a list of signature objects. 
+        # get the ID from each of them. 
+        signatures_ids = [sig.signature_id for sig in val]
+        encoded_signature_id_list = le_signatures.transform(signatures_ids)
+        pointwise_featurize[key] = (signatures_ids, pointwise_matrix[encoded_signature_id_list, :])
+
+    return pointwise_featurize
+
 
 def store_featurized_pickles(
     dataset: ANDData,
@@ -969,6 +999,8 @@ def store_featurized_pickles(
     nan_value: float = np.nan,
     delete_training_data: bool = False,
     random_seed: int = 1,
+    pointwise_matrix = None,
+    le_signatures = None,
 ) -> Union[Tuple[TupleOfArrays, TupleOfArrays, TupleOfArrays], TupleOfArrays]:
     """
     Featurizes the input dataset and stores as preprocessed data in pickle files
@@ -1119,18 +1151,41 @@ def store_featurized_pickles(
         val_signatures_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/seed{random_seed}/val_signatures.pkl"
         test_signatures_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/seed{random_seed}/test_signatures.pkl"
 
+        train_object_list: Dict[str, List[Signature]] = dataset.get_signature_objects(train_signatures)
+        val_object_list: Dict[str, List[Signature]] = dataset.get_signature_objects(val_signatures)
+        test_object_list: Dict[str, List[Signature]] = dataset.get_signature_objects(test_signatures)
+
         if(not os.path.isfile(train_signatures_pkl)):
-            train_object_list: Dict[str, List[Signature]] = dataset.get_signature_objects(train_signatures)
+            #train_object_list = dataset.get_signature_objects(train_signatures)
             with open(train_signatures_pkl, "wb") as _pkl_file:
                 pickle.dump(train_object_list, _pkl_file)
 
-            val_object_list: Dict[str, List[Signature]] = dataset.get_signature_objects(val_signatures)
+            #val_object_list = dataset.get_signature_objects(val_signatures)
             with open(val_signatures_pkl, "wb") as _pkl_file:
                 pickle.dump(val_object_list, _pkl_file)
 
-            test_object_list: Dict[str, List[Signature]] = dataset.get_signature_objects(test_signatures)
+            #test_object_list = dataset.get_signature_objects(test_signatures)
             with open(test_signatures_pkl, "wb") as _pkl_file:
                 pickle.dump(test_object_list, _pkl_file)
 
+        # Now utlize this in-creating the blockwise, train, test and Val pair of pointwise features per block.
+
+        train_pointwise_features = create_pointwise_block(train_object_list, pointwise_matrix, le_signatures)
+        validation_pointwise_features = create_pointwise_block(val_object_list, pointwise_matrix, le_signatures)
+        test_pointwise_features = create_pointwise_block(test_object_list, pointwise_matrix, le_signatures)
 
+
+        if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{dataset.name}/pointwise/seed{random_seed}")):
+            os.makedirs(f"{PREPROCESSED_DATA_DIR}/{dataset.name}/pointwise/seed{random_seed}")
+
+        train_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/pointwise/seed{random_seed}/train_signature_features.pkl"
+        val_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/pointwise/seed{random_seed}/val_signature_features.pkl"
+        test_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/pointwise/seed{random_seed}/test_signature_features.pkl"
+
+        with open(train_pkl,"wb") as _pkl_file:
+            pickle.dump(train_pointwise_features, _pkl_file)
+        with open(val_pkl,"wb") as _pkl_file:
+            pickle.dump(validation_pointwise_features, _pkl_file)
+        with open(test_pkl,"wb") as _pkl_file:
+            pickle.dump(test_pointwise_features, _pkl_file)
         return train_pkl, val_pkl, test_pkl