From fd407c049b66254e0335ca64c39c336aba986a9e Mon Sep 17 00:00:00 2001 From: Sriharsha-hatwar Date: Sat, 11 Mar 2023 21:10:12 +0000 Subject: [PATCH 01/17] Changes to requirements.in --- requirements.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.in b/requirements.in index 509d2a8..727a17e 100644 --- a/requirements.in +++ b/requirements.in @@ -25,7 +25,7 @@ wandb ############################# # HummingBird Requirements -#awscli==1.25.90 +awscli==1.25.90 #git+https://github.com/microsoft/hummingbird.git@mainterl/fine-tune-trees # ECC Layer Requirements @@ -33,4 +33,4 @@ higra==0.6.4 cvxpy cvxpylayers pytorch_lightning -git+https://github.com/dhdhagar/NeuMiss.git@dev \ No newline at end of file +git+https://github.com/dhdhagar/NeuMiss.git@dev From bb5e19cbb8f8172f419268852a496045d92a1338 Mon Sep 17 00:00:00 2001 From: Dhruv Agarwal Date: Wed, 15 Mar 2023 15:23:09 -0400 Subject: [PATCH 02/17] Debugging, SDP fixes, grad_acc, no-sdp, bceloss, local runs, bug fixes (#39) * Add --debug and --track_errors to log cvxpylayer errors * Fix save_to_wandb call for hyperparameters * Fix tensor serializable error * Fix for --eval_only_split flow * Add `sdp_scale` hyperparameter to scale the weight matrix to the SDP layer by the maximum element * Add `gradient_accumulation` hyperparameter * Change run defaults and sweep configs * Add sweep prefix option to run_sweep * Increase sweep agent memory * Clamp cvxpy output to [0,1] * Address meshgrid warning * Add `weighted_loss` to e2e sweep config * Modify run_sweep.sh to take in seed start and end values * Add `use_sdp` hyperparam to control whether to use the SDP during training and inference or directly use the MLP output with HAC-cut * Log errors before crashing; make error tracking the default behavior * Exception handling improvements * Make tqdm verbose even in silent mode * Save best dev model before testing * Add *-nosdp sweep configurations * Add `e2e_loss` hyperparam to control whether to use Frobenius or BCE loss * Change default subsampling to 80 (train) and 100 (dev) * Add --local to run with wandb disabled, change default weighted_loss to false to stay consistent with icml23 submission --- add_agent.sh | 2 +- e2e_debug/solve.py | 140 +++++++++++ e2e_pipeline/hac_cut_layer.py | 22 +- e2e_pipeline/model.py | 13 +- e2e_pipeline/sdp_layer.py | 62 +++-- e2e_scripts/evaluate.py | 65 +++-- e2e_scripts/train.py | 291 +++++++++++++++------- e2e_scripts/train_utils.py | 49 +++- run_sweep.sh | 14 +- utils/parser.py | 12 + wandb_configs/sweeps/e2e-nosdp-warm.json | 16 ++ wandb_configs/sweeps/e2e-nosdp.json | 15 ++ wandb_configs/sweeps/e2e.json | 2 +- wandb_configs/sweeps/frac-nosdp-warm.json | 17 ++ wandb_configs/sweeps/frac-nosdp.json | 16 ++ wandb_configs/sweeps/frac.json | 2 +- wandb_configs/sweeps/mlp.json | 5 +- 17 files changed, 588 insertions(+), 155 deletions(-) create mode 100644 e2e_debug/solve.py create mode 100644 wandb_configs/sweeps/e2e-nosdp-warm.json create mode 100644 wandb_configs/sweeps/e2e-nosdp.json create mode 100644 wandb_configs/sweeps/frac-nosdp-warm.json create mode 100644 wandb_configs/sweeps/frac-nosdp.json diff --git a/add_agent.sh b/add_agent.sh index 19b0a2b..13b93f5 100644 --- a/add_agent.sh +++ b/add_agent.sh @@ -10,7 +10,7 @@ gpu_name=${6:-"gypsum-1080ti"} # "gypsum-1080ti" for ((i = 1; i <= ${n_agents}; i++)); do JOB_DESC=${model}_${dataset}_sweep${seed}-${i} && JOB_NAME=${JOB_DESC}_$(date +%s) && \ sbatch -J ${JOB_NAME} -e jobs/${JOB_NAME}.err -o jobs/${JOB_NAME}.log \ - --partition=${gpu_name} --gres=gpu:1 --mem=80G --time=12:00:00 \ + --partition=${gpu_name} --gres=gpu:1 --mem=100G --time=12:00:00 \ run_sbatch.sh e2e_scripts/train.py \ --dataset="${dataset}" \ --dataset_random_seed=${seed} \ diff --git a/e2e_debug/solve.py b/e2e_debug/solve.py new file mode 100644 index 0000000..6e47ef2 --- /dev/null +++ b/e2e_debug/solve.py @@ -0,0 +1,140 @@ +import json +import argparse +import cvxpy as cp +import logging +import numpy as np +import torch + +from IPython import embed + +from e2e_pipeline.hac_cut_layer import HACCutLayer + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) +logger = logging.getLogger(__name__) + + +class Parser(argparse.ArgumentParser): + def __init__(self): + super().__init__() + self.add_argument( + "--data_fpath", type=str + ) + self.add_argument( + "--data_idx", type=int, default=0 + ) + self.add_argument( + "--scs_max_sdp_iters", type=int, default=50000 + ) + self.add_argument( + "--scs_silent", action="store_true", + ) + self.add_argument( + "--scs_eps", type=float, default=1e-3 + ) + self.add_argument( + "--scs_scale", type=float, default=1e-1, + ) + self.add_argument( + "--scs_dont_normalize", action="store_true", + ) + self.add_argument( + "--scs_use_indirect", action="store_true", + ) + self.add_argument( + "--scs_dont_use_quad_obj", action="store_true", + ) + self.add_argument( + "--scs_alpha", type=float, default=1.5 + ) + self.add_argument( + "--scs_log_csv_filename", type=str, + ) + self.add_argument( + "--interactive", action="store_true", + ) + + +if __name__ == '__main__': + parser = Parser() + args = parser.parse_args() + logger.info("Script arguments:") + logger.info(args.__dict__) + + # Read error file + logger.info("Reading input data") + with open(args.data_fpath, 'r') as fh: + data = json.load(fh) + assert len(data['errors']) > 0 + # Pick specific error instance to process + error_data = data['errors'][args.data_idx] + + # Extract input data from the error instance + _raw = np.array(error_data['model_call_args']['data']) + _W_val = np.array(error_data['cvxpy_layer_args']['W_val']) + + # Construct cvxpy problem + logger.info('Constructing optimization problem') + # edge_weights = _W_val.tocoo() + n = _W_val.shape[0] + W = _W_val + # W = csr_matrix((edge_weights.data, (edge_weights.row, edge_weights.col)), shape=(n, n)) + X = cp.Variable((n, n), PSD=True) + # Build constraint set + constraints = [ + cp.diag(X) == np.ones((n,)), + X[:n, :] >= 0, + X[:n, :] <= 1 + ] + + # Setup HAC Cut + hac_cut = HACCutLayer() + hac_cut.eval() + + sdp_obj_value = float('inf') + result_idxs, results_X, results_clustering = [], [], [] + no_solution_scaling_factors = [] + for i in range(1, 10): # n + # Skipping 1; no scaling leads to non-convergence (infinite objective value) + if i == 1: + scaling_factor = np.max(W) + else: + scaling_factor = i + logger.info(f'Scaling factor={scaling_factor}') + # Create problem + W_scaled = W / scaling_factor + problem = cp.Problem(cp.Maximize(cp.trace(W_scaled @ X)), constraints) + # Solve problem + sdp_obj_value = problem.solve( + solver=cp.SCS, + verbose=not args.scs_silent, + max_iters=args.scs_max_sdp_iters, + eps=args.scs_eps, + normalize=not args.scs_dont_normalize, + alpha=args.scs_alpha, + scale=args.scs_scale, + use_indirect=args.scs_use_indirect, + use_quad_obj=not args.scs_dont_use_quad_obj, + log_csv_filename=args.scs_log_csv_filename + ) + logger.info(f"@scaling={scaling_factor}, objective value = {sdp_obj_value}, norm={np.linalg.norm(W_scaled)}") + if sdp_obj_value != float('inf'): + result_idxs.append(i) + results_X.append(X.value) + # Find clustering solution + hac_cut.get_rounded_solution(torch.tensor(X.value), torch.tensor(W_scaled)) + results_clustering.append(hac_cut.cluster_labels.numpy()) + else: + no_solution_scaling_factors.append(scaling_factor) + logger.info(f"Solution not found = {len(no_solution_scaling_factors)}") + logger.info(f"Solution found = {len(results_X)}") + + logger.info("Same clustering:") + for i in range(len(results_clustering)-1): + logger.info(np.array_equal(results_clustering[i], results_clustering[i + 1])) + # logger.info(f"Solution found with scaling factor = {scaling_factor}") + # if args.interactive and sdp_obj_value == float('inf'): + # embed() + + if args.interactive: + embed() diff --git a/e2e_pipeline/hac_cut_layer.py b/e2e_pipeline/hac_cut_layer.py index c2b5a99..f3e44a4 100644 --- a/e2e_pipeline/hac_cut_layer.py +++ b/e2e_pipeline/hac_cut_layer.py @@ -13,7 +13,7 @@ def __init__(self): Takes fractional SDP output as input, and simultaneously builds & cuts avg. HAC tree to get rounded solution. Executes straight-through estimator as the backward pass. """ - def get_rounded_solution(self, X, weights, _MAX_DIST=10, use_similarities=True, max_similarity=1, verbose=False): + def get_rounded_solution(self, X, weights, _MAX_DIST=1000, use_similarities=True, max_similarity=1, verbose=False): """ X is a symmetric NxN matrix of fractional, decision values with a 1-diagonal (output from the SDP layer) weights is an NxN upper-triangular (shift 1) matrix of edge weights @@ -34,7 +34,8 @@ def get_rounded_solution(self, X, weights, _MAX_DIST=10, use_similarities=True, round_matrix = torch.eye(D, device=device) # Take the upper triangular and mask the other values with a large number - Y = _MAX_DIST * torch.ones(D, D, device=device).tril() + (max_similarity-X if use_similarities else X).triu(1) + _MAX_DIST = torch.max(torch.abs(X)) * _MAX_DIST + Y = _MAX_DIST * torch.ones(D, D, device=device).tril() + (max_similarity - X if use_similarities else X).triu(1) # Compute the dissimilarity minima per row values, indices = torch.min(Y, dim=1) @@ -100,7 +101,7 @@ def get_rounded_solution(self, X, weights, _MAX_DIST=10, use_similarities=True, # Energy calculations clustering[max_node] = clustering[parent_1] + clustering[parent_2] leaf_indices = torch.where(clustering[max_node])[0] - leaf_edges = torch.meshgrid(leaf_indices, leaf_indices) + leaf_edges = torch.meshgrid(leaf_indices, leaf_indices, indexing='ij') energy[max_node] = energy[parent_1] + energy[parent_2] merge_energy = torch.sum(weights[leaf_edges]) if merge_energy >= energy[max_node]: @@ -123,9 +124,16 @@ def get_rounded_solution(self, X, weights, _MAX_DIST=10, use_similarities=True, self.round_matrix = round_matrix self.cluster_labels = clustering[-1] self.parents = parents - objective_matrix = weights * torch.triu(round_matrix, diagonal=1) - self.objective_value = (energy[max_node] - torch.sum(objective_matrix[objective_matrix < 0])).item() # MA + with torch.no_grad(): + objective_matrix = weights * torch.triu(round_matrix, diagonal=1) + self.objective_value = (energy[max_node] - torch.sum(objective_matrix[objective_matrix < 0])).item() # MA return self.round_matrix - def forward(self, X, W, use_similarities=True): - return X + (self.get_rounded_solution(X, W, use_similarities=use_similarities) - X).detach() + def forward(self, X, W, use_similarities=True, return_triu=False): + solution = X + (self.get_rounded_solution(X, W, + use_similarities=use_similarities, + max_similarity=torch.max(X)) - X).detach() + if return_triu: + triu_indices = torch.triu_indices(len(solution), len(solution), offset=1) + return solution[triu_indices[0], triu_indices[1]] + return solution diff --git a/e2e_pipeline/model.py b/e2e_pipeline/model.py index ddabe71..928ae21 100644 --- a/e2e_pipeline/model.py +++ b/e2e_pipeline/model.py @@ -15,7 +15,8 @@ class EntResModel(torch.nn.Module): def __init__(self, n_features, neumiss_depth, dropout_p, dropout_only_once, add_neumiss, neumiss_deq, hidden_dim, n_hidden_layers, add_batchnorm, activation, - negative_slope, hidden_config, sdp_max_iters, sdp_eps, use_rounded_loss=True): + negative_slope, hidden_config, sdp_max_iters, sdp_eps, sdp_scale=False, use_rounded_loss=True, + return_triu_on_train=False, use_sdp=True): super().__init__() # Layers self.mlp_layer = MLPLayer(n_features=n_features, neumiss_depth=neumiss_depth, dropout_p=dropout_p, @@ -23,10 +24,12 @@ def __init__(self, n_features, neumiss_depth, dropout_p, dropout_only_once, add_ hidden_dim=hidden_dim, n_hidden_layers=n_hidden_layers, add_batchnorm=add_batchnorm, activation=activation, negative_slope=negative_slope, hidden_config=hidden_config) self.uncompress_layer = UncompressTransformLayer() - self.sdp_layer = SDPLayer(max_iters=sdp_max_iters, eps=sdp_eps) + self.sdp_layer = SDPLayer(max_iters=sdp_max_iters, eps=sdp_eps, scale_input=sdp_scale) self.hac_cut_layer = HACCutLayer() # Configs self.use_rounded_loss = use_rounded_loss + self.return_triu_on_train = return_triu_on_train + self.use_sdp = use_sdp def forward(self, x, N, warmstart=False, verbose=False): edge_weights = torch.squeeze(self.mlp_layer(x)) @@ -41,14 +44,16 @@ def forward(self, x, N, warmstart=False, verbose=False): logger.info(f"Size of W_matrix = {edge_weights_uncompressed.size()}") logger.info(f"\n{edge_weights_uncompressed}") - output_probs = self.sdp_layer(edge_weights_uncompressed, N) + output_probs = self.sdp_layer(edge_weights_uncompressed, N, use_sdp=self.use_sdp, return_triu=( + self.training and not self.use_rounded_loss and self.return_triu_on_train)) if verbose: logger.info(f"Size of X = {output_probs.size()}") logger.info(f"\n{output_probs}") if self.training and not self.use_rounded_loss: return output_probs - pred_clustering = self.hac_cut_layer(output_probs, edge_weights_uncompressed) + pred_clustering = self.hac_cut_layer(output_probs, edge_weights_uncompressed, + return_triu=(self.training and self.return_triu_on_train)) if verbose: logger.info(f"Size of X_r = {pred_clustering.size()}") logger.info(f"\n{pred_clustering}") diff --git a/e2e_pipeline/sdp_layer.py b/e2e_pipeline/sdp_layer.py index 5a27ceb..5a57def 100644 --- a/e2e_pipeline/sdp_layer.py +++ b/e2e_pipeline/sdp_layer.py @@ -11,14 +11,28 @@ level=logging.INFO) logger = logging.getLogger(__name__) + class CvxpyException(Exception): - pass + def __init__(self, data=None): + self.data = data + + +def get_max_agree_objective(weights, probs, verbose=False): + with torch.no_grad(): + objective_matrix = weights * torch.triu(probs, diagonal=1) + objective_value_IC = torch.sum(objective_matrix).item() + objective_value_MA = objective_value_IC - torch.sum(objective_matrix[objective_matrix < 0]).item() + if verbose: + logger.info(f'SDP objective: intra-cluster={objective_value_IC}, max-agree={objective_value_MA}') + return objective_value_MA + class SDPLayer(torch.nn.Module): - def __init__(self, max_iters: int = 50000, eps: float = 1e-3): + def __init__(self, max_iters: int = 50000, eps: float = 1e-3, scale_input=False): super().__init__() self.max_iters = max_iters self.eps = eps + self.scale_input = scale_input self.objective_value = None # Stores the last run objective value def build_and_solve_sdp(self, W_val, N, verbose=False): @@ -46,26 +60,46 @@ def build_and_solve_sdp(self, W_val, N, verbose=False): # Forward pass through the SDP cvxpylayer try: - pw_probs = self.cvxpy_layer(W_val, solver_args={ + pw_prob_matrix = self.cvxpy_layer(W_val, solver_args={ "solve_method": "SCS", "verbose": verbose, "max_iters": self.max_iters, "eps": self.eps })[0] + # Fix to prevent invalid solution values close to 0 and 1 but outside the range + pw_prob_matrix = torch.clamp(pw_prob_matrix, min=0, max=1) except: logger.error(f'CvxpyException: Error running forward pass on W_val of shape {W_val.shape}') - raise CvxpyException() + raise CvxpyException(data={ + 'W_val': W_val.detach().tolist(), + 'solver_args': { + "solve_method": "SCS", + "verbose": verbose, + "max_iters": self.max_iters, + "eps": self.eps + } + }) + objective_value_MA = get_max_agree_objective(W_val, pw_prob_matrix, verbose=verbose) + return objective_value_MA, pw_prob_matrix + + def get_sigmoid_matrix(self, W_val, N, verbose=False): + pw_prob_matrix = torch.sigmoid(W_val) + objective_value_MA = get_max_agree_objective(W_val, pw_prob_matrix, verbose=verbose) + return objective_value_MA, pw_prob_matrix - with torch.no_grad(): - objective_matrix = W_val * torch.triu(pw_probs, diagonal=1) - objective_value_IC = torch.sum(objective_matrix).item() - objective_value_MA = objective_value_IC - torch.sum(objective_matrix[objective_matrix < 0]).item() + def forward(self, edge_weights_uncompressed, N, use_sdp=True, return_triu=False, verbose=False): + W_val = edge_weights_uncompressed + if self.scale_input: + with torch.no_grad(): + scale_factor = torch.max(torch.abs(W_val)) if verbose: - logger.info(f'SDP objective: intra-cluster={objective_value_IC}, max-agree={objective_value_MA}') + logger.info(f"Scaling W_val by {scale_factor}") + W_val /= scale_factor - return objective_value_MA, pw_probs + solver = self.build_and_solve_sdp if use_sdp else self.get_sigmoid_matrix + self.objective_value, pw_prob_matrix = solver(W_val, N, verbose) - def forward(self, edge_weights_uncompressed, N, verbose=False): - objective_value, pw_probs = self.build_and_solve_sdp(edge_weights_uncompressed, N, verbose) - self.objective_value = objective_value - return pw_probs + if return_triu: + triu_indices = torch.triu_indices(len(pw_prob_matrix), len(pw_prob_matrix), offset=1) + return pw_prob_matrix[triu_indices[0], triu_indices[1]] + return pw_prob_matrix diff --git a/e2e_scripts/evaluate.py b/e2e_scripts/evaluate.py index d1ae7fd..67105ae 100644 --- a/e2e_scripts/evaluate.py +++ b/e2e_scripts/evaluate.py @@ -13,7 +13,7 @@ from e2e_pipeline.cc_inference import CCInference from e2e_pipeline.hac_inference import HACInference from e2e_pipeline.sdp_layer import CvxpyException -from e2e_scripts.train_utils import compute_b3_f1 +from e2e_scripts.train_utils import compute_b3_f1, save_to_wandb_run from IPython import embed @@ -22,10 +22,13 @@ level=logging.INFO) logger = logging.getLogger(__name__) -def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, val_dataloader=None, - tqdm_label='', device=None, verbose=False): + +def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, clustering_threshold=None, + val_dataloader=None, tqdm_label='', device=None, verbose=False, debug=False, _errors=None, + run_dir='./'): """ - clustering_fn, val_dataloader: unused when pairwise_mode is False (only added to keep fn signature identical) + clustering_fn, clustering_threshold, val_dataloader: unused when pairwise_mode is False + (only added to keep fn signature identical) """ device = device if device is not None else torch.device("cuda" if torch.cuda.is_available() else "cpu") n_features = dataloader.dataset[0][0].shape[1] @@ -39,7 +42,7 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, val_da } max_pred_id = -1 n_exceptions = 0 - for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}', disable=(not verbose))): + for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}')): if overfit_batch_idx > -1: if idx < overfit_batch_idx: continue @@ -56,10 +59,24 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, val_da # Forward pass through the e2e model data = data.to(device) try: - _ = model(data, block_size) - except CvxpyException: - if tqdm_label is not 'dev': - raise CvxpyException() + _ = model(data, block_size, verbose=verbose) + except CvxpyException as e: + logger.info(e) + _error_obj = { + 'method': 'eval', + 'model_type': 'e2e', + 'data_split': tqdm_label, + 'model_call_args': { + 'data': data.detach().tolist(), + 'block_size': block_size + }, + 'cvxpy_layer_args': e.data + } + if _errors is not None: + _errors.append(_error_obj) + save_to_wandb_run({'errors': _errors}, 'errors.json', run_dir, logger) + if not debug: # if tqdm_label is not 'dev' and not debug: + raise CvxpyException(data=_error_obj) # If split is dev, skip batch and continue all_gold = all_gold[:-len(cluster_ids)] n_exceptions += 1 @@ -79,7 +96,7 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, val_da def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", return_pred_only=False, thresh_for_f1=0.5, clustering_fn=None, clustering_threshold=None, val_dataloader=None, - tqdm_label='', device=None, verbose=False): + tqdm_label='', device=None, verbose=False, debug=False, _errors=None, run_dir='./'): device = device if device is not None else torch.device("cuda" if torch.cuda.is_available() else "cpu") n_features = dataloader.dataset[0][0].shape[1] @@ -97,7 +114,7 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret } max_pred_id = -1 # In each iteration, add to all blockwise predicted IDs to distinguish from previous blocks n_exceptions = 0 - for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}', disable=(not verbose))): + for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}')): if overfit_batch_idx > -1: if idx < overfit_batch_idx: continue @@ -114,11 +131,25 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret # Forward pass through the e2e model data = data.to(device) try: - pred_cluster_ids = clustering_fn(model(data), block_size, min_id=(max_pred_id + 1), + pred_cluster_ids = clustering_fn(model(data, verbose=verbose), block_size, min_id=(max_pred_id + 1), threshold=clustering_threshold) - except CvxpyException: - if tqdm_label is not 'dev': - raise CvxpyException() + except CvxpyException as e: + logger.info(e) + _error_obj = { + 'method': 'eval', + 'model_type': 'pairwise_cc', + 'data_split': tqdm_label, + 'model_call_args': { + 'data': data.detach().tolist(), + 'block_size': block_size + }, + 'cvxpy_layer_args': e.data + } + if _errors is not None: + _errors.append(_error_obj) + save_to_wandb_run({'errors': _errors}, 'errors.json', run_dir, logger) + if not debug: # if tqdm_label is not 'dev' and not debug: + raise CvxpyException(data=_error_obj) # If split is dev, skip batch and continue all_gold = all_gold[:-len(cluster_ids)] n_exceptions += 1 @@ -136,7 +167,7 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret return (b3_f1, vmeasure, cc_obj_vals) if clustering_fn.__class__ is CCInference else (b3_f1, vmeasure) y_pred, targets = [], [] - for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}', disable=(not verbose))): + for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}')): if overfit_batch_idx > -1: if idx < overfit_batch_idx: continue @@ -148,7 +179,7 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret target = target.flatten().float() # Forward pass through the pairwise model data = data.to(device) - y_pred.append(torch.sigmoid(model(data)).cpu().numpy()) + y_pred.append(torch.sigmoid(model(data, verbose=verbose)).cpu().numpy()) targets.append(target) y_pred = np.hstack(y_pred) targets = np.hstack(targets) diff --git a/e2e_scripts/train.py b/e2e_scripts/train.py index 8ea2ba0..1148587 100644 --- a/e2e_scripts/train.py +++ b/e2e_scripts/train.py @@ -18,7 +18,7 @@ from e2e_pipeline.sdp_layer import CvxpyException from e2e_scripts.evaluate import evaluate, evaluate_pairwise from e2e_scripts.train_utils import DEFAULT_HYPERPARAMS, get_dataloaders, get_matrix_size_from_triu, \ - uncompress_target_tensor, count_parameters, log_cc_objective_values + uncompress_target_tensor, count_parameters, log_cc_objective_values, save_to_wandb_run, FrobeniusLoss from utils.parser import Parser from IPython import embed @@ -31,8 +31,8 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, group=None, save_model=False, load_model_from_wandb_run=None, load_model_from_fpath=None, - load_hyp_from_wandb_run=None, eval_only_split=None, skip_initial_eval=False, - pairwise_eval_clustering=None): + eval_only_split=None, skip_initial_eval=False, pairwise_eval_clustering=None, + debug=False, track_errors=True, local=False): init_args = { 'config': DEFAULT_HYPERPARAMS } @@ -45,6 +45,8 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g init_args.update({'tags': tags}) if group is not None: init_args.update({'group': group}) + if local: + init_args.update({'mode': 'disabled'}) # Start wandb run with wandb.init(**init_args) as run: @@ -53,9 +55,10 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g logger.info("Run hyperparameters:") logger.info(hyp) # Save hyperparameters as a json file and store in wandb run - with open(os.path.join(run.dir, 'hyperparameters.json'), 'w') as fh: - json.dump(dict(hyp), fh) - wandb.save('hyperparameters.json') + save_to_wandb_run(dict(hyp), 'hyperparameters.json', run.dir, logger) + + # Track errors + _errors = [] if track_errors else None # Seed everything if hyp['run_random_seed'] is not None: @@ -65,6 +68,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g pairwise_mode = hyp['pairwise_mode'] weighted_loss = hyp['weighted_loss'] + e2e_loss = hyp['e2e_loss'] batch_size = hyp['batch_size'] if pairwise_mode else 1 # Force clustering runs to operate on 1 block only n_epochs = hyp['n_epochs'] n_warmstart_epochs = hyp['n_warmstart_epochs'] @@ -82,6 +86,8 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g negative_slope = hyp["negative_slope"] sdp_max_iters = hyp["sdp_max_iters"] sdp_eps = hyp["sdp_eps"] + sdp_scale = hyp["sdp_scale"] + grad_acc = hyp['batch_size'] if hyp["gradient_accumulation"] else 1 overfit_batch_idx = hyp['overfit_batch_idx'] clustering_metrics = {'b3_f1': 0, 'vmeasure': 1} pairwise_metrics = {'auroc': 0, 'f1': 1} @@ -101,10 +107,25 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g if not pairwise_mode: model = EntResModel(n_features, neumiss_depth, dropout_p, dropout_only_once, add_neumiss, neumiss_deq, hidden_dim, n_hidden_layers, add_batchnorm, activation, - negative_slope, hidden_config, sdp_max_iters, sdp_eps, - use_rounded_loss=hyp["use_rounded_loss"]) + negative_slope, hidden_config, sdp_max_iters, sdp_eps, sdp_scale, + use_rounded_loss=hyp["use_rounded_loss"], return_triu_on_train=(e2e_loss == "bce"), + use_sdp=hyp["use_sdp"]) # Define loss - loss_fn_e2e = lambda pred, gold: torch.norm(gold - pred) + if e2e_loss not in ["frob", "bce"]: + raise ValueError("Invalid value for e2e_loss") + loss_fn_e2e = FrobeniusLoss() if e2e_loss == 'frob' else torch.nn.BCELoss() + + pos_weight = None + if weighted_loss: + if overfit_batch_idx > -1: + n_pos = train_dataloader.dataset[overfit_batch_idx][1].sum() + pos_weight = (len(train_dataloader.dataset[overfit_batch_idx][1]) - n_pos) / n_pos + else: + _n_pos, _n_total = 0., 0. + for _i in range(len(train_dataloader.dataset)): + _n_pos += train_dataloader.dataset[_i][1].sum() + _n_total += len(train_dataloader.dataset[_i][1]) + pos_weight = (_n_total - _n_pos) / _n_pos # Define eval eval_fn = evaluate pairwise_clustering_fns = [None] # Unused when pairwise_mode is False @@ -118,11 +139,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g hyp["subsample_sz_dev"], True, hyp['batch_size']) # Define loss - pos_weight = None - if weighted_loss and overfit_batch_idx == -1: - n_pos = train_dataloader_pairwise.dataset[:][1].sum() - pos_weight = torch.tensor((len(train_dataloader_pairwise.dataset) - n_pos) / n_pos) - loss_fn_pairwise = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight) + loss_fn_pairwise = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight)) else: model = PairwiseModel(n_features, neumiss_depth, dropout_p, dropout_only_once, add_neumiss, neumiss_deq, hidden_dim, n_hidden_layers, add_batchnorm, activation, @@ -144,14 +161,14 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g pairwise_clustering_fns = [None] if pairwise_eval_clustering is not None: if pairwise_eval_clustering == 'cc': - pairwise_clustering_fns = [CCInference(sdp_max_iters, sdp_eps)] + pairwise_clustering_fns = [CCInference(sdp_max_iters, sdp_eps, sdp_scale)] pairwise_clustering_fns[0].eval() pairwise_clustering_fn_labels = ['cc'] elif pairwise_eval_clustering == 'hac': pairwise_clustering_fns = [HACInference()] pairwise_clustering_fn_labels = ['hac'] elif pairwise_eval_clustering == 'both': - cc_inference = CCInference(sdp_max_iters, sdp_eps) + cc_inference = CCInference(sdp_max_iters, sdp_eps, sdp_scale) pairwise_clustering_fns = [cc_inference, HACInference(), cc_inference] pairwise_clustering_fns[0].eval() pairwise_clustering_fn_labels = ['cc', 'hac', 'cc-fixed'] @@ -188,33 +205,47 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g 'dev': val_dataloader, 'test': test_dataloader } + start_time = time.time() with torch.no_grad(): model.eval() + + eval_dataloader = dataloaders[eval_only_split] + eval_scores = eval_fn(model, eval_dataloader, tqdm_label=eval_only_split, device=device, verbose=verbose, + debug=debug, _errors=_errors) + logger.info(f"Eval: {eval_only_split}_{list(eval_metric_to_idx)[0]}={eval_scores[0]}, " + + f"{eval_only_split}_{list(eval_metric_to_idx)[1]}={eval_scores[1]}") + # Log eval metrics + wandb.log({f'best_{eval_only_split}_{list(eval_metric_to_idx)[0]}': eval_scores[0], + f'best_{eval_only_split}_{list(eval_metric_to_idx)[1]}': eval_scores[1]}) + if len(eval_scores) == 3: + log_cc_objective_values(scores=eval_scores, split_name=eval_only_split, log_prefix='Eval', + verbose=verbose, logger=logger) + + # For pairwise-mode: if pairwise_clustering_fns[0] is not None: - assert eval_only_split == 'test' # Clustering in --eval_only_split implemented only for test set - eval_metric_to_idx = clustering_metrics - eval_dataloader = test_dataloader_e2e - else: - eval_dataloader = dataloaders[eval_only_split] - start_time = time.time() - clustering_threshold = None - for i, pairwise_clustering_fn in enumerate(pairwise_clustering_fns): - eval_scores = eval_fn(model, eval_dataloader, clustering_fn=pairwise_clustering_fn, - clustering_threshold=clustering_threshold, val_dataloader=val_dataloader_e2e, - tqdm_label=eval_only_split, device=device, verbose=verbose) - if pairwise_clustering_fn.__class__ is HACInference: - clustering_threshold = pairwise_clustering_fn.cut_threshold - if verbose: + clustering_threshold = None + for i, pairwise_clustering_fn in enumerate(pairwise_clustering_fns): + clustering_scores = eval_fn(model, test_dataloader_e2e, # Clustering only implemented for TEST + clustering_fn=pairwise_clustering_fn, + clustering_threshold=clustering_threshold, + val_dataloader=val_dataloader_e2e, + tqdm_label='test clustering', device=device, verbose=verbose, + debug=debug, _errors=_errors) + if pairwise_clustering_fn.__class__ is HACInference: + clustering_threshold = pairwise_clustering_fn.cut_threshold logger.info( - f"Eval: {eval_only_split}_{list(eval_metric_to_idx)[0]}_{pairwise_clustering_fn_labels[i]}={eval_scores[0]}, " + - f"{eval_only_split}_{list(eval_metric_to_idx)[1]}_{pairwise_clustering_fn_labels[i]}={eval_scores[1]}") - wandb.log({'epoch': 0, f'{eval_only_split}_{list(eval_metric_to_idx)[0]}_{pairwise_clustering_fn_labels[i]}': eval_scores[0], - f'{eval_only_split}_{list(eval_metric_to_idx)[1]}_{pairwise_clustering_fn_labels[i]}': eval_scores[1]}) - if len(eval_scores) == 3: - log_cc_objective_values(scores=eval_scores, - split_name=f'{eval_only_split}_{pairwise_clustering_fn_labels[i]}', - log_prefix='Eval', verbose=verbose, logger=logger) - end_time = time.time() + f"Eval: test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[0]}, " + + f"test_{list(clustering_metrics)[1]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[1]}") + # Log eval metrics + wandb.log({f'best_test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}': + clustering_scores[0], + f'best_test_{list(clustering_metrics)[1]}_{pairwise_clustering_fn_labels[i]}': + clustering_scores[1]}) + if len(clustering_scores) == 3: + log_cc_objective_values(scores=clustering_scores, + split_name=f'best_test_{pairwise_clustering_fn_labels[i]}', + log_prefix='Eval', verbose=verbose, logger=logger) + end_time = time.time() else: # Training wandb.watch(model) @@ -243,20 +274,35 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g model.eval() if overfit_batch_idx > -1: train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx, - tqdm_label='train', device=device, verbose=verbose) - if verbose: - logger.info(f"Initial: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " + - f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}") + tqdm_label='train', device=device, verbose=verbose, debug=debug, + _errors=_errors) + logger.info(f"Initial: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " + + f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}") wandb.log({'epoch': 0, f'train_{list(eval_metric_to_idx)[0]}': train_scores[0], f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]}) else: - dev_scores = eval_fn(model, val_dataloader, tqdm_label='dev', device=device, verbose=verbose) - if verbose: - logger.info(f"Initial: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " + - f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}") + dev_scores = eval_fn(model, val_dataloader, tqdm_label='dev', device=device, verbose=verbose, + debug=debug, _errors=_errors) + logger.info(f"Initial: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " + + f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}") wandb.log({'epoch': 0, f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0], f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1]}) + if not pairwise_mode and grad_acc > 1: + grad_acc_steps = [] + _seen_pw = 0 + _seen_blk = 0 + for d in train_dataloader.dataset: + _blk_sz = len(d[1]) + _seen_pw += _blk_sz + _seen_blk += 1 + if _seen_pw >= grad_acc: + grad_acc_steps.append(_seen_blk) + _seen_pw = 0 + _seen_blk = 0 + if _seen_blk > 0: + grad_acc_steps.append(_seen_blk) + model.train() start_time = time.time() # Tracks full training runtime for i in range(n_epochs): @@ -271,9 +317,13 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g wandb.log({'epoch': i + 1}) running_loss = [] n_exceptions = 0 + + grad_acc_count = 0 + grad_acc_idx = 0 + optimizer.zero_grad() + for (idx, batch) in enumerate(tqdm(_train_dataloader, - desc=f"{'Warm-starting' if warmstart_mode else 'Training'} {i + 1}", - disable=(not verbose))): + desc=f"{'Warm-starting' if warmstart_mode else 'Training'} {i + 1}")): if overfit_batch_idx > -1: if idx < overfit_batch_idx: continue @@ -299,45 +349,94 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g # Forward pass through the e2e or pairwise model data, target = data.to(device), target.to(device) - output = model(data, N=block_size, warmstart=warmstart_mode, verbose=verbose) + try: + output = model(data, N=block_size, warmstart=warmstart_mode, verbose=verbose) + except CvxpyException as e: + logger.info(e) + _error_obj = { + 'method': 'train_forward', + 'model_type': 'e2e' if not pairwise_mode else 'pairwise', + 'data_split': 'train', + 'model_call_args': { + 'data': data.detach().cpu(), + 'block_size': block_size + }, + 'cvxpy_layer_args': e.data + } + if _errors is not None: + _errors.append(_error_obj) + save_to_wandb_run({'errors': _errors}, 'errors.json', run.dir, logger) + if debug: + n_exceptions += 1 + logger.info( + f'Caught CvxpyException in forward call (count -> {n_exceptions}): skipping batch') + continue # Calculate the loss if not pairwise_mode and not warmstart_mode: - gold_output = uncompress_target_tensor(target, device=device) + grad_acc_denom = 1 if grad_acc == 1 else grad_acc_steps[grad_acc_idx] + if e2e_loss != "bce": + target = uncompress_target_tensor(target, device=device) if verbose: - logger.info(f"Gold:\n{gold_output}") - try: - loss = loss_fn(output.view_as(gold_output), gold_output) / (2 * block_size) - except CvxpyException: - n_exceptions += 1 - logger.info(f'Caught CvxpyException {n_exceptions}: skipping batch') - continue + logger.info(f"Gold:\n{target}") + if pos_weight is not None: + loss_weight = target * pos_weight + (1 - target) + loss_fn.weight = loss_weight + loss = loss_fn(output.view_as(target), target) / grad_acc_denom else: + # Pairwise or warmstart mode if verbose: logger.info(f"Gold:\n{target}") loss = loss_fn(output.view_as(target), target) - optimizer.zero_grad() - loss.backward() - optimizer.step() + try: + loss.backward() + if not pairwise_mode and grad_acc > 1: + grad_acc_count += len(data) + except Exception as e: + logger.info(e) + if isinstance(e, CvxpyException): + _error_obj = { + 'method': 'train_backward', + 'model_type': 'e2e' if not pairwise_mode else 'pairwise', + 'data_split': 'train', + 'model_call_args': { + 'data': data.detach().cpu(), + 'block_size': block_size + } + } + if _errors is not None: + _errors.append(_error_obj) + save_to_wandb_run({'errors': _errors}, 'errors.json', run.dir, logger) + if debug: + n_exceptions += 1 + logger.info( + f'Caught CvxpyException in backward call (count -> {n_exceptions}): skipping batch') + continue + if pairwise_mode or ( + idx == len(_train_dataloader.dataset) - 1) or grad_acc == 1 or grad_acc_count >= grad_acc: + optimizer.step() + optimizer.zero_grad() + if grad_acc > 1: + grad_acc_count = 0 + grad_acc_idx += 1 if verbose: logger.info(f"Loss = {loss.item()}") running_loss.append(loss.item()) wandb.log({f'train_loss{"_warmstart" if warmstart_mode else ""}': np.mean(running_loss)}) - if verbose: - logger.info(f"Epoch loss = {np.mean(running_loss)}") + logger.info(f"Epoch loss = {np.mean(running_loss)}") # Get model performance on dev (or 'train' for overfitting runs) with torch.no_grad(): model.eval() if overfit_batch_idx > -1: train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx, - tqdm_label='train', device=device, verbose=verbose) - if verbose: - logger.info(f"Epoch {i + 1}: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " + - f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}") + tqdm_label='train', device=device, verbose=verbose, debug=debug, + _errors=_errors) + logger.info(f"Epoch {i + 1}: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " + + f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}") wandb.log({f'train_{list(eval_metric_to_idx)[0]}': train_scores[0], f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]}) if use_lr_scheduler: @@ -346,17 +445,16 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g elif hyp['lr_scheduler'] == 'step': scheduler.step() else: - dev_scores = eval_fn(model, val_dataloader, tqdm_label='dev', device=device, verbose=verbose) - if verbose: - logger.info(f"Epoch {i + 1}: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " + - f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}") + dev_scores = eval_fn(model, val_dataloader, tqdm_label='dev', device=device, verbose=verbose, + debug=debug, _errors=_errors) + logger.info(f"Epoch {i + 1}: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " + + f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}") wandb.log({f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0], f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1], f'train_epoch_loss': np.mean(running_loss)}) dev_opt_score = dev_scores[eval_metric_to_idx[dev_opt_metric]] if dev_opt_score > best_dev_score: - if verbose: - logger.info(f"New best dev {dev_opt_metric} score @ epoch{i+1}: {dev_opt_score}") + logger.info(f"New best dev {dev_opt_metric} score @ epoch{i+1}: {dev_opt_score}") best_epoch = i best_dev_score = dev_opt_score best_dev_scores = dev_scores @@ -367,18 +465,23 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g elif hyp['lr_scheduler'] == 'step': scheduler.step() model.train() - end_time = time.time() + # Save model + if save_model: + torch.save(best_dev_state_dict, os.path.join(run.dir, 'model_state_dict_best.pt')) + wandb.save('model_state_dict_best.pt') + logger.info(f"Saved best model on dev to {os.path.join(run.dir, 'model_state_dict_best.pt')}") + + # Evaluate the best dev model on test if overfit_batch_idx == -1: - # Evaluate best dev model on test model.load_state_dict(best_dev_state_dict) with torch.no_grad(): model.eval() - test_scores = eval_fn(model, test_dataloader, tqdm_label='test', device=device, verbose=verbose) - if verbose: - logger.info(f"Final: test_{list(eval_metric_to_idx)[0]}={test_scores[0]}, " + - f"test_{list(eval_metric_to_idx)[1]}={test_scores[1]}") + test_scores = eval_fn(model, test_dataloader, tqdm_label='test', device=device, verbose=verbose, + debug=debug, _errors=_errors) + logger.info(f"Final: test_{list(eval_metric_to_idx)[0]}={test_scores[0]}, " + + f"test_{list(eval_metric_to_idx)[1]}={test_scores[1]}") # Log final metrics wandb.log({'best_dev_epoch': best_epoch + 1, f'best_dev_{list(eval_metric_to_idx)[0]}': best_dev_scores[0], @@ -387,7 +490,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g f'best_test_{list(eval_metric_to_idx)[1]}': test_scores[1]}) if len(test_scores) == 3: log_cc_objective_values(scores=test_scores, split_name='best_test', log_prefix='Final', - verbose=verbose, logger=logger) + verbose=True, logger=logger) # For pairwise-mode: if pairwise_clustering_fns[0] is not None: clustering_threshold = None @@ -396,30 +499,27 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g clustering_fn=pairwise_clustering_fn, clustering_threshold=clustering_threshold, val_dataloader=val_dataloader_e2e, - tqdm_label='test clustering', device=device, verbose=verbose) + tqdm_label='test clustering', device=device, verbose=verbose, + debug=debug, _errors=_errors) if pairwise_clustering_fn.__class__ is HACInference: clustering_threshold = pairwise_clustering_fn.cut_threshold - if verbose: - logger.info(f"Final: test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[0]}, " + - f"test_{list(clustering_metrics)[1]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[1]}") + logger.info(f"Final: test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[0]}, " + + f"test_{list(clustering_metrics)[1]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[1]}") # Log final metrics wandb.log({f'best_test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}': clustering_scores[0], f'best_test_{list(clustering_metrics)[1]}_{pairwise_clustering_fn_labels[i]}': clustering_scores[1]}) if len(clustering_scores) == 3: log_cc_objective_values(scores=clustering_scores, split_name=f'best_test_{pairwise_clustering_fn_labels[i]}', - log_prefix='Final', verbose=verbose, logger=logger) + log_prefix='Final', verbose=True, logger=logger) run.summary["z_model_parameters"] = count_parameters(model) run.summary["z_run_time"] = round(end_time - start_time) run.summary["z_run_dir_path"] = run.dir - # Save models - if save_model: - torch.save(best_dev_state_dict, os.path.join(run.dir, 'model_state_dict_best.pt')) - wandb.save('model_state_dict_best.pt') - logger.info(f"Saved best model on dev to {os.path.join(run.dir, 'model_state_dict_best.pt')}") + if _errors is not None: + save_to_wandb_run({'errors': _errors}, 'errors.json', run.dir, logger) logger.info(f"Run directory: {run.dir}") logger.info("End of train() call") @@ -496,7 +596,10 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g verbose=not args['silent'], tags=args['wandb_tags'], save_model=args['save_model'], - skip_initial_eval=args['skip_initial_eval']), + skip_initial_eval=args['skip_initial_eval'], + debug=args['debug'], + track_errors=not args['no_error_tracking'], + local=args['local']), count=args['wandb_max_runs']) logger.info("End of sweep") @@ -524,8 +627,10 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g save_model=args['save_model'], load_model_from_wandb_run=args['load_model_from_wandb_run'], load_model_from_fpath=args['load_model_from_fpath'], - load_hyp_from_wandb_run=args['load_hyp_from_wandb_run'], eval_only_split=args['eval_only_split'], skip_initial_eval=args['skip_initial_eval'], - pairwise_eval_clustering=args['pairwise_eval_clustering']) + pairwise_eval_clustering=args['pairwise_eval_clustering'], + debug=args['debug'], + track_errors=not args['no_error_tracking'], + local=args['local']) logger.info("End of run") diff --git a/e2e_scripts/train_utils.py b/e2e_scripts/train_utils.py index 0cf34b7..efeea6e 100644 --- a/e2e_scripts/train_utils.py +++ b/e2e_scripts/train_utils.py @@ -1,16 +1,18 @@ """ Helper functions and constants for e2e_scripts/train.py """ - +import os +import json from collections import defaultdict from typing import Dict -from typing import Tuple +from typing import Tuple, Optional import math import pickle from torch.utils.data import DataLoader from s2and.consts import PREPROCESSED_DATA_DIR from s2and.data import S2BlocksDataset from s2and.eval import b3_precision_recall_fscore +from torch import Tensor import torch import numpy as np import wandb @@ -23,8 +25,8 @@ # Dataset "dataset": "pubmed", "dataset_random_seed": 1, - "subsample_sz_train": -1, - "subsample_sz_dev": -1, + "subsample_sz_train": 80, + "subsample_sz_dev": 100, # Run config "run_random_seed": 17, "pairwise_mode": False, @@ -45,23 +47,27 @@ "activation": "leaky_relu", "negative_slope": 0.01, "use_rounded_loss": True, + "use_sdp": True, + "e2e_loss": "frob", # e2e only: "frob", "bce" # Solver config "sdp_max_iters": 50000, - "sdp_eps": 1e-1, + "sdp_eps": 1e-3, + "sdp_scale": True, # Training config - "batch_size": 10000, # For pairwise_mode only - "lr": 1e-4, + "batch_size": 10000, # pairwise only; used by e2e if gradient_accumulation is true + "lr": 4e-3, "n_epochs": 5, "n_warmstart_epochs": 0, - "weighted_loss": True, # For pairwise_mode only; TODO: Think about implementing for e2e + "weighted_loss": False, "use_lr_scheduler": True, - "lr_scheduler": "plateau", # "step" + "lr_scheduler": "plateau", # "plateau", "step" "lr_factor": 0.4, "lr_min": 1e-6, "lr_scheduler_patience": 2, "lr_step_size": 2, "lr_gamma": 0.4, "weight_decay": 0.01, + "gradient_accumulation": False, # e2e only; accumulate over pairwise examples "dev_opt_metric": 'b3_f1', # e2e: {'b3_f1', 'vmeasure'}; pairwise: {'auroc', 'f1'} "overfit_batch_idx": -1 } @@ -133,6 +139,7 @@ def compute_b3_f1(true_cluster_ids, pred_cluster_ids): pred_cluster_dict[pred_cluster_ids[i]].append(i) return b3_precision_recall_fscore(true_cluster_dict, pred_cluster_dict) + def log_cc_objective_values(scores, split_name, log_prefix, verbose, logger, plot=False): frac, round = np.array(scores[2]['sdp']), np.array(scores[2]['round']) # Objective across blocks @@ -151,3 +158,27 @@ def log_cc_objective_values(scores, split_name, log_prefix, verbose, logger, plo f'{split_name}_obj_ratio': mean_approx_ratio}) # TODO: Implement plotting the approx. ratio v/s block sizes + + +def save_to_wandb_run(file, fname, fpath, logger): + with open(os.path.join(fpath, fname), 'w') as fh: + json.dump(file, fh) + wandb.save(fname) + logger.info(f"Saved {fname} to {os.path.join(fpath, fname)}") + + +class FrobeniusLoss: + def __init__(self, weight: Optional[Tensor] = None, reduction: str = 'original') -> None: + self.weight = weight + self.reduction = reduction + + def __call__(self, input: Tensor, target: Tensor) -> Tensor: + n = len(target) + normalization = 1. + if self.reduction == 'mean': + normalization = n * (n - 1) + elif self.reduction == 'original': # TODO: Probably want to not use this + normalization = 2 * n + if self.weight is None: + return torch.norm((target - input)) / normalization + return torch.norm(self.weight * (target - input)) / normalization diff --git a/run_sweep.sh b/run_sweep.sh index 6c8776f..b61c509 100644 --- a/run_sweep.sh +++ b/run_sweep.sh @@ -1,21 +1,23 @@ #!/bin/bash -e dataset=${1:-"pubmed"} -n_seeds=${2:-5} -model=${3:-"e2e"} # Used as prefix and to pick up the right sweep file -gpu_name=${4:-"gypsum-1080ti"} +n_seed_start=${2:-1} +n_seed_end=${3:-5} +model=${4:-"e2e"} # Used as prefix and to pick up the right sweep file +gpu_name=${5:-"gypsum-1080ti"} +sweep_prefix=${6:-""} -for ((i = 1; i <= ${n_seeds}; i++)); do +for ((i = ${n_seed_start}; i <= ${n_seed_end}; i++)); do JOB_DESC=${model}_${dataset}_sweep${i} && JOB_NAME=${JOB_DESC}_$(date +%s) && \ sbatch -J ${JOB_NAME} -e jobs/${JOB_NAME}.err -o jobs/${JOB_NAME}.log \ - --partition=${gpu_name} --gres=gpu:1 --mem=80G --time=12:00:00 \ + --partition=${gpu_name} --gres=gpu:1 --mem=100G --time=12:00:00 \ run_sbatch.sh e2e_scripts/train.py \ --dataset="${dataset}" \ --dataset_random_seed=${i} \ --pairwise_eval_clustering="both" \ --skip_initial_eval \ --silent \ - --wandb_sweep_name="${model}_${dataset}_${i}" \ + --wandb_sweep_name="${sweep_prefix}_${model}_${dataset}_${i}" \ --wandb_sweep_params="wandb_configs/sweeps/${model}.json" \ --wandb_tags="${model},${dataset},seed_${i}" echo " Logs: jobs/${JOB_NAME}.err" diff --git a/utils/parser.py b/utils/parser.py index e4211e6..3291481 100644 --- a/utils/parser.py +++ b/utils/parser.py @@ -134,3 +134,15 @@ def add_training_args(self): help="(only in --pairwise_mode) Whether to run clustering during --eval_only_split and final test eval. " + "Accepts 'cc' for correlation clustering, 'hac' for agglomerative clustering, and 'both' to run both.", ) + parser.add_argument( + "--debug", action="store_true", + help="Enable debugging mode, where train-eval flows do not quit on known errors in order to allow tracking", + ) + parser.add_argument( + "--no_error_tracking", action="store_true", + help="Disable error logging for SDP forward and backward passes", + ) + parser.add_argument( + "--local", action="store_true", + help="Run script with wandb disabled", + ) diff --git a/wandb_configs/sweeps/e2e-nosdp-warm.json b/wandb_configs/sweeps/e2e-nosdp-warm.json new file mode 100644 index 0000000..3846f0d --- /dev/null +++ b/wandb_configs/sweeps/e2e-nosdp-warm.json @@ -0,0 +1,16 @@ +{ + "n_epochs": {"value": 10}, + "lr": {"max": 2e-1, "min": 1e-5}, + "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]}, + "dev_opt_metric": {"value": "b3_f1"}, + "neumiss_depth": {"values": [10, 20]}, + "hidden_dim": {"values": [512, 1024]}, + "n_hidden_layers": {"values": [1, 2]}, + "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, + "lr_scheduler": {"value": "plateau"}, + "subsample_sz_train": {"value": 80}, + "subsample_sz_dev": {"value": 100}, + "activation": {"values": ["leaky_relu", "relu"]}, + "use_sdp": {"value": false}, + "n_warmstart_epochs": {"value": 2} +} diff --git a/wandb_configs/sweeps/e2e-nosdp.json b/wandb_configs/sweeps/e2e-nosdp.json new file mode 100644 index 0000000..4e02afe --- /dev/null +++ b/wandb_configs/sweeps/e2e-nosdp.json @@ -0,0 +1,15 @@ +{ + "n_epochs": {"value": 10}, + "lr": {"max": 2e-1, "min": 1e-5}, + "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]}, + "dev_opt_metric": {"value": "b3_f1"}, + "neumiss_depth": {"values": [10, 20]}, + "hidden_dim": {"values": [512, 1024]}, + "n_hidden_layers": {"values": [1, 2]}, + "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, + "lr_scheduler": {"value": "plateau"}, + "subsample_sz_train": {"value": 80}, + "subsample_sz_dev": {"value": 100}, + "activation": {"values": ["leaky_relu", "relu"]}, + "use_sdp": {"value": false} +} diff --git a/wandb_configs/sweeps/e2e.json b/wandb_configs/sweeps/e2e.json index d3db142..20991ba 100644 --- a/wandb_configs/sweeps/e2e.json +++ b/wandb_configs/sweeps/e2e.json @@ -7,7 +7,7 @@ "hidden_dim": {"values": [512, 1024]}, "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, - "lr_scheduler": {"values": ["plateau", "step"]}, + "lr_scheduler": {"value": "plateau"}, "subsample_sz_train": {"value": 80}, "subsample_sz_dev": {"value": 100}, "activation": {"values": ["leaky_relu", "relu"]} diff --git a/wandb_configs/sweeps/frac-nosdp-warm.json b/wandb_configs/sweeps/frac-nosdp-warm.json new file mode 100644 index 0000000..75503ce --- /dev/null +++ b/wandb_configs/sweeps/frac-nosdp-warm.json @@ -0,0 +1,17 @@ +{ + "n_epochs": {"value": 10}, + "lr": {"max": 2e-1, "min": 1e-5}, + "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]}, + "dev_opt_metric": {"value": "b3_f1"}, + "neumiss_depth": {"values": [10, 20]}, + "hidden_dim": {"values": [512, 1024]}, + "n_hidden_layers": {"values": [1, 2]}, + "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, + "lr_scheduler": {"value": "plateau"}, + "subsample_sz_train": {"value": 80}, + "subsample_sz_dev": {"value": 100}, + "activation": {"values": ["leaky_relu", "relu"]}, + "use_rounded_loss": {"value": false}, + "use_sdp": {"value": false}, + "n_warmstart_epochs": {"value": 2} +} diff --git a/wandb_configs/sweeps/frac-nosdp.json b/wandb_configs/sweeps/frac-nosdp.json new file mode 100644 index 0000000..f27ee08 --- /dev/null +++ b/wandb_configs/sweeps/frac-nosdp.json @@ -0,0 +1,16 @@ +{ + "n_epochs": {"value": 10}, + "lr": {"max": 2e-1, "min": 1e-5}, + "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]}, + "dev_opt_metric": {"value": "b3_f1"}, + "neumiss_depth": {"values": [10, 20]}, + "hidden_dim": {"values": [512, 1024]}, + "n_hidden_layers": {"values": [1, 2]}, + "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, + "lr_scheduler": {"value": "plateau"}, + "subsample_sz_train": {"value": 80}, + "subsample_sz_dev": {"value": 100}, + "activation": {"values": ["leaky_relu", "relu"]}, + "use_rounded_loss": {"value": false}, + "use_sdp": {"value": false} +} diff --git a/wandb_configs/sweeps/frac.json b/wandb_configs/sweeps/frac.json index d91acd6..7eb6812 100644 --- a/wandb_configs/sweeps/frac.json +++ b/wandb_configs/sweeps/frac.json @@ -7,7 +7,7 @@ "hidden_dim": {"values": [512, 1024]}, "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, - "lr_scheduler": {"values": ["plateau", "step"]}, + "lr_scheduler": {"value": "plateau"}, "subsample_sz_train": {"value": 80}, "subsample_sz_dev": {"value": 100}, "activation": {"values": ["leaky_relu", "relu"]}, diff --git a/wandb_configs/sweeps/mlp.json b/wandb_configs/sweeps/mlp.json index 3df8746..a5f49fc 100644 --- a/wandb_configs/sweeps/mlp.json +++ b/wandb_configs/sweeps/mlp.json @@ -8,6 +8,7 @@ "hidden_dim": {"values": [512, 1024]}, "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, - "lr_scheduler": {"values": ["plateau", "step"]}, - "activation": {"values": ["leaky_relu", "relu"]} + "lr_scheduler": {"value": "plateau"}, + "activation": {"values": ["leaky_relu", "relu"]}, + "weighted_loss": {"value": true} } From f1a19aa0cd2a78f2c6c4a7dd6846cd2c32d42d24 Mon Sep 17 00:00:00 2001 From: Sriharsha-hatwar Date: Wed, 15 Mar 2023 19:31:14 +0000 Subject: [PATCH 03/17] point feature creation implementation --- e2e_scripts/preprocess_s2and_pointwise.py | 94 ++++++++++++++++++ s2and/data.py | 10 +- s2and/featurizer.py | 111 ++++++++++++++++++++++ 3 files changed, 214 insertions(+), 1 deletion(-) create mode 100644 e2e_scripts/preprocess_s2and_pointwise.py diff --git a/e2e_scripts/preprocess_s2and_pointwise.py b/e2e_scripts/preprocess_s2and_pointwise.py new file mode 100644 index 0000000..6aac36e --- /dev/null +++ b/e2e_scripts/preprocess_s2and_pointwise.py @@ -0,0 +1,94 @@ +""" +Run from command line: + python e2e_scripts/preprocess_s2and_data.py --data_home_dir="./data" --dataset_name="pubmed" +""" +import sys + +from typing import Union, Dict +from typing import Tuple + +from s2and.consts import PREPROCESSED_DATA_DIR +from s2and.featurizer import FeaturizationInfo, store_featurized_pickles, many_pairs_featurize, pointwise_featurize +from os.path import join +from s2and.data import ANDData +import pickle +import numpy as np +from scipy.sparse import csr_matrix, coo_matrix +from utils.parser import Parser + +from s2and.data import ANDData +import logging +from s2and.featurizer import FeaturizationInfo, featurize + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) +logger = logging.getLogger(__name__) + +def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed): + """ + Fetch pointwise feature for dataset and store in a pickle. + """ + processed_data = {} + parent_dir = f"{data_home_dir}/{dataset_name}" + """ + AND_dataset = ANDData( + signatures=join(parent_dir, f"{dataset_name}_signatures.json"), + papers=join(parent_dir, f"{dataset_name}_papers.json"), + mode="inference", + clusters=join(parent_dir, f"{dataset_name}_clusters.json"), + block_type="s2", + train_pairs_size=100000, + val_pairs_size=10000, + test_pairs_size=10000, + name=dataset_name, + n_jobs=16, + random_seed=random_seed, + ) + """ + #print("This is for pickling dataset....") + #with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f: + # pickle.dump(AND_dataset, f) + + print("getting pickled dataset...") + with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f: + AND_dataset = pickle.load(f) + print("Loaded pickle dataset...") + + + + point_features_row, point_features_col, point_features_data, num_feats, num_points = pointwise_featurize(AND_dataset, + n_jobs=16, + use_cache=False, + random_seed=random_seed) + logger.info('converting feature indices to csr_matrix') + point_features = coo_matrix( + (point_features_data, (point_features_row, point_features_col)), + shape=(num_points, num_feats) + ).tocsr() + + print("Matrix creation done.") + processed_data['mention_level_features'] = point_features + + logger.info('Dumping processed data') + + with open(f'{dataset_name}_feature_processed.pkl', 'wb') as f: + pickle.dump(processed_data, f) + +if __name__=='__main__': + # Creates the pickles that store the preprocessed data + # Read cmd line args + + parser = Parser(add_preprocessing_args=True) + parser.add_preprocessing_args() + + args = parser.parse_args() + print(args) + + params = args.__dict__ + data_home_dir = params["data_home_dir"] + dataset = params["dataset_name"] + + seed = 1211 # Dummy not needed, can be totally removed. + print("Preprocessing started for seed value", seed) + save_pickled_pointwise_features(data_home_dir, dataset, seed) + print("Matrix") diff --git a/s2and/data.py b/s2and/data.py index 9d75eb1..ca0f7f1 100644 --- a/s2and/data.py +++ b/s2and/data.py @@ -495,7 +495,15 @@ def __init__( self.preprocess_signatures(name_counts_loaded) logger.info("preprocessed signatures") - + def force_signature_to_cluster_mapping(self): + if self.clusters is not None: + self.signature_to_cluster_id = {} + logger.info("making signature to cluster id") + for cluster_id, cluster_info in self.clusters.items(): + for signature in cluster_info["signature_ids"]: + self.signature_to_cluster_id[signature] = cluster_id + logger.info("made signature to cluster id") + def get_signature_objects(self, signature_ids: Dict[str, List[str]]) -> Dict[str, List[Signature]]: """ Returns a dict of blockId with a list of it's Signature objects, useful for qualitative analysis diff --git a/s2and/featurizer.py b/s2and/featurizer.py index f306980..743d7e9 100644 --- a/s2and/featurizer.py +++ b/s2and/featurizer.py @@ -8,6 +8,9 @@ import functools import logging from collections import Counter +from collections.abc import Iterable + +from sklearn import preprocessing from tqdm import tqdm @@ -825,6 +828,114 @@ def featurize( logger.info("featurized test") return train_features, val_features, test_features + +def pointwise_featurize( + dataset: ANDData, + n_jobs: int = 1, + use_cache: bool = False, + chunk_size: int = DEFAULT_CHUNK_SIZE, + nan_value: float = np.nan, + delete_training_data: bool = False, + random_seed: int = 1, +): + """ + Featurizes the input dataset and stores as a unified pickle file. + + Parameters + ---------- + dataset: ANDData + the dataset containing the relevant data + n_jobs: int + the number of cpus to use + use_cache: bool + whether or not to use write to/read from the features cache + chunk_size: int + the chunk size for multiprocessing + nan_value: float + the value to replace nans with + delete_training_data: bool + Whether to delete some suspicious training examples + + Returns + ------- + Returns the three items : + 1. Row indices of the sparse matrix containing the data + 2. Column indices of the sparse matrix containing the data + 3. The data to be filled in the given row and column combination. + """ + # Do you think OrderedSet and OrderedDict should be used here? + signature_feature_set = set() + signature_dict = {} + + # We dont need to iterate signature per block as we need to create for all the signatures irrespective of the block. + + for signature_key, values in dataset.signatures.items(): + per_signature_features = dataset.signatures[signature_key]._asdict() + signature_dict[signature_key] = [] + for feature_key, value in per_signature_features.items(): + index_key = None + if (value is None + or (isinstance(value, Iterable) and len(value) == 0)): + continue + try: + if np.isnan(value): + print('\n!!!! Found a NaN !!!!\n') + exit() + continue + except: + pass + + # Let us check the type of value for each signatures. + + if isinstance(value, str) or isinstance(value, int): + index_key = (feature_key, value) + signature_feature_set.add(str(index_key)) + signature_dict[signature_key].append(index_key) + elif isinstance(value, Counter): + for val in value.keys(): + index_key = (feature_key, val) + signature_feature_set.add(str(index_key)) + signature_dict[signature_key].append(index_key) + elif isinstance(value, Iterable): + for val in value: + index_key = (feature_key, val) + signature_feature_set.add(str(index_key)) + signature_dict[signature_key].append(index_key) + else: + print('\n!!!! Found another type !!!!\n') + embed() + exit() + logger.info('Label encoding the values') + # Label encoding code --- + + """" + { + "signature_id_one" : [(feat_key_1, val_1), (feat_key_2, val_2) ...], + "signature_id_two" : [(feat_key_1, val_1), (feat_key_3, val_3) ...] + + } + """ + le_signature_feature_set = preprocessing.LabelEncoder() + le_signature_feature_set.fit(list(signature_feature_set)) + + le_signature_dict = preprocessing.LabelEncoder() + le_signature_dict.fit(list(signature_dict.keys())) + + point_features_row, point_features_col, point_features_data = [], [], [] + num_points = len(signature_dict.keys()) + num_feats = len(signature_feature_set) + for key, values in signature_dict.items(): + encoded_key_val = le_signature_dict.transform([key])[0] + val_strings = [str(val) for val in values] + encoded_values_val = le_signature_feature_set.transform(val_strings) + for val in encoded_values_val : + point_features_row.append(encoded_key_val) + point_features_col.append(val) + point_features_data.append(1) + + return point_features_row, point_features_col, point_features_data, num_feats, num_points + + def store_featurized_pickles( dataset: ANDData, featurizer_info: FeaturizationInfo, From f1efefb1b292f34433980be81e34907e1077cada Mon Sep 17 00:00:00 2001 From: Sriharsha-hatwar Date: Wed, 15 Mar 2023 19:37:15 +0000 Subject: [PATCH 04/17] minor changes , added comments --- e2e_scripts/preprocess_s2and_pointwise.py | 14 +++++++------- s2and/featurizer.py | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/e2e_scripts/preprocess_s2and_pointwise.py b/e2e_scripts/preprocess_s2and_pointwise.py index 6aac36e..9035372 100644 --- a/e2e_scripts/preprocess_s2and_pointwise.py +++ b/e2e_scripts/preprocess_s2and_pointwise.py @@ -1,6 +1,6 @@ """ Run from command line: - python e2e_scripts/preprocess_s2and_data.py --data_home_dir="./data" --dataset_name="pubmed" + python e2e_scripts/preprocess_s2and_pointwise.py --data_home_dir="./data" --dataset_name="pubmed" """ import sys @@ -30,7 +30,7 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed): """ processed_data = {} parent_dir = f"{data_home_dir}/{dataset_name}" - """ + AND_dataset = ANDData( signatures=join(parent_dir, f"{dataset_name}_signatures.json"), papers=join(parent_dir, f"{dataset_name}_papers.json"), @@ -44,15 +44,15 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed): n_jobs=16, random_seed=random_seed, ) - """ + #print("This is for pickling dataset....") #with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f: # pickle.dump(AND_dataset, f) - print("getting pickled dataset...") - with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f: - AND_dataset = pickle.load(f) - print("Loaded pickle dataset...") + #print("getting pickled dataset...") + #with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f: + # AND_dataset = pickle.load(f) + #print("Loaded pickle dataset...") diff --git a/s2and/featurizer.py b/s2and/featurizer.py index 743d7e9..bf41ed8 100644 --- a/s2and/featurizer.py +++ b/s2and/featurizer.py @@ -864,7 +864,7 @@ def pointwise_featurize( 3. The data to be filled in the given row and column combination. """ # Do you think OrderedSet and OrderedDict should be used here? - signature_feature_set = set() + signature_feature_set = set() # The feature is stored a str and not tuple to facilitate label encoding. signature_dict = {} # We dont need to iterate signature per block as we need to create for all the signatures irrespective of the block. @@ -889,7 +889,7 @@ def pointwise_featurize( if isinstance(value, str) or isinstance(value, int): index_key = (feature_key, value) - signature_feature_set.add(str(index_key)) + signature_feature_set.add(str(index_key)) # Converting to str from tuple. signature_dict[signature_key].append(index_key) elif isinstance(value, Counter): for val in value.keys(): From 548f151c4e059280881a4d7c620cc23ded9ec2da Mon Sep 17 00:00:00 2001 From: Dhruv Agarwal Date: Fri, 17 Mar 2023 02:09:13 -0400 Subject: [PATCH 05/17] Multiprocessing dev, block size stats, eval_all method (#40) --- e2e_pipeline/cc_inference.py | 7 +- e2e_pipeline/hac_inference.py | 6 +- e2e_pipeline/sdp_layer.py | 14 +- e2e_scripts/evaluate.py | 17 +- e2e_scripts/get_block_sizes.py | 116 +++++++++ e2e_scripts/preprocess_s2and_data.py | 4 +- e2e_scripts/train.py | 340 ++++++++++++++++++++------- e2e_scripts/train_utils.py | 25 +- run_sweep.sh | 2 +- utils/parser.py | 8 + 10 files changed, 433 insertions(+), 106 deletions(-) create mode 100644 e2e_scripts/get_block_sizes.py diff --git a/e2e_pipeline/cc_inference.py b/e2e_pipeline/cc_inference.py index e486e0c..75700c2 100644 --- a/e2e_pipeline/cc_inference.py +++ b/e2e_pipeline/cc_inference.py @@ -17,11 +17,12 @@ class CCInference(torch.nn.Module): Correlation clustering inference-only model. Expects edge weights and the number of nodes as input. """ - def __init__(self, sdp_max_iters, sdp_eps): + def __init__(self, sdp_max_iters, sdp_eps, sdp_scale, use_sdp): super().__init__() self.uncompress_layer = UncompressTransformLayer() - self.sdp_layer = SDPLayer(max_iters=sdp_max_iters, eps=sdp_eps) + self.sdp_layer = SDPLayer(max_iters=sdp_max_iters, eps=sdp_eps, scale_input=sdp_scale) self.hac_cut_layer = HACCutLayer() + self.use_sdp = use_sdp def forward(self, edge_weights, N, min_id=0, threshold=None, verbose=False): edge_weights = torch.squeeze(edge_weights) @@ -29,7 +30,7 @@ def forward(self, edge_weights, N, min_id=0, threshold=None, verbose=False): # threshold is used to convert a similarity score (in [0,1]) into edge weights (in R, i.e. + and -) edge_weights = torch.sigmoid(edge_weights) - threshold edge_weights_uncompressed = self.uncompress_layer(edge_weights, N) - output_probs = self.sdp_layer(edge_weights_uncompressed, N) + output_probs = self.sdp_layer(edge_weights_uncompressed, N, use_sdp=self.use_sdp) pred_clustering = self.hac_cut_layer(output_probs, edge_weights_uncompressed) if verbose: diff --git a/e2e_pipeline/hac_inference.py b/e2e_pipeline/hac_inference.py index cf20ad5..30e1b9d 100644 --- a/e2e_pipeline/hac_inference.py +++ b/e2e_pipeline/hac_inference.py @@ -33,7 +33,6 @@ def tune_threshold(self, model, dataloader, device, n_trials=1000): all_gold = [] blockwise_trees = [] all_dists = [] - max_pred_id = -1 # In each iteration, add to all blockwise predicted IDs to distinguish from previous blocks n_features = dataloader.dataset[0][0].shape[1] for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Tuning threshold on dev')): data, _, cluster_ids = batch @@ -46,7 +45,8 @@ def tune_threshold(self, model, dataloader, device, n_trials=1000): # Forward pass through the e2e model data = data.to(device) - tree_and_alts, dists = self.cluster(model(data), block_size, return_tree=True) + edge_weights = model(data, N=len(cluster_ids), warmstart=True) + tree_and_alts, dists = self.cluster(edge_weights, block_size, return_tree=True) blockwise_trees.append(tree_and_alts) all_dists.append(dists) @@ -61,7 +61,7 @@ def tune_threshold(self, model, dataloader, device, n_trials=1000): best_dev_metric = -1 for _thresh in tqdm(thresholds, desc="Finding best cut threshold"): all_pred = [] - max_pred_id = -1 + max_pred_id = -1 # In each iter, add to all blockwise predicted IDs to distinguish from previous blocks for (_hac, _hac_alts) in blockwise_trees: _cut_labels = self.cut_tree(_hac, _hac_alts, _thresh) pred_cluster_ids = _cut_labels + (max_pred_id + 1) diff --git a/e2e_pipeline/sdp_layer.py b/e2e_pipeline/sdp_layer.py index 5a57def..bdc10c0 100644 --- a/e2e_pipeline/sdp_layer.py +++ b/e2e_pipeline/sdp_layer.py @@ -41,26 +41,26 @@ def build_and_solve_sdp(self, W_val, N, verbose=False): Returns a symmetric NxN matrix of fractional, decision values with a 1-diagonal """ # Initialize the cvxpy layer - self.X = cp.Variable((N, N), PSD=True) - self.W = cp.Parameter((N, N)) + X = cp.Variable((N, N), PSD=True) + W = cp.Parameter((N, N)) # build out constraint set constraints = [ - cp.diag(self.X) == np.ones((N,)), - self.X[:N, :] >= 0, + cp.diag(X) == np.ones((N,)), + X[:N, :] >= 0, ] # create problem - self.prob = cp.Problem(cp.Maximize(cp.trace(self.W @ self.X)), constraints) + prob = cp.Problem(cp.Maximize(cp.trace(W @ X)), constraints) # Note: maximizing the trace is equivalent to maximizing the sum_E (w_uv * X_uv) objective # because W is upper-triangular and X is symmetric # Build the SDP cvxpylayer - self.cvxpy_layer = CvxpyLayer(self.prob, parameters=[self.W], variables=[self.X]) + cvxpy_layer = CvxpyLayer(prob, parameters=[W], variables=[X]) # Forward pass through the SDP cvxpylayer try: - pw_prob_matrix = self.cvxpy_layer(W_val, solver_args={ + pw_prob_matrix = cvxpy_layer(W_val, solver_args={ "solve_method": "SCS", "verbose": verbose, "max_iters": self.max_iters, diff --git a/e2e_scripts/evaluate.py b/e2e_scripts/evaluate.py index 67105ae..b7ba424 100644 --- a/e2e_scripts/evaluate.py +++ b/e2e_scripts/evaluate.py @@ -4,6 +4,7 @@ import logging from tqdm import tqdm +from time import time from sklearn.metrics.cluster import v_measure_score from sklearn.metrics import roc_curve, auc from sklearn.metrics import precision_recall_fscore_support @@ -25,7 +26,7 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, clustering_threshold=None, val_dataloader=None, tqdm_label='', device=None, verbose=False, debug=False, _errors=None, - run_dir='./'): + run_dir='./', tqdm_position=None): """ clustering_fn, clustering_threshold, val_dataloader: unused when pairwise_mode is False (only added to keep fn signature identical) @@ -42,7 +43,7 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste } max_pred_id = -1 n_exceptions = 0 - for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}')): + for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}', position=tqdm_position)): if overfit_batch_idx > -1: if idx < overfit_batch_idx: continue @@ -63,6 +64,7 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste except CvxpyException as e: logger.info(e) _error_obj = { + 'id': f'e_{int(time())}', 'method': 'eval', 'model_type': 'e2e', 'data_split': tqdm_label, @@ -96,7 +98,8 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", return_pred_only=False, thresh_for_f1=0.5, clustering_fn=None, clustering_threshold=None, val_dataloader=None, - tqdm_label='', device=None, verbose=False, debug=False, _errors=None, run_dir='./'): + tqdm_label='', device=None, verbose=False, debug=False, _errors=None, run_dir='./', + tqdm_position=None): device = device if device is not None else torch.device("cuda" if torch.cuda.is_available() else "cpu") n_features = dataloader.dataset[0][0].shape[1] @@ -114,7 +117,7 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret } max_pred_id = -1 # In each iteration, add to all blockwise predicted IDs to distinguish from previous blocks n_exceptions = 0 - for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}')): + for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}', position=tqdm_position)): if overfit_batch_idx > -1: if idx < overfit_batch_idx: continue @@ -131,11 +134,13 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret # Forward pass through the e2e model data = data.to(device) try: - pred_cluster_ids = clustering_fn(model(data, verbose=verbose), block_size, min_id=(max_pred_id + 1), + edge_weights = model(data, N=block_size, warmstart=True, verbose=verbose) + pred_cluster_ids = clustering_fn(edge_weights, block_size, min_id=(max_pred_id + 1), threshold=clustering_threshold) except CvxpyException as e: logger.info(e) _error_obj = { + 'id': f'e_{int(time())}', 'method': 'eval', 'model_type': 'pairwise_cc', 'data_split': tqdm_label, @@ -167,7 +172,7 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret return (b3_f1, vmeasure, cc_obj_vals) if clustering_fn.__class__ is CCInference else (b3_f1, vmeasure) y_pred, targets = [], [] - for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}')): + for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}', position=tqdm_position)): if overfit_batch_idx > -1: if idx < overfit_batch_idx: continue diff --git a/e2e_scripts/get_block_sizes.py b/e2e_scripts/get_block_sizes.py new file mode 100644 index 0000000..89dfd87 --- /dev/null +++ b/e2e_scripts/get_block_sizes.py @@ -0,0 +1,116 @@ +import argparse +import glob +import json +import logging +import os +import numpy as np +import pickle +from time import time +from tqdm import tqdm + +from IPython import embed + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) +logger = logging.getLogger(__name__) + + +class NpEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.integer): + return int(obj) + if isinstance(obj, np.floating): + return float(obj) + if isinstance(obj, np.ndarray): + return obj.tolist() + return super(NpEncoder, self).default(obj) + + +class Parser(argparse.ArgumentParser): + def __init__(self): + super().__init__() + self.add_argument( + "--src", type=str + ) + self.add_argument( + "--unique", action="store_true", + ) + self.add_argument( + "--silent", action="store_true", + ) + self.add_argument( + "--interactive", action="store_true", + ) + + +if __name__ == '__main__': + parser = Parser() + args = parser.parse_args() + logger.info("Script arguments:") + logger.info(args.__dict__) + + root = "data/preprocessed_data" + save_fpath = f'./data_block_sizes{"_" + int(time()) if args.unique else ""}.json' + ignore = ['pubmed_OLD'] + n_seeds = 5 + splits = ['train', 'val', 'test'] + + result = {} + + for dataset_path in tqdm(glob.glob(os.path.join(root, "*")), disable=args.silent): + dataset = dataset_path.split('/')[-1] + if dataset in ignore: + continue + result[dataset] = {} + _seen_blk_across = set() + for seed in range(1, n_seeds+1): + result[dataset][seed] = {} + _seen_blk = set() + _full_bkl_sizes = [] + for split in splits: + _blk_szs = [] + fpath = os.path.join(dataset_path, f'seed{seed}', f'{split}_features.pkl') + with open(fpath, 'rb') as fh: + block_dict = pickle.load(fh) + for k in block_dict.keys(): + assert k not in _seen_blk + _seen_blk.add(k) + _, _, cluster_ids = block_dict[k] + _blk_szs.append(len(cluster_ids)) + result[dataset][seed][split] = { + 'n_blocks': len(_blk_szs), + 'min': np.min(_blk_szs), + 'max': np.max(_blk_szs), + 'mean': np.mean(_blk_szs), + 'median': np.median(_blk_szs) + } + _full_bkl_sizes += _blk_szs + result[dataset][seed]['full'] = { + 'n_blocks': len(_full_bkl_sizes), + 'min': np.min(_full_bkl_sizes), + 'max': np.max(_full_bkl_sizes), + 'mean': np.mean(_full_bkl_sizes), + 'median': np.median(_full_bkl_sizes) + } + _seen_blk_across = _seen_blk_across.union(_seen_blk) + result[dataset]['mean_across_seeds'] = { + 'n_blocks': np.mean([result[dataset][seed]['full']['n_blocks'] for seed in range(1, n_seeds + 1)]), + 'min': np.mean([result[dataset][seed]['full']['min'] for seed in range(1, n_seeds + 1)]), + 'max': np.mean([result[dataset][seed]['full']['max'] for seed in range(1, n_seeds + 1)]), + 'mean': np.mean([result[dataset][seed]['full']['mean'] for seed in range(1, n_seeds + 1)]), + 'median': np.mean([result[dataset][seed]['full']['median'] for seed in range(1, n_seeds + 1)]) + } + result[dataset]['n_blocks'] = len(_seen_blk_across) + + logger.info(f'Dataset: {dataset}') + logger.info(f' Blocks covered: {result[dataset]["n_blocks"]}') + logger.info(f' Across seed stats (mean):') + for k, v in result[dataset]['mean_across_seeds'].items(): + logger.info(f' {k}: {v}') + + with open(save_fpath, 'w') as fh: + json.dump(result, fh, cls=NpEncoder) + logger.info(f'Saved results to {save_fpath}') + + if args.interactive: + embed() diff --git a/e2e_scripts/preprocess_s2and_data.py b/e2e_scripts/preprocess_s2and_data.py index 1322586..cb46521 100644 --- a/e2e_scripts/preprocess_s2and_data.py +++ b/e2e_scripts/preprocess_s2and_data.py @@ -38,10 +38,10 @@ def save_blockwise_featurized_data(dataset_name, random_seed): n_jobs=16, random_seed=random_seed, ) - + logger.info("Loaded ANDData object") # Load the featurizer, which calculates pairwise similarity scores featurization_info = FeaturizationInfo() - # the cache will make it faster to train multiple times - it stores the features on disk for you + logger.info("Loaded featurization info") train_pkl, val_pkl, test_pkl = store_featurized_pickles(AND_dataset, featurization_info, n_jobs=16, diff --git a/e2e_scripts/train.py b/e2e_scripts/train.py index 1148587..23b36d8 100644 --- a/e2e_scripts/train.py +++ b/e2e_scripts/train.py @@ -1,5 +1,7 @@ +import glob import json import os +import sys import time import logging import random @@ -18,9 +20,17 @@ from e2e_pipeline.sdp_layer import CvxpyException from e2e_scripts.evaluate import evaluate, evaluate_pairwise from e2e_scripts.train_utils import DEFAULT_HYPERPARAMS, get_dataloaders, get_matrix_size_from_triu, \ - uncompress_target_tensor, count_parameters, log_cc_objective_values, save_to_wandb_run, FrobeniusLoss + uncompress_target_tensor, count_parameters, log_cc_objective_values, save_to_wandb_run, FrobeniusLoss, \ + copy_and_load_model from utils.parser import Parser +from torch.multiprocessing import Process, set_start_method, Manager + +try: + set_start_method('spawn', force=True) +except RuntimeError: + pass + from IPython import embed @@ -29,10 +39,105 @@ logger = logging.getLogger(__name__) +def _check_process(_proc, _return_dict, logger, run, overfit_batch_idx, use_lr_scheduler, hyp, + scheduler, eval_metric_to_idx, dev_opt_metric, i, best_epoch, best_dev_score, + best_dev_scores, best_dev_state_dict, sync=False): + if _proc is not None: + if _return_dict['_state'] == 'done' or (sync and _return_dict['_state'] == 'start'): + _proc.join() + _return_dict['_state'] = 'finish' + if _return_dict['_method'] == 'init_eval': + logger.info(_return_dict['local']) + run.log(_return_dict['wandb']) + elif _return_dict['_method'] == 'dev_eval': + logger.info(_return_dict['local']) + run.log(_return_dict['wandb']) + if overfit_batch_idx > -1: + if use_lr_scheduler: + if hyp['lr_scheduler'] == 'plateau': + scheduler.step(_return_dict['train_scores'][eval_metric_to_idx[dev_opt_metric]]) + elif hyp['lr_scheduler'] == 'step': + scheduler.step() + else: + dev_scores = _return_dict['dev_scores'] + dev_opt_score = dev_scores[eval_metric_to_idx[dev_opt_metric]] + if dev_opt_score > best_dev_score: + logger.info(f"New best dev {dev_opt_metric} score @ epoch{i+1}: {dev_opt_score}") + best_epoch = i + best_dev_score = dev_opt_score + best_dev_scores = dev_scores + best_dev_state_dict = torch.load(_return_dict['state_dict_path'], device) + if use_lr_scheduler: + if hyp['lr_scheduler'] == 'plateau': + scheduler.step(dev_scores[eval_metric_to_idx[dev_opt_metric]]) + elif hyp['lr_scheduler'] == 'step': + scheduler.step() + return best_epoch, best_dev_score, best_dev_scores, best_dev_state_dict + + +def init_eval(model_class, model_args, state_dict_path, overfit_batch_idx, eval_fn, train_dataloader, device, verbose, + debug, _errors, eval_metric_to_idx, val_dataloader, return_dict): + return_dict['_state'] = 'start' + return_dict['_method'] = 'init_eval' + model = model_class(*model_args) + model.load_state_dict(torch.load(state_dict_path)) + model.to(device) + with torch.no_grad(): + model.eval() + if overfit_batch_idx > -1: + train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx, + tqdm_label='train', device=device, verbose=verbose, debug=debug, + _errors=_errors, tqdm_position=0) + return_dict['local'] = f"Initial: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " + \ + f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}" + return_dict['wandb'] = {'epoch': 0, f'train_{list(eval_metric_to_idx)[0]}': train_scores[0], + f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]} + else: + dev_scores = eval_fn(model, val_dataloader, tqdm_label='dev 0', device=device, verbose=verbose, + debug=debug, _errors=_errors, tqdm_position=0) + return_dict['local'] = f"Initial: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " + \ + f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}" + return_dict['wandb'] = {'epoch': 0, f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0], + f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1]} + del model + return_dict['_state'] = 'done' + + +def dev_eval(model_class, model_args, state_dict_path, overfit_batch_idx, eval_fn, train_dataloader, device, verbose, + debug, _errors, eval_metric_to_idx, val_dataloader, return_dict, i, run_dir): + return_dict['_state'] = 'start' + return_dict['_method'] = 'dev_eval' + return_dict['state_dict_path'] = state_dict_path + model = model_class(*model_args) + model.load_state_dict(torch.load(state_dict_path)) + model.to(device) + with torch.no_grad(): + model.eval() + if overfit_batch_idx > -1: + train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx, + tqdm_label='train', device=device, verbose=verbose, debug=debug, + _errors=_errors) + return_dict['local'] = f"Epoch {i + 1}: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " + \ + f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}" + return_dict['wandb'] = {f'train_{list(eval_metric_to_idx)[0]}': train_scores[0], + f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]} + return_dict['train_scores'] = train_scores + else: + dev_scores = eval_fn(model, val_dataloader, tqdm_label=f'dev {i+1}', device=device, verbose=verbose, + debug=debug, _errors=_errors) + return_dict['local'] = f"Epoch {i + 1}: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " + \ + f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}" + return_dict['wandb'] = {f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0], + f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1]} + return_dict['dev_scores'] = dev_scores + del model + return_dict['_state'] = 'done' + + def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, group=None, save_model=False, load_model_from_wandb_run=None, load_model_from_fpath=None, - eval_only_split=None, skip_initial_eval=False, pairwise_eval_clustering=None, - debug=False, track_errors=True, local=False): + eval_only_split=None, eval_all=False, skip_initial_eval=False, pairwise_eval_clustering=None, + debug=False, track_errors=True, local=False, sync_dev=False): init_args = { 'config': DEFAULT_HYPERPARAMS } @@ -48,6 +153,11 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g if local: init_args.update({'mode': 'disabled'}) + # Parallel process for validation runs + _proc = None + _return_dict = Manager().dict() + _return_dict['_state'] = 'initial' + # Start wandb run with wandb.init(**init_args) as run: wandb.config.update(hyperparams, allow_val_change=True) @@ -55,7 +165,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g logger.info("Run hyperparameters:") logger.info(hyp) # Save hyperparameters as a json file and store in wandb run - save_to_wandb_run(dict(hyp), 'hyperparameters.json', run.dir, logger) + save_to_wandb_run(dict(hyp), 'hyperparameters.json', run.dir, logger, error_logger=False) # Track errors _errors = [] if track_errors else None @@ -68,6 +178,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g pairwise_mode = hyp['pairwise_mode'] weighted_loss = hyp['weighted_loss'] + use_rounded_loss = hyp["use_rounded_loss"] e2e_loss = hyp['e2e_loss'] batch_size = hyp['batch_size'] if pairwise_mode else 1 # Force clustering runs to operate on 1 block only n_epochs = hyp['n_epochs'] @@ -84,6 +195,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g neumiss_depth = hyp["neumiss_depth"] add_neumiss = not hyp['convert_nan'] negative_slope = hyp["negative_slope"] + use_sdp = hyp["use_sdp"] sdp_max_iters = hyp["sdp_max_iters"] sdp_eps = hyp["sdp_eps"] sdp_scale = hyp["sdp_scale"] @@ -105,11 +217,11 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g # Create model with hyperparams if not pairwise_mode: - model = EntResModel(n_features, neumiss_depth, dropout_p, dropout_only_once, add_neumiss, - neumiss_deq, hidden_dim, n_hidden_layers, add_batchnorm, activation, - negative_slope, hidden_config, sdp_max_iters, sdp_eps, sdp_scale, - use_rounded_loss=hyp["use_rounded_loss"], return_triu_on_train=(e2e_loss == "bce"), - use_sdp=hyp["use_sdp"]) + model_args = (n_features, neumiss_depth, dropout_p, dropout_only_once, add_neumiss, + neumiss_deq, hidden_dim, n_hidden_layers, add_batchnorm, activation, + negative_slope, hidden_config, sdp_max_iters, sdp_eps, sdp_scale, + use_rounded_loss, (e2e_loss == "bce"), use_sdp) + model = EntResModel(*model_args) # Define loss if e2e_loss not in ["frob", "bce"]: raise ValueError("Invalid value for e2e_loss") @@ -141,9 +253,10 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g # Define loss loss_fn_pairwise = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight)) else: - model = PairwiseModel(n_features, neumiss_depth, dropout_p, dropout_only_once, add_neumiss, - neumiss_deq, hidden_dim, n_hidden_layers, add_batchnorm, activation, - negative_slope, hidden_config) + model_args = (n_features, neumiss_depth, dropout_p, dropout_only_once, add_neumiss, + neumiss_deq, hidden_dim, n_hidden_layers, add_batchnorm, activation, + negative_slope, hidden_config) + model = PairwiseModel(*model_args) # Define loss pos_weight = None if weighted_loss: @@ -161,14 +274,14 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g pairwise_clustering_fns = [None] if pairwise_eval_clustering is not None: if pairwise_eval_clustering == 'cc': - pairwise_clustering_fns = [CCInference(sdp_max_iters, sdp_eps, sdp_scale)] + pairwise_clustering_fns = [CCInference(sdp_max_iters, sdp_eps, sdp_scale, use_sdp)] pairwise_clustering_fns[0].eval() pairwise_clustering_fn_labels = ['cc'] elif pairwise_eval_clustering == 'hac': pairwise_clustering_fns = [HACInference()] pairwise_clustering_fn_labels = ['hac'] elif pairwise_eval_clustering == 'both': - cc_inference = CCInference(sdp_max_iters, sdp_eps, sdp_scale) + cc_inference = CCInference(sdp_max_iters, sdp_eps, sdp_scale, use_sdp) pairwise_clustering_fns = [cc_inference, HACInference(), cc_inference] pairwise_clustering_fns[0].eval() pairwise_clustering_fn_labels = ['cc', 'hac', 'cc-fixed'] @@ -181,7 +294,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g hyp["normalize_data"], hyp["subsample_sz_train"], hyp["subsample_sz_dev"], - False, 1) + pairwise_mode=False, batch_size=1) logger.info(f"Model loaded: {model}", ) # Load stored model, if available @@ -198,8 +311,55 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g logger.info(f'Loaded stored model.') model.to(device) - if eval_only_split is not None: - # Run inference and exit + if eval_all: + # Run all inference variants on the test set and exit + cc_inference_sdp = CCInference(sdp_max_iters, sdp_eps, sdp_scale, use_sdp=True) + cc_inference_nosdp = CCInference(sdp_max_iters, sdp_eps, sdp_scale, use_sdp=False) + inference_fns = [HACInference(), + cc_inference_sdp, cc_inference_sdp, + cc_inference_nosdp, cc_inference_nosdp] + inference_fn_labels = ['hac', + 'cc', 'cc-fixed', + 'cc-nosdp', 'cc-nosdp-fixed'] + cc_inference_sdp.eval() + cc_inference_nosdp.eval() + _, val_dataloader_e2e, test_dataloader_e2e = get_dataloaders(hyp["dataset"], + hyp["dataset_random_seed"], + hyp["convert_nan"], + hyp["nan_value"], + hyp["normalize_data"], + hyp["subsample_sz_train"], + hyp["subsample_sz_dev"], + pairwise_mode=False, batch_size=1) + start_time = time.time() + with torch.no_grad(): + model.eval() + clustering_threshold = None + for i, inference_fn in enumerate(inference_fns): + logger.info(f'Inference method: {inference_fn_labels[i]}') + clustering_scores = evaluate_pairwise(model, test_dataloader_e2e, + clustering_fn=inference_fn, + clustering_threshold=clustering_threshold if i % 2 == 0 else None, + val_dataloader=val_dataloader_e2e, + tqdm_label='test clustering', device=device, verbose=verbose, + debug=debug, _errors=_errors) + if inference_fn.__class__ is HACInference: + clustering_threshold = inference_fn.cut_threshold + logger.info( + f"Eval: test_{list(clustering_metrics)[0]}_{inference_fn_labels[i]}={clustering_scores[0]}, " + + f"test_{list(clustering_metrics)[1]}_{inference_fn_labels[i]}={clustering_scores[1]}") + # Log eval metrics + wandb.log({f'best_test_{list(clustering_metrics)[0]}_{inference_fn_labels[i]}': + clustering_scores[0], + f'best_test_{list(clustering_metrics)[1]}_{inference_fn_labels[i]}': + clustering_scores[1]}) + if len(clustering_scores) == 3: + log_cc_objective_values(scores=clustering_scores, + split_name=f'best_test_{inference_fn_labels[i]}', + log_prefix='Eval', verbose=verbose, logger=logger) + end_time = time.time() + elif eval_only_split is not None: + # Run inference on the specified split and exit dataloaders = { 'train': train_dataloader, 'dev': val_dataloader, @@ -247,7 +407,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g log_prefix='Eval', verbose=verbose, logger=logger) end_time = time.time() else: - # Training + # Train and evaluate wandb.watch(model) optimizer = torch.optim.AdamW(model.parameters(), lr=hyp['lr']) @@ -263,31 +423,22 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=hyp['lr_step_size'], gamma=hyp['lr_gamma'], verbose=True) - best_dev_state_dict = None + best_dev_state_dict = copy.deepcopy(model.state_dict()) best_dev_score = -1 # Stores the score of only the specified optimization metric - best_dev_scores = None # Contains scores of all metrics + best_dev_scores = () # Contains scores of all metrics best_epoch = 0 if not skip_initial_eval: # Get initial model performance on dev (or 'train' for overfitting runs) - with torch.no_grad(): - model.eval() - if overfit_batch_idx > -1: - train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx, - tqdm_label='train', device=device, verbose=verbose, debug=debug, - _errors=_errors) - logger.info(f"Initial: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " + - f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}") - wandb.log({'epoch': 0, f'train_{list(eval_metric_to_idx)[0]}': train_scores[0], - f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]}) - else: - dev_scores = eval_fn(model, val_dataloader, tqdm_label='dev', device=device, verbose=verbose, - debug=debug, _errors=_errors) - logger.info(f"Initial: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " + - f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}") - wandb.log({'epoch': 0, f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0], - f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1]}) - + _state_dict_path = copy_and_load_model(model, run.dir, device, store_only=True) + _proc = Process(target=init_eval, + kwargs=dict(model_class=model.__class__, model_args=model_args, + state_dict_path=_state_dict_path, + overfit_batch_idx=overfit_batch_idx, eval_fn=eval_fn, + train_dataloader=train_dataloader, device=device, verbose=verbose, + debug=debug, _errors=_errors, eval_metric_to_idx=eval_metric_to_idx, + val_dataloader=val_dataloader, return_dict=_return_dict)) + _proc.start() if not pairwise_mode and grad_acc > 1: grad_acc_steps = [] _seen_pw = 0 @@ -323,7 +474,21 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g optimizer.zero_grad() for (idx, batch) in enumerate(tqdm(_train_dataloader, - desc=f"{'Warm-starting' if warmstart_mode else 'Training'} {i + 1}")): + desc=f"{'Warm-starting' if warmstart_mode else 'Training'} {i + 1}", + position=1)): + best_epoch, best_dev_score, best_dev_scores, best_dev_state_dict = _check_process(_proc, + _return_dict, + logger, run, + overfit_batch_idx, + use_lr_scheduler, + hyp, scheduler, + eval_metric_to_idx, + dev_opt_metric, + i - 1, best_epoch, + best_dev_score, + best_dev_scores, + best_dev_state_dict, + sync=sync_dev) if overfit_batch_idx > -1: if idx < overfit_batch_idx: continue @@ -354,6 +519,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g except CvxpyException as e: logger.info(e) _error_obj = { + 'id': f'tf_{int(time.time())}', 'method': 'train_forward', 'model_type': 'e2e' if not pairwise_mode else 'pairwise', 'data_split': 'train', @@ -397,6 +563,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g logger.info(e) if isinstance(e, CvxpyException): _error_obj = { + 'id': f'tb_{int(time.time())}', 'method': 'train_backward', 'model_type': 'e2e' if not pairwise_mode else 'pairwise', 'data_split': 'train', @@ -426,47 +593,47 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g running_loss.append(loss.item()) wandb.log({f'train_loss{"_warmstart" if warmstart_mode else ""}': np.mean(running_loss)}) + # Sync to get previous epoch's dev eval + best_epoch, best_dev_score, best_dev_scores, best_dev_state_dict = _check_process(_proc, _return_dict, + logger, run, + overfit_batch_idx, + use_lr_scheduler, + hyp, scheduler, + eval_metric_to_idx, + dev_opt_metric, i - 1, + best_epoch, + best_dev_score, + best_dev_scores, + best_dev_state_dict, + sync=True) + logger.info(f"Epoch loss = {np.mean(running_loss)}") + wandb.log({f'train_epoch_loss': np.mean(running_loss)}) # Get model performance on dev (or 'train' for overfitting runs) - with torch.no_grad(): - model.eval() - if overfit_batch_idx > -1: - train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx, - tqdm_label='train', device=device, verbose=verbose, debug=debug, - _errors=_errors) - logger.info(f"Epoch {i + 1}: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " + - f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}") - wandb.log({f'train_{list(eval_metric_to_idx)[0]}': train_scores[0], - f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]}) - if use_lr_scheduler: - if hyp['lr_scheduler'] == 'plateau': - scheduler.step(train_scores[eval_metric_to_idx[dev_opt_metric]]) - elif hyp['lr_scheduler'] == 'step': - scheduler.step() - else: - dev_scores = eval_fn(model, val_dataloader, tqdm_label='dev', device=device, verbose=verbose, - debug=debug, _errors=_errors) - logger.info(f"Epoch {i + 1}: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " + - f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}") - wandb.log({f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0], - f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1], - f'train_epoch_loss': np.mean(running_loss)}) - dev_opt_score = dev_scores[eval_metric_to_idx[dev_opt_metric]] - if dev_opt_score > best_dev_score: - logger.info(f"New best dev {dev_opt_metric} score @ epoch{i+1}: {dev_opt_score}") - best_epoch = i - best_dev_score = dev_opt_score - best_dev_scores = dev_scores - best_dev_state_dict = copy.deepcopy(model.state_dict()) - if use_lr_scheduler: - if hyp['lr_scheduler'] == 'plateau': - scheduler.step(dev_scores[eval_metric_to_idx[dev_opt_metric]]) - elif hyp['lr_scheduler'] == 'step': - scheduler.step() - model.train() + _state_dict_path = copy_and_load_model(model, run.dir, device, store_only=True) + _proc = Process(target=dev_eval, + kwargs=dict(model_class=model.__class__, model_args=model_args, + state_dict_path=_state_dict_path, overfit_batch_idx=overfit_batch_idx, + eval_fn=eval_fn, train_dataloader=train_dataloader, device=device, + verbose=verbose, debug=debug, _errors=_errors, + eval_metric_to_idx=eval_metric_to_idx, val_dataloader=val_dataloader, + return_dict=_return_dict, i=i, run_dir=run.dir)) + _proc.start() end_time = time.time() + best_epoch, best_dev_score, best_dev_scores, best_dev_state_dict = _check_process(_proc, _return_dict, + logger, run, + overfit_batch_idx, + use_lr_scheduler, + hyp, scheduler, + eval_metric_to_idx, + dev_opt_metric, i, + best_epoch, + best_dev_score, + best_dev_scores, + best_dev_state_dict, + sync=True) # Save model if save_model: torch.save(best_dev_state_dict, os.path.join(run.dir, 'model_state_dict_best.pt')) @@ -479,7 +646,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g with torch.no_grad(): model.eval() test_scores = eval_fn(model, test_dataloader, tqdm_label='test', device=device, verbose=verbose, - debug=debug, _errors=_errors) + debug=debug, _errors=_errors, tqdm_position=2) logger.info(f"Final: test_{list(eval_metric_to_idx)[0]}={test_scores[0]}, " + f"test_{list(eval_metric_to_idx)[1]}={test_scores[1]}") # Log final metrics @@ -500,7 +667,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g clustering_threshold=clustering_threshold, val_dataloader=val_dataloader_e2e, tqdm_label='test clustering', device=device, verbose=verbose, - debug=debug, _errors=_errors) + debug=debug, _errors=_errors, tqdm_position=2) if pairwise_clustering_fn.__class__ is HACInference: clustering_threshold = pairwise_clustering_fn.cut_threshold logger.info(f"Final: test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[0]}, " + @@ -513,14 +680,17 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g split_name=f'best_test_{pairwise_clustering_fn_labels[i]}', log_prefix='Final', verbose=True, logger=logger) - run.summary["z_model_parameters"] = count_parameters(model) run.summary["z_run_time"] = round(end_time - start_time) run.summary["z_run_dir_path"] = run.dir if _errors is not None: - save_to_wandb_run({'errors': _errors}, 'errors.json', run.dir, logger) - + _all_errors = save_to_wandb_run({'errors': _errors}, 'errors.json', run.dir, logger) + if len(_all_errors['errors']) > 0: + logger.warning(f'Errors were encountered during the run. LOGS: {os.path.join(run.dir, "errors.json")}') + # Cleanup + for filename in glob.glob(os.path.join(run.dir, "_temp_state_dict*")): + os.remove(filename) logger.info(f"Run directory: {run.dir}") logger.info("End of train() call") @@ -579,7 +749,8 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g sweep_config.update({ 'early_terminate': { 'type': 'hyperband', - 'min_iter': 5 + 'min_iter': 4, + 'eta': 2 } }) @@ -599,7 +770,8 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g skip_initial_eval=args['skip_initial_eval'], debug=args['debug'], track_errors=not args['no_error_tracking'], - local=args['local']), + local=args['local'], + sync_dev=args['sync_dev']), count=args['wandb_max_runs']) logger.info("End of sweep") @@ -628,9 +800,11 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g load_model_from_wandb_run=args['load_model_from_wandb_run'], load_model_from_fpath=args['load_model_from_fpath'], eval_only_split=args['eval_only_split'], + eval_all=args['eval_all'], skip_initial_eval=args['skip_initial_eval'], pairwise_eval_clustering=args['pairwise_eval_clustering'], debug=args['debug'], track_errors=not args['no_error_tracking'], - local=args['local']) + local=args['local'], + sync_dev=args['sync_dev']) logger.info("End of run") diff --git a/e2e_scripts/train_utils.py b/e2e_scripts/train_utils.py index efeea6e..c86c498 100644 --- a/e2e_scripts/train_utils.py +++ b/e2e_scripts/train_utils.py @@ -1,6 +1,7 @@ """ Helper functions and constants for e2e_scripts/train.py """ +import copy import os import json from collections import defaultdict @@ -8,6 +9,7 @@ from typing import Tuple, Optional import math import pickle +from time import time from torch.utils.data import DataLoader from s2and.consts import PREPROCESSED_DATA_DIR from s2and.data import S2BlocksDataset @@ -160,11 +162,20 @@ def log_cc_objective_values(scores, split_name, log_prefix, verbose, logger, plo # TODO: Implement plotting the approx. ratio v/s block sizes -def save_to_wandb_run(file, fname, fpath, logger): +def save_to_wandb_run(file, fname, fpath, logger, error_logger=True): + if error_logger and os.path.exists(os.path.join(fpath, fname)): + with open(os.path.join(fpath, fname), 'r') as fh: + all_errors = json.load(fh)['errors'] + all_ids = set([e['id'] for e in all_errors]) + for new_error in file['errors']: + if new_error['id'] not in all_ids: + all_errors.append(new_error) + file['errors'] = all_errors with open(os.path.join(fpath, fname), 'w') as fh: json.dump(file, fh) wandb.save(fname) logger.info(f"Saved {fname} to {os.path.join(fpath, fname)}") + return file class FrobeniusLoss: @@ -182,3 +193,15 @@ def __call__(self, input: Tensor, target: Tensor) -> Tensor: if self.weight is None: return torch.norm((target - input)) / normalization return torch.norm(self.weight * (target - input)) / normalization + + +def copy_and_load_model(model, run_dir, device, store_only=False): + _model = copy.deepcopy(model) + _PATH = os.path.join(run_dir, f'_temp_state_dict_{int(time())}.pt') + torch.save(model.state_dict(), _PATH) + if store_only: + return _PATH + _STATE_DICT = torch.load(_PATH, device) + _model.load_state_dict(_STATE_DICT) + os.remove(_PATH) + return _model diff --git a/run_sweep.sh b/run_sweep.sh index b61c509..c635c0f 100644 --- a/run_sweep.sh +++ b/run_sweep.sh @@ -17,7 +17,7 @@ for ((i = ${n_seed_start}; i <= ${n_seed_end}; i++)); do --pairwise_eval_clustering="both" \ --skip_initial_eval \ --silent \ - --wandb_sweep_name="${sweep_prefix}_${model}_${dataset}_${i}" \ + --wandb_sweep_name="${sweep_prefix}${model}_${dataset}_${i}" \ --wandb_sweep_params="wandb_configs/sweeps/${model}.json" \ --wandb_tags="${model},${dataset},seed_${i}" echo " Logs: jobs/${JOB_NAME}.err" diff --git a/utils/parser.py b/utils/parser.py index 3291481..f16f61f 100644 --- a/utils/parser.py +++ b/utils/parser.py @@ -125,6 +125,10 @@ def add_training_args(self): "--eval_only_split", type=str, help="Run script in inference-only mode on a particular data split (train / dev / test)", ) + parser.add_argument( + "--eval_all", action='store_true', + help="Evaluate model using all inference methods over the test set and exit", + ) parser.add_argument( "--skip_initial_eval", action='store_true', help="Whether to skip dev evaluation before training starts", @@ -146,3 +150,7 @@ def add_training_args(self): "--local", action="store_true", help="Run script with wandb disabled", ) + parser.add_argument( + "--sync_dev", action="store_true", + help="Whether to force dev evaluations to run synchronously", + ) From 64d3d28f9db45478807904f2a2ad24461ac2738d Mon Sep 17 00:00:00 2001 From: arana_umass_edu Date: Fri, 17 Mar 2023 18:17:51 +0000 Subject: [PATCH 06/17] optimizinf pointwise feature code --- e2e_scripts/preprocess_s2and_pointwise.py | 26 +++++------ s2and/featurizer.py | 56 +++++++++++++---------- 2 files changed, 43 insertions(+), 39 deletions(-) diff --git a/e2e_scripts/preprocess_s2and_pointwise.py b/e2e_scripts/preprocess_s2and_pointwise.py index 9035372..002aeae 100644 --- a/e2e_scripts/preprocess_s2and_pointwise.py +++ b/e2e_scripts/preprocess_s2and_pointwise.py @@ -19,12 +19,13 @@ from s2and.data import ANDData import logging from s2and.featurizer import FeaturizationInfo, featurize +from IPython import embed logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) -def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed): +def save_pickled_pointwise_features(data_home_dir, dataset_name): """ Fetch pointwise feature for dataset and store in a pickle. """ @@ -45,27 +46,25 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed): random_seed=random_seed, ) - #print("This is for pickling dataset....") - #with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f: - # pickle.dump(AND_dataset, f) + # print("Storing pickled dataset....") + # with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f: + # pickle.dump(AND_dataset, f) - #print("getting pickled dataset...") - #with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f: - # AND_dataset = pickle.load(f) - #print("Loaded pickle dataset...") + # print("Loading pickled dataset...") + # with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f: + # AND_dataset = pickle.load(f) + # print("Loaded pickle dataset...") point_features_row, point_features_col, point_features_data, num_feats, num_points = pointwise_featurize(AND_dataset, n_jobs=16, - use_cache=False, - random_seed=random_seed) + use_cache=False) logger.info('converting feature indices to csr_matrix') point_features = coo_matrix( (point_features_data, (point_features_row, point_features_col)), shape=(num_points, num_feats) ).tocsr() - print("Matrix creation done.") processed_data['mention_level_features'] = point_features @@ -88,7 +87,6 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed): data_home_dir = params["data_home_dir"] dataset = params["dataset_name"] - seed = 1211 # Dummy not needed, can be totally removed. - print("Preprocessing started for seed value", seed) - save_pickled_pointwise_features(data_home_dir, dataset, seed) + print("Preprocessing started") + save_pickled_pointwise_features(data_home_dir, dataset) print("Matrix") diff --git a/s2and/featurizer.py b/s2and/featurizer.py index bf41ed8..022edb8 100644 --- a/s2and/featurizer.py +++ b/s2and/featurizer.py @@ -9,6 +9,7 @@ import logging from collections import Counter from collections.abc import Iterable +from IPython import embed from sklearn import preprocessing @@ -828,15 +829,14 @@ def featurize( logger.info("featurized test") return train_features, val_features, test_features + + def pointwise_featurize( dataset: ANDData, n_jobs: int = 1, use_cache: bool = False, chunk_size: int = DEFAULT_CHUNK_SIZE, - nan_value: float = np.nan, - delete_training_data: bool = False, - random_seed: int = 1, ): """ Featurizes the input dataset and stores as a unified pickle file. @@ -851,10 +851,6 @@ def pointwise_featurize( whether or not to use write to/read from the features cache chunk_size: int the chunk size for multiprocessing - nan_value: float - the value to replace nans with - delete_training_data: bool - Whether to delete some suspicious training examples Returns ------- @@ -874,6 +870,20 @@ def pointwise_featurize( signature_dict[signature_key] = [] for feature_key, value in per_signature_features.items(): index_key = None + + features_to_ignore = [ + 'author_info_name_counts', + 'author_info_position', + 'author_info_block', + 'author_info_given_block', + 'paper_id', + 'author_id', + 'sourced_author_source', + 'sourced_author_ids', + ] + if feature_key in features_to_ignore: + continue + if (value is None or (isinstance(value, Iterable) and len(value) == 0)): continue @@ -888,18 +898,18 @@ def pointwise_featurize( # Let us check the type of value for each signatures. if isinstance(value, str) or isinstance(value, int): - index_key = (feature_key, value) - signature_feature_set.add(str(index_key)) # Converting to str from tuple. + index_key = str((feature_key, value)) + signature_feature_set.add(index_key) # Converting to str from tuple. signature_dict[signature_key].append(index_key) elif isinstance(value, Counter): for val in value.keys(): - index_key = (feature_key, val) - signature_feature_set.add(str(index_key)) + index_key = str((feature_key, val)) + signature_feature_set.add(index_key) signature_dict[signature_key].append(index_key) elif isinstance(value, Iterable): for val in value: - index_key = (feature_key, val) - signature_feature_set.add(str(index_key)) + index_key = str((feature_key, val)) + signature_feature_set.add(index_key) signature_dict[signature_key].append(index_key) else: print('\n!!!! Found another type !!!!\n') @@ -918,21 +928,17 @@ def pointwise_featurize( le_signature_feature_set = preprocessing.LabelEncoder() le_signature_feature_set.fit(list(signature_feature_set)) - le_signature_dict = preprocessing.LabelEncoder() - le_signature_dict.fit(list(signature_dict.keys())) - point_features_row, point_features_col, point_features_data = [], [], [] num_points = len(signature_dict.keys()) - num_feats = len(signature_feature_set) - for key, values in signature_dict.items(): - encoded_key_val = le_signature_dict.transform([key])[0] - val_strings = [str(val) for val in values] - encoded_values_val = le_signature_feature_set.transform(val_strings) - for val in encoded_values_val : - point_features_row.append(encoded_key_val) - point_features_col.append(val) - point_features_data.append(1) + num_feats = len(signature_feature_set) + for index, (_, values) in tqdm(enumerate(signature_dict.items()), desc="Converting to spare matrix"): + encoded_signature_features = le_signature_feature_set.transform(values) + for feature_label in encoded_signature_features : + point_features_row.append(index) + point_features_col.append(feature_label) + point_features_data.append(1) + return point_features_row, point_features_col, point_features_data, num_feats, num_points From c1e13ae21024d441b7a949d9e08baccbb58346c8 Mon Sep 17 00:00:00 2001 From: Dhruv Agarwal Date: Sat, 18 Mar 2023 04:23:50 -0400 Subject: [PATCH 07/17] Inference solver, parallel eval iterations, sweep config changes (#41) --- e2e_debug/solve.py | 41 +-- e2e_pipeline/sdp_layer.py | 41 ++- e2e_pipeline/uncompress_layer.py | 2 +- e2e_scripts/evaluate.py | 140 ++++++++-- e2e_scripts/preprocess_s2and_data.py | 2 +- e2e_scripts/train.py | 326 ++++++++-------------- e2e_scripts/train_utils.py | 183 ++++++++++-- s2and/data.py | 7 +- utils/parser.py | 5 +- wandb_configs/sweeps/e2e-nosdp-warm.json | 6 +- wandb_configs/sweeps/e2e-nosdp.json | 6 +- wandb_configs/sweeps/e2e-warm.json | 6 +- wandb_configs/sweeps/e2e.json | 6 +- wandb_configs/sweeps/frac-nosdp-warm.json | 6 +- wandb_configs/sweeps/frac-nosdp.json | 6 +- wandb_configs/sweeps/frac-warm.json | 6 +- wandb_configs/sweeps/frac.json | 6 +- wandb_configs/sweeps/mlp.json | 1 + 18 files changed, 487 insertions(+), 309 deletions(-) diff --git a/e2e_debug/solve.py b/e2e_debug/solve.py index 6e47ef2..ccefa35 100644 --- a/e2e_debug/solve.py +++ b/e2e_debug/solve.py @@ -50,6 +50,9 @@ def __init__(self): self.add_argument( "--scs_log_csv_filename", type=str, ) + self.add_argument( + "--max_scaling", action="store_true", + ) self.add_argument( "--interactive", action="store_true", ) @@ -63,15 +66,18 @@ def __init__(self): # Read error file logger.info("Reading input data") - with open(args.data_fpath, 'r') as fh: - data = json.load(fh) - assert len(data['errors']) > 0 - # Pick specific error instance to process - error_data = data['errors'][args.data_idx] + if args.data_fpath.endswith('.pt'): + _W_val = torch.load(args.data_fpath, map_location='cpu').numpy() + else: + with open(args.data_fpath, 'r') as fh: + data = json.load(fh) + assert len(data['errors']) > 0 + # Pick specific error instance to process + error_data = data['errors'][args.data_idx] - # Extract input data from the error instance - _raw = np.array(error_data['model_call_args']['data']) - _W_val = np.array(error_data['cvxpy_layer_args']['W_val']) + # Extract input data from the error instance + _raw = np.array(error_data['model_call_args']['data']) + _W_val = np.array(error_data['cvxpy_layer_args']['W_val']) # Construct cvxpy problem logger.info('Constructing optimization problem') @@ -84,7 +90,7 @@ def __init__(self): constraints = [ cp.diag(X) == np.ones((n,)), X[:n, :] >= 0, - X[:n, :] <= 1 + # X[:n, :] <= 1 ] # Setup HAC Cut @@ -94,12 +100,14 @@ def __init__(self): sdp_obj_value = float('inf') result_idxs, results_X, results_clustering = [], [], [] no_solution_scaling_factors = [] - for i in range(1, 10): # n + for i in range(0, 10): # n # Skipping 1; no scaling leads to non-convergence (infinite objective value) - if i == 1: - scaling_factor = np.max(W) + if i == 0: + scaling_factor = np.max(np.abs(W)) else: scaling_factor = i + if args.max_scaling: + continue logger.info(f'Scaling factor={scaling_factor}') # Create problem W_scaled = W / scaling_factor @@ -114,8 +122,7 @@ def __init__(self): alpha=args.scs_alpha, scale=args.scs_scale, use_indirect=args.scs_use_indirect, - use_quad_obj=not args.scs_dont_use_quad_obj, - log_csv_filename=args.scs_log_csv_filename + # use_quad_obj=not args.scs_dont_use_quad_obj ) logger.info(f"@scaling={scaling_factor}, objective value = {sdp_obj_value}, norm={np.linalg.norm(W_scaled)}") if sdp_obj_value != float('inf'): @@ -129,9 +136,9 @@ def __init__(self): logger.info(f"Solution not found = {len(no_solution_scaling_factors)}") logger.info(f"Solution found = {len(results_X)}") - logger.info("Same clustering:") - for i in range(len(results_clustering)-1): - logger.info(np.array_equal(results_clustering[i], results_clustering[i + 1])) + # logger.info("Same clustering:") + # for i in range(len(results_clustering)-1): + # logger.info(np.array_equal(results_clustering[i], results_clustering[i + 1])) # logger.info(f"Solution found with scaling factor = {scaling_factor}") # if args.interactive and sdp_obj_value == float('inf'): # embed() diff --git a/e2e_pipeline/sdp_layer.py b/e2e_pipeline/sdp_layer.py index bdc10c0..be914b8 100644 --- a/e2e_pipeline/sdp_layer.py +++ b/e2e_pipeline/sdp_layer.py @@ -50,22 +50,33 @@ def build_and_solve_sdp(self, W_val, N, verbose=False): X[:N, :] >= 0, ] - # create problem - prob = cp.Problem(cp.Maximize(cp.trace(W @ X)), constraints) - # Note: maximizing the trace is equivalent to maximizing the sum_E (w_uv * X_uv) objective - # because W is upper-triangular and X is symmetric - - # Build the SDP cvxpylayer - cvxpy_layer = CvxpyLayer(prob, parameters=[W], variables=[X]) - - # Forward pass through the SDP cvxpylayer try: - pw_prob_matrix = cvxpy_layer(W_val, solver_args={ - "solve_method": "SCS", - "verbose": verbose, - "max_iters": self.max_iters, - "eps": self.eps - })[0] + if self.training: + # create problem + prob = cp.Problem(cp.Maximize(cp.trace(W @ X)), constraints) + # Note: maximizing the trace is equivalent to maximizing the sum_E (w_uv * X_uv) objective + # because W is upper-triangular and X is symmetric + # Build the SDP cvxpylayer + cvxpy_layer = CvxpyLayer(prob, parameters=[W], variables=[X]) + # Forward pass through the SDP cvxpylayer + pw_prob_matrix = cvxpy_layer(W_val, solver_args={ + "solve_method": "SCS", + "verbose": verbose, + "max_iters": self.max_iters, + "eps": self.eps + })[0] + else: + # create problem + prob = cp.Problem(cp.Maximize(cp.trace(W_val.cpu().numpy() @ X)), constraints) + _solve_val = prob.solve( + solver=cp.SCS, + verbose=verbose, + max_iters=self.max_iters, + eps=self.eps + ) + if _solve_val == float('inf'): + raise ValueError() + pw_prob_matrix = torch.tensor(X.value, device=W_val.device) # Fix to prevent invalid solution values close to 0 and 1 but outside the range pw_prob_matrix = torch.clamp(pw_prob_matrix, min=0, max=1) except: diff --git a/e2e_pipeline/uncompress_layer.py b/e2e_pipeline/uncompress_layer.py index 93d99dd..7198b07 100644 --- a/e2e_pipeline/uncompress_layer.py +++ b/e2e_pipeline/uncompress_layer.py @@ -6,7 +6,7 @@ def __init__(self): super().__init__() def forward(self, compressed_matrix, N, make_symmetric=False, ones_diagonal=False): - device = compressed_matrix.get_device() + device = compressed_matrix.device triu_indices = torch.triu_indices(N, N, offset=1, device=device) if make_symmetric: sym_indices = torch.stack((torch.cat((triu_indices[0], triu_indices[1])), diff --git a/e2e_scripts/evaluate.py b/e2e_scripts/evaluate.py index b7ba424..71ec3d6 100644 --- a/e2e_scripts/evaluate.py +++ b/e2e_scripts/evaluate.py @@ -8,13 +8,14 @@ from sklearn.metrics.cluster import v_measure_score from sklearn.metrics import roc_curve, auc from sklearn.metrics import precision_recall_fscore_support +from torch.multiprocessing import Process, Manager import numpy as np import torch from e2e_pipeline.cc_inference import CCInference from e2e_pipeline.hac_inference import HACInference from e2e_pipeline.sdp_layer import CvxpyException -from e2e_scripts.train_utils import compute_b3_f1, save_to_wandb_run +from e2e_scripts.train_utils import compute_b3_f1, save_to_wandb_run, copy_and_load_model from IPython import embed @@ -24,13 +25,50 @@ logger = logging.getLogger(__name__) +def _run_iter(model_class, state_dict_path, _fork_id, _shared_list, eval_fn, **kwargs): + model = model_class(*kwargs['model_args']) + model.load_state_dict(torch.load(state_dict_path)) + model.to('cpu') + model.eval() + with torch.no_grad(): + res = eval_fn(model=model, **kwargs) + _shared_list.append(res) + del model + + +def _fork_iter(batch_idx, _fork_id, _shared_list, eval_fn, **kwargs): + kwargs['model_class'] = kwargs['model'].__class__ + kwargs['state_dict_path'] = copy_and_load_model(kwargs['model'], kwargs['run_dir'], 'cpu', store_only=True) + del kwargs['model'] + kwargs['overfit_batch_idx'] = batch_idx + kwargs['tqdm_label'] = f'{kwargs["tqdm_label"]} (fork{_fork_id})' + kwargs['_fork_id'] = _fork_id + kwargs['tqdm_position'] = (0 if kwargs['tqdm_position'] is None else kwargs['tqdm_position']) + _fork_id + 1 + kwargs['return_iter'] = True + kwargs['fork_size'] = -1 + kwargs['_shared_list'] = _shared_list + kwargs['disable_tqdm'] = True + kwargs['device'] = 'cpu' + kwargs['eval_fn'] = eval_fn + _proc = Process(target=_run_iter, kwargs=kwargs) + _proc.start() + return _proc + + def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, clustering_threshold=None, val_dataloader=None, tqdm_label='', device=None, verbose=False, debug=False, _errors=None, - run_dir='./', tqdm_position=None): + run_dir='./', tqdm_position=None, model_args=None, return_iter=False, fork_size=500, + disable_tqdm=False): """ clustering_fn, clustering_threshold, val_dataloader: unused when pairwise_mode is False (only added to keep fn signature identical) """ + fn_args = locals() + fork_enabled = fork_size > -1 and model_args is not None + if fork_enabled: + _fork_id = 1 + _shared_list = Manager().list() + _procs = [] device = device if device is not None else torch.device("cuda" if torch.cuda.is_available() else "cpu") n_features = dataloader.dataset[0][0].shape[1] @@ -43,7 +81,8 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste } max_pred_id = -1 n_exceptions = 0 - for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}', position=tqdm_position)): + pbar = tqdm(dataloader, desc=f'Eval {tqdm_label}', position=tqdm_position, disable=disable_tqdm) + for (idx, batch) in enumerate(pbar): if overfit_batch_idx > -1: if idx < overfit_batch_idx: continue @@ -51,11 +90,16 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste break data, _, cluster_ids = batch block_size = len(cluster_ids) - all_gold += list(np.reshape(cluster_ids, (block_size,))) + pbar.set_description(f'Eval {tqdm_label} (sz={block_size})') data = data.reshape(-1, n_features).float() if data.shape[0] == 0: # Only one signature in block; manually assign a unique cluster pred_cluster_ids = [max_pred_id + 1] + elif fork_enabled and block_size >= fork_size: + _proc = _fork_iter(idx, _fork_id, _shared_list, evaluate, **fn_args) + _fork_id += 1 + _procs.append((_proc, block_size)) + continue else: # Forward pass through the e2e model data = data.to(device) @@ -79,8 +123,6 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste save_to_wandb_run({'errors': _errors}, 'errors.json', run_dir, logger) if not debug: # if tqdm_label is not 'dev' and not debug: raise CvxpyException(data=_error_obj) - # If split is dev, skip batch and continue - all_gold = all_gold[:-len(cluster_ids)] n_exceptions += 1 logger.info(f'Caught CvxpyException {n_exceptions}: skipping batch') continue @@ -89,8 +131,34 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste cc_obj_vals['sdp'].append(model.sdp_layer.objective_value) cc_obj_vals['block_idxs'].append(idx) cc_obj_vals['block_sizes'].append(block_size) + all_gold += list(np.reshape(cluster_ids, (block_size,))) max_pred_id = max(pred_cluster_ids) all_pred += list(pred_cluster_ids) + if overfit_batch_idx > -1 and return_iter: + return { + 'cluster_labels': model.hac_cut_layer.cluster_labels, + 'round_objective_value': model.hac_cut_layer.objective_value, + 'sdp_objective_value': model.sdp_layer.objective_value, + 'block_idx': idx, + 'block_size': block_size, + 'cluster_ids': cluster_ids + } + + if fork_enabled and len(_procs) > 0: + _procs.sort(key=lambda x: x[1]) # To visualize progress + for _proc in tqdm(_procs, desc=f'Eval {tqdm_label} (waiting for forks to join)', position=tqdm_position): + _proc[0].join() + assert len(_procs) == len(_shared_list), "All forked eval iterations did not return results" + for _data in _shared_list: + pred_cluster_ids = (_data['cluster_labels'] + (max_pred_id + 1)).tolist() + cc_obj_vals['round'].append(_data['round_objective_value']) + cc_obj_vals['sdp'].append(_data['sdp_objective_value']) + cc_obj_vals['block_idxs'].append(_data['block_idx']) + cc_obj_vals['block_sizes'].append(_data['block_size']) + all_gold += list(np.reshape(_data['cluster_ids'], (_data['block_size'],))) + max_pred_id = max(pred_cluster_ids) + all_pred += list(pred_cluster_ids) + vmeasure = v_measure_score(all_gold, all_pred) b3_f1 = compute_b3_f1(all_gold, all_pred)[2] return b3_f1, vmeasure, cc_obj_vals @@ -99,7 +167,13 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", return_pred_only=False, thresh_for_f1=0.5, clustering_fn=None, clustering_threshold=None, val_dataloader=None, tqdm_label='', device=None, verbose=False, debug=False, _errors=None, run_dir='./', - tqdm_position=None): + tqdm_position=None, model_args=None, return_iter=False, fork_size=500, disable_tqdm=False): + fn_args = locals() + fork_enabled = fork_size > -1 and model_args is not None + if fork_enabled: + _fork_id = 1 + _shared_list = Manager().list() + _procs = [] device = device if device is not None else torch.device("cuda" if torch.cuda.is_available() else "cpu") n_features = dataloader.dataset[0][0].shape[1] @@ -117,7 +191,8 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret } max_pred_id = -1 # In each iteration, add to all blockwise predicted IDs to distinguish from previous blocks n_exceptions = 0 - for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}', position=tqdm_position)): + pbar = tqdm(dataloader, desc=f'Eval {tqdm_label}', position=tqdm_position, disable=disable_tqdm) + for (idx, batch) in enumerate(pbar): if overfit_batch_idx > -1: if idx < overfit_batch_idx: continue @@ -125,11 +200,16 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret break data, _, cluster_ids = batch block_size = len(cluster_ids) - all_gold += list(np.reshape(cluster_ids, (block_size,))) + pbar.set_description(f'Eval {tqdm_label} (sz={block_size})') data = data.reshape(-1, n_features).float() if data.shape[0] == 0: # Only one signature in block; manually assign a unique cluster pred_cluster_ids = [max_pred_id + 1] + elif fork_enabled and block_size >= fork_size and clustering_fn.__class__ is CCInference: + _proc = _fork_iter(idx, _fork_id, _shared_list, evaluate_pairwise, **fn_args) + _fork_id += 1 + _procs.append((_proc, block_size)) + continue else: # Forward pass through the e2e model data = data.to(device) @@ -155,24 +235,49 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret save_to_wandb_run({'errors': _errors}, 'errors.json', run_dir, logger) if not debug: # if tqdm_label is not 'dev' and not debug: raise CvxpyException(data=_error_obj) - # If split is dev, skip batch and continue - all_gold = all_gold[:-len(cluster_ids)] n_exceptions += 1 logger.info(f'Caught CvxpyException {n_exceptions}: skipping batch') continue + if clustering_fn.__class__ is CCInference: + cc_obj_vals['round'].append(clustering_fn.hac_cut_layer.objective_value) + cc_obj_vals['sdp'].append(clustering_fn.sdp_layer.objective_value) + cc_obj_vals['block_idxs'].append(idx) + cc_obj_vals['block_sizes'].append(block_size) + all_gold += list(np.reshape(cluster_ids, (block_size,))) max_pred_id = max(pred_cluster_ids) all_pred += list(pred_cluster_ids) - if clustering_fn.__class__ is CCInference: - cc_obj_vals['round'].append(clustering_fn.hac_cut_layer.objective_value) - cc_obj_vals['sdp'].append(clustering_fn.sdp_layer.objective_value) - cc_obj_vals['block_idxs'].append(idx) - cc_obj_vals['block_sizes'].append(block_size) + if overfit_batch_idx > -1 and return_iter: + return { + 'cluster_labels': list(np.array(pred_cluster_ids) - (max_pred_id + 1)), + 'round_objective_value': clustering_fn.hac_cut_layer.objective_value, + 'sdp_objective_value': clustering_fn.sdp_layer.objective_value, + 'block_idx': idx, + 'block_size': block_size, + 'cluster_ids': cluster_ids + } + + if fork_enabled and len(_procs) > 0: + _procs.sort(key=lambda x: x[1]) # To visualize progress + for _proc in tqdm(_procs, desc=f'Eval {tqdm_label} (waiting for forks to join)', position=tqdm_position): + _proc[0].join() + assert len(_procs) == len(_shared_list), "All forked eval iterations did not return results" + for _data in _shared_list: + pred_cluster_ids = (_data['cluster_labels'] + (max_pred_id + 1)).tolist() + cc_obj_vals['round'].append(_data['round_objective_value']) + cc_obj_vals['sdp'].append(_data['sdp_objective_value']) + cc_obj_vals['block_idxs'].append(_data['block_idx']) + cc_obj_vals['block_sizes'].append(_data['block_size']) + all_gold += list(np.reshape(_data['cluster_ids'], (_data['block_size'],))) + max_pred_id = max(pred_cluster_ids) + all_pred += list(pred_cluster_ids) + vmeasure = v_measure_score(all_gold, all_pred) b3_f1 = compute_b3_f1(all_gold, all_pred)[2] return (b3_f1, vmeasure, cc_obj_vals) if clustering_fn.__class__ is CCInference else (b3_f1, vmeasure) y_pred, targets = [], [] - for (idx, batch) in enumerate(tqdm(dataloader, desc=f'Evaluating {tqdm_label}', position=tqdm_position)): + pbar = tqdm(dataloader, desc=f'Eval {tqdm_label}', position=tqdm_position, disable=disable_tqdm) + for (idx, batch) in enumerate(pbar): if overfit_batch_idx > -1: if idx < overfit_batch_idx: continue @@ -180,6 +285,7 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret break data, target = batch data = data.reshape(-1, n_features).float() + pbar.set_description(f'Eval {tqdm_label} (sz={len(data)})') assert data.shape[0] != 0 target = target.flatten().float() # Forward pass through the pairwise model diff --git a/e2e_scripts/preprocess_s2and_data.py b/e2e_scripts/preprocess_s2and_data.py index cb46521..d97283b 100644 --- a/e2e_scripts/preprocess_s2and_data.py +++ b/e2e_scripts/preprocess_s2and_data.py @@ -118,7 +118,7 @@ def find_total_num_train_pairs(blockwise_data): DATA_HOME_DIR = params["data_home_dir"] dataset = params["dataset_name"] - random_seeds = {1, 2, 3, 4, 5} + random_seeds = [1, 2, 3, 4, 5] if params["dataset_seed"] is None else [params["dataset_seed"]] for seed in random_seeds: print("Preprocessing started for seed value", seed) save_blockwise_featurized_data(dataset, seed) diff --git a/e2e_scripts/train.py b/e2e_scripts/train.py index 23b36d8..a645257 100644 --- a/e2e_scripts/train.py +++ b/e2e_scripts/train.py @@ -1,7 +1,6 @@ import glob import json import os -import sys import time import logging import random @@ -10,8 +9,8 @@ import wandb import torch import numpy as np - from tqdm import tqdm +from torch.multiprocessing import set_start_method, Manager from e2e_pipeline.cc_inference import CCInference from e2e_pipeline.hac_inference import HACInference @@ -21,119 +20,21 @@ from e2e_scripts.evaluate import evaluate, evaluate_pairwise from e2e_scripts.train_utils import DEFAULT_HYPERPARAMS, get_dataloaders, get_matrix_size_from_triu, \ uncompress_target_tensor, count_parameters, log_cc_objective_values, save_to_wandb_run, FrobeniusLoss, \ - copy_and_load_model + get_feature_count, _check_process, fork_eval, init_eval, dev_eval from utils.parser import Parser -from torch.multiprocessing import Process, set_start_method, Manager +from IPython import embed try: set_start_method('spawn', force=True) except RuntimeError: pass -from IPython import embed - - logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) -def _check_process(_proc, _return_dict, logger, run, overfit_batch_idx, use_lr_scheduler, hyp, - scheduler, eval_metric_to_idx, dev_opt_metric, i, best_epoch, best_dev_score, - best_dev_scores, best_dev_state_dict, sync=False): - if _proc is not None: - if _return_dict['_state'] == 'done' or (sync and _return_dict['_state'] == 'start'): - _proc.join() - _return_dict['_state'] = 'finish' - if _return_dict['_method'] == 'init_eval': - logger.info(_return_dict['local']) - run.log(_return_dict['wandb']) - elif _return_dict['_method'] == 'dev_eval': - logger.info(_return_dict['local']) - run.log(_return_dict['wandb']) - if overfit_batch_idx > -1: - if use_lr_scheduler: - if hyp['lr_scheduler'] == 'plateau': - scheduler.step(_return_dict['train_scores'][eval_metric_to_idx[dev_opt_metric]]) - elif hyp['lr_scheduler'] == 'step': - scheduler.step() - else: - dev_scores = _return_dict['dev_scores'] - dev_opt_score = dev_scores[eval_metric_to_idx[dev_opt_metric]] - if dev_opt_score > best_dev_score: - logger.info(f"New best dev {dev_opt_metric} score @ epoch{i+1}: {dev_opt_score}") - best_epoch = i - best_dev_score = dev_opt_score - best_dev_scores = dev_scores - best_dev_state_dict = torch.load(_return_dict['state_dict_path'], device) - if use_lr_scheduler: - if hyp['lr_scheduler'] == 'plateau': - scheduler.step(dev_scores[eval_metric_to_idx[dev_opt_metric]]) - elif hyp['lr_scheduler'] == 'step': - scheduler.step() - return best_epoch, best_dev_score, best_dev_scores, best_dev_state_dict - - -def init_eval(model_class, model_args, state_dict_path, overfit_batch_idx, eval_fn, train_dataloader, device, verbose, - debug, _errors, eval_metric_to_idx, val_dataloader, return_dict): - return_dict['_state'] = 'start' - return_dict['_method'] = 'init_eval' - model = model_class(*model_args) - model.load_state_dict(torch.load(state_dict_path)) - model.to(device) - with torch.no_grad(): - model.eval() - if overfit_batch_idx > -1: - train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx, - tqdm_label='train', device=device, verbose=verbose, debug=debug, - _errors=_errors, tqdm_position=0) - return_dict['local'] = f"Initial: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " + \ - f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}" - return_dict['wandb'] = {'epoch': 0, f'train_{list(eval_metric_to_idx)[0]}': train_scores[0], - f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]} - else: - dev_scores = eval_fn(model, val_dataloader, tqdm_label='dev 0', device=device, verbose=verbose, - debug=debug, _errors=_errors, tqdm_position=0) - return_dict['local'] = f"Initial: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " + \ - f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}" - return_dict['wandb'] = {'epoch': 0, f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0], - f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1]} - del model - return_dict['_state'] = 'done' - - -def dev_eval(model_class, model_args, state_dict_path, overfit_batch_idx, eval_fn, train_dataloader, device, verbose, - debug, _errors, eval_metric_to_idx, val_dataloader, return_dict, i, run_dir): - return_dict['_state'] = 'start' - return_dict['_method'] = 'dev_eval' - return_dict['state_dict_path'] = state_dict_path - model = model_class(*model_args) - model.load_state_dict(torch.load(state_dict_path)) - model.to(device) - with torch.no_grad(): - model.eval() - if overfit_batch_idx > -1: - train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx, - tqdm_label='train', device=device, verbose=verbose, debug=debug, - _errors=_errors) - return_dict['local'] = f"Epoch {i + 1}: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " + \ - f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}" - return_dict['wandb'] = {f'train_{list(eval_metric_to_idx)[0]}': train_scores[0], - f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]} - return_dict['train_scores'] = train_scores - else: - dev_scores = eval_fn(model, val_dataloader, tqdm_label=f'dev {i+1}', device=device, verbose=verbose, - debug=debug, _errors=_errors) - return_dict['local'] = f"Epoch {i + 1}: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " + \ - f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}" - return_dict['wandb'] = {f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0], - f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1]} - return_dict['dev_scores'] = dev_scores - del model - return_dict['_state'] = 'done' - - def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, group=None, save_model=False, load_model_from_wandb_run=None, load_model_from_fpath=None, eval_only_split=None, eval_all=False, skip_initial_eval=False, pairwise_eval_clustering=None, @@ -206,14 +107,20 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g eval_metric_to_idx = clustering_metrics if not pairwise_mode else pairwise_metrics dev_opt_metric = hyp['dev_opt_metric'] if hyp['dev_opt_metric'] in eval_metric_to_idx \ else list(eval_metric_to_idx)[0] + training_mode = not eval_all and eval_only_split is None # Get data loaders (optionally with imputation, normalization) - train_dataloader, val_dataloader, test_dataloader = get_dataloaders(hyp["dataset"], hyp["dataset_random_seed"], - hyp["convert_nan"], hyp["nan_value"], - hyp["normalize_data"], hyp["subsample_sz_train"], - hyp["subsample_sz_dev"], pairwise_mode, - batch_size) - n_features = train_dataloader.dataset[0][0].shape[1] + if training_mode: + train_dataloader, val_dataloader, test_dataloader = get_dataloaders(hyp["dataset"], + hyp["dataset_random_seed"], + hyp["convert_nan"], hyp["nan_value"], + hyp["normalize_data"], + hyp["subsample_sz_train"], + hyp["subsample_sz_dev"], pairwise_mode, + batch_size) + n_features = train_dataloader.dataset[0][0].shape[1] + else: + n_features = get_feature_count(hyp["dataset"], hyp["dataset_random_seed"]) # Create model with hyperparams if not pairwise_mode: @@ -222,53 +129,44 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g negative_slope, hidden_config, sdp_max_iters, sdp_eps, sdp_scale, use_rounded_loss, (e2e_loss == "bce"), use_sdp) model = EntResModel(*model_args) - # Define loss - if e2e_loss not in ["frob", "bce"]: - raise ValueError("Invalid value for e2e_loss") - loss_fn_e2e = FrobeniusLoss() if e2e_loss == 'frob' else torch.nn.BCELoss() - - pos_weight = None - if weighted_loss: - if overfit_batch_idx > -1: - n_pos = train_dataloader.dataset[overfit_batch_idx][1].sum() - pos_weight = (len(train_dataloader.dataset[overfit_batch_idx][1]) - n_pos) / n_pos - else: - _n_pos, _n_total = 0., 0. - for _i in range(len(train_dataloader.dataset)): - _n_pos += train_dataloader.dataset[_i][1].sum() - _n_total += len(train_dataloader.dataset[_i][1]) - pos_weight = (_n_total - _n_pos) / _n_pos # Define eval eval_fn = evaluate pairwise_clustering_fns = [None] # Unused when pairwise_mode is False - if n_warmstart_epochs > 0: - train_dataloader_pairwise, _, _ = get_dataloaders(hyp["dataset"], - hyp["dataset_random_seed"], - hyp["convert_nan"], - hyp["nan_value"], - hyp["normalize_data"], - hyp["subsample_sz_train"], - hyp["subsample_sz_dev"], - True, hyp['batch_size']) + + if training_mode: # => model will be used for training # Define loss - loss_fn_pairwise = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight)) + if e2e_loss not in ["frob", "bce"]: + raise ValueError("Invalid value for e2e_loss") + loss_fn_e2e = FrobeniusLoss() if e2e_loss == 'frob' else torch.nn.BCELoss() + + pos_weight = None + if weighted_loss: + if overfit_batch_idx > -1: + n_pos = train_dataloader.dataset[overfit_batch_idx][1].sum() + pos_weight = (len(train_dataloader.dataset[overfit_batch_idx][1]) - n_pos) / n_pos + else: + _n_pos, _n_total = 0., 0. + for _i in range(len(train_dataloader.dataset)): + _n_pos += train_dataloader.dataset[_i][1].sum() + _n_total += len(train_dataloader.dataset[_i][1]) + pos_weight = (_n_total - _n_pos) / _n_pos if _n_pos > 0 else 1. + if n_warmstart_epochs > 0: + train_dataloader_pairwise = get_dataloaders(hyp["dataset"], + hyp["dataset_random_seed"], + hyp["convert_nan"], + hyp["nan_value"], + hyp["normalize_data"], + hyp["subsample_sz_train"], + hyp["subsample_sz_dev"], + pairwise_mode=True, batch_size=hyp['batch_size'], + split='train') + # Define loss + loss_fn_pairwise = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight)) else: model_args = (n_features, neumiss_depth, dropout_p, dropout_only_once, add_neumiss, - neumiss_deq, hidden_dim, n_hidden_layers, add_batchnorm, activation, - negative_slope, hidden_config) + neumiss_deq, hidden_dim, n_hidden_layers, add_batchnorm, activation, + negative_slope, hidden_config) model = PairwiseModel(*model_args) - # Define loss - pos_weight = None - if weighted_loss: - if overfit_batch_idx > -1: - n_pos = \ - train_dataloader.dataset[overfit_batch_idx * batch_size:(overfit_batch_idx + 1) * batch_size][ - 1].sum() - pos_weight = torch.tensor((batch_size - n_pos) / n_pos) - else: - n_pos = train_dataloader.dataset[:][1].sum() - pos_weight = torch.tensor((len(train_dataloader.dataset) - n_pos) / n_pos) - loss_fn_pairwise = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight) # Define eval eval_fn = evaluate_pairwise pairwise_clustering_fns = [None] @@ -287,14 +185,28 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g pairwise_clustering_fn_labels = ['cc', 'hac', 'cc-fixed'] else: raise ValueError('Invalid argument passed to --pairwise_eval_clustering') - _, val_dataloader_e2e, test_dataloader_e2e = get_dataloaders(hyp["dataset"], - hyp["dataset_random_seed"], - hyp["convert_nan"], - hyp["nan_value"], - hyp["normalize_data"], - hyp["subsample_sz_train"], - hyp["subsample_sz_dev"], - pairwise_mode=False, batch_size=1) + val_dataloader_e2e, test_dataloader_e2e = get_dataloaders(hyp["dataset"], + hyp["dataset_random_seed"], + hyp["convert_nan"], + hyp["nan_value"], + hyp["normalize_data"], + hyp["subsample_sz_train"], + hyp["subsample_sz_dev"], + pairwise_mode=False, batch_size=1, + split=['dev', 'test']) + if training_mode: # => model will be used for training + # Define loss + pos_weight = None + if weighted_loss: + if overfit_batch_idx > -1: + n_pos = \ + train_dataloader.dataset[overfit_batch_idx * batch_size:(overfit_batch_idx + 1) * batch_size][ + 1].sum() + pos_weight = torch.tensor((batch_size - n_pos) / n_pos if n_pos > 0 else 1.) + else: + n_pos = train_dataloader.dataset[:][1].sum() + pos_weight = torch.tensor((len(train_dataloader.dataset) - n_pos) / n_pos if n_pos > 0 else 1.) + loss_fn_pairwise = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight) logger.info(f"Model loaded: {model}", ) # Load stored model, if available @@ -323,14 +235,15 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g 'cc-nosdp', 'cc-nosdp-fixed'] cc_inference_sdp.eval() cc_inference_nosdp.eval() - _, val_dataloader_e2e, test_dataloader_e2e = get_dataloaders(hyp["dataset"], - hyp["dataset_random_seed"], - hyp["convert_nan"], - hyp["nan_value"], - hyp["normalize_data"], - hyp["subsample_sz_train"], - hyp["subsample_sz_dev"], - pairwise_mode=False, batch_size=1) + val_dataloader_e2e, test_dataloader_e2e = get_dataloaders(hyp["dataset"], + hyp["dataset_random_seed"], + hyp["convert_nan"], + hyp["nan_value"], + hyp["normalize_data"], + hyp["subsample_sz_train"], + hyp["subsample_sz_dev"], + pairwise_mode=False, batch_size=1, + split=['dev', 'test']) start_time = time.time() with torch.no_grad(): model.eval() @@ -342,7 +255,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g clustering_threshold=clustering_threshold if i % 2 == 0 else None, val_dataloader=val_dataloader_e2e, tqdm_label='test clustering', device=device, verbose=verbose, - debug=debug, _errors=_errors) + debug=debug, _errors=_errors, model_args=model_args) if inference_fn.__class__ is HACInference: clustering_threshold = inference_fn.cut_threshold logger.info( @@ -360,18 +273,16 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g end_time = time.time() elif eval_only_split is not None: # Run inference on the specified split and exit - dataloaders = { - 'train': train_dataloader, - 'dev': val_dataloader, - 'test': test_dataloader - } start_time = time.time() with torch.no_grad(): model.eval() - - eval_dataloader = dataloaders[eval_only_split] + eval_dataloader = get_dataloaders(hyp["dataset"], hyp["dataset_random_seed"], + hyp["convert_nan"], hyp["nan_value"], + hyp["normalize_data"], hyp["subsample_sz_train"], + hyp["subsample_sz_dev"], pairwise_mode, + batch_size, split=eval_only_split) eval_scores = eval_fn(model, eval_dataloader, tqdm_label=eval_only_split, device=device, verbose=verbose, - debug=debug, _errors=_errors) + debug=debug, _errors=_errors, model_args=model_args) logger.info(f"Eval: {eval_only_split}_{list(eval_metric_to_idx)[0]}={eval_scores[0]}, " + f"{eval_only_split}_{list(eval_metric_to_idx)[1]}={eval_scores[1]}") # Log eval metrics @@ -380,7 +291,6 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g if len(eval_scores) == 3: log_cc_objective_values(scores=eval_scores, split_name=eval_only_split, log_prefix='Eval', verbose=verbose, logger=logger) - # For pairwise-mode: if pairwise_clustering_fns[0] is not None: clustering_threshold = None @@ -390,7 +300,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g clustering_threshold=clustering_threshold, val_dataloader=val_dataloader_e2e, tqdm_label='test clustering', device=device, verbose=verbose, - debug=debug, _errors=_errors) + debug=debug, _errors=_errors, model_args=model_args) if pairwise_clustering_fn.__class__ is HACInference: clustering_threshold = pairwise_clustering_fn.cut_threshold logger.info( @@ -430,15 +340,14 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g if not skip_initial_eval: # Get initial model performance on dev (or 'train' for overfitting runs) - _state_dict_path = copy_and_load_model(model, run.dir, device, store_only=True) - _proc = Process(target=init_eval, - kwargs=dict(model_class=model.__class__, model_args=model_args, - state_dict_path=_state_dict_path, - overfit_batch_idx=overfit_batch_idx, eval_fn=eval_fn, - train_dataloader=train_dataloader, device=device, verbose=verbose, - debug=debug, _errors=_errors, eval_metric_to_idx=eval_metric_to_idx, - val_dataloader=val_dataloader, return_dict=_return_dict)) - _proc.start() + _proc = fork_eval(target=init_eval, args=dict(model_args=model_args, + overfit_batch_idx=overfit_batch_idx, eval_fn=eval_fn, + train_dataloader=train_dataloader, device=device, + verbose=verbose, + debug=debug, _errors=_errors, + eval_metric_to_idx=eval_metric_to_idx, + val_dataloader=val_dataloader, return_dict=_return_dict), + model=model, run_dir=run.dir, device=device, logger=logger) if not pairwise_mode and grad_acc > 1: grad_acc_steps = [] _seen_pw = 0 @@ -473,9 +382,9 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g grad_acc_idx = 0 optimizer.zero_grad() - for (idx, batch) in enumerate(tqdm(_train_dataloader, - desc=f"{'Warm-starting' if warmstart_mode else 'Training'} {i + 1}", - position=1)): + pbar = tqdm(_train_dataloader, desc=f"{'Warm-starting' if warmstart_mode else 'Training'} {i + 1}", + position=1) + for (idx, batch) in enumerate(pbar): best_epoch, best_dev_score, best_dev_scores, best_dev_state_dict = _check_process(_proc, _return_dict, logger, run, @@ -506,6 +415,8 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g # Block contains only one signature pair; batchnorm throws error continue block_size = get_matrix_size_from_triu(data) + pbar.set_description(f"{'Warm-starting' if warmstart_mode else 'Training'} {i + 1} " + \ + f"(sz={len(data) if (pairwise_mode or warmstart_mode) else block_size})") target = target.flatten().float() if verbose: logger.info(f"Batch shape: {data.shape}") @@ -611,29 +522,29 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g wandb.log({f'train_epoch_loss': np.mean(running_loss)}) # Get model performance on dev (or 'train' for overfitting runs) - _state_dict_path = copy_and_load_model(model, run.dir, device, store_only=True) - _proc = Process(target=dev_eval, - kwargs=dict(model_class=model.__class__, model_args=model_args, - state_dict_path=_state_dict_path, overfit_batch_idx=overfit_batch_idx, - eval_fn=eval_fn, train_dataloader=train_dataloader, device=device, + _proc = fork_eval(target=dev_eval, + args=dict(model_args=model_args, + overfit_batch_idx=overfit_batch_idx, eval_fn=eval_fn, + train_dataloader=train_dataloader, device=device, verbose=verbose, debug=debug, _errors=_errors, eval_metric_to_idx=eval_metric_to_idx, val_dataloader=val_dataloader, - return_dict=_return_dict, i=i, run_dir=run.dir)) - _proc.start() + return_dict=_return_dict, i=i), + model=model, run_dir=run.dir, device=device, logger=logger, + sync=(idx == len(_train_dataloader.dataset) - 1)) end_time = time.time() best_epoch, best_dev_score, best_dev_scores, best_dev_state_dict = _check_process(_proc, _return_dict, - logger, run, - overfit_batch_idx, - use_lr_scheduler, - hyp, scheduler, - eval_metric_to_idx, - dev_opt_metric, i, - best_epoch, - best_dev_score, - best_dev_scores, - best_dev_state_dict, - sync=True) + logger, run, + overfit_batch_idx, + use_lr_scheduler, + hyp, scheduler, + eval_metric_to_idx, + dev_opt_metric, i, + best_epoch, + best_dev_score, + best_dev_scores, + best_dev_state_dict, + sync=True) # Save model if save_model: torch.save(best_dev_state_dict, os.path.join(run.dir, 'model_state_dict_best.pt')) @@ -646,7 +557,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g with torch.no_grad(): model.eval() test_scores = eval_fn(model, test_dataloader, tqdm_label='test', device=device, verbose=verbose, - debug=debug, _errors=_errors, tqdm_position=2) + debug=debug, _errors=_errors, tqdm_position=2, model_args=model_args) logger.info(f"Final: test_{list(eval_metric_to_idx)[0]}={test_scores[0]}, " + f"test_{list(eval_metric_to_idx)[1]}={test_scores[1]}") # Log final metrics @@ -667,7 +578,8 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g clustering_threshold=clustering_threshold, val_dataloader=val_dataloader_e2e, tqdm_label='test clustering', device=device, verbose=verbose, - debug=debug, _errors=_errors, tqdm_position=2) + debug=debug, _errors=_errors, tqdm_position=2, + model_args=model_args) if pairwise_clustering_fn.__class__ is HACInference: clustering_threshold = pairwise_clustering_fn.cut_threshold logger.info(f"Final: test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[0]}, " + diff --git a/e2e_scripts/train_utils.py b/e2e_scripts/train_utils.py index c86c498..0fe40c9 100644 --- a/e2e_scripts/train_utils.py +++ b/e2e_scripts/train_utils.py @@ -9,26 +9,27 @@ from typing import Tuple, Optional import math import pickle +import torch +import numpy as np +import wandb from time import time +from sklearn.preprocessing import StandardScaler from torch.utils.data import DataLoader from s2and.consts import PREPROCESSED_DATA_DIR from s2and.data import S2BlocksDataset from s2and.eval import b3_precision_recall_fscore from torch import Tensor -import torch -import numpy as np -import wandb +from torch.multiprocessing import Process from IPython import embed - # Default hyperparameters DEFAULT_HYPERPARAMS = { # Dataset "dataset": "pubmed", "dataset_random_seed": 1, - "subsample_sz_train": 80, - "subsample_sz_dev": 100, + "subsample_sz_train": 60, + "subsample_sz_dev": -1, # Run config "run_random_seed": 17, "pairwise_mode": False, @@ -56,11 +57,11 @@ "sdp_eps": 1e-3, "sdp_scale": True, # Training config - "batch_size": 10000, # pairwise only; used by e2e if gradient_accumulation is true - "lr": 4e-3, + "batch_size": 8000, # pairwise only; used by e2e if gradient_accumulation is true + "lr": 1e-3, "n_epochs": 5, "n_warmstart_epochs": 0, - "weighted_loss": False, + "weighted_loss": True, "use_lr_scheduler": True, "lr_scheduler": "plateau", # "plateau", "step" "lr_factor": 0.4, @@ -69,7 +70,7 @@ "lr_step_size": 2, "lr_gamma": 0.4, "weight_decay": 0.01, - "gradient_accumulation": False, # e2e only; accumulate over pairwise examples + "gradient_accumulation": True, # e2e only; accumulate over pairwise examples "dev_opt_metric": 'b3_f1', # e2e: {'b3_f1', 'vmeasure'}; pairwise: {'auroc', 'f1'} "overfit_batch_idx": -1 } @@ -83,25 +84,42 @@ def read_blockwise_features(pkl): def get_dataloaders(dataset, dataset_seed, convert_nan, nan_value, normalize, subsample_sz_train, subsample_sz_dev, - pairwise_mode, batch_size): - train_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{dataset_seed}/train_features.pkl" - val_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{dataset_seed}/val_features.pkl" - test_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{dataset_seed}/test_features.pkl" + pairwise_mode, batch_size, shuffle=False, split=None): + pickle_path = { + 'train': f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{dataset_seed}/train_features.pkl", + 'dev': f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{dataset_seed}/val_features.pkl", + 'test': f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{dataset_seed}/test_features.pkl" + } + subsample_sz = { + 'train': subsample_sz_train, + 'dev': subsample_sz_dev, + 'test': -1 + } + train_scaler = StandardScaler() + train_X = np.concatenate(list(map(lambda x: x[0], read_blockwise_features(pickle_path['train']).values()))) + train_scaler.fit(train_X) - train_dataset = S2BlocksDataset(read_blockwise_features(train_pkl), convert_nan=convert_nan, nan_value=nan_value, - scale=normalize, subsample_sz=subsample_sz_train, pairwise_mode=pairwise_mode) - train_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=batch_size) + def _get_dataloader(_split): + dataset = S2BlocksDataset(read_blockwise_features(pickle_path[_split]), convert_nan=convert_nan, + nan_value=nan_value, scale=normalize, scaler=train_scaler, + subsample_sz=subsample_sz[_split], + pairwise_mode=pairwise_mode, sort_desc=(_split in ['dev', 'test'])) + dataloader = DataLoader(dataset, shuffle=shuffle, batch_size=batch_size) + return dataloader - val_dataset = S2BlocksDataset(read_blockwise_features(val_pkl), convert_nan=convert_nan, nan_value=nan_value, - scale=normalize, scaler=train_dataset.scaler, subsample_sz=subsample_sz_dev, - pairwise_mode=pairwise_mode) - val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size) + if split is None: + return _get_dataloader('train'), _get_dataloader('dev'), _get_dataloader('test') + if type(split) is str: + return _get_dataloader(split) + if type(split) is list: + return tuple([_get_dataloader(_split) for _split in split]) + raise ValueError('Invalid argument to split') - test_dataset = S2BlocksDataset(read_blockwise_features(test_pkl), convert_nan=convert_nan, nan_value=nan_value, - scale=normalize, scaler=train_dataset.scaler, pairwise_mode=pairwise_mode) - test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size) - return train_dataloader, val_dataloader, test_dataloader +def get_feature_count(dataset, dataset_seed): + data_fpath = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{dataset_seed}/test_features.pkl" + block_dict = read_blockwise_features(data_fpath) + return next(iter(block_dict.values()))[0].shape[1] def uncompress_target_tensor(compressed_targets, make_symmetric=True, device=None): @@ -205,3 +223,118 @@ def copy_and_load_model(model, run_dir, device, store_only=False): _model.load_state_dict(_STATE_DICT) os.remove(_PATH) return _model + + +def _check_process(_proc, _return_dict, logger, run, overfit_batch_idx, use_lr_scheduler, hyp, + scheduler, eval_metric_to_idx, dev_opt_metric, i, best_epoch, best_dev_score, + best_dev_scores, best_dev_state_dict, sync=False): + if _proc is not None: + if _return_dict['_state'] == 'done' or (sync and _return_dict['_state'] != 'finish'): + _proc.join() + _return_dict['_state'] = 'finish' + if _return_dict['_method'] == 'init_eval': + logger.info(_return_dict['local']) + run.log(_return_dict['wandb']) + if overfit_batch_idx == -1: + best_dev_scores = _return_dict['dev_scores'] + best_dev_score = best_dev_scores[eval_metric_to_idx[dev_opt_metric]] + elif _return_dict['_method'] == 'dev_eval': + logger.info(_return_dict['local']) + run.log(_return_dict['wandb']) + if overfit_batch_idx > -1: + if use_lr_scheduler: + if hyp['lr_scheduler'] == 'plateau': + scheduler.step(_return_dict['train_scores'][eval_metric_to_idx[dev_opt_metric]]) + elif hyp['lr_scheduler'] == 'step': + scheduler.step() + else: + dev_scores = _return_dict['dev_scores'] + dev_opt_score = dev_scores[eval_metric_to_idx[dev_opt_metric]] + if dev_opt_score > best_dev_score: + logger.info(f"New best dev {dev_opt_metric} score @ epoch{i + 1}: {dev_opt_score}") + best_epoch = i + best_dev_score = dev_opt_score + best_dev_scores = dev_scores + best_dev_state_dict = torch.load(_return_dict['state_dict_path']) + if use_lr_scheduler: + if hyp['lr_scheduler'] == 'plateau': + scheduler.step(dev_scores[eval_metric_to_idx[dev_opt_metric]]) + elif hyp['lr_scheduler'] == 'step': + scheduler.step() + return best_epoch, best_dev_score, best_dev_scores, best_dev_state_dict + + +def init_eval(model_class, model_args, state_dict_path, overfit_batch_idx, eval_fn, train_dataloader, device, verbose, + debug, _errors, eval_metric_to_idx, val_dataloader, return_dict): + return_dict['_state'] = 'start' + return_dict['_method'] = 'init_eval' + model = model_class(*model_args) + model.load_state_dict(torch.load(state_dict_path)) + model.to(device) + with torch.no_grad(): + model.eval() + if overfit_batch_idx > -1: + train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx, + tqdm_label='train', device=device, verbose=verbose, debug=debug, + _errors=_errors, tqdm_position=0, model_args=model_args) + return_dict['local'] = f"Initial: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " + \ + f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}" + return_dict['wandb'] = {'epoch': 0, f'train_{list(eval_metric_to_idx)[0]}': train_scores[0], + f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]} + else: + dev_scores = eval_fn(model, val_dataloader, tqdm_label='dev 0', device=device, verbose=verbose, + debug=debug, _errors=_errors, tqdm_position=0, model_args=model_args) + return_dict['local'] = f"Initial: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " + \ + f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}" + return_dict['wandb'] = {'epoch': 0, f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0], + f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1]} + return_dict['dev_scores'] = dev_scores + del model + return_dict['_state'] = 'done' + return return_dict + + +def dev_eval(model_class, model_args, state_dict_path, overfit_batch_idx, eval_fn, train_dataloader, device, verbose, + debug, _errors, eval_metric_to_idx, val_dataloader, return_dict, i): + return_dict['_state'] = 'start' + return_dict['_method'] = 'dev_eval' + return_dict['state_dict_path'] = state_dict_path + model = model_class(*model_args) + model.load_state_dict(torch.load(state_dict_path)) + model.to(device) + with torch.no_grad(): + model.eval() + if overfit_batch_idx > -1: + train_scores = eval_fn(model, train_dataloader, overfit_batch_idx=overfit_batch_idx, + tqdm_label='train', device=device, verbose=verbose, debug=debug, + _errors=_errors, model_args=model_args) + return_dict['local'] = f"Epoch {i + 1}: train_{list(eval_metric_to_idx)[0]}={train_scores[0]}, " + \ + f"train_{list(eval_metric_to_idx)[1]}={train_scores[1]}" + return_dict['wandb'] = {f'train_{list(eval_metric_to_idx)[0]}': train_scores[0], + f'train_{list(eval_metric_to_idx)[1]}': train_scores[1]} + return_dict['train_scores'] = train_scores + else: + dev_scores = eval_fn(model, val_dataloader, tqdm_label=f'dev {i + 1}', device=device, verbose=verbose, + debug=debug, _errors=_errors, model_args=model_args) + return_dict['local'] = f"Epoch {i + 1}: dev_{list(eval_metric_to_idx)[0]}={dev_scores[0]}, " + \ + f"dev_{list(eval_metric_to_idx)[1]}={dev_scores[1]}" + return_dict['wandb'] = {f'dev_{list(eval_metric_to_idx)[0]}': dev_scores[0], + f'dev_{list(eval_metric_to_idx)[1]}': dev_scores[1]} + return_dict['dev_scores'] = dev_scores + del model + return_dict['_state'] = 'done' + return return_dict + + +def fork_eval(target, args, model, run_dir, device, logger, sync=False): + state_dict_path = copy_and_load_model(model, run_dir, device, store_only=True) + args['model_class'] = model.__class__ + args['state_dict_path'] = state_dict_path + if sync: + target(**args) + proc = Process() + else: + proc = Process(target=target, kwargs=args) + logger.info('Forking eval') + proc.start() + return proc diff --git a/s2and/data.py b/s2and/data.py index 9d75eb1..745251b 100644 --- a/s2and/data.py +++ b/s2and/data.py @@ -128,7 +128,7 @@ class S2BlocksDataset(Dataset): """ def __init__(self, block_dict: Dict[str, Tuple[np.ndarray, np.ndarray, np.ndarray]], convert_nan=True, nan_value=-1, scale=False, scaler=None, subsample_sz=-1, - pairwise_mode=False): + pairwise_mode=False, sort_desc=False): self.pairwise_mode = pairwise_mode self.block_dict = block_dict self.convert_nan = convert_nan @@ -171,6 +171,11 @@ def __init__(self, block_dict: Dict[str, Tuple[np.ndarray, np.ndarray, np.ndarra else: self.blockwise_data.append((X, y, cluster_ids)) self.blockwise_keys.append(dict_key) + if sort_desc: + self.blockwise_keys = list(map(lambda x: x[1], sorted(enumerate(self.blockwise_keys), + key=lambda x: len(self.blockwise_data[x[0]][2]), + reverse=True))) + self.blockwise_data.sort(key=lambda x: -len(x[2])) if self.pairwise_mode: self.pairwise_data = {'X': [], 'y': []} self.cluster_ids = [] diff --git a/utils/parser.py b/utils/parser.py index f16f61f..de21db8 100644 --- a/utils/parser.py +++ b/utils/parser.py @@ -38,6 +38,9 @@ def add_preprocessing_args(self): parser.add_argument( "--dataset_name", type=str, help="name of AND dataset that you want to preprocess" ) + parser.add_argument( + "--dataset_seed", type=int + ) def add_training_args(self): """ @@ -94,7 +97,7 @@ def add_training_args(self): help="Whether to prevent wandb sweep early terminate or not", ) parser.add_argument( - "--wandb_max_runs", type=int, default=600, + "--wandb_max_runs", type=int, default=120, help="Maximum number of runs to try in the sweep", ) parser.add_argument( diff --git a/wandb_configs/sweeps/e2e-nosdp-warm.json b/wandb_configs/sweeps/e2e-nosdp-warm.json index 3846f0d..294b543 100644 --- a/wandb_configs/sweeps/e2e-nosdp-warm.json +++ b/wandb_configs/sweeps/e2e-nosdp-warm.json @@ -8,9 +8,9 @@ "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, "lr_scheduler": {"value": "plateau"}, - "subsample_sz_train": {"value": 80}, - "subsample_sz_dev": {"value": 100}, "activation": {"values": ["leaky_relu", "relu"]}, "use_sdp": {"value": false}, - "n_warmstart_epochs": {"value": 2} + "n_warmstart_epochs": {"value": 2}, + "gradient_accumulation": {"values": [true, false]}, + "weighted_loss": {"values": [true, false]} } diff --git a/wandb_configs/sweeps/e2e-nosdp.json b/wandb_configs/sweeps/e2e-nosdp.json index 4e02afe..5b47c39 100644 --- a/wandb_configs/sweeps/e2e-nosdp.json +++ b/wandb_configs/sweeps/e2e-nosdp.json @@ -8,8 +8,8 @@ "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, "lr_scheduler": {"value": "plateau"}, - "subsample_sz_train": {"value": 80}, - "subsample_sz_dev": {"value": 100}, "activation": {"values": ["leaky_relu", "relu"]}, - "use_sdp": {"value": false} + "use_sdp": {"value": false}, + "gradient_accumulation": {"values": [true, false]}, + "weighted_loss": {"values": [true, false]} } diff --git a/wandb_configs/sweeps/e2e-warm.json b/wandb_configs/sweeps/e2e-warm.json index 77de43c..19e511b 100644 --- a/wandb_configs/sweeps/e2e-warm.json +++ b/wandb_configs/sweeps/e2e-warm.json @@ -8,8 +8,8 @@ "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, "lr_scheduler": {"value": "plateau"}, - "subsample_sz_train": {"value": 80}, - "subsample_sz_dev": {"value": 100}, "activation": {"values": ["leaky_relu", "relu"]}, - "n_warmstart_epochs": {"value": 2} + "n_warmstart_epochs": {"value": 2}, + "gradient_accumulation": {"values": [true, false]}, + "weighted_loss": {"values": [true, false]} } diff --git a/wandb_configs/sweeps/e2e.json b/wandb_configs/sweeps/e2e.json index 20991ba..e084f00 100644 --- a/wandb_configs/sweeps/e2e.json +++ b/wandb_configs/sweeps/e2e.json @@ -8,7 +8,7 @@ "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, "lr_scheduler": {"value": "plateau"}, - "subsample_sz_train": {"value": 80}, - "subsample_sz_dev": {"value": 100}, - "activation": {"values": ["leaky_relu", "relu"]} + "activation": {"values": ["leaky_relu", "relu"]}, + "gradient_accumulation": {"values": [true, false]}, + "weighted_loss": {"values": [true, false]} } diff --git a/wandb_configs/sweeps/frac-nosdp-warm.json b/wandb_configs/sweeps/frac-nosdp-warm.json index 75503ce..491e04c 100644 --- a/wandb_configs/sweeps/frac-nosdp-warm.json +++ b/wandb_configs/sweeps/frac-nosdp-warm.json @@ -8,10 +8,10 @@ "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, "lr_scheduler": {"value": "plateau"}, - "subsample_sz_train": {"value": 80}, - "subsample_sz_dev": {"value": 100}, "activation": {"values": ["leaky_relu", "relu"]}, "use_rounded_loss": {"value": false}, "use_sdp": {"value": false}, - "n_warmstart_epochs": {"value": 2} + "n_warmstart_epochs": {"value": 2}, + "gradient_accumulation": {"values": [true, false]}, + "weighted_loss": {"values": [true, false]} } diff --git a/wandb_configs/sweeps/frac-nosdp.json b/wandb_configs/sweeps/frac-nosdp.json index f27ee08..d9a1e41 100644 --- a/wandb_configs/sweeps/frac-nosdp.json +++ b/wandb_configs/sweeps/frac-nosdp.json @@ -8,9 +8,9 @@ "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, "lr_scheduler": {"value": "plateau"}, - "subsample_sz_train": {"value": 80}, - "subsample_sz_dev": {"value": 100}, "activation": {"values": ["leaky_relu", "relu"]}, "use_rounded_loss": {"value": false}, - "use_sdp": {"value": false} + "use_sdp": {"value": false}, + "gradient_accumulation": {"values": [true, false]}, + "weighted_loss": {"values": [true, false]} } diff --git a/wandb_configs/sweeps/frac-warm.json b/wandb_configs/sweeps/frac-warm.json index fa4b935..b13efc5 100644 --- a/wandb_configs/sweeps/frac-warm.json +++ b/wandb_configs/sweeps/frac-warm.json @@ -8,9 +8,9 @@ "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, "lr_scheduler": {"value": "plateau"}, - "subsample_sz_train": {"value": 80}, - "subsample_sz_dev": {"value": 100}, "activation": {"values": ["leaky_relu", "relu"]}, "use_rounded_loss": {"value": false}, - "n_warmstart_epochs": {"value": 2} + "n_warmstart_epochs": {"value": 2}, + "gradient_accumulation": {"values": [true, false]}, + "weighted_loss": {"values": [true, false]} } diff --git a/wandb_configs/sweeps/frac.json b/wandb_configs/sweeps/frac.json index 7eb6812..a572b76 100644 --- a/wandb_configs/sweeps/frac.json +++ b/wandb_configs/sweeps/frac.json @@ -8,8 +8,8 @@ "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, "lr_scheduler": {"value": "plateau"}, - "subsample_sz_train": {"value": 80}, - "subsample_sz_dev": {"value": 100}, "activation": {"values": ["leaky_relu", "relu"]}, - "use_rounded_loss": {"value": false} + "use_rounded_loss": {"value": false}, + "gradient_accumulation": {"values": [true, false]}, + "weighted_loss": {"values": [true, false]} } diff --git a/wandb_configs/sweeps/mlp.json b/wandb_configs/sweeps/mlp.json index a5f49fc..24274c7 100644 --- a/wandb_configs/sweeps/mlp.json +++ b/wandb_configs/sweeps/mlp.json @@ -10,5 +10,6 @@ "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, "lr_scheduler": {"value": "plateau"}, "activation": {"values": ["leaky_relu", "relu"]}, + "gradient_accumulation": {"value": false}, "weighted_loss": {"value": true} } From 69617d842773b56e39dfef985ab86f986e6a895c Mon Sep 17 00:00:00 2001 From: Dhruv Agarwal Date: Sat, 18 Mar 2023 14:44:53 -0400 Subject: [PATCH 08/17] Add gradient clipping with norm --- e2e_scripts/train.py | 4 ++++ e2e_scripts/train_utils.py | 1 + 2 files changed, 5 insertions(+) diff --git a/e2e_scripts/train.py b/e2e_scripts/train.py index a645257..eb0b483 100644 --- a/e2e_scripts/train.py +++ b/e2e_scripts/train.py @@ -493,6 +493,10 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g continue if pairwise_mode or ( idx == len(_train_dataloader.dataset) - 1) or grad_acc == 1 or grad_acc_count >= grad_acc: + if hyp["max_grad_norm"] != -1: + torch.nn.utils.clip_grad_norm_( + model.parameters(), hyp["max_grad_norm"] + ) optimizer.step() optimizer.zero_grad() if grad_acc > 1: diff --git a/e2e_scripts/train_utils.py b/e2e_scripts/train_utils.py index 0fe40c9..4083905 100644 --- a/e2e_scripts/train_utils.py +++ b/e2e_scripts/train_utils.py @@ -69,6 +69,7 @@ "lr_scheduler_patience": 2, "lr_step_size": 2, "lr_gamma": 0.4, + "max_grad_norm": 1, # Off if set to -1 "weight_decay": 0.01, "gradient_accumulation": True, # e2e only; accumulate over pairwise examples "dev_opt_metric": 'b3_f1', # e2e: {'b3_f1', 'vmeasure'}; pairwise: {'auroc', 'f1'} From d84ae96ba1536d43fd209c3656440c4eabc1cf05 Mon Sep 17 00:00:00 2001 From: Dhruv Agarwal Date: Sat, 18 Mar 2023 16:31:36 -0400 Subject: [PATCH 09/17] Set limit on parallel forks in parallel iterations --- e2e_scripts/evaluate.py | 145 ++++++++++++++++++++-------------------- 1 file changed, 73 insertions(+), 72 deletions(-) diff --git a/e2e_scripts/evaluate.py b/e2e_scripts/evaluate.py index 71ec3d6..f3d15bc 100644 --- a/e2e_scripts/evaluate.py +++ b/e2e_scripts/evaluate.py @@ -58,7 +58,7 @@ def _fork_iter(batch_idx, _fork_id, _shared_list, eval_fn, **kwargs): def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, clustering_threshold=None, val_dataloader=None, tqdm_label='', device=None, verbose=False, debug=False, _errors=None, run_dir='./', tqdm_position=None, model_args=None, return_iter=False, fork_size=500, - disable_tqdm=False): + max_parallel_forks=5, disable_tqdm=False): """ clustering_fn, clustering_threshold, val_dataloader: unused when pairwise_mode is False (only added to keep fn signature identical) @@ -96,41 +96,41 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste # Only one signature in block; manually assign a unique cluster pred_cluster_ids = [max_pred_id + 1] elif fork_enabled and block_size >= fork_size: - _proc = _fork_iter(idx, _fork_id, _shared_list, evaluate, **fn_args) - _fork_id += 1 - _procs.append((_proc, block_size)) - continue - else: - # Forward pass through the e2e model - data = data.to(device) - try: - _ = model(data, block_size, verbose=verbose) - except CvxpyException as e: - logger.info(e) - _error_obj = { - 'id': f'e_{int(time())}', - 'method': 'eval', - 'model_type': 'e2e', - 'data_split': tqdm_label, - 'model_call_args': { - 'data': data.detach().tolist(), - 'block_size': block_size - }, - 'cvxpy_layer_args': e.data - } - if _errors is not None: - _errors.append(_error_obj) - save_to_wandb_run({'errors': _errors}, 'errors.json', run_dir, logger) - if not debug: # if tqdm_label is not 'dev' and not debug: - raise CvxpyException(data=_error_obj) - n_exceptions += 1 - logger.info(f'Caught CvxpyException {n_exceptions}: skipping batch') + if (len(_procs) - len(_shared_list)) < max_parallel_forks: + _proc = _fork_iter(idx, _fork_id, _shared_list, evaluate, **fn_args) + _fork_id += 1 + _procs.append((_proc, block_size)) continue - pred_cluster_ids = (model.hac_cut_layer.cluster_labels + (max_pred_id + 1)).tolist() - cc_obj_vals['round'].append(model.hac_cut_layer.objective_value) - cc_obj_vals['sdp'].append(model.sdp_layer.objective_value) - cc_obj_vals['block_idxs'].append(idx) - cc_obj_vals['block_sizes'].append(block_size) + # Forward pass through the e2e model + data = data.to(device) + try: + _ = model(data, block_size, verbose=verbose) + except CvxpyException as e: + logger.info(e) + _error_obj = { + 'id': f'e_{int(time())}', + 'method': 'eval', + 'model_type': 'e2e', + 'data_split': tqdm_label, + 'model_call_args': { + 'data': data.detach().tolist(), + 'block_size': block_size + }, + 'cvxpy_layer_args': e.data + } + if _errors is not None: + _errors.append(_error_obj) + save_to_wandb_run({'errors': _errors}, 'errors.json', run_dir, logger) + if not debug: # if tqdm_label is not 'dev' and not debug: + raise CvxpyException(data=_error_obj) + n_exceptions += 1 + logger.info(f'Caught CvxpyException {n_exceptions}: skipping batch') + continue + pred_cluster_ids = (model.hac_cut_layer.cluster_labels + (max_pred_id + 1)).tolist() + cc_obj_vals['round'].append(model.hac_cut_layer.objective_value) + cc_obj_vals['sdp'].append(model.sdp_layer.objective_value) + cc_obj_vals['block_idxs'].append(idx) + cc_obj_vals['block_sizes'].append(block_size) all_gold += list(np.reshape(cluster_ids, (block_size,))) max_pred_id = max(pred_cluster_ids) all_pred += list(pred_cluster_ids) @@ -167,7 +167,8 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", return_pred_only=False, thresh_for_f1=0.5, clustering_fn=None, clustering_threshold=None, val_dataloader=None, tqdm_label='', device=None, verbose=False, debug=False, _errors=None, run_dir='./', - tqdm_position=None, model_args=None, return_iter=False, fork_size=500, disable_tqdm=False): + tqdm_position=None, model_args=None, return_iter=False, fork_size=500, max_parallel_forks=5, + disable_tqdm=False): fn_args = locals() fork_enabled = fork_size > -1 and model_args is not None if fork_enabled: @@ -206,43 +207,43 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret # Only one signature in block; manually assign a unique cluster pred_cluster_ids = [max_pred_id + 1] elif fork_enabled and block_size >= fork_size and clustering_fn.__class__ is CCInference: - _proc = _fork_iter(idx, _fork_id, _shared_list, evaluate_pairwise, **fn_args) - _fork_id += 1 - _procs.append((_proc, block_size)) - continue - else: - # Forward pass through the e2e model - data = data.to(device) - try: - edge_weights = model(data, N=block_size, warmstart=True, verbose=verbose) - pred_cluster_ids = clustering_fn(edge_weights, block_size, min_id=(max_pred_id + 1), - threshold=clustering_threshold) - except CvxpyException as e: - logger.info(e) - _error_obj = { - 'id': f'e_{int(time())}', - 'method': 'eval', - 'model_type': 'pairwise_cc', - 'data_split': tqdm_label, - 'model_call_args': { - 'data': data.detach().tolist(), - 'block_size': block_size - }, - 'cvxpy_layer_args': e.data - } - if _errors is not None: - _errors.append(_error_obj) - save_to_wandb_run({'errors': _errors}, 'errors.json', run_dir, logger) - if not debug: # if tqdm_label is not 'dev' and not debug: - raise CvxpyException(data=_error_obj) - n_exceptions += 1 - logger.info(f'Caught CvxpyException {n_exceptions}: skipping batch') + if (len(_procs) - len(_shared_list)) < max_parallel_forks: + _proc = _fork_iter(idx, _fork_id, _shared_list, evaluate_pairwise, **fn_args) + _fork_id += 1 + _procs.append((_proc, block_size)) continue - if clustering_fn.__class__ is CCInference: - cc_obj_vals['round'].append(clustering_fn.hac_cut_layer.objective_value) - cc_obj_vals['sdp'].append(clustering_fn.sdp_layer.objective_value) - cc_obj_vals['block_idxs'].append(idx) - cc_obj_vals['block_sizes'].append(block_size) + # Forward pass through the e2e model + data = data.to(device) + try: + edge_weights = model(data, N=block_size, warmstart=True, verbose=verbose) + pred_cluster_ids = clustering_fn(edge_weights, block_size, min_id=(max_pred_id + 1), + threshold=clustering_threshold) + except CvxpyException as e: + logger.info(e) + _error_obj = { + 'id': f'e_{int(time())}', + 'method': 'eval', + 'model_type': 'pairwise_cc', + 'data_split': tqdm_label, + 'model_call_args': { + 'data': data.detach().tolist(), + 'block_size': block_size + }, + 'cvxpy_layer_args': e.data + } + if _errors is not None: + _errors.append(_error_obj) + save_to_wandb_run({'errors': _errors}, 'errors.json', run_dir, logger) + if not debug: # if tqdm_label is not 'dev' and not debug: + raise CvxpyException(data=_error_obj) + n_exceptions += 1 + logger.info(f'Caught CvxpyException {n_exceptions}: skipping batch') + continue + if clustering_fn.__class__ is CCInference: + cc_obj_vals['round'].append(clustering_fn.hac_cut_layer.objective_value) + cc_obj_vals['sdp'].append(clustering_fn.sdp_layer.objective_value) + cc_obj_vals['block_idxs'].append(idx) + cc_obj_vals['block_sizes'].append(block_size) all_gold += list(np.reshape(cluster_ids, (block_size,))) max_pred_id = max(pred_cluster_ids) all_pred += list(pred_cluster_ids) From 5bfa8a2b7257ffb0a876355ce4d517aca5892fef Mon Sep 17 00:00:00 2001 From: Dhruv Agarwal Date: Sat, 18 Mar 2023 16:54:31 -0400 Subject: [PATCH 10/17] Set limit on parallel forks in parallel iterations --- e2e_scripts/evaluate.py | 6 +++--- e2e_scripts/train_utils.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/e2e_scripts/evaluate.py b/e2e_scripts/evaluate.py index f3d15bc..340e14e 100644 --- a/e2e_scripts/evaluate.py +++ b/e2e_scripts/evaluate.py @@ -57,8 +57,8 @@ def _fork_iter(batch_idx, _fork_id, _shared_list, eval_fn, **kwargs): def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, clustering_threshold=None, val_dataloader=None, tqdm_label='', device=None, verbose=False, debug=False, _errors=None, - run_dir='./', tqdm_position=None, model_args=None, return_iter=False, fork_size=500, - max_parallel_forks=5, disable_tqdm=False): + run_dir='./', tqdm_position=None, model_args=None, return_iter=False, fork_size=300, + max_parallel_forks=4, disable_tqdm=False): """ clustering_fn, clustering_threshold, val_dataloader: unused when pairwise_mode is False (only added to keep fn signature identical) @@ -167,7 +167,7 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", return_pred_only=False, thresh_for_f1=0.5, clustering_fn=None, clustering_threshold=None, val_dataloader=None, tqdm_label='', device=None, verbose=False, debug=False, _errors=None, run_dir='./', - tqdm_position=None, model_args=None, return_iter=False, fork_size=500, max_parallel_forks=5, + tqdm_position=None, model_args=None, return_iter=False, fork_size=300, max_parallel_forks=4, disable_tqdm=False): fn_args = locals() fork_enabled = fork_size > -1 and model_args is not None diff --git a/e2e_scripts/train_utils.py b/e2e_scripts/train_utils.py index 4083905..617f064 100644 --- a/e2e_scripts/train_utils.py +++ b/e2e_scripts/train_utils.py @@ -29,7 +29,7 @@ "dataset": "pubmed", "dataset_random_seed": 1, "subsample_sz_train": 60, - "subsample_sz_dev": -1, + "subsample_sz_dev": 300, # Run config "run_random_seed": 17, "pairwise_mode": False, From b1bbb49e74c7232c1ceda0fd7ec5e05a06d1cc63 Mon Sep 17 00:00:00 2001 From: Dhruv Agarwal Date: Sat, 18 Mar 2023 17:00:38 -0400 Subject: [PATCH 11/17] Add log message for iteration fork --- e2e_scripts/evaluate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/e2e_scripts/evaluate.py b/e2e_scripts/evaluate.py index 340e14e..bfaf5e9 100644 --- a/e2e_scripts/evaluate.py +++ b/e2e_scripts/evaluate.py @@ -51,6 +51,7 @@ def _fork_iter(batch_idx, _fork_id, _shared_list, eval_fn, **kwargs): kwargs['device'] = 'cpu' kwargs['eval_fn'] = eval_fn _proc = Process(target=_run_iter, kwargs=kwargs) + logger.info('Forking eval iteration') _proc.start() return _proc From 87081d7908e8dc553fad31fab9e4407fa2d8fd84 Mon Sep 17 00:00:00 2001 From: Dhruv Agarwal Date: Mon, 20 Mar 2023 12:30:24 -0400 Subject: [PATCH 12/17] Larger datasets, hyperparameter search space, icml_final_eval (#42) --- add_agent.sh | 2 +- e2e_scripts/evaluate.py | 20 ++- e2e_scripts/train.py | 154 ++++++++++++++++------ e2e_scripts/train_utils.py | 3 +- get_wandb_results.py | 126 ++++++++++++++++++ rerun_batch.sh | 127 ++++++++++++++++++ rerun_best.sh | 19 +++ run_sweep.sh | 6 +- utils/parser.py | 4 + wandb_configs/sweeps/e2e-nosdp-warm.json | 8 +- wandb_configs/sweeps/e2e-nosdp.json | 6 +- wandb_configs/sweeps/e2e-warm.json | 8 +- wandb_configs/sweeps/e2e.json | 6 +- wandb_configs/sweeps/frac-nosdp-warm.json | 6 +- wandb_configs/sweeps/frac-nosdp.json | 6 +- wandb_configs/sweeps/frac-warm.json | 6 +- wandb_configs/sweeps/frac.json | 6 +- wandb_configs/sweeps/mlp.json | 6 +- 18 files changed, 441 insertions(+), 78 deletions(-) create mode 100644 get_wandb_results.py create mode 100644 rerun_batch.sh create mode 100644 rerun_best.sh diff --git a/add_agent.sh b/add_agent.sh index 13b93f5..eba0497 100644 --- a/add_agent.sh +++ b/add_agent.sh @@ -10,7 +10,7 @@ gpu_name=${6:-"gypsum-1080ti"} # "gypsum-1080ti" for ((i = 1; i <= ${n_agents}; i++)); do JOB_DESC=${model}_${dataset}_sweep${seed}-${i} && JOB_NAME=${JOB_DESC}_$(date +%s) && \ sbatch -J ${JOB_NAME} -e jobs/${JOB_NAME}.err -o jobs/${JOB_NAME}.log \ - --partition=${gpu_name} --gres=gpu:1 --mem=100G --time=12:00:00 \ + --partition=${gpu_name} --gres=gpu:1 --mem=120G --time=12:00:00 \ run_sbatch.sh e2e_scripts/train.py \ --dataset="${dataset}" \ --dataset_random_seed=${seed} \ diff --git a/e2e_scripts/evaluate.py b/e2e_scripts/evaluate.py index bfaf5e9..8f61c70 100644 --- a/e2e_scripts/evaluate.py +++ b/e2e_scripts/evaluate.py @@ -58,8 +58,8 @@ def _fork_iter(batch_idx, _fork_id, _shared_list, eval_fn, **kwargs): def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, clustering_threshold=None, val_dataloader=None, tqdm_label='', device=None, verbose=False, debug=False, _errors=None, - run_dir='./', tqdm_position=None, model_args=None, return_iter=False, fork_size=300, - max_parallel_forks=4, disable_tqdm=False): + run_dir='./', tqdm_position=None, model_args=None, return_iter=False, fork_size=500, + max_parallel_forks=3, disable_tqdm=False): """ clustering_fn, clustering_threshold, val_dataloader: unused when pairwise_mode is False (only added to keep fn signature identical) @@ -97,6 +97,7 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste # Only one signature in block; manually assign a unique cluster pred_cluster_ids = [max_pred_id + 1] elif fork_enabled and block_size >= fork_size: + logger.info(f"Eval fork info: len(_procs)={len(_procs)}, len(_shared_list)={len(_shared_list)}") if (len(_procs) - len(_shared_list)) < max_parallel_forks: _proc = _fork_iter(idx, _fork_id, _shared_list, evaluate, **fn_args) _fork_id += 1 @@ -149,7 +150,11 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste _procs.sort(key=lambda x: x[1]) # To visualize progress for _proc in tqdm(_procs, desc=f'Eval {tqdm_label} (waiting for forks to join)', position=tqdm_position): _proc[0].join() - assert len(_procs) == len(_shared_list), "All forked eval iterations did not return results" + try: + assert len(_procs) == len(_shared_list) + except: + logger.info("Error: All forked eval iterations did not return results") + raise ValueError("All forked eval iterations did not return results") for _data in _shared_list: pred_cluster_ids = (_data['cluster_labels'] + (max_pred_id + 1)).tolist() cc_obj_vals['round'].append(_data['round_objective_value']) @@ -168,7 +173,7 @@ def evaluate(model, dataloader, overfit_batch_idx=-1, clustering_fn=None, cluste def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", return_pred_only=False, thresh_for_f1=0.5, clustering_fn=None, clustering_threshold=None, val_dataloader=None, tqdm_label='', device=None, verbose=False, debug=False, _errors=None, run_dir='./', - tqdm_position=None, model_args=None, return_iter=False, fork_size=300, max_parallel_forks=4, + tqdm_position=None, model_args=None, return_iter=False, fork_size=500, max_parallel_forks=3, disable_tqdm=False): fn_args = locals() fork_enabled = fork_size > -1 and model_args is not None @@ -208,6 +213,7 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret # Only one signature in block; manually assign a unique cluster pred_cluster_ids = [max_pred_id + 1] elif fork_enabled and block_size >= fork_size and clustering_fn.__class__ is CCInference: + logger.info(f"Eval fork info: len(_procs)={len(_procs)}, len(_shared_list)={len(_shared_list)}") if (len(_procs) - len(_shared_list)) < max_parallel_forks: _proc = _fork_iter(idx, _fork_id, _shared_list, evaluate_pairwise, **fn_args) _fork_id += 1 @@ -262,7 +268,11 @@ def evaluate_pairwise(model, dataloader, overfit_batch_idx=-1, mode="macro", ret _procs.sort(key=lambda x: x[1]) # To visualize progress for _proc in tqdm(_procs, desc=f'Eval {tqdm_label} (waiting for forks to join)', position=tqdm_position): _proc[0].join() - assert len(_procs) == len(_shared_list), "All forked eval iterations did not return results" + try: + assert len(_procs) == len(_shared_list) + except: + logger.info("Error: All forked eval iterations did not return results") + raise ValueError("All forked eval iterations did not return results") for _data in _shared_list: pred_cluster_ids = (_data['cluster_labels'] + (max_pred_id + 1)).tolist() cc_obj_vals['round'].append(_data['round_objective_value']) diff --git a/e2e_scripts/train.py b/e2e_scripts/train.py index eb0b483..9bc1bc4 100644 --- a/e2e_scripts/train.py +++ b/e2e_scripts/train.py @@ -38,7 +38,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, group=None, save_model=False, load_model_from_wandb_run=None, load_model_from_fpath=None, eval_only_split=None, eval_all=False, skip_initial_eval=False, pairwise_eval_clustering=None, - debug=False, track_errors=True, local=False, sync_dev=False): + debug=False, track_errors=True, local=False, sync_dev=False, icml_final_eval=False): init_args = { 'config': DEFAULT_HYPERPARAMS } @@ -63,6 +63,27 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g with wandb.init(**init_args) as run: wandb.config.update(hyperparams, allow_val_change=True) hyp = wandb.config + + # Limit training epochs by dataset in e2e mode (for tractability) + max_epochs_by_dataset = { + 'e2e': { + 'aminer': 3, + 'kisti': 3, + 'arnetminer': 5 + }, + 'nosdp': { + 'aminer': 3, + 'kisti': 3, + 'arnetminer': 5 + } + } + n_epochs_override = None + if not hyp['pairwise_mode']: + _training_method = 'e2e' if hyp['use_sdp'] else 'nosdp' + if hyp['dataset'] in max_epochs_by_dataset[_training_method]: + n_epochs_override = max_epochs_by_dataset[_training_method][hyp['dataset']] + logger.info(f'Limiting number of epochs from {hyp["n_epochs"]} to {n_epochs_override}') + logger.info("Run hyperparameters:") logger.info(hyp) # Save hyperparameters as a json file and store in wandb run @@ -82,7 +103,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g use_rounded_loss = hyp["use_rounded_loss"] e2e_loss = hyp['e2e_loss'] batch_size = hyp['batch_size'] if pairwise_mode else 1 # Force clustering runs to operate on 1 block only - n_epochs = hyp['n_epochs'] + n_epochs = n_epochs_override if n_epochs_override is not None else hyp['n_epochs'] n_warmstart_epochs = hyp['n_warmstart_epochs'] use_lr_scheduler = hyp['use_lr_scheduler'] hidden_dim = hyp["hidden_dim"] @@ -282,7 +303,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g hyp["subsample_sz_dev"], pairwise_mode, batch_size, split=eval_only_split) eval_scores = eval_fn(model, eval_dataloader, tqdm_label=eval_only_split, device=device, verbose=verbose, - debug=debug, _errors=_errors, model_args=model_args) + debug=debug, _errors=_errors, model_args=model_args, run_dir=run.dir) logger.info(f"Eval: {eval_only_split}_{list(eval_metric_to_idx)[0]}={eval_scores[0]}, " + f"{eval_only_split}_{list(eval_metric_to_idx)[1]}={eval_scores[1]}") # Log eval metrics @@ -300,7 +321,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g clustering_threshold=clustering_threshold, val_dataloader=val_dataloader_e2e, tqdm_label='test clustering', device=device, verbose=verbose, - debug=debug, _errors=_errors, model_args=model_args) + debug=debug, _errors=_errors, model_args=model_args, run_dir=run.dir) if pairwise_clustering_fn.__class__ is HACInference: clustering_threshold = pairwise_clustering_fn.cut_threshold logger.info( @@ -558,43 +579,96 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g # Evaluate the best dev model on test if overfit_batch_idx == -1: model.load_state_dict(best_dev_state_dict) - with torch.no_grad(): - model.eval() - test_scores = eval_fn(model, test_dataloader, tqdm_label='test', device=device, verbose=verbose, - debug=debug, _errors=_errors, tqdm_position=2, model_args=model_args) - logger.info(f"Final: test_{list(eval_metric_to_idx)[0]}={test_scores[0]}, " + - f"test_{list(eval_metric_to_idx)[1]}={test_scores[1]}") - # Log final metrics - wandb.log({'best_dev_epoch': best_epoch + 1, - f'best_dev_{list(eval_metric_to_idx)[0]}': best_dev_scores[0], - f'best_dev_{list(eval_metric_to_idx)[1]}': best_dev_scores[1], - f'best_test_{list(eval_metric_to_idx)[0]}': test_scores[0], - f'best_test_{list(eval_metric_to_idx)[1]}': test_scores[1]}) - if len(test_scores) == 3: - log_cc_objective_values(scores=test_scores, split_name='best_test', log_prefix='Final', - verbose=True, logger=logger) - # For pairwise-mode: - if pairwise_clustering_fns[0] is not None: + + if icml_final_eval: + # Run all inference variants on the test set and exit + cc_inference_sdp = CCInference(sdp_max_iters, sdp_eps, sdp_scale, use_sdp=True) + cc_inference_nosdp = CCInference(sdp_max_iters, sdp_eps, sdp_scale, use_sdp=False) + inference_fns = [HACInference(), + cc_inference_sdp, cc_inference_sdp, + cc_inference_nosdp, cc_inference_nosdp] + inference_fn_labels = ['hac', + 'cc', 'cc-fixed', + 'cc-nosdp', 'cc-nosdp-fixed'] + cc_inference_sdp.eval() + cc_inference_nosdp.eval() + val_dataloader_e2e, test_dataloader_e2e = get_dataloaders(hyp["dataset"], + hyp["dataset_random_seed"], + hyp["convert_nan"], + hyp["nan_value"], + hyp["normalize_data"], + hyp["subsample_sz_train"], + hyp["subsample_sz_dev"], + pairwise_mode=False, batch_size=1, + split=['dev', 'test']) + inf_start_time = time.time() + with torch.no_grad(): + model.eval() clustering_threshold = None - for i, pairwise_clustering_fn in enumerate(pairwise_clustering_fns): - clustering_scores = eval_fn(model, test_dataloader_e2e, - clustering_fn=pairwise_clustering_fn, - clustering_threshold=clustering_threshold, - val_dataloader=val_dataloader_e2e, - tqdm_label='test clustering', device=device, verbose=verbose, - debug=debug, _errors=_errors, tqdm_position=2, - model_args=model_args) - if pairwise_clustering_fn.__class__ is HACInference: - clustering_threshold = pairwise_clustering_fn.cut_threshold - logger.info(f"Final: test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[0]}, " + - f"test_{list(clustering_metrics)[1]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[1]}") - # Log final metrics - wandb.log({f'best_test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}': clustering_scores[0], - f'best_test_{list(clustering_metrics)[1]}_{pairwise_clustering_fn_labels[i]}': clustering_scores[1]}) + for i, inference_fn in enumerate(inference_fns): + logger.info(f'Inference method: {inference_fn_labels[i]}') + clustering_scores = evaluate_pairwise(model, test_dataloader_e2e, + clustering_fn=inference_fn, + clustering_threshold=clustering_threshold if i % 2 == 0 else None, + val_dataloader=val_dataloader_e2e, + tqdm_label='test clustering', device=device, + verbose=verbose, + debug=debug, _errors=_errors, model_args=model_args) + if inference_fn.__class__ is HACInference: + clustering_threshold = inference_fn.cut_threshold + logger.info( + f"Eval: test_{list(clustering_metrics)[0]}_{inference_fn_labels[i]}={clustering_scores[0]}, " + + f"test_{list(clustering_metrics)[1]}_{inference_fn_labels[i]}={clustering_scores[1]}") + # Log eval metrics + wandb.log({f'best_test_{list(clustering_metrics)[0]}_{inference_fn_labels[i]}': + clustering_scores[0], + f'best_test_{list(clustering_metrics)[1]}_{inference_fn_labels[i]}': + clustering_scores[1]}) if len(clustering_scores) == 3: log_cc_objective_values(scores=clustering_scores, - split_name=f'best_test_{pairwise_clustering_fn_labels[i]}', - log_prefix='Final', verbose=True, logger=logger) + split_name=f'best_test_{inference_fn_labels[i]}', + log_prefix='Eval', verbose=verbose, logger=logger) + inf_end_time = time.time() + run.summary["z_inf_time"] = round(inf_end_time - inf_start_time) + else: + with torch.no_grad(): + model.eval() + test_scores = eval_fn(model, test_dataloader, tqdm_label='test', device=device, verbose=verbose, + debug=debug, _errors=_errors, tqdm_position=2, model_args=model_args, + run_dir=run.dir) + logger.info(f"Final: test_{list(eval_metric_to_idx)[0]}={test_scores[0]}, " + + f"test_{list(eval_metric_to_idx)[1]}={test_scores[1]}") + # Log final metrics + wandb.log({'best_dev_epoch': best_epoch + 1, + f'best_dev_{list(eval_metric_to_idx)[0]}': best_dev_scores[0], + f'best_dev_{list(eval_metric_to_idx)[1]}': best_dev_scores[1], + f'best_test_{list(eval_metric_to_idx)[0]}': test_scores[0], + f'best_test_{list(eval_metric_to_idx)[1]}': test_scores[1]}) + if len(test_scores) == 3: + log_cc_objective_values(scores=test_scores, split_name='best_test', log_prefix='Final', + verbose=True, logger=logger) + # For pairwise-mode: + if pairwise_clustering_fns[0] is not None: + clustering_threshold = None + for i, pairwise_clustering_fn in enumerate(pairwise_clustering_fns): + clustering_scores = eval_fn(model, test_dataloader_e2e, + clustering_fn=pairwise_clustering_fn, + clustering_threshold=clustering_threshold, + val_dataloader=val_dataloader_e2e, + tqdm_label='test clustering', device=device, verbose=verbose, + debug=debug, _errors=_errors, tqdm_position=2, + model_args=model_args, run_dir=run.dir) + if pairwise_clustering_fn.__class__ is HACInference: + clustering_threshold = pairwise_clustering_fn.cut_threshold + logger.info(f"Final: test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[0]}, " + + f"test_{list(clustering_metrics)[1]}_{pairwise_clustering_fn_labels[i]}={clustering_scores[1]}") + # Log final metrics + wandb.log({f'best_test_{list(clustering_metrics)[0]}_{pairwise_clustering_fn_labels[i]}': clustering_scores[0], + f'best_test_{list(clustering_metrics)[1]}_{pairwise_clustering_fn_labels[i]}': clustering_scores[1]}) + if len(clustering_scores) == 3: + log_cc_objective_values(scores=clustering_scores, + split_name=f'best_test_{pairwise_clustering_fn_labels[i]}', + log_prefix='Final', verbose=True, logger=logger) run.summary["z_model_parameters"] = count_parameters(model) run.summary["z_run_time"] = round(end_time - start_time) @@ -676,6 +750,7 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g sweep_id = wandb.sweep(sweep=sweep_config, project=args['wandb_project'], entity=args['wandb_entity']) + logger.info(f"SWEEP_ID={sweep_id}") # Start sweep job wandb.agent(sweep_id, @@ -722,5 +797,6 @@ def train(hyperparams={}, verbose=False, project=None, entity=None, tags=None, g debug=args['debug'], track_errors=not args['no_error_tracking'], local=args['local'], - sync_dev=args['sync_dev']) + sync_dev=args['sync_dev'], + icml_final_eval=args['icml_final_eval']) logger.info("End of run") diff --git a/e2e_scripts/train_utils.py b/e2e_scripts/train_utils.py index 617f064..d94701a 100644 --- a/e2e_scripts/train_utils.py +++ b/e2e_scripts/train_utils.py @@ -4,6 +4,7 @@ import copy import os import json +import random from collections import defaultdict from typing import Dict from typing import Tuple, Optional @@ -216,7 +217,7 @@ def __call__(self, input: Tensor, target: Tensor) -> Tensor: def copy_and_load_model(model, run_dir, device, store_only=False): _model = copy.deepcopy(model) - _PATH = os.path.join(run_dir, f'_temp_state_dict_{int(time())}.pt') + _PATH = os.path.join(run_dir, f'_temp_state_dict_{int(time())}-{random.randint(0, 100)}.pt') torch.save(model.state_dict(), _PATH) if store_only: return _PATH diff --git a/get_wandb_results.py b/get_wandb_results.py new file mode 100644 index 0000000..eda0eb2 --- /dev/null +++ b/get_wandb_results.py @@ -0,0 +1,126 @@ +import argparse +import json +import logging +import csv +from copy import deepcopy +import numpy as np +import pandas as pd + +from IPython import embed + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) +logger = logging.getLogger(__name__) + + +class Parser(argparse.ArgumentParser): + def __init__(self): + super().__init__() + self.add_argument( + "--data_fpath", type=str + ) + self.add_argument( + "--interactive", action="store_true", + ) + self.add_argument( + "--get_b3_f1_across", action="store_true", + ) + + +def get_df_by_dataset(res, dataset): + new_res = {} + for _r in res: + if dataset in _r: + new_res[_r.replace(f"{dataset}_", '')] = res[_r] + return pd.DataFrame(new_res).T + +if __name__ == '__main__': + parser = Parser() + args = parser.parse_args() + logger.info("Script arguments:") + logger.info(args.__dict__) + + if args.data_fpath is not None: + fpath = args.data_fpath + else: + # hardcoded during dev + fpath = 'wandb_export_2023-03-19T14_30_08.659-04_00.csv' + + results = [] + with open(fpath, mode='r') as csv_file: + csv_reader = csv.DictReader(csv_file) + line_count = 0 + for row in csv_reader: + if line_count == 0: + print(f'Column names are {", ".join(row)}') + else: + results.append(deepcopy(row)) + line_count += 1 + print(f'Processed {line_count} lines.') + + final = {} + out_keys = { + 'train_time': 'z_run_time', + 'inf_time': 'z_inf_time', + 'b3_f1_hac': 'best_test_b3_f1_hac', + 'b3_f1_cc': 'best_test_b3_f1_cc', + 'b3_f1_cc-fixed': 'best_test_b3_f1_cc-fixed', + 'b3_f1_cc-nosdp': 'best_test_b3_f1_cc-nosdp', + 'b3_f1_cc-nosdp-fixed': 'best_test_b3_f1_cc-nosdp-fixed', + 'vmeasure_hac': 'best_test_vmeasure_hac', + 'vmeasure_cc': 'best_test_vmeasure_cc', + 'vmeasure_cc-fixed': 'best_test_vmeasure_cc-fixed', + 'vmeasure_cc-nosdp': 'best_test_vmeasure_cc-nosdp', + 'vmeasure_cc-nosdp-fixed': 'best_test_vmeasure_cc-nosdp-fixed' + } + + for r in results: + try: + method = f"{'mlp' if r['pairwise_mode']=='true' else 'e2e'}" + if r['pairwise_mode'] == 'false': + method += f"{'_nosdp' if r['use_sdp']=='false' else ''}" + method += f"{'_round' if r['use_rounded_loss'] == 'true' else '_frac'}" + key = f"{r['dataset']}_{method}" + + if key not in final: + final[key] = {o: [] for o in out_keys.keys()} + + for _key in out_keys: + final[key][_key].append(float(r[out_keys[_key]])) + except: + continue + + means, stds, comb = {}, {}, {} + for k in final: + if k is not means: + means[k] = {} + stds[k] = {} + comb[k] = {} + for _k in final[k]: + means[k][_k] = round(np.mean(final[k][_k])*(1 if 'time' in _k else 100), 2) + stds[k][_k] = round(np.std(final[k][_k])*(1 if 'time' in _k else 100), 2) + comb[k][_k] = f"{means[k][_k]}±{stds[k][_k]}" + + with open('results-mean.json', 'w') as fh: + json.dump(means, fh) + with open('results-std.json', 'w') as fh: + json.dump(stds, fh) + with open('results.json', 'w') as fh: + json.dump(comb, fh) + + res_df = pd.DataFrame(comb) + + if args.get_b3_f1_across: + # Average b3_f1 numbers of each training method over all inference methods + print() + print() + mean_dfs = {} + for d in ['pubmed', 'qian', 'zbmath', 'arnetminer', 'kisti']: + print(f'Dataset: {d}') + mean_dfs[d] = get_df_by_dataset(means, d).T[ + ['b3_f1_hac', 'b3_f1_cc', 'b3_f1_cc-fixed', 'b3_f1_cc-nosdp', 'b3_f1_cc-nosdp-fixed']].T.mean() + print(mean_dfs[d]) + print() + + if args.interactive: + embed() diff --git a/rerun_batch.sh b/rerun_batch.sh new file mode 100644 index 0000000..0d91868 --- /dev/null +++ b/rerun_batch.sh @@ -0,0 +1,127 @@ +#!/bin/bash -e + +sh rerun_best.sh gffw3aq7 gypsum-1080ti +sh rerun_best.sh 88qaxovr gypsum-1080ti +sh rerun_best.sh c4f4u06r gypsum-1080ti +sh rerun_best.sh tuojhxl9 gypsum-1080ti +sh rerun_best.sh 896wsqzi gypsum-1080ti +sh rerun_best.sh ugfvzuu3 gypsum-1080ti +sh rerun_best.sh xgovhwp2 gypsum-1080ti +sh rerun_best.sh xwei54ka gypsum-1080ti +sh rerun_best.sh ehg8oouh gypsum-1080ti +sh rerun_best.sh hbbp5yk5 gypsum-1080ti +sh rerun_best.sh 8r0o10am gypsum-1080ti +sh rerun_best.sh jlrho35c gypsum-1080ti +sh rerun_best.sh 7fq9ubkl gypsum-1080ti +sh rerun_best.sh 9li0p2xf gypsum-1080ti +sh rerun_best.sh 3v83ldl4 gypsum-1080ti +sh rerun_best.sh 8gmw28xf gypsum-1080ti +sh rerun_best.sh 5mobobvf gypsum-1080ti +sh rerun_best.sh w4lo7mic gypsum-1080ti +sh rerun_best.sh inlrt56m gypsum-1080ti +sh rerun_best.sh 5841jp68 gypsum-1080ti +sh rerun_best.sh 44ghc7aa gypsum-titanx +sh rerun_best.sh c36kghyo gypsum-titanx +sh rerun_best.sh 6w7t2y5m gypsum-titanx +sh rerun_best.sh bo5ww9oj gypsum-titanx +sh rerun_best.sh i1g1bwuz gypsum-titanx +sh rerun_best.sh vdxqpisp gypsum-titanx +sh rerun_best.sh sc4xc4lq gypsum-titanx +sh rerun_best.sh 41uylhgc gypsum-titanx +sh rerun_best.sh ellbgtzj gypsum-titanx +sh rerun_best.sh 20j5pp3p gypsum-titanx +sh rerun_best.sh mqwfys78 gypsum-titanx +sh rerun_best.sh 4cl8lvl5 gypsum-titanx +sh rerun_best.sh jhnlrb9b gypsum-titanx +sh rerun_best.sh d8gybu3j gypsum-titanx +sh rerun_best.sh i13k9nhb gypsum-titanx +sh rerun_best.sh yfc5xfq6 gypsum-titanx +sh rerun_best.sh by24aayn gypsum-titanx +sh rerun_best.sh ubiwtwso gypsum-titanx +sh rerun_best.sh o0y4csbo gypsum-titanx +sh rerun_best.sh wntemai3 gypsum-titanx +sh rerun_best.sh nmtlv76s gypsum-2080ti +sh rerun_best.sh prz43ogk gypsum-2080ti +sh rerun_best.sh 2edwecpz gypsum-2080ti +sh rerun_best.sh th5hl878 gypsum-2080ti +sh rerun_best.sh wtrrazuk gypsum-2080ti +sh rerun_best.sh oa404d8x gypsum-2080ti +sh rerun_best.sh 1ke4vxc7 gypsum-2080ti +sh rerun_best.sh s6rozj6y gypsum-2080ti +sh rerun_best.sh mz0gjtgm gypsum-2080ti +sh rerun_best.sh hodwzmv2 gypsum-2080ti +sh rerun_best.sh cxhocbc9 gypsum-2080ti +sh rerun_best.sh grajc9xd gypsum-2080ti +sh rerun_best.sh 8yin7z6k gypsum-2080ti +sh rerun_best.sh ykpeo4zt gypsum-2080ti +sh rerun_best.sh xpbybund gypsum-2080ti +sh rerun_best.sh j55f9ppp gypsum-2080ti +sh rerun_best.sh wuu45zhi gypsum-2080ti +sh rerun_best.sh f5t9ge27 gypsum-2080ti +sh rerun_best.sh kboyitfu gypsum-2080ti +sh rerun_best.sh s8v1grpa gypsum-2080ti +sh rerun_best.sh 2l8mjiei gypsum-m40 +sh rerun_best.sh ztng9hxr gypsum-m40 +sh rerun_best.sh 44dgz6e7 gypsum-m40 +sh rerun_best.sh 4uza846x gypsum-m40 +sh rerun_best.sh p7q0x2x4 gypsum-m40 +sh rerun_best.sh xqts82x9 gypsum-m40 +sh rerun_best.sh 85coxdiq gypsum-m40 +sh rerun_best.sh 4zlgu03n gypsum-m40 +sh rerun_best.sh 5y0yeyil gypsum-m40 +sh rerun_best.sh 33v38tro gypsum-m40 +sh rerun_best.sh 3cphu97j gypsum-m40 +sh rerun_best.sh rseqi816 gypsum-m40 +sh rerun_best.sh a4xdafqa gypsum-m40 +sh rerun_best.sh kxv70u0z gypsum-m40 +sh rerun_best.sh vjdcqev1 gypsum-m40 +sh rerun_best.sh fmx2rqe7 gypsum-m40 +sh rerun_best.sh ehrfeu8f gypsum-m40 +sh rerun_best.sh 3qja957g gypsum-m40 +sh rerun_best.sh y5lbmu6d gypsum-m40 +sh rerun_best.sh s21w56en gypsum-m40 +sh rerun_best.sh g40syomc gypsum-titanx +sh rerun_best.sh nucn0flw gypsum-titanx +sh rerun_best.sh j8211otn gypsum-titanx +sh rerun_best.sh 3ylnj3zg gypsum-titanx +sh rerun_best.sh 63pa6vn8 gypsum-titanx +sh rerun_best.sh ukvsewnh gypsum-titanx +sh rerun_best.sh hc3f7qsd gypsum-titanx +sh rerun_best.sh 37e6x0rx gypsum-titanx +sh rerun_best.sh nu08k76t gypsum-titanx +sh rerun_best.sh 8h4rjiok gypsum-titanx +sh rerun_best.sh z81s4dat gypsum-titanx +sh rerun_best.sh c81jlxii gypsum-titanx +sh rerun_best.sh kmidsylz gypsum-titanx +sh rerun_best.sh 8t6zp873 gypsum-titanx +sh rerun_best.sh rsve7a2h gypsum-titanx +sh rerun_best.sh sefe99yi gypsum-titanx +sh rerun_best.sh 6r1frbt4 gypsum-titanx +sh rerun_best.sh 0gs1obh1 gypsum-titanx +sh rerun_best.sh cspdnl7j gypsum-titanx +sh rerun_best.sh 8hcrk3n9 gypsum-titanx +sh rerun_best.sh c6drhs6a gypsum-m40 +sh rerun_best.sh ym7mdlep gypsum-m40 +sh rerun_best.sh wioahicm gypsum-m40 +sh rerun_best.sh z9k6elm0 gypsum-m40 +sh rerun_best.sh t74hzmfa gypsum-m40 +sh rerun_best.sh 7sgzno3w gypsum-m40 +sh rerun_best.sh y8ckivlk gypsum-m40 +sh rerun_best.sh 5qwq07l6 gypsum-m40 +sh rerun_best.sh uc4jgv30 gypsum-m40 +sh rerun_best.sh tziwf98r gypsum-m40 +sh rerun_best.sh 0zof37l4 gypsum-m40 +sh rerun_best.sh gn8osqi9 gypsum-m40 +sh rerun_best.sh 3j1zpwd2 gypsum-m40 +sh rerun_best.sh mcw4dk3x gypsum-m40 +sh rerun_best.sh d4kudukt gypsum-m40 +sh rerun_best.sh mi55qs26 gypsum-m40 +sh rerun_best.sh 1j7867tv gypsum-m40 +sh rerun_best.sh wb3de1t8 gypsum-m40 +sh rerun_best.sh 3w093bm5 gypsum-m40 +sh rerun_best.sh 1daqvigq gypsum-m40 +sh rerun_best.sh ymt0dxxd gypsum-m40 +sh rerun_best.sh ue2i5chg gypsum-m40 +sh rerun_best.sh 2vj2gvkc gypsum-m40 +sh rerun_best.sh 08uafgmw gypsum-m40 +sh rerun_best.sh 2ptqopi7 gypsum-m40 diff --git a/rerun_best.sh b/rerun_best.sh new file mode 100644 index 0000000..6549e38 --- /dev/null +++ b/rerun_best.sh @@ -0,0 +1,19 @@ +#!/bin/bash -e + +entity="dhdhagar" +project="prob-ent-resolution" +run_id=${1} +gpu_name=${2:-"gypsum-1080ti"} +run_tag=${3:-"icml_rebut_best"} + +JOB_DESC=rerun_${run_id} && JOB_NAME=${JOB_DESC}_$(date +%s) && \ + sbatch -J ${JOB_NAME} -e jobs/${JOB_NAME}.err -o jobs/${JOB_NAME}.log \ + --partition=${gpu_name} --gres=gpu:1 --mem=120G --time=4:00:00 \ + run_sbatch.sh e2e_scripts/train.py \ + --load_hyp_from_wandb_run="${entity}/${project}/${run_id}" \ + --icml_final_eval \ + --skip_initial_eval \ + --silent \ + --wandb_tags="${run_tag},${run_id}" \ + --save_model + echo " Logs: jobs/${JOB_NAME}.err" diff --git a/run_sweep.sh b/run_sweep.sh index c635c0f..926e3ea 100644 --- a/run_sweep.sh +++ b/run_sweep.sh @@ -10,15 +10,15 @@ sweep_prefix=${6:-""} for ((i = ${n_seed_start}; i <= ${n_seed_end}; i++)); do JOB_DESC=${model}_${dataset}_sweep${i} && JOB_NAME=${JOB_DESC}_$(date +%s) && \ sbatch -J ${JOB_NAME} -e jobs/${JOB_NAME}.err -o jobs/${JOB_NAME}.log \ - --partition=${gpu_name} --gres=gpu:1 --mem=100G --time=12:00:00 \ + --partition=${gpu_name} --gres=gpu:1 --mem=120G --time=12:00:00 \ run_sbatch.sh e2e_scripts/train.py \ --dataset="${dataset}" \ --dataset_random_seed=${i} \ --pairwise_eval_clustering="both" \ --skip_initial_eval \ --silent \ - --wandb_sweep_name="${sweep_prefix}${model}_${dataset}_${i}" \ + --wandb_sweep_name="${sweep_prefix}_${model}_${dataset}_${i}" \ --wandb_sweep_params="wandb_configs/sweeps/${model}.json" \ - --wandb_tags="${model},${dataset},seed_${i}" + --wandb_tags="${model},${dataset},seed_${i},${sweep_prefix}" echo " Logs: jobs/${JOB_NAME}.err" done diff --git a/utils/parser.py b/utils/parser.py index de21db8..535f1af 100644 --- a/utils/parser.py +++ b/utils/parser.py @@ -157,3 +157,7 @@ def add_training_args(self): "--sync_dev", action="store_true", help="Whether to force dev evaluations to run synchronously", ) + parser.add_argument( + "--icml_final_eval", action="store_true", + help="ICML REBUTTAL ONLY: Run all eval after training", + ) diff --git a/wandb_configs/sweeps/e2e-nosdp-warm.json b/wandb_configs/sweeps/e2e-nosdp-warm.json index 294b543..c4f22cf 100644 --- a/wandb_configs/sweeps/e2e-nosdp-warm.json +++ b/wandb_configs/sweeps/e2e-nosdp-warm.json @@ -1,16 +1,16 @@ { "n_epochs": {"value": 10}, - "lr": {"max": 2e-1, "min": 1e-5}, + "lr": {"max": 7e-2, "min": 1e-5}, "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]}, "dev_opt_metric": {"value": "b3_f1"}, "neumiss_depth": {"values": [10, 20]}, - "hidden_dim": {"values": [512, 1024]}, + "hidden_dim": {"values": [256, 512]}, "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, "lr_scheduler": {"value": "plateau"}, - "activation": {"values": ["leaky_relu", "relu"]}, + "activation": {"values": ["leaky_relu"]}, "use_sdp": {"value": false}, "n_warmstart_epochs": {"value": 2}, "gradient_accumulation": {"values": [true, false]}, "weighted_loss": {"values": [true, false]} -} +} \ No newline at end of file diff --git a/wandb_configs/sweeps/e2e-nosdp.json b/wandb_configs/sweeps/e2e-nosdp.json index 5b47c39..1c29da8 100644 --- a/wandb_configs/sweeps/e2e-nosdp.json +++ b/wandb_configs/sweeps/e2e-nosdp.json @@ -1,14 +1,14 @@ { "n_epochs": {"value": 10}, - "lr": {"max": 2e-1, "min": 1e-5}, + "lr": {"max": 7e-2, "min": 1e-5}, "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]}, "dev_opt_metric": {"value": "b3_f1"}, "neumiss_depth": {"values": [10, 20]}, - "hidden_dim": {"values": [512, 1024]}, + "hidden_dim": {"values": [256, 512]}, "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, "lr_scheduler": {"value": "plateau"}, - "activation": {"values": ["leaky_relu", "relu"]}, + "activation": {"values": ["leaky_relu"]}, "use_sdp": {"value": false}, "gradient_accumulation": {"values": [true, false]}, "weighted_loss": {"values": [true, false]} diff --git a/wandb_configs/sweeps/e2e-warm.json b/wandb_configs/sweeps/e2e-warm.json index 19e511b..8f68ae6 100644 --- a/wandb_configs/sweeps/e2e-warm.json +++ b/wandb_configs/sweeps/e2e-warm.json @@ -1,15 +1,15 @@ { "n_epochs": {"value": 10}, - "lr": {"max": 2e-1, "min": 1e-5}, + "lr": {"max": 7e-2, "min": 1e-5}, "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]}, "dev_opt_metric": {"value": "b3_f1"}, "neumiss_depth": {"values": [10, 20]}, - "hidden_dim": {"values": [512, 1024]}, + "hidden_dim": {"values": [256, 512]}, "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, "lr_scheduler": {"value": "plateau"}, - "activation": {"values": ["leaky_relu", "relu"]}, + "activation": {"values": ["leaky_relu"]}, "n_warmstart_epochs": {"value": 2}, "gradient_accumulation": {"values": [true, false]}, "weighted_loss": {"values": [true, false]} -} +} \ No newline at end of file diff --git a/wandb_configs/sweeps/e2e.json b/wandb_configs/sweeps/e2e.json index e084f00..b7948aa 100644 --- a/wandb_configs/sweeps/e2e.json +++ b/wandb_configs/sweeps/e2e.json @@ -1,14 +1,14 @@ { "n_epochs": {"value": 10}, - "lr": {"max": 2e-1, "min": 1e-5}, + "lr": {"max": 7e-2, "min": 1e-5}, "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]}, "dev_opt_metric": {"value": "b3_f1"}, "neumiss_depth": {"values": [10, 20]}, - "hidden_dim": {"values": [512, 1024]}, + "hidden_dim": {"values": [256, 512]}, "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, "lr_scheduler": {"value": "plateau"}, - "activation": {"values": ["leaky_relu", "relu"]}, + "activation": {"values": ["leaky_relu"]}, "gradient_accumulation": {"values": [true, false]}, "weighted_loss": {"values": [true, false]} } diff --git a/wandb_configs/sweeps/frac-nosdp-warm.json b/wandb_configs/sweeps/frac-nosdp-warm.json index 491e04c..cc105d2 100644 --- a/wandb_configs/sweeps/frac-nosdp-warm.json +++ b/wandb_configs/sweeps/frac-nosdp-warm.json @@ -1,14 +1,14 @@ { "n_epochs": {"value": 10}, - "lr": {"max": 2e-1, "min": 1e-5}, + "lr": {"max": 7e-2, "min": 1e-5}, "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]}, "dev_opt_metric": {"value": "b3_f1"}, "neumiss_depth": {"values": [10, 20]}, - "hidden_dim": {"values": [512, 1024]}, + "hidden_dim": {"values": [256, 512]}, "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, "lr_scheduler": {"value": "plateau"}, - "activation": {"values": ["leaky_relu", "relu"]}, + "activation": {"values": ["leaky_relu"]}, "use_rounded_loss": {"value": false}, "use_sdp": {"value": false}, "n_warmstart_epochs": {"value": 2}, diff --git a/wandb_configs/sweeps/frac-nosdp.json b/wandb_configs/sweeps/frac-nosdp.json index d9a1e41..0c6ad42 100644 --- a/wandb_configs/sweeps/frac-nosdp.json +++ b/wandb_configs/sweeps/frac-nosdp.json @@ -1,14 +1,14 @@ { "n_epochs": {"value": 10}, - "lr": {"max": 2e-1, "min": 1e-5}, + "lr": {"max": 7e-2, "min": 1e-5}, "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]}, "dev_opt_metric": {"value": "b3_f1"}, "neumiss_depth": {"values": [10, 20]}, - "hidden_dim": {"values": [512, 1024]}, + "hidden_dim": {"values": [256, 512]}, "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, "lr_scheduler": {"value": "plateau"}, - "activation": {"values": ["leaky_relu", "relu"]}, + "activation": {"values": ["leaky_relu"]}, "use_rounded_loss": {"value": false}, "use_sdp": {"value": false}, "gradient_accumulation": {"values": [true, false]}, diff --git a/wandb_configs/sweeps/frac-warm.json b/wandb_configs/sweeps/frac-warm.json index b13efc5..cac98fd 100644 --- a/wandb_configs/sweeps/frac-warm.json +++ b/wandb_configs/sweeps/frac-warm.json @@ -1,14 +1,14 @@ { "n_epochs": {"value": 10}, - "lr": {"max": 2e-1, "min": 1e-5}, + "lr": {"max": 7e-2, "min": 1e-5}, "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]}, "dev_opt_metric": {"value": "b3_f1"}, "neumiss_depth": {"values": [10, 20]}, - "hidden_dim": {"values": [512, 1024]}, + "hidden_dim": {"values": [256, 512]}, "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, "lr_scheduler": {"value": "plateau"}, - "activation": {"values": ["leaky_relu", "relu"]}, + "activation": {"values": ["leaky_relu"]}, "use_rounded_loss": {"value": false}, "n_warmstart_epochs": {"value": 2}, "gradient_accumulation": {"values": [true, false]}, diff --git a/wandb_configs/sweeps/frac.json b/wandb_configs/sweeps/frac.json index a572b76..3b4b277 100644 --- a/wandb_configs/sweeps/frac.json +++ b/wandb_configs/sweeps/frac.json @@ -1,14 +1,14 @@ { "n_epochs": {"value": 10}, - "lr": {"max": 2e-1, "min": 1e-5}, + "lr": {"max": 7e-2, "min": 1e-5}, "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]}, "dev_opt_metric": {"value": "b3_f1"}, "neumiss_depth": {"values": [10, 20]}, - "hidden_dim": {"values": [512, 1024]}, + "hidden_dim": {"values": [256, 512]}, "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, "lr_scheduler": {"value": "plateau"}, - "activation": {"values": ["leaky_relu", "relu"]}, + "activation": {"values": ["leaky_relu"]}, "use_rounded_loss": {"value": false}, "gradient_accumulation": {"values": [true, false]}, "weighted_loss": {"values": [true, false]} diff --git a/wandb_configs/sweeps/mlp.json b/wandb_configs/sweeps/mlp.json index 24274c7..a0c1e4a 100644 --- a/wandb_configs/sweeps/mlp.json +++ b/wandb_configs/sweeps/mlp.json @@ -1,15 +1,15 @@ { "pairwise_mode": {"value": true}, "n_epochs": {"value": 10}, - "lr": {"max": 2e-1, "min": 1e-5}, + "lr": {"max": 7e-2, "min": 1e-5}, "weight_decay": {"values": [1e-1, 1e-2, 1e-3, 0]}, "dev_opt_metric": {"value": "auroc"}, "neumiss_depth": {"values": [10, 20]}, - "hidden_dim": {"values": [512, 1024]}, + "hidden_dim": {"values": [256, 512]}, "n_hidden_layers": {"values": [1, 2]}, "dropout_p": {"values": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}, "lr_scheduler": {"value": "plateau"}, - "activation": {"values": ["leaky_relu", "relu"]}, + "activation": {"values": ["leaky_relu"]}, "gradient_accumulation": {"value": false}, "weighted_loss": {"value": true} } From b0679fcb54cdbe5066f857fa52d51b9f9e621cdf Mon Sep 17 00:00:00 2001 From: Sriharsha-hatwar Date: Thu, 23 Mar 2023 15:15:13 +0000 Subject: [PATCH 13/17] Initial commit for pickling sparse matrix for different splits --- e2e_scripts/preprocess_s2and_data.py | 18 +++--- e2e_scripts/preprocess_s2and_pointwise.py | 79 +++++++++++++++++------ s2and/featurizer.py | 11 +++- 3 files changed, 78 insertions(+), 30 deletions(-) diff --git a/e2e_scripts/preprocess_s2and_data.py b/e2e_scripts/preprocess_s2and_data.py index 1322586..acd19b6 100644 --- a/e2e_scripts/preprocess_s2and_data.py +++ b/e2e_scripts/preprocess_s2and_data.py @@ -16,14 +16,15 @@ from s2and.data import ANDData import logging from s2and.featurizer import FeaturizationInfo, featurize +from preprocess_s2and_pointwise import save_pickled_pointwise_features logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) -def save_blockwise_featurized_data(dataset_name, random_seed): - parent_dir = f"{DATA_HOME_DIR}/{dataset_name}" +def save_blockwise_featurized_data(data_home_dir, dataset_name, random_seed): + parent_dir = f"{data_home_dir}/{dataset_name}" AND_dataset = ANDData( signatures=join(parent_dir, f"{dataset_name}_signatures.json"), papers=join(parent_dir, f"{dataset_name}_papers.json"), @@ -115,13 +116,17 @@ def find_total_num_train_pairs(blockwise_data): print(args) params = args.__dict__ - DATA_HOME_DIR = params["data_home_dir"] + data_home_dir = params["data_home_dir"] dataset = params["dataset_name"] random_seeds = {1, 2, 3, 4, 5} for seed in random_seeds: print("Preprocessing started for seed value", seed) - save_blockwise_featurized_data(dataset, seed) + # Create the AND Dataset for the particular seed. (write a function let it be in train_utils.py + # Provide the AND Dataset to the functions : save_pickled_pointwise_features and save_blockwise_featurized_data + #save_blockwise_featurized_data(data_home_dir, dataset, seed) + save_pickled_pointwise_features(data_home_dir, dataset, seed) + # Check the pickles are created OK train_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{seed}/train_features.pkl" @@ -129,7 +134,4 @@ def find_total_num_train_pairs(blockwise_data): test_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{seed}/test_features.pkl" blockwise_features = read_blockwise_features(train_pkl) find_total_num_train_pairs(blockwise_features) - #verify_diff_with_s2and(dataset, seed) - - - + #verify_diff_with_s2and(dataset, seed) \ No newline at end of file diff --git a/e2e_scripts/preprocess_s2and_pointwise.py b/e2e_scripts/preprocess_s2and_pointwise.py index 002aeae..5cd8d3e 100644 --- a/e2e_scripts/preprocess_s2and_pointwise.py +++ b/e2e_scripts/preprocess_s2and_pointwise.py @@ -12,6 +12,7 @@ from os.path import join from s2and.data import ANDData import pickle +import os import numpy as np from scipy.sparse import csr_matrix, coo_matrix from utils.parser import Parser @@ -25,13 +26,56 @@ level=logging.INFO) logger = logging.getLogger(__name__) -def save_pickled_pointwise_features(data_home_dir, dataset_name): +def save_pointwise_in_different_splits(AND_dataset, sparse_matrix, label_encoder_signatures, random_seed): + logger.info('extracting signature depending on different split') + + train_block, val_block, test_block = AND_dataset.split_cluster_signatures() + + train_pointwise_features = {} + validation_pointwise_features = {} + test_pointwise_features = {} + + # The above three should have a key-list(val) (where val is a list of signature IDs) under them. + + # Doing for training block : + for block_id, list_of_signatures in train_block.items(): + # Let us transform each of those using label encoder and index them from the sparse matrix. + encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures) + train_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :] + + # Doing for validation block : + for block_id, list_of_signatures in val_block.items(): + # Let us transform each of those using label encoder and index them from the sparse matrix. + encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures) + validation_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :] + + for block_id, list_of_signatures in test_block.items(): + # Let us transform each of those using label encoder and index them from the sparse matrix. + encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures) + test_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :] + + if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}")): + os.makedirs(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}") + + train_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/train_features.pkl" + val_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/val_features.pkl" + test_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/test_features.pkl" + + with open(train_pkl,"wb") as _pkl_file: + pickle.dump(train_pointwise_features, _pkl_file) + with open(val_pkl,"wb") as _pkl_file: + pickle.dump(validation_pointwise_features, _pkl_file) + with open(test_pkl,"wb") as _pkl_file: + pickle.dump(test_pointwise_features, _pkl_file) + + +def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed): """ Fetch pointwise feature for dataset and store in a pickle. """ processed_data = {} parent_dir = f"{data_home_dir}/{dataset_name}" - + """ AND_dataset = ANDData( signatures=join(parent_dir, f"{dataset_name}_signatures.json"), papers=join(parent_dir, f"{dataset_name}_papers.json"), @@ -43,21 +87,22 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name): test_pairs_size=10000, name=dataset_name, n_jobs=16, - random_seed=random_seed, + random_seed=random_seed ) - # print("Storing pickled dataset....") - # with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f: - # pickle.dump(AND_dataset, f) - - # print("Loading pickled dataset...") - # with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f: - # AND_dataset = pickle.load(f) - # print("Loaded pickle dataset...") + print("Storing pickled dataset....") + with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f: + pickle.dump(AND_dataset, f) + """ + # Use below line carefully. + print("Loading pickled dataset...") + with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f: + AND_dataset = pickle.load(f) + print("Loaded pickle dataset...") - point_features_row, point_features_col, point_features_data, num_feats, num_points = pointwise_featurize(AND_dataset, + point_features_row, point_features_col, point_features_data, num_feats, num_points, le_feature_dict = pointwise_featurize(AND_dataset, n_jobs=16, use_cache=False) logger.info('converting feature indices to csr_matrix') @@ -66,12 +111,7 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name): shape=(num_points, num_feats) ).tocsr() print("Matrix creation done.") - processed_data['mention_level_features'] = point_features - - logger.info('Dumping processed data') - - with open(f'{dataset_name}_feature_processed.pkl', 'wb') as f: - pickle.dump(processed_data, f) + save_pointwise_in_different_splits(AND_dataset, point_features, le_feature_dict, random_seed) if __name__=='__main__': # Creates the pickles that store the preprocessed data @@ -86,7 +126,8 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name): params = args.__dict__ data_home_dir = params["data_home_dir"] dataset = params["dataset_name"] + random_seed = 1000 print("Preprocessing started") - save_pickled_pointwise_features(data_home_dir, dataset) + save_pickled_pointwise_features(data_home_dir, dataset, random_seed) print("Matrix") diff --git a/s2and/featurizer.py b/s2and/featurizer.py index 022edb8..1ccc62b 100644 --- a/s2and/featurizer.py +++ b/s2and/featurizer.py @@ -927,19 +927,24 @@ def pointwise_featurize( """ le_signature_feature_set = preprocessing.LabelEncoder() le_signature_feature_set.fit(list(signature_feature_set)) + + # I am using this for easy retrieval for training, val and test block retrieval. + le_signature_dict = preprocessing.LabelEncoder() + le_signature_dict.fit(list(signature_dict.keys())) point_features_row, point_features_col, point_features_data = [], [], [] num_points = len(signature_dict.keys()) num_feats = len(signature_feature_set) - for index, (_, values) in tqdm(enumerate(signature_dict.items()), desc="Converting to spare matrix"): + for _, (key, values) in tqdm(enumerate(signature_dict.items()), desc="Converting to sparse matrix"): encoded_signature_features = le_signature_feature_set.transform(values) + encoded_key_val = le_signature_dict.transform([key])[0] for feature_label in encoded_signature_features : - point_features_row.append(index) + point_features_row.append(encoded_key_val) point_features_col.append(feature_label) point_features_data.append(1) - return point_features_row, point_features_col, point_features_data, num_feats, num_points + return point_features_row, point_features_col, point_features_data, num_feats, num_points, le_signature_dict def store_featurized_pickles( From 1f2037b0e3c25b3d74d4014d03dddded7f5b11ab Mon Sep 17 00:00:00 2001 From: arana_umass_edu Date: Thu, 23 Mar 2023 21:46:31 +0000 Subject: [PATCH 14/17] create pointwise feature set then shuffle and split --- e2e_scripts/preprocess_s2and_data.py | 17 ++- e2e_scripts/preprocess_s2and_pointwise.py | 146 ++++++++++++---------- s2and/featurizer.py | 29 +++-- 3 files changed, 104 insertions(+), 88 deletions(-) diff --git a/e2e_scripts/preprocess_s2and_data.py b/e2e_scripts/preprocess_s2and_data.py index acd19b6..93fb2d0 100644 --- a/e2e_scripts/preprocess_s2and_data.py +++ b/e2e_scripts/preprocess_s2and_data.py @@ -16,14 +16,14 @@ from s2and.data import ANDData import logging from s2and.featurizer import FeaturizationInfo, featurize -from preprocess_s2and_pointwise import save_pickled_pointwise_features +from preprocess_s2and_pointwise import save_pickled_pointwise_features, create_signature_features_matrix logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) -def save_blockwise_featurized_data(data_home_dir, dataset_name, random_seed): +def save_featurized_data(data_home_dir, dataset_name, random_seed, point_features_mat, le_signatures): parent_dir = f"{data_home_dir}/{dataset_name}" AND_dataset = ANDData( signatures=join(parent_dir, f"{dataset_name}_signatures.json"), @@ -43,6 +43,7 @@ def save_blockwise_featurized_data(data_home_dir, dataset_name, random_seed): # Load the featurizer, which calculates pairwise similarity scores featurization_info = FeaturizationInfo() # the cache will make it faster to train multiple times - it stores the features on disk for you + save_pickled_pointwise_features(AND_dataset, point_features_mat, le_signatures, random_seed) train_pkl, val_pkl, test_pkl = store_featurized_pickles(AND_dataset, featurization_info, n_jobs=16, @@ -65,8 +66,8 @@ def find_total_num_train_pairs(blockwise_data): for block_id in blockwise_data.keys(): count += len(blockwise_data[block_id][0]) - print("Total num of signature pairs", count) - + print("Total num of signature pairs", count) + # def verify_diff_with_s2and(dataset_name, random_seed): # parent_dir = f"{DATA_HOME_DIR}/{dataset_name}" # AND_dataset = ANDData( @@ -105,7 +106,6 @@ def find_total_num_train_pairs(blockwise_data): # # print("VERIFICATION STATUS: ", s2and_set==our_set) - if __name__=='__main__': # Creates the pickles that store the preprocessed data # Read cmd line args @@ -118,14 +118,13 @@ def find_total_num_train_pairs(blockwise_data): params = args.__dict__ data_home_dir = params["data_home_dir"] dataset = params["dataset_name"] + + point_features_mat, le_signatures = create_signature_features_matrix(data_home_dir, dataset) random_seeds = {1, 2, 3, 4, 5} for seed in random_seeds: print("Preprocessing started for seed value", seed) - # Create the AND Dataset for the particular seed. (write a function let it be in train_utils.py - # Provide the AND Dataset to the functions : save_pickled_pointwise_features and save_blockwise_featurized_data - #save_blockwise_featurized_data(data_home_dir, dataset, seed) - save_pickled_pointwise_features(data_home_dir, dataset, seed) + save_featurized_data(data_home_dir, dataset, seed, point_features_mat, le_signatures) # Check the pickles are created OK diff --git a/e2e_scripts/preprocess_s2and_pointwise.py b/e2e_scripts/preprocess_s2and_pointwise.py index 5cd8d3e..2918d7c 100644 --- a/e2e_scripts/preprocess_s2and_pointwise.py +++ b/e2e_scripts/preprocess_s2and_pointwise.py @@ -26,56 +26,71 @@ level=logging.INFO) logger = logging.getLogger(__name__) -def save_pointwise_in_different_splits(AND_dataset, sparse_matrix, label_encoder_signatures, random_seed): - logger.info('extracting signature depending on different split') - - train_block, val_block, test_block = AND_dataset.split_cluster_signatures() - - train_pointwise_features = {} - validation_pointwise_features = {} - test_pointwise_features = {} - - # The above three should have a key-list(val) (where val is a list of signature IDs) under them. - - # Doing for training block : - for block_id, list_of_signatures in train_block.items(): - # Let us transform each of those using label encoder and index them from the sparse matrix. - encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures) - train_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :] - - # Doing for validation block : - for block_id, list_of_signatures in val_block.items(): - # Let us transform each of those using label encoder and index them from the sparse matrix. - encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures) - validation_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :] +def save_pickled_pointwise_features(AND_dataset, sparse_matrix, + label_encoder_signatures, + random_seed: int = None): + """ + Fetch pointwise feature for dataset and store in a pickle. + """ - for block_id, list_of_signatures in test_block.items(): - # Let us transform each of those using label encoder and index them from the sparse matrix. - encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures) - test_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :] - - if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}")): - os.makedirs(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}") - - train_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/train_features.pkl" - val_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/val_features.pkl" - test_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/test_features.pkl" - - with open(train_pkl,"wb") as _pkl_file: - pickle.dump(train_pointwise_features, _pkl_file) - with open(val_pkl,"wb") as _pkl_file: - pickle.dump(validation_pointwise_features, _pkl_file) - with open(test_pkl,"wb") as _pkl_file: - pickle.dump(test_pointwise_features, _pkl_file) + if random_seed: + train_block, val_block, test_block = AND_dataset.split_cluster_signatures() + + train_pointwise_features = {} + validation_pointwise_features = {} + test_pointwise_features = {} + + # The above three should have a key-list(val) (where val is a list of signature IDs) under them. + + # Doing for training block : + for block_id, list_of_signatures in train_block.items(): + # Let us transform each of those using label encoder and index them from the sparse matrix. + encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures) + train_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :] + + # Doing for validation block : + for block_id, list_of_signatures in val_block.items(): + # Let us transform each of those using label encoder and index them from the sparse matrix. + encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures) + validation_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :] + + for block_id, list_of_signatures in test_block.items(): + # Let us transform each of those using label encoder and index them from the sparse matrix. + encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures) + test_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :] + + if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}")): + os.makedirs(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}") + + train_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}/train_signature_features.pkl" + val_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}/val_signature_features.pkl" + test_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}/test_signature_features.pkl" + + with open(train_pkl,"wb") as _pkl_file: + pickle.dump(train_pointwise_features, _pkl_file) + with open(val_pkl,"wb") as _pkl_file: + pickle.dump(validation_pointwise_features, _pkl_file) + with open(test_pkl,"wb") as _pkl_file: + pickle.dump(test_pointwise_features, _pkl_file) + else: + processed_data = {} + point_features_mat, _ = create_signature_features_matrix(data_home_dir, AND_dataset.name) + processed_data['mention_level_features'] = point_features_mat + + logger.info('Dumping processed data') + file_name = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/{dataset_name}_all_signature_features.pkl" + + with open(file_name, 'wb') as f: + pickle.dump(processed_data, f) -def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed): +def create_signature_features_matrix(data_home_dir, dataset_name): """ - Fetch pointwise feature for dataset and store in a pickle. + Generate pointwise feature set for the entire dataset and return sparse matrix + representation for each signature and their respective features. """ - processed_data = {} + logger.info("Signature features pre-procesing started") parent_dir = f"{data_home_dir}/{dataset_name}" - """ AND_dataset = ANDData( signatures=join(parent_dir, f"{dataset_name}_signatures.json"), papers=join(parent_dir, f"{dataset_name}_papers.json"), @@ -86,33 +101,27 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed): val_pairs_size=10000, test_pairs_size=10000, name=dataset_name, - n_jobs=16, - random_seed=random_seed + n_jobs=16 ) - print("Storing pickled dataset....") - with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f: - pickle.dump(AND_dataset, f) - """ - # Use below line carefully. - print("Loading pickled dataset...") - with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f: - AND_dataset = pickle.load(f) - print("Loaded pickle dataset...") +# print("Storing pickled dataset....") +# with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f: +# pickle.dump(AND_dataset, f) + +# # Use below line carefully. +# print("Loading pickled dataset...") +# with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f: +# AND_dataset = pickle.load(f) +# print("Loaded pickle dataset...") + + point_features_mat, le_signatures = pointwise_featurize(AND_dataset, + n_jobs=16, + use_cache=False) + + logger.info("Signature features pre-procesing completed") + return point_features_mat, le_signatures - - point_features_row, point_features_col, point_features_data, num_feats, num_points, le_feature_dict = pointwise_featurize(AND_dataset, - n_jobs=16, - use_cache=False) - logger.info('converting feature indices to csr_matrix') - point_features = coo_matrix( - (point_features_data, (point_features_row, point_features_col)), - shape=(num_points, num_feats) - ).tocsr() - print("Matrix creation done.") - save_pointwise_in_different_splits(AND_dataset, point_features, le_feature_dict, random_seed) - if __name__=='__main__': # Creates the pickles that store the preprocessed data # Read cmd line args @@ -129,5 +138,4 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed): random_seed = 1000 print("Preprocessing started") - save_pickled_pointwise_features(data_home_dir, dataset, random_seed) - print("Matrix") + save_pickled_pointwise_features(data_home_dir, dataset) diff --git a/s2and/featurizer.py b/s2and/featurizer.py index 1ccc62b..15278e4 100644 --- a/s2and/featurizer.py +++ b/s2and/featurizer.py @@ -10,6 +10,7 @@ from collections import Counter from collections.abc import Iterable from IPython import embed +from scipy.sparse import csr_matrix, coo_matrix from sklearn import preprocessing @@ -839,7 +840,7 @@ def pointwise_featurize( chunk_size: int = DEFAULT_CHUNK_SIZE, ): """ - Featurizes the input dataset and stores as a unified pickle file. + Extarct Pointwise Features from the dataset. Parameters ---------- @@ -855,16 +856,15 @@ def pointwise_featurize( Returns ------- Returns the three items : - 1. Row indices of the sparse matrix containing the data - 2. Column indices of the sparse matrix containing the data - 3. The data to be filled in the given row and column combination. + 1. Sparse matrix poitwise feature representation of all the signatures in a dataset. + 2. Label encoder to index signature according to their ids """ # Do you think OrderedSet and OrderedDict should be used here? signature_feature_set = set() # The feature is stored a str and not tuple to facilitate label encoding. signature_dict = {} # We dont need to iterate signature per block as we need to create for all the signatures irrespective of the block. - + logger.info('Creating signatures feature set...') for signature_key, values in dataset.signatures.items(): per_signature_features = dataset.signatures[signature_key]._asdict() signature_dict[signature_key] = [] @@ -915,7 +915,9 @@ def pointwise_featurize( print('\n!!!! Found another type !!!!\n') embed() exit() - logger.info('Label encoding the values') + logger.info('Created signatures feature set...') + + logger.info('Label encoding signature features...') # Label encoding code --- """" @@ -927,7 +929,7 @@ def pointwise_featurize( """ le_signature_feature_set = preprocessing.LabelEncoder() le_signature_feature_set.fit(list(signature_feature_set)) - + # I am using this for easy retrieval for training, val and test block retrieval. le_signature_dict = preprocessing.LabelEncoder() le_signature_dict.fit(list(signature_dict.keys())) @@ -936,15 +938,22 @@ def pointwise_featurize( num_points = len(signature_dict.keys()) num_feats = len(signature_feature_set) - for _, (key, values) in tqdm(enumerate(signature_dict.items()), desc="Converting to sparse matrix"): + for key, values in tqdm(signature_dict.items(), desc="Converting to spare matrix"): encoded_signature_features = le_signature_feature_set.transform(values) encoded_key_val = le_signature_dict.transform([key])[0] for feature_label in encoded_signature_features : point_features_row.append(encoded_key_val) point_features_col.append(feature_label) point_features_data.append(1) - - return point_features_row, point_features_col, point_features_data, num_feats, num_points, le_signature_dict + logger.info('Label encoding completed...') + + logger.info('converting feature indices to csr_matrix') + point_features = coo_matrix( + (point_features_data, (point_features_row, point_features_col)), + shape=(num_points, num_feats) + ).tocsr() + print("Matrix creation done.") + return point_features, le_signature_dict def store_featurized_pickles( From e1cbad5a2788a4019a6b61e3b2597debecded55b Mon Sep 17 00:00:00 2001 From: Sriharsha-hatwar Date: Sun, 26 Mar 2023 00:04:36 +0000 Subject: [PATCH 15/17] Added the verification code needed for verifying the signature sparse matix length and order. --- e2e_scripts/preprocess_s2and_data.py | 52 ++++++++++++++++++++++- e2e_scripts/preprocess_s2and_pointwise.py | 6 ++- s2and/featurizer.py | 5 ++- 3 files changed, 59 insertions(+), 4 deletions(-) diff --git a/e2e_scripts/preprocess_s2and_data.py b/e2e_scripts/preprocess_s2and_data.py index d6962b3..97ca5d0 100644 --- a/e2e_scripts/preprocess_s2and_data.py +++ b/e2e_scripts/preprocess_s2and_data.py @@ -15,6 +15,7 @@ from s2and.data import ANDData import logging +import json from s2and.featurizer import FeaturizationInfo, featurize from preprocess_s2and_pointwise import save_pickled_pointwise_features, create_signature_features_matrix @@ -22,6 +23,42 @@ level=logging.INFO) logger = logging.getLogger(__name__) +def validate_pointwise_featurizer(dataset, pointwise_matrix, le_signature_ids): + # This function is here to validate two things : + # 1. Whether the length of the signtaures in the matrix is same as the length of the json file. + # Also the block id. + # 2. Check whether the order is the same between the json file and the matrix. + # The order needs to be checked on block level as well as the signature level. + + print("### --- Validating the pointwise matrix creation") + print("The shape of the matrix : ",pointwise_matrix.shape) + file_signature_location= f"../data/{dataset.name}/{dataset.name}_signatures.json" + with open(file_signature_location, 'r') as myfile: + data=myfile.read() + dict_obj = json.loads(data) + file_keys = list(dict_obj.keys()) + print("Length of the signatures file : ", len(dict_obj)) + print("### -- Validating the signature order. ") + indices = list(range(pointwise_matrix.shape[0])) + inverse_transformed_signature_ids = list(le_signature_ids.inverse_transform(indices)) + ordered = True + length = len(dict_obj) + if len(dict_obj) == pointwise_matrix.shape[0]: + print("The lengths are same") + index = 0 + while (ordered and index < length): + if inverse_transformed_signature_ids[index] == file_keys[index]: + index += 1 + else: + print("inverse_transformed_signature_ids[index] :", inverse_transformed_signature_ids[index]) + print("file_keys[index] : ", file_keys[index]) + ordered = False + print("The order is not same") + if ordered: + print("The order is same.") + + else: + print("The lengths are not same..") def save_featurized_data(data_home_dir, dataset_name, random_seed, point_features_mat, le_signatures): parent_dir = f"{data_home_dir}/{dataset_name}" @@ -46,6 +83,8 @@ def save_featurized_data(data_home_dir, dataset_name, random_seed, point_feature save_pickled_pointwise_features(AND_dataset, point_features_mat, le_signatures, random_seed) + validate_pointwise_featurizer(AND_dataset, point_features_mat, le_signatures) + train_pkl, val_pkl, test_pkl = store_featurized_pickles(AND_dataset, featurization_info, n_jobs=16, @@ -123,6 +162,17 @@ def find_total_num_train_pairs(blockwise_data): point_features_mat, le_signatures = create_signature_features_matrix(data_home_dir, dataset) + # Added this for speeding up while testing. + matrix_pickle_file_location = "./matrix_pickle.pkl" + + with open(matrix_pickle_file_location,"wb") as _pkl_file: + pickle.dump((point_features_mat, le_signatures), _pkl_file) + + """ + with open(matrix_pickle_file_location, 'rb') as f: + point_features_mat, le_signatures = pickle.load(f) + """ + random_seeds = [1, 2, 3, 4, 5] if params["dataset_seed"] is None else [params["dataset_seed"]] for seed in random_seeds: print("Preprocessing started for seed value", seed) @@ -135,4 +185,4 @@ def find_total_num_train_pairs(blockwise_data): test_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{seed}/test_features.pkl" blockwise_features = read_blockwise_features(train_pkl) find_total_num_train_pairs(blockwise_features) - #verify_diff_with_s2and(dataset, seed) \ No newline at end of file + #verify_diff_with_s2and(dataset, seed) diff --git a/e2e_scripts/preprocess_s2and_pointwise.py b/e2e_scripts/preprocess_s2and_pointwise.py index 2918d7c..50c482c 100644 --- a/e2e_scripts/preprocess_s2and_pointwise.py +++ b/e2e_scripts/preprocess_s2and_pointwise.py @@ -34,6 +34,7 @@ def save_pickled_pointwise_features(AND_dataset, sparse_matrix, """ if random_seed: + # This splits the signatures per three different blocks train_block, val_block, test_block = AND_dataset.split_cluster_signatures() train_pointwise_features = {} @@ -41,7 +42,8 @@ def save_pickled_pointwise_features(AND_dataset, sparse_matrix, test_pointwise_features = {} # The above three should have a key-list(val) (where val is a list of signature IDs) under them. - + # Below three for loops go through the blocks, gets the corresponding row index of the signature + # from the label encoder, splices the matrix with only those rows and stores per block. # Doing for training block : for block_id, list_of_signatures in train_block.items(): # Let us transform each of those using label encoder and index them from the sparse matrix. @@ -78,7 +80,7 @@ def save_pickled_pointwise_features(AND_dataset, sparse_matrix, processed_data['mention_level_features'] = point_features_mat logger.info('Dumping processed data') - file_name = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/{dataset_name}_all_signature_features.pkl" + file_name = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/{AND_dataset.name}_all_signature_features.pkl" with open(file_name, 'wb') as f: pickle.dump(processed_data, f) diff --git a/s2and/featurizer.py b/s2and/featurizer.py index 15278e4..e4c7a4b 100644 --- a/s2and/featurizer.py +++ b/s2and/featurizer.py @@ -871,6 +871,7 @@ def pointwise_featurize( for feature_key, value in per_signature_features.items(): index_key = None + # TODO : WHy to ignore this? features_to_ignore = [ 'author_info_name_counts', 'author_info_position', @@ -896,6 +897,8 @@ def pointwise_featurize( pass # Let us check the type of value for each signatures. + # This will go through each signature and depending on the type of key in the + # key-val pair, flatens it and is used a single feature to create a sparse matrix. if isinstance(value, str) or isinstance(value, int): index_key = str((feature_key, value)) @@ -938,7 +941,7 @@ def pointwise_featurize( num_points = len(signature_dict.keys()) num_feats = len(signature_feature_set) - for key, values in tqdm(signature_dict.items(), desc="Converting to spare matrix"): + for key, values in tqdm(signature_dict.items(), desc="Converting to sparse matrix"): encoded_signature_features = le_signature_feature_set.transform(values) encoded_key_val = le_signature_dict.transform([key])[0] for feature_label in encoded_signature_features : From 2fc77e81a19fa24d8b03c899a9c907987c8b50d7 Mon Sep 17 00:00:00 2001 From: Sriharsha-hatwar Date: Tue, 4 Apr 2023 23:27:50 +0000 Subject: [PATCH 16/17] adding the folder for storing the pointwise matrix --- e2e_scripts/preprocess_s2and_pointwise.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/e2e_scripts/preprocess_s2and_pointwise.py b/e2e_scripts/preprocess_s2and_pointwise.py index 50c482c..27242cd 100644 --- a/e2e_scripts/preprocess_s2and_pointwise.py +++ b/e2e_scripts/preprocess_s2and_pointwise.py @@ -61,12 +61,12 @@ def save_pickled_pointwise_features(AND_dataset, sparse_matrix, encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures) test_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :] - if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}")): - os.makedirs(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}") + if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}")): + os.makedirs(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}") - train_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}/train_signature_features.pkl" - val_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}/val_signature_features.pkl" - test_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}/test_signature_features.pkl" + train_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/train_signature_features.pkl" + val_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/val_signature_features.pkl" + test_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/test_signature_features.pkl" with open(train_pkl,"wb") as _pkl_file: pickle.dump(train_pointwise_features, _pkl_file) @@ -120,6 +120,16 @@ def create_signature_features_matrix(data_home_dir, dataset_name): n_jobs=16, use_cache=False) + matrix_pickle_file_location = f'preprocess_matrix_{dataset_name}.pkl' + print("Storing pickled matrix ....") + with open(matrix_pickle_file_location, 'wb') as f: + pickle.dump((point_features_mat, le_signatures), f) + + print("### loading from pickle") + with open(matrix_pickle_file_location, 'rb') as f: + point_features_mat, le_signatures = pickle.load(f) + + logger.info("Signature features pre-procesing completed") return point_features_mat, le_signatures From 85cbbd75a44a74a303f4c6ab12155a8896a9877f Mon Sep 17 00:00:00 2001 From: Sriharsha-hatwar Date: Mon, 10 Apr 2023 02:53:53 +0000 Subject: [PATCH 17/17] Changes to address the validation code and pointwise feature creation --- e2e_scripts/preprocess_s2and_data.py | 179 +++++++++++++++++----- e2e_scripts/preprocess_s2and_pointwise.py | 6 +- s2and/featurizer.py | 75 +++++++-- 3 files changed, 211 insertions(+), 49 deletions(-) diff --git a/e2e_scripts/preprocess_s2and_data.py b/e2e_scripts/preprocess_s2and_data.py index 97ca5d0..bfcc15b 100644 --- a/e2e_scripts/preprocess_s2and_data.py +++ b/e2e_scripts/preprocess_s2and_data.py @@ -23,42 +23,144 @@ level=logging.INFO) logger = logging.getLogger(__name__) -def validate_pointwise_featurizer(dataset, pointwise_matrix, le_signature_ids): - # This function is here to validate two things : - # 1. Whether the length of the signtaures in the matrix is same as the length of the json file. - # Also the block id. - # 2. Check whether the order is the same between the json file and the matrix. - # The order needs to be checked on block level as well as the signature level. - print("### --- Validating the pointwise matrix creation") - print("The shape of the matrix : ",pointwise_matrix.shape) - file_signature_location= f"../data/{dataset.name}/{dataset.name}_signatures.json" - with open(file_signature_location, 'r') as myfile: - data=myfile.read() - dict_obj = json.loads(data) - file_keys = list(dict_obj.keys()) - print("Length of the signatures file : ", len(dict_obj)) - print("### -- Validating the signature order. ") - indices = list(range(pointwise_matrix.shape[0])) - inverse_transformed_signature_ids = list(le_signature_ids.inverse_transform(indices)) - ordered = True - length = len(dict_obj) - if len(dict_obj) == pointwise_matrix.shape[0]: - print("The lengths are same") +def validate_order_all_block(pointwise_block, features_block): + # First, check the blockwise similarity + keys_pointwise = list(pointwise_block.keys()) + keys_features = list(features_block.keys()) + + if len(keys_pointwise) == len(keys_features): + print("The number of blocks are same across the features") + ordered = True index = 0 - while (ordered and index < length): - if inverse_transformed_signature_ids[index] == file_keys[index]: - index += 1 + len_of_blocks = len(keys_pointwise) + while ordered and index < len_of_blocks: + if keys_pointwise[index] == keys_features[index]: + index+=1 else: - print("inverse_transformed_signature_ids[index] :", inverse_transformed_signature_ids[index]) - print("file_keys[index] : ", file_keys[index]) ordered = False - print("The order is not same") - if ordered: - print("The order is same.") + if not ordered: + print("The blocks are not in order.") + else: + print("The blocks are in order.") + return ordered + else: + print("The number of blocks in seed : ", seed, "are not the same across features") + return False + +def validate_order_inside_block(pointwise_block, features_block): + pointwise_signature_list = [] + signature_id_list = [] + + for block, val in pointwise_block.items(): + list_of_sig = val[0] + pointwise_signature_list.extend(list_of_sig) + + #print("pointwise_signature_list : ", pointwise_signature_list) + + for block, val in features_block.items(): + list_of_sig = [sigs.signature_id for sigs in val] + signature_id_list.extend(list_of_sig) + + #print("signature_id_list : ", signature_id_list) + + # Now for validation part. + ordered = True + index = 0 + len_of_sigs = len(pointwise_signature_list) + while ordered and index < len_of_sigs: + if pointwise_signature_list[index] == signature_id_list[index]: + index += 1 + else: + ordered = False + if not ordered: + print("The Signatures are not in order.") else: - print("The lengths are not same..") + print("The Signatures are in order.") + return ordered + +def validate_pointwise_featurizer(dataset): + print("### --- Validating the pointwise matrix creation") + # Need to go through each pickle file in all the seeds. + seeds = [1] + + for seed in seeds: + train_point_loc = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/pointwise/seed{seed}/train_signature_features.pkl" + val_point_loc = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/pointwise/seed{seed}/val_signature_features.pkl" + test_point_loc = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/pointwise/seed{seed}/test_signature_features.pkl" + + + train_loc = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/seed{seed}/train_signatures.pkl" + val_loc = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/seed{seed}/val_signatures.pkl" + test_loc = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/seed{seed}/test_signatures.pkl" + + # This is what the pointwise has created. + with open(train_point_loc, 'rb') as f: + train_block_pointwise_data = pickle.load(f) + + with open(train_loc, 'rb') as f: + train_block_data = pickle.load(f) + + + # For training block. + train_blocks_in_order = validate_order_all_block(train_block_pointwise_data, train_block_data) + + if train_blocks_in_order: + print("Training block of seed ", seed, "in order") + else: + print("Training block of seed ", seed, " are not in order") + + is_signature_train_in_order = validate_order_inside_block(train_block_pointwise_data, train_block_data) + + if is_signature_train_in_order: + print("Training signature of seed ", seed , "in order") + else: + print("Training signature of seed ", seed , "are not in order") + + + + # For validation parts.. + with open(val_point_loc, 'rb') as f: + val_block_pointwise_data = pickle.load(f) + with open(val_loc, 'rb') as f: + val_block_data = pickle.load(f) + + val_blocks_in_order = validate_order_all_block(val_block_pointwise_data, val_block_data) + + if val_blocks_in_order: + print("Validation block of seed ", seed, "in order") + else: + print("Validation block of seed ", seed, " are not in order") + + is_signature_val_in_order = validate_order_inside_block(val_block_pointwise_data, val_block_data) + + if is_signature_val_in_order: + print("Validation signature of seed ", seed , "in order") + else: + print("Validation signature of seed ", seed , "are not in order") + + + # For testing parts.. + with open(test_point_loc, 'rb') as f: + test_block_pointwise_data = pickle.load(f) + with open(test_loc, 'rb') as f: + test_block_data = pickle.load(f) + + test_blocks_in_order = validate_order_all_block(test_block_pointwise_data, test_block_data) + + if test_blocks_in_order: + print("Test block of seed ", seed, "in order") + else: + print("Test block of seed ", seed, " are not in order") + + is_signature_test_in_order = validate_order_inside_block(test_block_pointwise_data, test_block_data) + + if is_signature_test_in_order: + print("Test signature of seed ", seed , "in order") + else: + print("Test signature of seed ", seed , "are not in order") + def save_featurized_data(data_home_dir, dataset_name, random_seed, point_features_mat, le_signatures): parent_dir = f"{data_home_dir}/{dataset_name}" @@ -81,15 +183,20 @@ def save_featurized_data(data_home_dir, dataset_name, random_seed, point_feature featurization_info = FeaturizationInfo() logger.info("Loaded featurization info") - save_pickled_pointwise_features(AND_dataset, point_features_mat, le_signatures, random_seed) + #save_pickled_pointwise_features(AND_dataset, point_features_mat, le_signatures, random_seed) - validate_pointwise_featurizer(AND_dataset, point_features_mat, le_signatures) + train_pkl, val_pkl, test_pkl = store_featurized_pickles(AND_dataset, featurization_info, n_jobs=16, use_cache=False, - random_seed=random_seed) + random_seed=random_seed, + pointwise_matrix=point_features_mat, + le_signatures=le_signatures) + + validate_pointwise_featurizer(AND_dataset) + print(" ## Validation and save process completed.") return train_pkl, val_pkl, test_pkl @@ -173,7 +280,7 @@ def find_total_num_train_pairs(blockwise_data): point_features_mat, le_signatures = pickle.load(f) """ - random_seeds = [1, 2, 3, 4, 5] if params["dataset_seed"] is None else [params["dataset_seed"]] + random_seeds = [1] if params["dataset_seed"] is None else [params["dataset_seed"]] for seed in random_seeds: print("Preprocessing started for seed value", seed) save_featurized_data(data_home_dir, dataset, seed, point_features_mat, le_signatures) @@ -183,6 +290,6 @@ def find_total_num_train_pairs(blockwise_data): train_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{seed}/train_features.pkl" val_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{seed}/val_features.pkl" test_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{seed}/test_features.pkl" - blockwise_features = read_blockwise_features(train_pkl) - find_total_num_train_pairs(blockwise_features) + #blockwise_features = read_blockwise_features(train_pkl) + #find_total_num_train_pairs(blockwise_features) #verify_diff_with_s2and(dataset, seed) diff --git a/e2e_scripts/preprocess_s2and_pointwise.py b/e2e_scripts/preprocess_s2and_pointwise.py index 27242cd..dfe8f6e 100644 --- a/e2e_scripts/preprocess_s2and_pointwise.py +++ b/e2e_scripts/preprocess_s2and_pointwise.py @@ -48,18 +48,18 @@ def save_pickled_pointwise_features(AND_dataset, sparse_matrix, for block_id, list_of_signatures in train_block.items(): # Let us transform each of those using label encoder and index them from the sparse matrix. encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures) - train_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :] + train_pointwise_features[block_id] = (list_of_signatures, sparse_matrix[encoded_signature_id_list, :]) # Doing for validation block : for block_id, list_of_signatures in val_block.items(): # Let us transform each of those using label encoder and index them from the sparse matrix. encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures) - validation_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :] + validation_pointwise_features[block_id] = (list_of_signatures, sparse_matrix[encoded_signature_id_list, :]) for block_id, list_of_signatures in test_block.items(): # Let us transform each of those using label encoder and index them from the sparse matrix. encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures) - test_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :] + test_pointwise_features[block_id] = (list_of_signatures, sparse_matrix[encoded_signature_id_list, :]) if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}")): os.makedirs(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}") diff --git a/s2and/featurizer.py b/s2and/featurizer.py index e4c7a4b..a72b5d7 100644 --- a/s2and/featurizer.py +++ b/s2and/featurizer.py @@ -9,6 +9,7 @@ import logging from collections import Counter from collections.abc import Iterable +from collections import OrderedDict from IPython import embed from scipy.sparse import csr_matrix, coo_matrix @@ -856,12 +857,16 @@ def pointwise_featurize( Returns ------- Returns the three items : - 1. Sparse matrix poitwise feature representation of all the signatures in a dataset. + 1. Sparse matrix pointwise feature representation of all the signatures in a dataset. 2. Label encoder to index signature according to their ids """ # Do you think OrderedSet and OrderedDict should be used here? - signature_feature_set = set() # The feature is stored a str and not tuple to facilitate label encoding. - signature_dict = {} + # I am using this to facilitate the order to be maintained. + # signature_feature_dict is facilitating an ordered set storage by using an ordered dict + # and is used to store the feature + # signature_dict - Is an ordered storage of Signature IDs. + signature_feature_dict = OrderedDict() # The feature is stored a str and not tuple to facilitate label encoding. + signature_dict = OrderedDict() # We dont need to iterate signature per block as we need to create for all the signatures irrespective of the block. logger.info('Creating signatures feature set...') @@ -902,17 +907,23 @@ def pointwise_featurize( if isinstance(value, str) or isinstance(value, int): index_key = str((feature_key, value)) - signature_feature_set.add(index_key) # Converting to str from tuple. + #signature_feature_set.add(index_key) # Converting to str from tuple. + if index_key not in signature_feature_dict: + signature_feature_dict[index_key] = None signature_dict[signature_key].append(index_key) elif isinstance(value, Counter): for val in value.keys(): index_key = str((feature_key, val)) - signature_feature_set.add(index_key) + #signature_feature_set.add(index_key) + if index_key not in signature_feature_dict: + signature_feature_dict[index_key] = None signature_dict[signature_key].append(index_key) elif isinstance(value, Iterable): for val in value: index_key = str((feature_key, val)) - signature_feature_set.add(index_key) + #signature_feature_set.add(index_key) + if index_key not in signature_feature_dict: + signature_feature_dict[index_key] = None signature_dict[signature_key].append(index_key) else: print('\n!!!! Found another type !!!!\n') @@ -930,8 +941,12 @@ def pointwise_featurize( } """ + # Before label encoding, I am converting the signature_set + # (which was a ordered dict into a list of signature by just getting the keys) + signature_feature_set = list(signature_feature_dict.keys()) + le_signature_feature_set = preprocessing.LabelEncoder() - le_signature_feature_set.fit(list(signature_feature_set)) + le_signature_feature_set.fit(signature_feature_set) # I am using this for easy retrieval for training, val and test block retrieval. le_signature_dict = preprocessing.LabelEncoder() @@ -958,6 +973,21 @@ def pointwise_featurize( print("Matrix creation done.") return point_features, le_signature_dict +def create_pointwise_block(object_list, pointwise_matrix, le_signatures): + # So object_list contains a dict of blocks and signatures. + # So we get the same block as key, get the signature IDs from the values and use the label + # encoder to get the corresponding indices and extract the indices from the pointwise matrix. + + pointwise_featurize = {} + for key, val in object_list.items(): + # here val is a list of signature objects. + # get the ID from each of them. + signatures_ids = [sig.signature_id for sig in val] + encoded_signature_id_list = le_signatures.transform(signatures_ids) + pointwise_featurize[key] = (signatures_ids, pointwise_matrix[encoded_signature_id_list, :]) + + return pointwise_featurize + def store_featurized_pickles( dataset: ANDData, @@ -969,6 +999,8 @@ def store_featurized_pickles( nan_value: float = np.nan, delete_training_data: bool = False, random_seed: int = 1, + pointwise_matrix = None, + le_signatures = None, ) -> Union[Tuple[TupleOfArrays, TupleOfArrays, TupleOfArrays], TupleOfArrays]: """ Featurizes the input dataset and stores as preprocessed data in pickle files @@ -1119,18 +1151,41 @@ def store_featurized_pickles( val_signatures_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/seed{random_seed}/val_signatures.pkl" test_signatures_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/seed{random_seed}/test_signatures.pkl" + train_object_list: Dict[str, List[Signature]] = dataset.get_signature_objects(train_signatures) + val_object_list: Dict[str, List[Signature]] = dataset.get_signature_objects(val_signatures) + test_object_list: Dict[str, List[Signature]] = dataset.get_signature_objects(test_signatures) + if(not os.path.isfile(train_signatures_pkl)): - train_object_list: Dict[str, List[Signature]] = dataset.get_signature_objects(train_signatures) + #train_object_list = dataset.get_signature_objects(train_signatures) with open(train_signatures_pkl, "wb") as _pkl_file: pickle.dump(train_object_list, _pkl_file) - val_object_list: Dict[str, List[Signature]] = dataset.get_signature_objects(val_signatures) + #val_object_list = dataset.get_signature_objects(val_signatures) with open(val_signatures_pkl, "wb") as _pkl_file: pickle.dump(val_object_list, _pkl_file) - test_object_list: Dict[str, List[Signature]] = dataset.get_signature_objects(test_signatures) + #test_object_list = dataset.get_signature_objects(test_signatures) with open(test_signatures_pkl, "wb") as _pkl_file: pickle.dump(test_object_list, _pkl_file) + # Now utlize this in-creating the blockwise, train, test and Val pair of pointwise features per block. + + train_pointwise_features = create_pointwise_block(train_object_list, pointwise_matrix, le_signatures) + validation_pointwise_features = create_pointwise_block(val_object_list, pointwise_matrix, le_signatures) + test_pointwise_features = create_pointwise_block(test_object_list, pointwise_matrix, le_signatures) + + if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{dataset.name}/pointwise/seed{random_seed}")): + os.makedirs(f"{PREPROCESSED_DATA_DIR}/{dataset.name}/pointwise/seed{random_seed}") + + train_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/pointwise/seed{random_seed}/train_signature_features.pkl" + val_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/pointwise/seed{random_seed}/val_signature_features.pkl" + test_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset.name}/pointwise/seed{random_seed}/test_signature_features.pkl" + + with open(train_pkl,"wb") as _pkl_file: + pickle.dump(train_pointwise_features, _pkl_file) + with open(val_pkl,"wb") as _pkl_file: + pickle.dump(validation_pointwise_features, _pkl_file) + with open(test_pkl,"wb") as _pkl_file: + pickle.dump(test_pointwise_features, _pkl_file) return train_pkl, val_pkl, test_pkl