Merge pull request #32 from aimclub/link_predict

Link predict
aimclub · Jan 11, 2024 · 7af183b · 7af183b
2 parents f355128 + ec9867f
commit 7af183b
Show file tree

Hide file tree

Showing 22 changed files with 1,027,437 additions and 955 deletions.
diff --git a/stable_gnn/embedding/embedding_factory.py b/stable_gnn/embedding/embedding_factory.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict
+from typing import Any, Dict, List
 
 import torch
 from numpy.typing import NDArray
@@ -13,9 +13,13 @@ class EmbeddingFactory:
     """Producing unsupervised embeddings for a given dataset"""
 
     @staticmethod
-    def _build_embeddings(loss: Dict[str, Any], data: Graph, conv: str, device: device) -> NDArray:
-        optuna_training = OptunaTrainEmbeddings(data=data, conv=conv, device=device, loss_function=loss)
-        best_values = optuna_training.run(number_of_trials=50)
+    def _build_embeddings(
+        loss: Dict[str, Any], data: Graph, conv: str, device: device, number_of_trials: int, tune_out: bool = False
+    ) -> NDArray:
+        optuna_training = OptunaTrainEmbeddings(
+            data=data, conv=conv, device=device, loss_function=loss, tune_out=tune_out
+        )
+        best_values = optuna_training.run(number_of_trials=number_of_trials)
 
         loss_trgt = dict()
         for par in loss:
@@ -28,7 +32,9 @@ def _build_embeddings(loss: Dict[str, Any], data: Graph, conv: str, device: devi
         if "lmbda" in loss_trgt:
             loss_trgt["lmbda"] = best_values["lmbda"]
 
-        model_training = ModelTrainEmbeddings(data=data, conv=conv, device=device, loss_function=loss_trgt)
+        model_training = ModelTrainEmbeddings(
+            data=data, conv=conv, device=device, loss_function=loss_trgt, tune_out=tune_out
+        )
         out = model_training.run(best_values)
         torch.cuda.empty_cache()
         return out.detach().cpu().numpy()
@@ -77,15 +83,32 @@ def _get_emb_settings(loss_name: str) -> Dict[str, Any]:
         else:
             raise NameError
 
-    def build_embeddings(self, loss_name: str, conv: str, data: Graph, device: device) -> NDArray:
+    def build_embeddings(
+        self,
+        loss_name: str,
+        conv: str,
+        data: List[Graph],
+        device: device,
+        number_of_trials: int,
+        tune_out: bool = False,
+    ) -> NDArray:
         """Build embeddings based on passed dataset and settings
 
         :param loss_name: (str): Name of loss function for embedding learning in GeomGCN layer
         :param conv: (str) Name of convolution used in unsupervied embeddings
         :param data: (Graph): Input Graph
         :param device: (device): Device 'cuda' or 'cpu'
+        :param number_of_trials (int): Number of trials for optuna tuning embeddings
+        :param tune_out (bool): Flag if you want tune out layer of embeddings
         :returns: (NDArray) embeddings NumPy array of (N_nodes) x (N_emb_dim)
         """
         loss_params = self._get_emb_settings(loss_name)
-        emb = self._build_embeddings(loss=loss_params, data=data[0], conv=conv, device=device)
+        emb = self._build_embeddings(
+            loss=loss_params,
+            data=data[0],
+            conv=conv,
+            device=device,
+            number_of_trials=number_of_trials,
+            tune_out=tune_out,
+        )
         return emb
diff --git a/stable_gnn/embedding/model_train_embeddings.py b/stable_gnn/embedding/model_train_embeddings.py
@@ -21,21 +21,25 @@ class ModelTrainEmbeddings:
     :param loss_function: (dict): Dict of parameters of unsupervised loss function
     :param conv: (str): Name of convolution (default:'GCN')
     :param device: (device): Either 'cuda' or 'cpu' (default:'cuda')
+    :param tune_out: (bool): Flag if you want tuning out layer or if it 2 for GeomGCN
     """
 
-    def __init__(self, data: Graph, loss_function: Dict, device: device, conv: str = "GCN") -> None:
+    def __init__(
+        self, data: Graph, loss_function: Dict, device: device, conv: str = "GCN", tune_out: bool = False
+    ) -> None:
         self.conv = conv
         self.device = device
         self.x = data.x
         self.y = data.y.squeeze()
         self.data = data.to(device)
         self.train_mask = torch.Tensor([True] * data.num_nodes)
         self.loss = loss_function
+        self.tune_out = tune_out
         super(ModelTrainEmbeddings, self).__init__()
 
     def _sampling(self, sampler: BaseSampler, epoch: int, nodes: Tensor) -> None:
         if epoch == 0:
-            self.samples = sampler.sample(nodes)
+            self.samples = sampler.sample(nodes.to(self.device))
 
     def _train(
         self,
@@ -76,6 +80,10 @@ def run(self, params: Dict) -> Tensor:
         :return: (Tensor): The output embeddings
         """
         hidden_layer = params["hidden_layer"]
+        if self.tune_out:
+            out_layer = params["out_layer"]
+        else:
+            out_layer = 2
         dropout = params["dropout"]
         size = params["size of network, number of convs"]
         learning_rate = params["lr"]
@@ -90,7 +98,7 @@ def run(self, params: Dict) -> Tensor:
             loss_function=self.loss,
             device=self.device,
             hidden_layer=hidden_layer,
-            out_layer=2,
+            out_layer=out_layer,
             num_layers=size,
             dropout=dropout,
         )
@@ -120,6 +128,10 @@ def _objective(self, trial: Trial) -> Tensor:
         dropout = trial.suggest_float("dropout", 0.0, 0.5, step=0.1)
         size = trial.suggest_categorical("size of network, number of convs", [1, 2, 3])
         learning_rate = trial.suggest_float("lr", 5e-3, 1e-2)
+        if self.tune_out:
+            out_layer = trial.suggest_categorical("out_layer", [32, 64, 128])
+        else:
+            out_layer = 2
 
         loss_to_train = {}
         for name in self.loss:
@@ -158,7 +170,7 @@ def _objective(self, trial: Trial) -> Tensor:
             loss_function=loss_to_train,
             device=self.device,
             hidden_layer=hidden_layer,
-            out_layer=2,
+            out_layer=out_layer,
             num_layers=size,
             dropout=dropout,
         )

diff --git a/stable_gnn/embedding/sampling/abstract_samplers.py b/stable_gnn/embedding/sampling/abstract_samplers.py
@@ -99,7 +99,7 @@ def _sample_negative(self, batch: Tensor, num_negative_samples: int) -> Tensor:
         :param num_negative_samples: (int): number of negative samples for each edge
         :return: (Tensor): Negative samples
         """
-        a, _ = subgraph(batch, self.data.edge_index)
+        a, _ = subgraph(batch, self.data.edge_index.to(self.device))
         adj = self._adj_list(a)
         g = dict()
         batch = batch.tolist()

diff --git a/stable_gnn/embedding/sampling/samplers.py b/stable_gnn/embedding/sampling/samplers.py
@@ -22,6 +22,39 @@
 from stable_gnn.embedding.sampling.abstract_samplers import BaseSampler, BaseSamplerWithNegative
 
 
+class NegativeSampler(BaseSamplerWithNegative):
+    """
+    Sampler for positive and negative edges using random walk based methods
+
+    :param data: (Graph): Input dataset
+    :param device: (device): Either 'cuda' or 'cpu'
+    :param loss_info: (dict): Dict of parameters of unsupervised loss function
+    """
+
+    def __init__(self, data: Graph, device: device, loss_info: Dict) -> None:
+        super().__init__(data, device, loss_info)
+
+    def _neg_sample(self, batch: Tensor) -> Tensor:
+        a, _ = subgraph(batch.tolist(), self.data.edge_index)
+        batch = batch.repeat(self.num_negative_samples)
+        neg_batch = self._sample_negative(batch, num_negative_samples=self.num_negative_samples)
+        return neg_batch
+
+    def negative_sample(self, batch: Batch) -> Tuple[Tensor, Tensor]:
+        """
+        Sample positive and negative edges for batch nodes
+
+        :param batch: (Batch): Nodes for positive and negative sampling from them
+        :return: (Tensor, Tensor): positive and negative samples
+        """
+        if not isinstance(batch, torch.Tensor):
+            batch = torch.tensor(batch, dtype=torch.long).to(self.device)
+        return self._neg_sample(batch)
+
+    def _pos_sample(self, batch: Tensor) -> Tensor:
+        pass
+
+
 class SamplerRandomWalk(BaseSamplerWithNegative):
     """
     Sampler for positive and negative edges using random walk based methods
@@ -274,8 +307,7 @@ def _pos_sample(self, batch: Tensor) -> Tensor:
         len_batch = len(batch)
         mask = torch.tensor([False] * len(self.data.x))
         mask[batch.tolist()] = True
-
-        a, _ = subgraph(batch, self.data.edge_index)
+        a, _ = subgraph(batch, self.data.edge_index.to(self.device))
         row, col = a
         row = row.to(self.device)
         col = col.to(self.device)