From f1508be590d1919d71113ac0f11d1aff6da152b1 Mon Sep 17 00:00:00 2001 From: CodeLionX Date: Fri, 30 Jun 2023 15:46:59 +0200 Subject: [PATCH 1/6] feat: prepare skeleton for GDN algorithm --- gdn/Dockerfile | 12 +++++ gdn/LICENSE | 21 +++++++++ gdn/README.md | 32 ++++++++++++++ gdn/algorithm.py | 101 +++++++++++++++++++++++++++++++++++++++++++ gdn/manifest.json | 98 +++++++++++++++++++++++++++++++++++++++++ gdn/requirements.txt | 4 ++ 6 files changed, 268 insertions(+) create mode 100644 gdn/Dockerfile create mode 100644 gdn/LICENSE create mode 100644 gdn/README.md create mode 100644 gdn/algorithm.py create mode 100644 gdn/manifest.json create mode 100644 gdn/requirements.txt diff --git a/gdn/Dockerfile b/gdn/Dockerfile new file mode 100644 index 0000000..8decce8 --- /dev/null +++ b/gdn/Dockerfile @@ -0,0 +1,12 @@ +FROM registry.gitlab.hpi.de/akita/i/python3-base:0.2.5 + +LABEL maintainer="sebastian.schmidl@hpi.de" + +ENV ALGORITHM_MAIN="/app/algorithm.py" + +# install algorithm dependencies +COPY requirements.txt /app/ +RUN pip install -r /app/requirements.txt +RUN pip install --no-index torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.5.0+cu102.html + +COPY algorithm.py /app/ diff --git a/gdn/LICENSE b/gdn/LICENSE new file mode 100644 index 0000000..89d9369 --- /dev/null +++ b/gdn/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021-2023 d-ailin and Sebastian Schmidl + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/gdn/README.md b/gdn/README.md new file mode 100644 index 0000000..b193011 --- /dev/null +++ b/gdn/README.md @@ -0,0 +1,32 @@ +# DeepNAP + +||| +| :--- | :--- | +| Citekey | KimEtAl2018DeepNAP | +| Source Code | `own` | +| Learning type | semi-supervised | +| Input dimensionality | multivariate | +||| + +## Dependencies + +- python 3 +- pytorch + +## Notes + +DeepNAP outputs anomaly scores for windows. +The results require post-processing. +The scores for each point can be assigned by aggregating the anomaly scores for each window the point is included in. + +You can use the following code snippet for the post-processing step in TimeEval (default parameters directly filled in from the source code): + + +```python +from timeeval.utils.window import ReverseWindowing +# post-processing for DeepNAP +def post_deepnap(scores: np.ndarray, args: dict) -> np.ndarray: + window_size = args.get("hyper_params", {}).get("anomaly_window_size", 15) + return ReverseWindowing(window_size=window_size * 2).fit_transform(scores) +``` + diff --git a/gdn/algorithm.py b/gdn/algorithm.py new file mode 100644 index 0000000..50ca65e --- /dev/null +++ b/gdn/algorithm.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +import argparse +import json +import sys +import numpy as np +import pandas as pd +import torch + +from dataclasses import dataclass + + +@dataclass +class CustomParameters: + window_size: int = 15 + stride: int = 5 + latent_size: int = 64 + n_out_layers: int = 1 + out_layer_dimensionality: int = 1 + epochs: int = 1 + batch_size: int = 128 + split: float = 0.9 + learning_rate_decay: float = 0.001 + random_state: int = 42 + + +class AlgorithmArgs(argparse.Namespace): + @property + def ts(self) -> np.ndarray: + return self.df.iloc[:, 1:-1].values + + @property + def df(self) -> pd.DataFrame: + return pd.read_csv(self.dataInput) + + @staticmethod + def from_sys_args() -> 'AlgorithmArgs': + args: dict = json.loads(sys.argv[1]) + custom_parameter_keys = dir(CustomParameters()) + filtered_parameters = dict( + filter(lambda x: x[0] in custom_parameter_keys, args.get("customParameters", {}).items())) + args["customParameters"] = CustomParameters(**filtered_parameters) + return AlgorithmArgs(**args) + + +def train(args: AlgorithmArgs): + ts = args.ts + + train_config = { + "batch": args.customParameters.batch_size, + "epoch": args.customParameters.epochs, + "slide_win": args.customParameters.window_size, + "dim": args.customParameters.latent_size, + "slide_stride": args.customParameters.stride, + "comment": "TimeEval execution", + "seed": args.customParameters.random_state, + "out_layer_num": args.customParameters.n_out_layers, + "out_layer_inter_dim": args.customParameters.out_layer_dimensionality, + "decay": args.customParameters.learning_rate_decay, + "val_ratio": args.customParameters.split, + "topk": 20, + } + + raise NotImplementedError("GDN is not implemented yet!") + + +def execute(args: AlgorithmArgs): + ts = args.ts + + env_config={ + "save_path": ..., + "dataset": ..., + "report": ..., + "device": ..., + "load_model_path": ... + } + + raise NotImplementedError("GDN is not implemented yet!") + +def set_random_state(config: AlgorithmArgs) -> None: + seed = config.customParameters.random_state + import random + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Wrong number of arguments specified; expected a single json-string!") + exit(1) + + args = AlgorithmArgs.from_sys_args() + set_random_state(args) + print(f"AlgorithmArgs: {args}") + + if args.executionType == "train": + train(args) + elif args.executionType == "execute": + execute(args) + else: + raise ValueError(f"Unknown execution type '{args.executionType}'; expected either 'train' or 'execute'!") diff --git a/gdn/manifest.json b/gdn/manifest.json new file mode 100644 index 0000000..d061cec --- /dev/null +++ b/gdn/manifest.json @@ -0,0 +1,98 @@ +{ + "title": "GDN", + "description": "Implementation of https://doi.org/10.1609/aaai.v35i5.16523", + "inputDimensionality": "multivariate", + "version": "0.1", + "authors": "Ailin Deng, Bryan Hooi", + "language": "Python", + "type": "Detector", + "mainFile": "algorithm.py", + "learningType": "semi-supervised", + "trainingStep": { + "parameters": [ + { + "name": "window_size", + "type": "int", + "defaultValue": 15, + "optional": "true", + "description": "Size of the sliding windows" + }, + { + "name": "stride", + "type": "int", + "defaultValue": 5, + "optional": "true", + "description": "Stride for the sliding windows" + }, + { + "name": "latent_size", + "type": "int", + "defaultValue": 64, + "optional": "true", + "description": "Dimensionality of the latent embedding space" + }, + { + "name": "n_out_layers", + "type": "int", + "defaultValue": 1, + "optional": "true", + "description": "Number of output layers" + }, + { + "name": "out_layer_dimensionality", + "type": "int", + "defaultValue": 1, + "optional": "true", + "description": "Interim dimensionality of the output layers." + }, + { + "name": "epochs", + "type": "int", + "defaultValue": 1, + "optional": "true", + "description": "Number of training iterations over entire dataset; recommended value: 100" + }, + { + "name": "batch_size", + "type": "int", + "defaultValue": 128, + "optional": "true", + "description": "Number of instances trained at the same time" + }, + { + "name": "split", + "type": "float", + "defaultValue": 0.9, + "optional": "true", + "description": "Train-validation split" + }, + { + "name": "learning_rate_decay", + "type": "float", + "defaultValue": 0.001, + "optional": "true", + "description": "Learning rate decay for Adam optimizer" + }, + { + "name": "random_state", + "type": "int", + "defaultValue": 42, + "optional": "true", + "description": "Seed for the random number generator" + } + ], + "modelInput": "none" + }, + "executionStep": { + "parameters": [ + { + "name": "random_state", + "type": "int", + "defaultValue": 42, + "optional": "true", + "description": "Seed for the random number generator" + } + ], + "modelInput": "required" + } +} diff --git a/gdn/requirements.txt b/gdn/requirements.txt new file mode 100644 index 0000000..8db9a89 --- /dev/null +++ b/gdn/requirements.txt @@ -0,0 +1,4 @@ +numpy>=1.19.5 +pandas>=1.2.1 +torch==1.5.1 +torch-geometric==1.5.0 From 3dc40063a063c265d126476ae5dfd954f7b9c95c Mon Sep 17 00:00:00 2001 From: CodeLionX Date: Fri, 30 Jun 2023 15:52:18 +0200 Subject: [PATCH 2/6] doc(gdn): add GDN to global README and describe the basic in its README --- README.md | 1 + gdn/README.md | 26 +++++--------------------- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 092bb8a..7ade96f 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,7 @@ The namespace prefix (repository) for the built Docker images is `registry.gitla | [fft](./fft) | `registry.gitlab.hpi.de/akita/i/fft` | python 3.7 | [`registry.gitlab.hpi.de/akita/i/python3-base`](./0-base-images/python3-base) | unsupervised | univariate | | [generic_rf](./generic_rf) | `registry.gitlab.hpi.de/akita/i/generic_rf` | python 3.7 | [`registry.gitlab.hpi.de/akita/i/python3-base`](./0-base-images/python3-base) | semi-supervised | univariate | | [generic_xgb](./generic_xgb) | `registry.gitlab.hpi.de/akita/i/generic_xgb` | python 3.7 | [`registry.gitlab.hpi.de/akita/i/python3-base`](./0-base-images/python3-base) | semi-supervised | univariate | +| [gdn](./gdn) | `registry.gitlab.hpi.de/akita/i/gdn` | python 3.7 | [`registry.gitlab.hpi.de/akita/i/python3-base`](./0-base-images/python3-base) | semi-supervised | multivariate | | [grammarviz3](./grammarviz3) | `registry.gitlab.hpi.de/akita/i/grammarviz3` | Java| [`registry.gitlab.hpi.de/akita/i/java-base`](./0-base-images/java-base) | unsupervised | univariate | | [grammarviz3_multi](./grammarviz3_multi) | `registry.gitlab.hpi.de/akita/i/grammarviz3_multi` | Java| [`registry.gitlab.hpi.de/akita/i/java-base`](./0-base-images/java-base) | unsupervised | multivariate | | [hbos](./hbos) | `registry.gitlab.hpi.de/akita/i/hbos` | python 3.7 | [`registry.gitlab.hpi.de/akita/i/pyod`](./0-base-images/pyod) -> [`registry.gitlab.hpi.de/akita/i/python3-base`](./0-base-images/python3-base) | unsupervised | multivariate | diff --git a/gdn/README.md b/gdn/README.md index b193011..4efe63d 100644 --- a/gdn/README.md +++ b/gdn/README.md @@ -1,9 +1,9 @@ -# DeepNAP +# GDN ||| | :--- | :--- | -| Citekey | KimEtAl2018DeepNAP | -| Source Code | `own` | +| Citekey | DengEtAl2021Graph | +| Source Code | [https://github.com/d-ailin/GDN](https://github.com/d-ailin/GDN) | | Learning type | semi-supervised | | Input dimensionality | multivariate | ||| @@ -12,21 +12,5 @@ - python 3 - pytorch - -## Notes - -DeepNAP outputs anomaly scores for windows. -The results require post-processing. -The scores for each point can be assigned by aggregating the anomaly scores for each window the point is included in. - -You can use the following code snippet for the post-processing step in TimeEval (default parameters directly filled in from the source code): - - -```python -from timeeval.utils.window import ReverseWindowing -# post-processing for DeepNAP -def post_deepnap(scores: np.ndarray, args: dict) -> np.ndarray: - window_size = args.get("hyper_params", {}).get("anomaly_window_size", 15) - return ReverseWindowing(window_size=window_size * 2).fit_transform(scores) -``` - +- pytorch-geometric +- torch-scatter, torch-sparse, torch-cluster, torch-spline-conv From 91c0fb1b8d946a114f758be0042c61e1981a64e9 Mon Sep 17 00:00:00 2001 From: 2er0 <2er0@dbaumi.at> Date: Tue, 4 Jul 2023 23:05:13 +0200 Subject: [PATCH 3/6] WIP not tested draft of GDN integration --- gdn/Dockerfile | 1 + gdn/GDN/LICENSE | 21 ++ gdn/GDN/README.md | 79 +++++++ gdn/GDN/__init__.py | 0 gdn/GDN/datasets/TimeDataset.py | 78 +++++++ gdn/GDN/evaluate.py | 159 +++++++++++++ gdn/GDN/install.sh | 5 + gdn/GDN/main.py | 383 ++++++++++++++++++++++++++++++++ gdn/GDN/models/GDN.py | 188 ++++++++++++++++ gdn/GDN/models/graph_layer.py | 124 +++++++++++ gdn/GDN/run.sh | 59 +++++ gdn/GDN/test.py | 83 +++++++ gdn/GDN/train.py | 112 ++++++++++ gdn/GDN/util/data.py | 126 +++++++++++ gdn/GDN/util/env.py | 15 ++ gdn/GDN/util/iostream.py | 106 +++++++++ gdn/GDN/util/net_struct.py | 58 +++++ gdn/GDN/util/preprocess.py | 116 ++++++++++ gdn/GDN/util/time.py | 28 +++ gdn/algorithm.py | 44 +++- 20 files changed, 1775 insertions(+), 10 deletions(-) create mode 100644 gdn/GDN/LICENSE create mode 100644 gdn/GDN/README.md create mode 100644 gdn/GDN/__init__.py create mode 100644 gdn/GDN/datasets/TimeDataset.py create mode 100644 gdn/GDN/evaluate.py create mode 100644 gdn/GDN/install.sh create mode 100644 gdn/GDN/main.py create mode 100644 gdn/GDN/models/GDN.py create mode 100644 gdn/GDN/models/graph_layer.py create mode 100644 gdn/GDN/run.sh create mode 100644 gdn/GDN/test.py create mode 100644 gdn/GDN/train.py create mode 100644 gdn/GDN/util/data.py create mode 100644 gdn/GDN/util/env.py create mode 100644 gdn/GDN/util/iostream.py create mode 100644 gdn/GDN/util/net_struct.py create mode 100644 gdn/GDN/util/preprocess.py create mode 100644 gdn/GDN/util/time.py diff --git a/gdn/Dockerfile b/gdn/Dockerfile index 8decce8..202770f 100644 --- a/gdn/Dockerfile +++ b/gdn/Dockerfile @@ -10,3 +10,4 @@ RUN pip install -r /app/requirements.txt RUN pip install --no-index torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.5.0+cu102.html COPY algorithm.py /app/ +COPY GDN /app/GDN diff --git a/gdn/GDN/LICENSE b/gdn/GDN/LICENSE new file mode 100644 index 0000000..956d782 --- /dev/null +++ b/gdn/GDN/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 d-ailin + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/gdn/GDN/README.md b/gdn/GDN/README.md new file mode 100644 index 0000000..7465300 --- /dev/null +++ b/gdn/GDN/README.md @@ -0,0 +1,79 @@ +# GDN + +Code implementation for : [Graph Neural Network-Based Anomaly Detection in Multivariate Time Series(AAAI'21)](https://arxiv.org/pdf/2106.06947.pdf) + + +# Installation +### Requirements +* Python >= 3.6 +* cuda == 10.2 +* [Pytorch==1.5.1](https://pytorch.org/) +* [PyG: torch-geometric==1.5.0](https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html) + +### Install packages +``` + # run after installing correct Pytorch package + bash install.sh +``` + +### Quick Start +Run to check if the environment is ready +``` + bash run.sh cpu msl + # or with gpu + bash run.sh msl # e.g. bash run.sh 1 msl +``` + + +# Usage +We use part of msl dataset(refer to [telemanom](https://github.com/khundman/telemanom)) as demo example. + +## Data Preparation +``` +# put your dataset under data/ directory with the same structure shown in the data/msl/ + +data + |-msl + | |-list.txt # the feature names, one feature per line + | |-train.csv # training data + | |-test.csv # test data + |-your_dataset + | |-list.txt + | |-train.csv + | |-test.csv + | ... + +``` + +### Notices: +* The first column in .csv will be regarded as index column. +* The column sequence in .csv don't need to match the sequence in list.txt, we will rearrange the data columns according to the sequence in list.txt. +* test.csv should have a column named "attack" which contains ground truth label(0/1) of being attacked or not(0: normal, 1: attacked) + +## Run +``` + # using gpu + bash run.sh + + # or using cpu + bash run.sh cpu +``` +You can change running parameters in the run.sh. + +# Others +SWaT and WADI datasets can be requested from [iTrust](https://itrust.sutd.edu.sg/) + + +# Citation +If you find this repo or our work useful for your research, please consider citing the paper +``` +@inproceedings{deng2021graph, + title={Graph neural network-based anomaly detection in multivariate time series}, + author={Deng, Ailin and Hooi, Bryan}, + booktitle={Proceedings of the AAAI Conference on Artificial Intelligence}, + volume={35}, + number={5}, + pages={4027--4035}, + year={2021} +} +``` diff --git a/gdn/GDN/__init__.py b/gdn/GDN/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gdn/GDN/datasets/TimeDataset.py b/gdn/GDN/datasets/TimeDataset.py new file mode 100644 index 0000000..8eb0b4c --- /dev/null +++ b/gdn/GDN/datasets/TimeDataset.py @@ -0,0 +1,78 @@ +import torch +from torch.utils.data import Dataset, DataLoader + +import torch.nn.functional as F +from sklearn.preprocessing import MinMaxScaler, StandardScaler +import numpy as np + + +class TimeDataset(Dataset): + def __init__(self, raw_data, edge_index, mode='train', config = None): + self.raw_data = raw_data + + self.config = config + self.edge_index = edge_index + self.mode = mode + + x_data = raw_data[:-1] + labels = raw_data[-1] + + + data = x_data + + # to tensor + data = torch.tensor(data).double() + labels = torch.tensor(labels).double() + + self.x, self.y, self.labels = self.process(data, labels) + + def __len__(self): + return len(self.x) + + + def process(self, data, labels): + x_arr, y_arr = [], [] + labels_arr = [] + + slide_win, slide_stride = [self.config[k] for k + in ['slide_win', 'slide_stride'] + ] + is_train = self.mode == 'train' + + node_num, total_time_len = data.shape + + rang = range(slide_win, total_time_len, slide_stride) if is_train else range(slide_win, total_time_len) + + for i in rang: + + ft = data[:, i-slide_win:i] + tar = data[:, i] + + x_arr.append(ft) + y_arr.append(tar) + + labels_arr.append(labels[i]) + + + x = torch.stack(x_arr).contiguous() + y = torch.stack(y_arr).contiguous() + + labels = torch.Tensor(labels_arr).contiguous() + + return x, y, labels + + def __getitem__(self, idx): + + feature = self.x[idx].double() + y = self.y[idx].double() + + edge_index = self.edge_index.long() + + label = self.labels[idx].double() + + return feature, y, label, edge_index + + + + + diff --git a/gdn/GDN/evaluate.py b/gdn/GDN/evaluate.py new file mode 100644 index 0000000..ae4110d --- /dev/null +++ b/gdn/GDN/evaluate.py @@ -0,0 +1,159 @@ +from util.data import * +import numpy as np +from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score + + +def get_full_err_scores(test_result, val_result): + np_test_result = np.array(test_result) + np_val_result = np.array(val_result) + + all_scores = None + all_normals = None + feature_num = np_test_result.shape[-1] + + labels = np_test_result[2, :, 0].tolist() + + for i in range(feature_num): + test_re_list = np_test_result[:2,:,i] + val_re_list = np_val_result[:2,:,i] + + scores = get_err_scores(test_re_list, val_re_list) + normal_dist = get_err_scores(val_re_list, val_re_list) + + if all_scores is None: + all_scores = scores + all_normals = normal_dist + else: + all_scores = np.vstack(( + all_scores, + scores + )) + all_normals = np.vstack(( + all_normals, + normal_dist + )) + + return all_scores, all_normals + + +def get_final_err_scores(test_result, val_result): + full_scores, all_normals = get_full_err_scores(test_result, val_result, return_normal_scores=True) + + all_scores = np.max(full_scores, axis=0) + + return all_scores + + + +def get_err_scores(test_res, val_res): + test_predict, test_gt = test_res + val_predict, val_gt = val_res + + n_err_mid, n_err_iqr = get_err_median_and_iqr(test_predict, test_gt) + + test_delta = np.abs(np.subtract( + np.array(test_predict).astype(np.float64), + np.array(test_gt).astype(np.float64) + )) + epsilon=1e-2 + + err_scores = (test_delta - n_err_mid) / ( np.abs(n_err_iqr) +epsilon) + + smoothed_err_scores = np.zeros(err_scores.shape) + before_num = 3 + for i in range(before_num, len(err_scores)): + smoothed_err_scores[i] = np.mean(err_scores[i-before_num:i+1]) + + + return smoothed_err_scores + + + +def get_loss(predict, gt): + return eval_mseloss(predict, gt) + +def get_f1_scores(total_err_scores, gt_labels, topk=1): + print('total_err_scores', total_err_scores.shape) + # remove the highest and lowest score at each timestep + total_features = total_err_scores.shape[0] + + # topk_indices = np.argpartition(total_err_scores, range(total_features-1-topk, total_features-1), axis=0)[-topk-1:-1] + topk_indices = np.argpartition(total_err_scores, range(total_features-topk-1, total_features), axis=0)[-topk:] + + topk_indices = np.transpose(topk_indices) + + total_topk_err_scores = [] + topk_err_score_map=[] + # topk_anomaly_sensors = [] + + for i, indexs in enumerate(topk_indices): + + sum_score = sum( score for k, score in enumerate(sorted([total_err_scores[index, i] for j, index in enumerate(indexs)])) ) + + total_topk_err_scores.append(sum_score) + + final_topk_fmeas = eval_scores(total_topk_err_scores, gt_labels, 400) + + return final_topk_fmeas + +def get_val_performance_data(total_err_scores, normal_scores, gt_labels, topk=1): + total_features = total_err_scores.shape[0] + + topk_indices = np.argpartition(total_err_scores, range(total_features-topk-1, total_features), axis=0)[-topk:] + + total_topk_err_scores = [] + topk_err_score_map=[] + + total_topk_err_scores = np.sum(np.take_along_axis(total_err_scores, topk_indices, axis=0), axis=0) + + thresold = np.max(normal_scores) + + pred_labels = np.zeros(len(total_topk_err_scores)) + pred_labels[total_topk_err_scores > thresold] = 1 + + for i in range(len(pred_labels)): + pred_labels[i] = int(pred_labels[i]) + gt_labels[i] = int(gt_labels[i]) + + pre = precision_score(gt_labels, pred_labels) + rec = recall_score(gt_labels, pred_labels) + + f1 = f1_score(gt_labels, pred_labels) + + + auc_score = roc_auc_score(gt_labels, total_topk_err_scores) + + return f1, pre, rec, auc_score, thresold + + +def get_best_performance_data(total_err_scores, gt_labels, topk=1): + + total_features = total_err_scores.shape[0] + + # topk_indices = np.argpartition(total_err_scores, range(total_features-1-topk, total_features-1), axis=0)[-topk-1:-1] + topk_indices = np.argpartition(total_err_scores, range(total_features-topk-1, total_features), axis=0)[-topk:] + + total_topk_err_scores = [] + topk_err_score_map=[] + + total_topk_err_scores = np.sum(np.take_along_axis(total_err_scores, topk_indices, axis=0), axis=0) + + final_topk_fmeas ,thresolds = eval_scores(total_topk_err_scores, gt_labels, 400, return_thresold=True) + + th_i = final_topk_fmeas.index(max(final_topk_fmeas)) + thresold = thresolds[th_i] + + pred_labels = np.zeros(len(total_topk_err_scores)) + pred_labels[total_topk_err_scores > thresold] = 1 + + for i in range(len(pred_labels)): + pred_labels[i] = int(pred_labels[i]) + gt_labels[i] = int(gt_labels[i]) + + pre = precision_score(gt_labels, pred_labels) + rec = recall_score(gt_labels, pred_labels) + + auc_score = roc_auc_score(gt_labels, total_topk_err_scores) + + return max(final_topk_fmeas), pre, rec, auc_score, thresold + diff --git a/gdn/GDN/install.sh b/gdn/GDN/install.sh new file mode 100644 index 0000000..5b4f4e7 --- /dev/null +++ b/gdn/GDN/install.sh @@ -0,0 +1,5 @@ +pip install --no-index torch-scatter -f https://pytorch-geometric.com/whl/torch-1.5.0+cu102.html +pip install --no-index torch-sparse -f https://pytorch-geometric.com/whl/torch-1.5.0+cu102.html +pip install --no-index torch-cluster -f https://pytorch-geometric.com/whl/torch-1.5.0+cu102.html +pip install --no-index torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.5.0+cu102.html +pip install torch-geometric==1.5.0 \ No newline at end of file diff --git a/gdn/GDN/main.py b/gdn/GDN/main.py new file mode 100644 index 0000000..a57ffa9 --- /dev/null +++ b/gdn/GDN/main.py @@ -0,0 +1,383 @@ +# -*- coding: utf-8 -*- +import pickle as pkl +from typing import List, Any + +import pandas as pd +import numpy as np +import torch +import matplotlib.pyplot as plt +from torch.utils.data import DataLoader, random_split, Subset + +from sklearn.preprocessing import MinMaxScaler + +from GDN.util.env import get_device, set_device +from GDN.util.preprocess import build_loc_net, construct_data +from GDN.util.net_struct import get_feature_map, get_fc_graph_struc +from GDN.util.iostream import printsep + +from GDN.datasets.TimeDataset import TimeDataset + +from GDN.models.GDN import GDN + +from GDN.train import train +from GDN.test import test +from GDN.evaluate import get_err_scores, get_best_performance_data, get_val_performance_data, get_full_err_scores + +import sys +from datetime import datetime + +import os +import argparse +from pathlib import Path + +import matplotlib.pyplot as plt + +import json +import random + + +def GDNtrain(train_config: dict, env_config: dict) -> None: + feature_map = get_feature_map(env_config["dataset"]) + fc_struc = get_fc_graph_struc(env_config["dataset"]) + + set_device(env_config["device"] + if "device" in env_config else + "cpu") + device = get_device() + + fc_edge_index = build_loc_net(fc_struc, list(env_config["dataset"].columns), feature_map=feature_map) + fc_edge_index = torch.tensor(fc_edge_index, dtype=torch.long) + + train_dataset_indata = construct_data(train, feature_map, labels=0) + + cfg = { + 'slide_win': train_config['slide_win'], + 'slide_stride': train_config['slide_stride'], + } + + train_dataset = TimeDataset(train_dataset_indata, fc_edge_index, mode='train', config=cfg) + train_dataloader, val_dataloader = get_loaders(train_dataset, train_config['seed'], train_config['batch'], + val_ratio=train_config['val_ratio']) + full_train_dataloader = DataLoader(train_dataset, batch_size=train_config['batch'], + shuffle=False, num_workers=0) + + edge_index_sets = [] + edge_index_sets.append(fc_edge_index) + + model = create_gdn_model(train_config, edge_index_sets, + feature_map, device) + + if "save_model_path" in env_config and len(env_config["save_model_path"]) > 0: + model_save_path = env_config["save_model_path"] + else: + model_save_path = get_save_path()[0] + + train_log = train(model, model_save_path, + config=train_config, + train_dataloader=train_dataloader, + val_dataloader=val_dataloader, + feature_map=feature_map, + test_dataloader=None, + test_dataset=None, + train_dataset=train_dataset, + dataset_name=env_config['dataset'] + ) + + _, train_result = test(model, full_train_dataloader) + save_result_output(train_result, env_config["dataOutput"]) + + save_config_element(train_config, feature_map, fc_edge_index) + + +def GDNtest(env_config: dict) -> None: + elements = load_config_element() + train_config, feature_map, fc_edge_index = elements[0], elements[1], elements[2] + + set_device(env_config["device"] + if "device" in env_config else + "cpu") + device = get_device() + + edge_index_sets = [] + edge_index_sets.append(fc_edge_index) + + test = env_config["dataset"] + + test_dataset_indata = construct_data(test, feature_map, + labels=test["is_anomaly"].tolist()) + + cfg = { + 'slide_win': train_config['slide_win'], + 'slide_stride': train_config['slide_stride'], + } + + test_dataset = TimeDataset(test_dataset_indata, fc_edge_index, mode='test', config=cfg) + + test_dataloader = DataLoader(test_dataset, batch_size=train_config['batch'], + shuffle=False, num_workers=0) + + if "load_model_path" in env_config and len(env_config["load_model_path"]) > 0: + model_load_path = env_config["load_model_path"] + else: + model_load_path = get_save_path()[0] + + model = create_gdn_model(train_config, edge_index_sets, + feature_map, device) + + model.load_state_dict(torch.load(model_load_path)) + best_model = model.to(device) + + _, test_result = test(best_model, test_dataloader) + save_result_output(test_result, env_config["dataOutput"]) + + +def create_gdn_model(train_config, + edge_index_sets, feature_map, + device) -> GDN: + return GDN(edge_index_sets, len(feature_map), + dim=train_config['dim'], + input_dim=train_config['slide_win'], + out_layer_num=train_config['out_layer_num'], + out_layer_inter_dim=train_config['out_layer_inter_dim'], + topk=train_config['topk'] + ).to(device) + + +def get_loaders(train_dataset, seed, batch, val_ratio=0.1): + dataset_len = int(len(train_dataset)) + train_use_len = int(dataset_len * (1 - val_ratio)) + val_use_len = int(dataset_len * val_ratio) + val_start_index = random.randrange(train_use_len) + indices = torch.arange(dataset_len) + + train_sub_indices = torch.cat([indices[:val_start_index], indices[val_start_index + val_use_len:]]) + train_subset = Subset(train_dataset, train_sub_indices) + + val_sub_indices = indices[val_start_index:val_start_index + val_use_len] + val_subset = Subset(train_dataset, val_sub_indices) + + train_dataloader = DataLoader(train_subset, batch_size=batch, + shuffle=True) + + val_dataloader = DataLoader(val_subset, batch_size=batch, + shuffle=False) + + return train_dataloader, val_dataloader + + +def get_save_path(): + now = datetime.now() + datestr = now.strftime('%m|%d-%H:%M:%S') + + paths = [ + f'./pretrained/best_{datestr}.pt', + f'./results/results.tmp', + f'./pretrained/train_config.pkl', + f'./pretrained/feature_map.pkl', + f'./pretrained/fc_edge_index.pkl' + ] + + for path in paths: + dirname = os.path.dirname(path) + Path(dirname).mkdir(parents=True, exist_ok=True) + + return paths + + +def save_config_element(train_config, feature_map, fc_edge_index) -> None: + paths = get_save_path() + for p, e in zip(paths[-3:], + [train_config, feature_map, fc_edge_index]): + with open(p, 'wb') as file: + pkl.dump(e, file, protocol=pkl.HIGHEST_PROTOCOL) + + +def load_config_element() -> List[Any]: + paths = get_save_path() + elements = [] + for p in paths[-3:]: + if not Path(p).exists(): + raise FileNotFoundError("Base element not found in required path." + "Run training first", p) + with open(p, 'rb') as file: + elements.append(pkl.load(file)) + return elements + + +def save_result_output(result, name) -> None: + path = get_save_path()[1].replace("results.tmp", name) + np_result = np.array(result) + np.savetxt(path, np_result, delimiter=",") + + +# TODO remove this +class GDNMain(): + def __init__(self, train_config, env_config, debug=False): + + self.train_config = train_config + self.env_config = env_config + self.datestr = None + + dataset = self.env_config['dataset'] + train_orig = pd.read_csv(f'./data/{dataset}/train.csv', sep=',', index_col=0) + test_orig = pd.read_csv(f'./data/{dataset}/test.csv', sep=',', index_col=0) + + train, test = train_orig, test_orig + + if 'attack' in train.columns: + train = train.drop(columns=['attack']) + + feature_map = get_feature_map(dataset) + fc_struc = get_fc_graph_struc(dataset) + + set_device(env_config['device']) + self.device = get_device() + + fc_edge_index = build_loc_net(fc_struc, list(train.columns), feature_map=feature_map) + fc_edge_index = torch.tensor(fc_edge_index, dtype=torch.long) + + self.feature_map = feature_map + + train_dataset_indata = construct_data(train, feature_map, labels=0) + test_dataset_indata = construct_data(test, feature_map, labels=test.attack.tolist()) + + cfg = { + 'slide_win': train_config['slide_win'], + 'slide_stride': train_config['slide_stride'], + } + + train_dataset = TimeDataset(train_dataset_indata, fc_edge_index, mode='train', config=cfg) + test_dataset = TimeDataset(test_dataset_indata, fc_edge_index, mode='test', config=cfg) + + train_dataloader, val_dataloader = get_loaders(train_dataset, train_config['seed'], train_config['batch'], + val_ratio=train_config['val_ratio']) + + self.train_dataset = train_dataset + self.test_dataset = test_dataset + + self.train_dataloader = train_dataloader + self.val_dataloader = val_dataloader + self.test_dataloader = DataLoader(test_dataset, batch_size=train_config['batch'], + shuffle=False, num_workers=0) + + edge_index_sets = [] + edge_index_sets.append(fc_edge_index) + + self.model = GDN(edge_index_sets, len(feature_map), + dim=train_config['dim'], + input_dim=train_config['slide_win'], + out_layer_num=train_config['out_layer_num'], + out_layer_inter_dim=train_config['out_layer_inter_dim'], + topk=train_config['topk'] + ).to(self.device) + + def run(self): + + if len(self.env_config['load_model_path']) > 0: + model_save_path = self.env_config['load_model_path'] + else: + model_save_path = self.get_save_path()[0] + + self.train_log = train(self.model, model_save_path, + config=train_config, + train_dataloader=self.train_dataloader, + val_dataloader=self.val_dataloader, + feature_map=self.feature_map, + test_dataloader=self.test_dataloader, + test_dataset=self.test_dataset, + train_dataset=self.train_dataset, + dataset_name=self.env_config['dataset'] + ) + + # test + self.model.load_state_dict(torch.load(model_save_path)) + best_model = self.model.to(self.device) + + _, self.test_result = test(best_model, self.test_dataloader) + _, self.val_result = test(best_model, self.val_dataloader) + + self.get_score(self.test_result, self.val_result) + + def get_score(self, test_result, val_result): + + feature_num = len(test_result[0][0]) + np_test_result = np.array(test_result) + np_val_result = np.array(val_result) + + test_labels = np_test_result[2, :, 0].tolist() + + test_scores, normal_scores = get_full_err_scores(test_result, val_result) + + top1_best_info = get_best_performance_data(test_scores, test_labels, topk=1) + top1_val_info = get_val_performance_data(test_scores, normal_scores, test_labels, topk=1) + + print('=========================** Result **============================\n') + + info = None + if self.env_config['report'] == 'best': + info = top1_best_info + elif self.env_config['report'] == 'val': + info = top1_val_info + + print(f'F1 score: {info[0]}') + print(f'precision: {info[1]}') + print(f'recall: {info[2]}\n') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument('-batch', help='batch size', type=int, default=128) + parser.add_argument('-epoch', help='train epoch', type=int, default=100) + parser.add_argument('-slide_win', help='slide_win', type=int, default=15) + parser.add_argument('-dim', help='dimension', type=int, default=64) + parser.add_argument('-slide_stride', help='slide_stride', type=int, default=5) + parser.add_argument('-save_path_pattern', help='save path pattern', type=str, default='') + parser.add_argument('-dataset', help='wadi / swat', type=str, default='wadi') + parser.add_argument('-device', help='cuda / cpu', type=str, default='cuda') + parser.add_argument('-random_seed', help='random seed', type=int, default=0) + parser.add_argument('-comment', help='experiment comment', type=str, default='') + parser.add_argument('-out_layer_num', help='outlayer num', type=int, default=1) + parser.add_argument('-out_layer_inter_dim', help='out_layer_inter_dim', type=int, default=256) + parser.add_argument('-decay', help='decay', type=float, default=0) + parser.add_argument('-val_ratio', help='val ratio', type=float, default=0.1) + parser.add_argument('-topk', help='topk num', type=int, default=20) + parser.add_argument('-report', help='best / val', type=str, default='best') + parser.add_argument('-load_model_path', help='trained model path', type=str, default='') + + args = parser.parse_args() + + random.seed(args.random_seed) + np.random.seed(args.random_seed) + torch.manual_seed(args.random_seed) + torch.cuda.manual_seed(args.random_seed) + torch.cuda.manual_seed_all(args.random_seed) + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + os.environ['PYTHONHASHSEED'] = str(args.random_seed) + + train_config = { + 'batch': args.batch, + 'epoch': args.epoch, + 'slide_win': args.slide_win, + 'dim': args.dim, + 'slide_stride': args.slide_stride, + 'comment': args.comment, + 'seed': args.random_seed, + 'out_layer_num': args.out_layer_num, + 'out_layer_inter_dim': args.out_layer_inter_dim, + 'decay': args.decay, + 'val_ratio': args.val_ratio, + 'topk': args.topk, + } + + env_config = { + 'save_path': args.save_path_pattern, + 'dataset': args.dataset, + 'report': args.report, + 'device': args.device, + 'load_model_path': args.load_model_path + } + + main = GDNMain(train_config, env_config, debug=False) + main.run() diff --git a/gdn/GDN/models/GDN.py b/gdn/GDN/models/GDN.py new file mode 100644 index 0000000..e967790 --- /dev/null +++ b/gdn/GDN/models/GDN.py @@ -0,0 +1,188 @@ +import numpy as np +import torch +import matplotlib.pyplot as plt +import torch.nn as nn +import time +from util.time import * +from util.env import * +from torch_geometric.nn import GCNConv, GATConv, EdgeConv +import math +import torch.nn.functional as F + +from .graph_layer import GraphLayer + + +def get_batch_edge_index(org_edge_index, batch_num, node_num): + # org_edge_index:(2, edge_num) + edge_index = org_edge_index.clone().detach() + edge_num = org_edge_index.shape[1] + batch_edge_index = edge_index.repeat(1,batch_num).contiguous() + + for i in range(batch_num): + batch_edge_index[:, i*edge_num:(i+1)*edge_num] += i*node_num + + return batch_edge_index.long() + + +class OutLayer(nn.Module): + def __init__(self, in_num, node_num, layer_num, inter_num = 512): + super(OutLayer, self).__init__() + + modules = [] + + for i in range(layer_num): + # last layer, output shape:1 + if i == layer_num-1: + modules.append(nn.Linear( in_num if layer_num == 1 else inter_num, 1)) + else: + layer_in_num = in_num if i == 0 else inter_num + modules.append(nn.Linear( layer_in_num, inter_num )) + modules.append(nn.BatchNorm1d(inter_num)) + modules.append(nn.ReLU()) + + self.mlp = nn.ModuleList(modules) + + def forward(self, x): + out = x + + for mod in self.mlp: + if isinstance(mod, nn.BatchNorm1d): + out = out.permute(0,2,1) + out = mod(out) + out = out.permute(0,2,1) + else: + out = mod(out) + + return out + + + +class GNNLayer(nn.Module): + def __init__(self, in_channel, out_channel, inter_dim=0, heads=1, node_num=100): + super(GNNLayer, self).__init__() + + + self.gnn = GraphLayer(in_channel, out_channel, inter_dim=inter_dim, heads=heads, concat=False) + + self.bn = nn.BatchNorm1d(out_channel) + self.relu = nn.ReLU() + self.leaky_relu = nn.LeakyReLU() + + def forward(self, x, edge_index, embedding=None, node_num=0): + + out, (new_edge_index, att_weight) = self.gnn(x, edge_index, embedding, return_attention_weights=True) + self.att_weight_1 = att_weight + self.edge_index_1 = new_edge_index + + out = self.bn(out) + + return self.relu(out) + + +class GDN(nn.Module): + def __init__(self, edge_index_sets, node_num, dim=64, out_layer_inter_dim=256, input_dim=10, out_layer_num=1, topk=20): + + super(GDN, self).__init__() + + self.edge_index_sets = edge_index_sets + + device = get_device() + + edge_index = edge_index_sets[0] + + + embed_dim = dim + self.embedding = nn.Embedding(node_num, embed_dim) + self.bn_outlayer_in = nn.BatchNorm1d(embed_dim) + + + edge_set_num = len(edge_index_sets) + self.gnn_layers = nn.ModuleList([ + GNNLayer(input_dim, dim, inter_dim=dim+embed_dim, heads=1) for i in range(edge_set_num) + ]) + + + self.node_embedding = None + self.topk = topk + self.learned_graph = None + + self.out_layer = OutLayer(dim*edge_set_num, node_num, out_layer_num, inter_num = out_layer_inter_dim) + + self.cache_edge_index_sets = [None] * edge_set_num + self.cache_embed_index = None + + self.dp = nn.Dropout(0.2) + + self.init_params() + + def init_params(self): + nn.init.kaiming_uniform_(self.embedding.weight, a=math.sqrt(5)) + + + def forward(self, data, org_edge_index): + + x = data.clone().detach() + edge_index_sets = self.edge_index_sets + + device = data.device + + batch_num, node_num, all_feature = x.shape + x = x.view(-1, all_feature).contiguous() + + + gcn_outs = [] + for i, edge_index in enumerate(edge_index_sets): + edge_num = edge_index.shape[1] + cache_edge_index = self.cache_edge_index_sets[i] + + if cache_edge_index is None or cache_edge_index.shape[1] != edge_num*batch_num: + self.cache_edge_index_sets[i] = get_batch_edge_index(edge_index, batch_num, node_num).to(device) + + batch_edge_index = self.cache_edge_index_sets[i] + + all_embeddings = self.embedding(torch.arange(node_num).to(device)) + + weights_arr = all_embeddings.detach().clone() + all_embeddings = all_embeddings.repeat(batch_num, 1) + + weights = weights_arr.view(node_num, -1) + + cos_ji_mat = torch.matmul(weights, weights.T) + normed_mat = torch.matmul(weights.norm(dim=-1).view(-1,1), weights.norm(dim=-1).view(1,-1)) + cos_ji_mat = cos_ji_mat / normed_mat + + dim = weights.shape[-1] + topk_num = self.topk + + topk_indices_ji = torch.topk(cos_ji_mat, topk_num, dim=-1)[1] + + self.learned_graph = topk_indices_ji + + gated_i = torch.arange(0, node_num).T.unsqueeze(1).repeat(1, topk_num).flatten().to(device).unsqueeze(0) + gated_j = topk_indices_ji.flatten().unsqueeze(0) + gated_edge_index = torch.cat((gated_j, gated_i), dim=0) + + batch_gated_edge_index = get_batch_edge_index(gated_edge_index, batch_num, node_num).to(device) + gcn_out = self.gnn_layers[i](x, batch_gated_edge_index, node_num=node_num*batch_num, embedding=all_embeddings) + + + gcn_outs.append(gcn_out) + + x = torch.cat(gcn_outs, dim=1) + x = x.view(batch_num, node_num, -1) + + + indexes = torch.arange(0,node_num).to(device) + out = torch.mul(x, self.embedding(indexes)) + + out = out.permute(0,2,1) + out = F.relu(self.bn_outlayer_in(out)) + out = out.permute(0,2,1) + + out = self.dp(out) + out = self.out_layer(out) + out = out.view(-1, node_num) + + + return out + \ No newline at end of file diff --git a/gdn/GDN/models/graph_layer.py b/gdn/GDN/models/graph_layer.py new file mode 100644 index 0000000..77d9db2 --- /dev/null +++ b/gdn/GDN/models/graph_layer.py @@ -0,0 +1,124 @@ +import torch +from torch.nn import Parameter, Linear, Sequential, BatchNorm1d, ReLU +import torch.nn.functional as F +from torch_geometric.nn.conv import MessagePassing +from torch_geometric.utils import remove_self_loops, add_self_loops, softmax + +from torch_geometric.nn.inits import glorot, zeros +import time +import math + +class GraphLayer(MessagePassing): + def __init__(self, in_channels, out_channels, heads=1, concat=True, + negative_slope=0.2, dropout=0, bias=True, inter_dim=-1,**kwargs): + super(GraphLayer, self).__init__(aggr='add', **kwargs) + + self.in_channels = in_channels + self.out_channels = out_channels + self.heads = heads + self.concat = concat + self.negative_slope = negative_slope + self.dropout = dropout + + self.__alpha__ = None + + self.lin = Linear(in_channels, heads * out_channels, bias=False) + + self.att_i = Parameter(torch.Tensor(1, heads, out_channels)) + self.att_j = Parameter(torch.Tensor(1, heads, out_channels)) + self.att_em_i = Parameter(torch.Tensor(1, heads, out_channels)) + self.att_em_j = Parameter(torch.Tensor(1, heads, out_channels)) + + if bias and concat: + self.bias = Parameter(torch.Tensor(heads * out_channels)) + elif bias and not concat: + self.bias = Parameter(torch.Tensor(out_channels)) + else: + self.register_parameter('bias', None) + + self.reset_parameters() + + def reset_parameters(self): + glorot(self.lin.weight) + glorot(self.att_i) + glorot(self.att_j) + + zeros(self.att_em_i) + zeros(self.att_em_j) + + zeros(self.bias) + + + + def forward(self, x, edge_index, embedding, return_attention_weights=False): + """""" + if torch.is_tensor(x): + x = self.lin(x) + x = (x, x) + else: + x = (self.lin(x[0]), self.lin(x[1])) + + edge_index, _ = remove_self_loops(edge_index) + edge_index, _ = add_self_loops(edge_index, + num_nodes=x[1].size(self.node_dim)) + + out = self.propagate(edge_index, x=x, embedding=embedding, edges=edge_index, + return_attention_weights=return_attention_weights) + + if self.concat: + out = out.view(-1, self.heads * self.out_channels) + else: + out = out.mean(dim=1) + + if self.bias is not None: + out = out + self.bias + + if return_attention_weights: + alpha, self.__alpha__ = self.__alpha__, None + return out, (edge_index, alpha) + else: + return out + + def message(self, x_i, x_j, edge_index_i, size_i, + embedding, + edges, + return_attention_weights): + + x_i = x_i.view(-1, self.heads, self.out_channels) + x_j = x_j.view(-1, self.heads, self.out_channels) + + if embedding is not None: + embedding_i, embedding_j = embedding[edge_index_i], embedding[edges[0]] + embedding_i = embedding_i.unsqueeze(1).repeat(1,self.heads,1) + embedding_j = embedding_j.unsqueeze(1).repeat(1,self.heads,1) + + key_i = torch.cat((x_i, embedding_i), dim=-1) + key_j = torch.cat((x_j, embedding_j), dim=-1) + + + + cat_att_i = torch.cat((self.att_i, self.att_em_i), dim=-1) + cat_att_j = torch.cat((self.att_j, self.att_em_j), dim=-1) + + alpha = (key_i * cat_att_i).sum(-1) + (key_j * cat_att_j).sum(-1) + + + alpha = alpha.view(-1, self.heads, 1) + + + alpha = F.leaky_relu(alpha, self.negative_slope) + alpha = softmax(alpha, edge_index_i, size_i) + + if return_attention_weights: + self.__alpha__ = alpha + + alpha = F.dropout(alpha, p=self.dropout, training=self.training) + + return x_j * alpha.view(-1, self.heads, 1) + + + + def __repr__(self): + return '{}({}, {}, heads={})'.format(self.__class__.__name__, + self.in_channels, + self.out_channels, self.heads) diff --git a/gdn/GDN/run.sh b/gdn/GDN/run.sh new file mode 100644 index 0000000..ca6ba59 --- /dev/null +++ b/gdn/GDN/run.sh @@ -0,0 +1,59 @@ +gpu_n=$1 +DATASET=$2 + +seed=5 +BATCH_SIZE=32 +SLIDE_WIN=5 +dim=64 +out_layer_num=1 +SLIDE_STRIDE=1 +topk=5 +out_layer_inter_dim=128 +val_ratio=0.2 +decay=0 + + +path_pattern="${DATASET}" +COMMENT="${DATASET}" + +EPOCH=30 +report='best' + +if [[ "$gpu_n" == "cpu" ]]; then + python main.py \ + -dataset $DATASET \ + -save_path_pattern $path_pattern \ + -slide_stride $SLIDE_STRIDE \ + -slide_win $SLIDE_WIN \ + -batch $BATCH_SIZE \ + -epoch $EPOCH \ + -comment $COMMENT \ + -random_seed $seed \ + -decay $decay \ + -dim $dim \ + -out_layer_num $out_layer_num \ + -out_layer_inter_dim $out_layer_inter_dim \ + -decay $decay \ + -val_ratio $val_ratio \ + -report $report \ + -topk $topk \ + -device 'cpu' +else + CUDA_VISIBLE_DEVICES=$gpu_n python main.py \ + -dataset $DATASET \ + -save_path_pattern $path_pattern \ + -slide_stride $SLIDE_STRIDE \ + -slide_win $SLIDE_WIN \ + -batch $BATCH_SIZE \ + -epoch $EPOCH \ + -comment $COMMENT \ + -random_seed $seed \ + -decay $decay \ + -dim $dim \ + -out_layer_num $out_layer_num \ + -out_layer_inter_dim $out_layer_inter_dim \ + -decay $decay \ + -val_ratio $val_ratio \ + -report $report \ + -topk $topk +fi \ No newline at end of file diff --git a/gdn/GDN/test.py b/gdn/GDN/test.py new file mode 100644 index 0000000..58ae625 --- /dev/null +++ b/gdn/GDN/test.py @@ -0,0 +1,83 @@ +import numpy as np +import torch +import matplotlib.pyplot as plt +import torch.nn as nn +import time +from util.time import * +from util.env import * + +import argparse +import matplotlib.pyplot as plt +from torch.utils.data import Dataset, DataLoader +import pandas as pd +import torch.nn.functional as F + + +from util.data import * +from util.preprocess import * + + + +def test(model, dataloader): + # test + loss_func = nn.MSELoss(reduction='mean') + device = get_device() + + test_loss_list = [] + now = time.time() + + test_predicted_list = [] + test_ground_list = [] + test_labels_list = [] + + t_test_predicted_list = [] + t_test_ground_list = [] + t_test_labels_list = [] + + test_len = len(dataloader) + + model.eval() + + i = 0 + acu_loss = 0 + for x, y, labels, edge_index in dataloader: + x, y, labels, edge_index = [item.to(device).float() for item in [x, y, labels, edge_index]] + + with torch.no_grad(): + predicted = model(x, edge_index).float().to(device) + + + loss = loss_func(predicted, y) + + + labels = labels.unsqueeze(1).repeat(1, predicted.shape[1]) + + if len(t_test_predicted_list) <= 0: + t_test_predicted_list = predicted + t_test_ground_list = y + t_test_labels_list = labels + else: + t_test_predicted_list = torch.cat((t_test_predicted_list, predicted), dim=0) + t_test_ground_list = torch.cat((t_test_ground_list, y), dim=0) + t_test_labels_list = torch.cat((t_test_labels_list, labels), dim=0) + + test_loss_list.append(loss.item()) + acu_loss += loss.item() + + i += 1 + + if i % 10000 == 1 and i > 1: + print(timeSincePlus(now, i / test_len)) + + + test_predicted_list = t_test_predicted_list.tolist() + test_ground_list = t_test_ground_list.tolist() + test_labels_list = t_test_labels_list.tolist() + + avg_loss = sum(test_loss_list)/len(test_loss_list) + + return avg_loss, [test_predicted_list, test_ground_list, test_labels_list] + + + + diff --git a/gdn/GDN/train.py b/gdn/GDN/train.py new file mode 100644 index 0000000..934bd50 --- /dev/null +++ b/gdn/GDN/train.py @@ -0,0 +1,112 @@ +import numpy as np +import torch +import matplotlib.pyplot as plt +import torch.nn as nn +import time +from util.time import * +from util.env import * +from sklearn.metrics import mean_squared_error +from test import * +import torch.nn.functional as F +import numpy as np +from evaluate import get_best_performance_data, get_val_performance_data, get_full_err_scores +from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score +from torch.utils.data import DataLoader, random_split, Subset +from scipy.stats import iqr + + + + +def loss_func(y_pred, y_true): + loss = F.mse_loss(y_pred, y_true, reduction='mean') + + return loss + + + +def train(model = None, save_path = '', config={}, train_dataloader=None, val_dataloader=None, feature_map={}, test_dataloader=None, test_dataset=None, dataset_name='swat', train_dataset=None): + + seed = config['seed'] + + optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=config['decay']) + + now = time.time() + + train_loss_list = [] + cmp_loss_list = [] + + device = get_device() + + + acu_loss = 0 + min_loss = 1e+8 + min_f1 = 0 + min_pre = 0 + best_prec = 0 + + i = 0 + epoch = config['epoch'] + early_stop_win = 15 + + model.train() + + log_interval = 1000 + stop_improve_count = 0 + + dataloader = train_dataloader + + for i_epoch in range(epoch): + + acu_loss = 0 + model.train() + + for x, labels, attack_labels, edge_index in dataloader: + _start = time.time() + + x, labels, edge_index = [item.float().to(device) for item in [x, labels, edge_index]] + + optimizer.zero_grad() + out = model(x, edge_index).float().to(device) + loss = loss_func(out, labels) + + loss.backward() + optimizer.step() + + + train_loss_list.append(loss.item()) + acu_loss += loss.item() + + i += 1 + + + # each epoch + print('epoch ({} / {}) (Loss:{:.8f}, ACU_loss:{:.8f})'.format( + i_epoch, epoch, + acu_loss/len(dataloader), acu_loss), flush=True + ) + + # use val dataset to judge + if val_dataloader is not None: + + val_loss, val_result = test(model, val_dataloader) + + if val_loss < min_loss: + torch.save(model.state_dict(), save_path) + + min_loss = val_loss + stop_improve_count = 0 + else: + stop_improve_count += 1 + + + if stop_improve_count >= early_stop_win: + break + + else: + if acu_loss < min_loss : + torch.save(model.state_dict(), save_path) + min_loss = acu_loss + + + + return train_loss_list diff --git a/gdn/GDN/util/data.py b/gdn/GDN/util/data.py new file mode 100644 index 0000000..3e3e8c8 --- /dev/null +++ b/gdn/GDN/util/data.py @@ -0,0 +1,126 @@ +# util functions about data + +from scipy.stats import rankdata, iqr, trim_mean +from sklearn.metrics import f1_score, mean_squared_error +import numpy as np +from numpy import percentile + + +def get_attack_interval(attack): + heads = [] + tails = [] + for i in range(len(attack)): + if attack[i] == 1: + if attack[i-1] == 0: + heads.append(i) + + if i < len(attack)-1 and attack[i+1] == 0: + tails.append(i) + elif i == len(attack)-1: + tails.append(i) + res = [] + for i in range(len(heads)): + res.append((heads[i], tails[i])) + # print(heads, tails) + return res + +# calculate F1 scores +def eval_scores(scores, true_scores, th_steps, return_thresold=False): + padding_list = [0]*(len(true_scores) - len(scores)) + # print(padding_list) + + if len(padding_list) > 0: + scores = padding_list + scores + + scores_sorted = rankdata(scores, method='ordinal') + th_steps = th_steps + # th_steps = 500 + th_vals = np.array(range(th_steps)) * 1.0 / th_steps + fmeas = [None] * th_steps + thresholds = [None] * th_steps + for i in range(th_steps): + cur_pred = scores_sorted > th_vals[i] * len(scores) + + fmeas[i] = f1_score(true_scores, cur_pred) + + score_index = scores_sorted.tolist().index(int(th_vals[i] * len(scores)+1)) + thresholds[i] = scores[score_index] + + if return_thresold: + return fmeas, thresholds + return fmeas + +def eval_mseloss(predicted, ground_truth): + + ground_truth_list = np.array(ground_truth) + predicted_list = np.array(predicted) + + + # mask = (ground_truth_list == 0) | (predicted_list == 0) + + # ground_truth_list = ground_truth_list[~mask] + # predicted_list = predicted_list[~mask] + + # neg_mask = predicted_list < 0 + # predicted_list[neg_mask] = 0 + + # err = np.abs(predicted_list / ground_truth_list - 1) + # acc = (1 - np.mean(err)) + + # return loss + loss = mean_squared_error(predicted_list, ground_truth_list) + + return loss + +def get_err_median_and_iqr(predicted, groundtruth): + + np_arr = np.abs(np.subtract(np.array(predicted), np.array(groundtruth))) + + err_median = np.median(np_arr) + err_iqr = iqr(np_arr) + + return err_median, err_iqr + +def get_err_median_and_quantile(predicted, groundtruth, percentage): + + np_arr = np.abs(np.subtract(np.array(predicted), np.array(groundtruth))) + + err_median = np.median(np_arr) + # err_iqr = iqr(np_arr) + err_delta = percentile(np_arr, int(percentage*100)) - percentile(np_arr, int((1-percentage)*100)) + + return err_median, err_delta + +def get_err_mean_and_quantile(predicted, groundtruth, percentage): + + np_arr = np.abs(np.subtract(np.array(predicted), np.array(groundtruth))) + + err_median = trim_mean(np_arr, percentage) + # err_iqr = iqr(np_arr) + err_delta = percentile(np_arr, int(percentage*100)) - percentile(np_arr, int((1-percentage)*100)) + + return err_median, err_delta + +def get_err_mean_and_std(predicted, groundtruth): + + np_arr = np.abs(np.subtract(np.array(predicted), np.array(groundtruth))) + + err_mean = np.mean(np_arr) + err_std = np.std(np_arr) + + return err_mean, err_std + + +def get_f1_score(scores, gt, contamination): + + padding_list = [0]*(len(gt) - len(scores)) + # print(padding_list) + + threshold = percentile(scores, 100 * (1 - contamination)) + + if len(padding_list) > 0: + scores = padding_list + scores + + pred_labels = (scores > threshold).astype('int').ravel() + + return f1_score(gt, pred_labels) \ No newline at end of file diff --git a/gdn/GDN/util/env.py b/gdn/GDN/util/env.py new file mode 100644 index 0000000..a89d37d --- /dev/null +++ b/gdn/GDN/util/env.py @@ -0,0 +1,15 @@ +import torch +import numpy as np + +_device = None + +def get_device(): + # return torch.device('cuda' if torch.cuda.is_available() else 'cpu') + return _device + +def set_device(dev): + global _device + _device = dev + +def init_work(worker_id, seed): + np.random.seed(seed + worker_id) diff --git a/gdn/GDN/util/iostream.py b/gdn/GDN/util/iostream.py new file mode 100644 index 0000000..e072789 --- /dev/null +++ b/gdn/GDN/util/iostream.py @@ -0,0 +1,106 @@ +from util.data import get_attack_interval +import time +from datetime import datetime +from pytz import utc, timezone +from util.time import timestamp2str +import json +import argparse +import numpy as np + +def printsep(): + print('='*40+'\n') + +def save_attack_infos(f1_scores, total_err_scores, labels, names, save_path, dataset, config): + slide_win=config['slide_win'] + down_len=config['down_len'] + + + if dataset == 'wadi' or dataset == 'wadi2': + s = '09/10/2017 18:00:00' + elif dataset == 'swat': + s = '28/12/2015 10:00:00' + start_s = int(time.mktime(datetime.strptime(s, "%d/%m/%Y %H:%M:%S").timetuple())) + cst8 = timezone('Asia/Shanghai') + fmt = '%m/%d %H:%M:%S' + + attack_inters = get_attack_interval(labels) + + save_infos = { + 'total_best_f1_score': f1_scores[0], + 'total_best_f1_score_topk': f1_scores[1], + 'total_best_f1_score_all': f1_scores[2], + 'attacks': [] + } + + indices_map = names + + indices = np.argmax(total_err_scores, axis=0).tolist() + anomaly_sensors = [ indices_map[index] for index in indices ] + + topk = 5 + topk_indices = np.argpartition(total_err_scores, -topk, axis=0)[-topk:] + topk_indices = np.transpose(topk_indices) + + topk_anomaly_sensors = [] + topk_err_score_map=[] + for i, indexs in enumerate(topk_indices): + # print(indexs) + topk_anomaly_sensors.append([indices_map[index] for index in indexs]) + + item = {} + for sensor, index in zip(topk_anomaly_sensors[i],indexs): + if sensor not in item: + item[sensor] = total_err_scores[index, i] + + topk_err_score_map.append(item) + + + for head, end in attack_inters: + attack_infos = {} + topk_attack_infos = {} + + head_t = timestamp2str(start_s+(head+slide_win)*down_len, fmt, cst8) + end_t = timestamp2str(start_s+(end+slide_win)*down_len, fmt, cst8) + # head_t = datetime.fromtimestamp(start_s+head).astimezone(cst8).strftime(fmt) + # end_t = datetime.fromtimestamp(start_s+end).astimezone(cst8).strftime(fmt) + + # print(f'\nattack from {head_t} to {end_t}:') + + for i in range(head, end): + # t = datetime.fromtimestamp(start_s+i).astimezone(cst8).strftime(fmt) + t = timestamp2str(start_s+(i+slide_win)*down_len, fmt, cst8) + max_sensor = anomaly_sensors[i] + topk_sensors = topk_anomaly_sensors[i] + + if max_sensor not in attack_infos: + attack_infos[max_sensor] = 0 + attack_infos[max_sensor] += 1 + + # for anomaly_sensor in topk_sensors: + # if anomaly_sensor not in topk_attack_infos: + # topk_attack_infos[anomaly_sensor] = 0 + # topk_attack_infos[anomaly_sensor] += 1 + + for anomaly_sensor in topk_sensors: + if anomaly_sensor not in topk_attack_infos: + topk_attack_infos[anomaly_sensor] = 0 + topk_attack_infos[anomaly_sensor] += topk_err_score_map[i][anomaly_sensor] + + + # print('-------------------------------') + # print(f'total top 5 attack sensors from {head_t} to {end_t}:') + sorted_attack_infos = {k: v for k, v in sorted(attack_infos.items(), reverse=True, key=lambda item: item[1])} + sorted_topk_attack_infos = {k: v for k, v in sorted(topk_attack_infos.items(), reverse=True, key=lambda item: item[1])} + # for key, count in sorted_attack_infos.items()[:5]: + # print(key, count) + + save_infos['attacks'].append({ + 'start': head_t, + 'end': end_t, + 'sensors': list(sorted_attack_infos), + 'topk_sensors': list(sorted_topk_attack_infos), + 'topk_scores': list(sorted_topk_attack_infos.values()) + }) + + with open(save_path, 'w+') as outfile: + json.dump(save_infos, outfile, indent=4) diff --git a/gdn/GDN/util/net_struct.py b/gdn/GDN/util/net_struct.py new file mode 100644 index 0000000..398b4c8 --- /dev/null +++ b/gdn/GDN/util/net_struct.py @@ -0,0 +1,58 @@ +import glob + +import pandas as pd + + +def get_feature_map(dataset: pd.DataFrame): + feature_list = [] + for ft in dataset.filter(like="value").columns.values.tolist(): + feature_list.append(ft.strip()) + + return feature_list + + +# graph is 'fully-connect' +def get_fc_graph_struc(dataset: pd.DataFrame): + struc_map = {} + feature_list = [] + for ft in dataset.filter(like="value").columns.values.tolist(): + feature_list.append(ft.strip()) + + for ft in feature_list: + if ft not in struc_map: + struc_map[ft] = [] + + for other_ft in feature_list: + if other_ft is not ft: + struc_map[ft].append(other_ft) + + return struc_map + +def get_prior_graph_struc(dataset): + feature_file = open(f'./data/{dataset}/features.txt', 'r') + + struc_map = {} + feature_list = [] + for ft in feature_file: + feature_list.append(ft.strip()) + + for ft in feature_list: + if ft not in struc_map: + struc_map[ft] = [] + for other_ft in feature_list: + if dataset == 'wadi' or dataset == 'wadi2': + # same group, 1_xxx, 2A_xxx, 2_xxx + if other_ft is not ft and other_ft[0] == ft[0]: + struc_map[ft].append(other_ft) + elif dataset == 'swat': + # FIT101, PV101 + if other_ft is not ft and other_ft[-3] == ft[-3]: + struc_map[ft].append(other_ft) + + + return struc_map + + +if __name__ == '__main__': + get_graph_struc() + \ No newline at end of file diff --git a/gdn/GDN/util/preprocess.py b/gdn/GDN/util/preprocess.py new file mode 100644 index 0000000..2d43f5e --- /dev/null +++ b/gdn/GDN/util/preprocess.py @@ -0,0 +1,116 @@ +# preprocess data +import numpy as np +import re + + +def get_most_common_features(target, all_features, max = 3, min = 3): + res = [] + main_keys = target.split('_') + + for feature in all_features: + if target == feature: + continue + + f_keys = feature.split('_') + common_key_num = len(list(set(f_keys) & set(main_keys))) + + if common_key_num >= min and common_key_num <= max: + res.append(feature) + + return res + +def build_net(target, all_features): + # get edge_indexes, and index_feature_map + main_keys = target.split('_') + edge_indexes = [ + [], + [] + ] + index_feature_map = [target] + + # find closest features(nodes): + parent_list = [target] + graph_map = {} + depth = 2 + + for i in range(depth): + for feature in parent_list: + children = get_most_common_features(feature, all_features) + + if feature not in graph_map: + graph_map[feature] = [] + + # exclude parent + pure_children = [] + for child in children: + if child not in graph_map: + pure_children.append(child) + + graph_map[feature] = pure_children + + if feature not in index_feature_map: + index_feature_map.append(feature) + p_index = index_feature_map.index(feature) + for child in pure_children: + if child not in index_feature_map: + index_feature_map.append(child) + c_index = index_feature_map.index(child) + + edge_indexes[1].append(p_index) + edge_indexes[0].append(c_index) + + parent_list = pure_children + + return edge_indexes, index_feature_map + + +def construct_data(data, feature_map, labels=0): + res = [] + + for feature in feature_map: + if feature in data.columns: + res.append(data.loc[:, feature].values.tolist()) + else: + print(feature, 'not exist in data') + # append labels as last + sample_n = len(res[0]) + + if type(labels) == int: + res.append([labels]*sample_n) + elif len(labels) == sample_n: + res.append(labels) + + return res + +def build_loc_net(struc, all_features, feature_map=[]): + + index_feature_map = feature_map + edge_indexes = [ + [], + [] + ] + for node_name, node_list in struc.items(): + if node_name not in all_features: + continue + + if node_name not in index_feature_map: + index_feature_map.append(node_name) + + p_index = index_feature_map.index(node_name) + for child in node_list: + if child not in all_features: + continue + + if child not in index_feature_map: + print(f'error: {child} not in index_feature_map') + # index_feature_map.append(child) + + c_index = index_feature_map.index(child) + # edge_indexes[0].append(p_index) + # edge_indexes[1].append(c_index) + edge_indexes[0].append(c_index) + edge_indexes[1].append(p_index) + + + + return edge_indexes \ No newline at end of file diff --git a/gdn/GDN/util/time.py b/gdn/GDN/util/time.py new file mode 100644 index 0000000..cafe271 --- /dev/null +++ b/gdn/GDN/util/time.py @@ -0,0 +1,28 @@ +import time +import math +from datetime import datetime +from pytz import utc, timezone + +def asMinutes(s): + m = math.floor(s / 60) + s -= m * 60 + return '%dm %ds' % (m, s) + + +def timeSincePlus(since, percent): + now = time.time() + s = now - since + es = s / (percent) + rs = es - s + return '%s (- %s)' % (asMinutes(s), asMinutes(rs)) + + +def timeSince(since): + now = time.time() + s = now - since + m = math.floor(s / 60) + s -= m * 60 + return '%dm %ds' % (m, s) + +def timestamp2str(sec, fmt, tz): + return datetime.fromtimestamp(sec).astimezone(tz).strftime(fmt) diff --git a/gdn/algorithm.py b/gdn/algorithm.py index 50ca65e..f90efdd 100644 --- a/gdn/algorithm.py +++ b/gdn/algorithm.py @@ -8,6 +8,8 @@ from dataclasses import dataclass +from GDN.main import GDNtrain, GDNtest + @dataclass class CustomParameters: @@ -25,8 +27,12 @@ class CustomParameters: class AlgorithmArgs(argparse.Namespace): @property - def ts(self) -> np.ndarray: - return self.df.iloc[:, 1:-1].values + def ts(self) -> pd.DataFrame: + return self.df.iloc[:, 1:-1] + + @property + def tsa(self) -> pd.Dataframe: + return self.df.iloc[:, 1:] @property def df(self) -> pd.DataFrame: @@ -44,7 +50,7 @@ def from_sys_args() -> 'AlgorithmArgs': def train(args: AlgorithmArgs): ts = args.ts - + train_config = { "batch": args.customParameters.batch_size, "epoch": args.customParameters.epochs, @@ -59,29 +65,47 @@ def train(args: AlgorithmArgs): "val_ratio": args.customParameters.split, "topk": 20, } - + + # load data + env_config = { + "dataset": ts, + "dataOutput": args.dataOutput, + "save_model_path": args.modelOutput, + "device": args.customParameters.device + } + + GDNtrain(train_config, env_config) + + # TODO remove raise NotImplementedError("GDN is not implemented yet!") def execute(args: AlgorithmArgs): ts = args.ts - env_config={ - "save_path": ..., - "dataset": ..., - "report": ..., - "device": ..., - "load_model_path": ... + env_config = { + "dataset": ts, + "dataOutput": args.dataOutput, + "load_model_path": args.modelInput, + "device": args.customParameters.device } + GDNtest(env_config) + + # TODO remove raise NotImplementedError("GDN is not implemented yet!") + def set_random_state(config: AlgorithmArgs) -> None: seed = config.customParameters.random_state import random random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) + torch.cuda.manual_seed(args.random_seed) + torch.cuda.manual_seed_all(args.random_seed) + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True if __name__ == "__main__": From daf8e81fb14ff592d040bac34b5b5a5de93d357d Mon Sep 17 00:00:00 2001 From: 2er0 <2er0@dbaumi.at> Date: Tue, 11 Jul 2023 23:01:16 +0200 Subject: [PATCH 4/6] WIP major, minor fixes and adaptions --- gdn/Dockerfile | 9 +- gdn/GDN/dataloader_fix.py | 118 ++++++++++++++ gdn/GDN/main.py | 319 +++++++++++-------------------------- gdn/GDN/models/GDN.py | 8 +- gdn/GDN/test.py | 41 ++--- gdn/GDN/train.py | 44 ++--- gdn/GDN/util/net_struct.py | 2 - gdn/__init__.py | 0 gdn/algorithm.py | 16 +- gdn/requirements.txt | 5 +- 10 files changed, 259 insertions(+), 303 deletions(-) create mode 100644 gdn/GDN/dataloader_fix.py create mode 100644 gdn/__init__.py diff --git a/gdn/Dockerfile b/gdn/Dockerfile index 202770f..4673677 100644 --- a/gdn/Dockerfile +++ b/gdn/Dockerfile @@ -1,13 +1,16 @@ -FROM registry.gitlab.hpi.de/akita/i/python3-base:0.2.5 +FROM registry.gitlab.hpi.de/akita/i/python3-base:0.2.6 -LABEL maintainer="sebastian.schmidl@hpi.de" +LABEL maintainer="2er0@dbaumi.at" ENV ALGORITHM_MAIN="/app/algorithm.py" # install algorithm dependencies COPY requirements.txt /app/ +RUN apt-get update; \ + apt-get install -y gcc python3-dev; \ + apt-get clean; \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* RUN pip install -r /app/requirements.txt -RUN pip install --no-index torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.5.0+cu102.html COPY algorithm.py /app/ COPY GDN /app/GDN diff --git a/gdn/GDN/dataloader_fix.py b/gdn/GDN/dataloader_fix.py new file mode 100644 index 0000000..8dbb43c --- /dev/null +++ b/gdn/GDN/dataloader_fix.py @@ -0,0 +1,118 @@ +# fixed version of original dataloader in torch_geometric/data/dataloader.py +# last import guarantees overload of original version +import torch.utils.data +from torch.utils.data.dataloader import default_collate + +from torch_geometric.data import Data, Batch +from torch._six import string_classes + + +int_classes = (bool, int) + + +class Collater(object): + def __init__(self, follow_batch): + self.follow_batch = follow_batch + + def collate(self, batch): + elem = batch[0] + if isinstance(elem, Data): + return Batch.from_data_list(batch, self.follow_batch) + elif isinstance(elem, torch.Tensor): + return default_collate(batch) + elif isinstance(elem, float): + return torch.tensor(batch, dtype=torch.float) + elif isinstance(elem, int_classes): + return torch.tensor(batch) + elif isinstance(elem, string_classes): + return batch + elif isinstance(elem, container_abcs.Mapping): + return {key: self.collate([d[key] for d in batch]) for key in elem} + elif isinstance(elem, tuple) and hasattr(elem, '_fields'): + return type(elem)(*(self.collate(s) for s in zip(*batch))) + elif isinstance(elem, container_abcs.Sequence): + return [self.collate(s) for s in zip(*batch)] + + raise TypeError('DataLoader found invalid type: {}'.format(type(elem))) + + def __call__(self, batch): + return self.collate(batch) + + +class DataLoader(torch.utils.data.DataLoader): + r"""Data loader which merges data objects from a + :class:`torch_geometric.data.dataset` to a mini-batch. + + Args: + dataset (Dataset): The dataset from which to load the data. + batch_size (int, optional): How many samples per batch to load. + (default: :obj:`1`) + shuffle (bool, optional): If set to :obj:`True`, the data will be + reshuffled at every epoch. (default: :obj:`False`) + follow_batch (list or tuple, optional): Creates assignment batch + vectors for each key in the list. (default: :obj:`[]`) + """ + + def __init__(self, dataset, batch_size=1, shuffle=False, follow_batch=[], + **kwargs): + super(DataLoader, + self).__init__(dataset, batch_size, shuffle, + collate_fn=Collater(follow_batch), **kwargs) + + +class DataListLoader(torch.utils.data.DataLoader): + r"""Data loader which merges data objects from a + :class:`torch_geometric.data.dataset` to a python list. + + .. note:: + + This data loader should be used for multi-gpu support via + :class:`torch_geometric.nn.DataParallel`. + + Args: + dataset (Dataset): The dataset from which to load the data. + batch_size (int, optional): How many samples per batch to load. + (default: :obj:`1`) + shuffle (bool, optional): If set to :obj:`True`, the data will be + reshuffled at every epoch (default: :obj:`False`) + """ + + def __init__(self, dataset, batch_size=1, shuffle=False, **kwargs): + super(DataListLoader, self).__init__( + dataset, batch_size, shuffle, + collate_fn=lambda data_list: data_list, **kwargs) + + +class DenseCollater(object): + def collate(self, data_list): + batch = Batch() + for key in data_list[0].keys: + batch[key] = default_collate([d[key] for d in data_list]) + return batch + + def __call__(self, batch): + return self.collate(batch) + + +class DenseDataLoader(torch.utils.data.DataLoader): + r"""Data loader which merges data objects from a + :class:`torch_geometric.data.dataset` to a mini-batch. + + .. note:: + + To make use of this data loader, all graphs in the dataset needs to + have the same shape for each its attributes. + Therefore, this data loader should only be used when working with + *dense* adjacency matrices. + + Args: + dataset (Dataset): The dataset from which to load the data. + batch_size (int, optional): How many samples per batch to load. + (default: :obj:`1`) + shuffle (bool, optional): If set to :obj:`True`, the data will be + reshuffled at every epoch (default: :obj:`False`) + """ + + def __init__(self, dataset, batch_size=1, shuffle=False, **kwargs): + super(DenseDataLoader, self).__init__( + dataset, batch_size, shuffle, collate_fn=DenseCollater(), **kwargs) diff --git a/gdn/GDN/main.py b/gdn/GDN/main.py index a57ffa9..e5d15b7 100644 --- a/gdn/GDN/main.py +++ b/gdn/GDN/main.py @@ -2,38 +2,27 @@ import pickle as pkl from typing import List, Any -import pandas as pd +import os +from pathlib import Path +import random +from datetime import datetime + import numpy as np import torch -import matplotlib.pyplot as plt -from torch.utils.data import DataLoader, random_split, Subset - -from sklearn.preprocessing import MinMaxScaler +from torch.utils.data import DataLoader, Subset from GDN.util.env import get_device, set_device from GDN.util.preprocess import build_loc_net, construct_data from GDN.util.net_struct import get_feature_map, get_fc_graph_struc -from GDN.util.iostream import printsep from GDN.datasets.TimeDataset import TimeDataset -from GDN.models.GDN import GDN +from GDN.models.GDN import GDNModule from GDN.train import train from GDN.test import test -from GDN.evaluate import get_err_scores, get_best_performance_data, get_val_performance_data, get_full_err_scores - -import sys -from datetime import datetime - -import os -import argparse -from pathlib import Path - -import matplotlib.pyplot as plt -import json -import random +import GDN.dataloader_fix def GDNtrain(train_config: dict, env_config: dict) -> None: @@ -48,7 +37,8 @@ def GDNtrain(train_config: dict, env_config: dict) -> None: fc_edge_index = build_loc_net(fc_struc, list(env_config["dataset"].columns), feature_map=feature_map) fc_edge_index = torch.tensor(fc_edge_index, dtype=torch.long) - train_dataset_indata = construct_data(train, feature_map, labels=0) + train_ts = env_config["dataset"] + train_dataset_indata = construct_data(train_ts, feature_map, labels=0) cfg = { 'slide_win': train_config['slide_win'], @@ -67,10 +57,7 @@ def GDNtrain(train_config: dict, env_config: dict) -> None: model = create_gdn_model(train_config, edge_index_sets, feature_map, device) - if "save_model_path" in env_config and len(env_config["save_model_path"]) > 0: - model_save_path = env_config["save_model_path"] - else: - model_save_path = get_save_path()[0] + model_save_path = get_save_path(env_config["save_model_path"])[0] train_log = train(model, model_save_path, config=train_config, @@ -84,9 +71,10 @@ def GDNtrain(train_config: dict, env_config: dict) -> None: ) _, train_result = test(model, full_train_dataloader) - save_result_output(train_result, env_config["dataOutput"]) + save_result_output(train_result, env_config) - save_config_element(train_config, feature_map, fc_edge_index) + save_config_element(env_config["save_model_path"], + train_config, feature_map, fc_edge_index) def GDNtest(env_config: dict) -> None: @@ -101,9 +89,8 @@ def GDNtest(env_config: dict) -> None: edge_index_sets = [] edge_index_sets.append(fc_edge_index) - test = env_config["dataset"] - - test_dataset_indata = construct_data(test, feature_map, + test_ts = env_config["dataset"] + test_dataset_indata = construct_data(test_ts, feature_map, labels=test["is_anomaly"].tolist()) cfg = { @@ -116,10 +103,7 @@ def GDNtest(env_config: dict) -> None: test_dataloader = DataLoader(test_dataset, batch_size=train_config['batch'], shuffle=False, num_workers=0) - if "load_model_path" in env_config and len(env_config["load_model_path"]) > 0: - model_load_path = env_config["load_model_path"] - else: - model_load_path = get_save_path()[0] + model_load_path = get_save_path(env_config["load_model_path"])[0] model = create_gdn_model(train_config, edge_index_sets, feature_map, device) @@ -134,13 +118,13 @@ def GDNtest(env_config: dict) -> None: def create_gdn_model(train_config, edge_index_sets, feature_map, device) -> GDN: - return GDN(edge_index_sets, len(feature_map), - dim=train_config['dim'], - input_dim=train_config['slide_win'], - out_layer_num=train_config['out_layer_num'], - out_layer_inter_dim=train_config['out_layer_inter_dim'], - topk=train_config['topk'] - ).to(device) + return GDNModule(edge_index_sets, len(feature_map), + dim=train_config['dim'], + input_dim=train_config['slide_win'], + out_layer_num=train_config['out_layer_num'], + out_layer_inter_dim=train_config['out_layer_inter_dim'], + topk=train_config['topk'] + ).to(device) def get_loaders(train_dataset, seed, batch, val_ratio=0.1): @@ -165,16 +149,22 @@ def get_loaders(train_dataset, seed, batch, val_ratio=0.1): return train_dataloader, val_dataloader -def get_save_path(): +def get_save_path(model_path: str): + # TODO change to TimeEval format of given locations + if len(model_path) > 0: + base_dir = os.path.dirname(model_path) + else: + base_dir = "results" + now = datetime.now() datestr = now.strftime('%m|%d-%H:%M:%S') paths = [ - f'./pretrained/best_{datestr}.pt', - f'./results/results.tmp', - f'./pretrained/train_config.pkl', - f'./pretrained/feature_map.pkl', - f'./pretrained/fc_edge_index.pkl' + f'{base_dir}/best_{datestr}.pt', + f'{base_dir}/results.tmp', + f'{base_dir}/train_config.pkl', + f'{base_dir}/feature_map.pkl', + f'{base_dir}/fc_edge_index.pkl' ] for path in paths: @@ -184,8 +174,8 @@ def get_save_path(): return paths -def save_config_element(train_config, feature_map, fc_edge_index) -> None: - paths = get_save_path() +def save_config_element(model_path, train_config, feature_map, fc_edge_index) -> None: + paths = get_save_path(model_path) for p, e in zip(paths[-3:], [train_config, feature_map, fc_edge_index]): with open(p, 'wb') as file: @@ -204,180 +194,65 @@ def load_config_element() -> List[Any]: return elements -def save_result_output(result, name) -> None: - path = get_save_path()[1].replace("results.tmp", name) +def save_result_output(result, env_config) -> None: + path = get_save_path(env_config["save_model_path"])[1].replace("results.tmp", env_config["dataOutput"]) np_result = np.array(result) np.savetxt(path, np_result, delimiter=",") - -# TODO remove this -class GDNMain(): - def __init__(self, train_config, env_config, debug=False): - - self.train_config = train_config - self.env_config = env_config - self.datestr = None - - dataset = self.env_config['dataset'] - train_orig = pd.read_csv(f'./data/{dataset}/train.csv', sep=',', index_col=0) - test_orig = pd.read_csv(f'./data/{dataset}/test.csv', sep=',', index_col=0) - - train, test = train_orig, test_orig - - if 'attack' in train.columns: - train = train.drop(columns=['attack']) - - feature_map = get_feature_map(dataset) - fc_struc = get_fc_graph_struc(dataset) - - set_device(env_config['device']) - self.device = get_device() - - fc_edge_index = build_loc_net(fc_struc, list(train.columns), feature_map=feature_map) - fc_edge_index = torch.tensor(fc_edge_index, dtype=torch.long) - - self.feature_map = feature_map - - train_dataset_indata = construct_data(train, feature_map, labels=0) - test_dataset_indata = construct_data(test, feature_map, labels=test.attack.tolist()) - - cfg = { - 'slide_win': train_config['slide_win'], - 'slide_stride': train_config['slide_stride'], - } - - train_dataset = TimeDataset(train_dataset_indata, fc_edge_index, mode='train', config=cfg) - test_dataset = TimeDataset(test_dataset_indata, fc_edge_index, mode='test', config=cfg) - - train_dataloader, val_dataloader = get_loaders(train_dataset, train_config['seed'], train_config['batch'], - val_ratio=train_config['val_ratio']) - - self.train_dataset = train_dataset - self.test_dataset = test_dataset - - self.train_dataloader = train_dataloader - self.val_dataloader = val_dataloader - self.test_dataloader = DataLoader(test_dataset, batch_size=train_config['batch'], - shuffle=False, num_workers=0) - - edge_index_sets = [] - edge_index_sets.append(fc_edge_index) - - self.model = GDN(edge_index_sets, len(feature_map), - dim=train_config['dim'], - input_dim=train_config['slide_win'], - out_layer_num=train_config['out_layer_num'], - out_layer_inter_dim=train_config['out_layer_inter_dim'], - topk=train_config['topk'] - ).to(self.device) - - def run(self): - - if len(self.env_config['load_model_path']) > 0: - model_save_path = self.env_config['load_model_path'] - else: - model_save_path = self.get_save_path()[0] - - self.train_log = train(self.model, model_save_path, - config=train_config, - train_dataloader=self.train_dataloader, - val_dataloader=self.val_dataloader, - feature_map=self.feature_map, - test_dataloader=self.test_dataloader, - test_dataset=self.test_dataset, - train_dataset=self.train_dataset, - dataset_name=self.env_config['dataset'] - ) - - # test - self.model.load_state_dict(torch.load(model_save_path)) - best_model = self.model.to(self.device) - - _, self.test_result = test(best_model, self.test_dataloader) - _, self.val_result = test(best_model, self.val_dataloader) - - self.get_score(self.test_result, self.val_result) - - def get_score(self, test_result, val_result): - - feature_num = len(test_result[0][0]) - np_test_result = np.array(test_result) - np_val_result = np.array(val_result) - - test_labels = np_test_result[2, :, 0].tolist() - - test_scores, normal_scores = get_full_err_scores(test_result, val_result) - - top1_best_info = get_best_performance_data(test_scores, test_labels, topk=1) - top1_val_info = get_val_performance_data(test_scores, normal_scores, test_labels, topk=1) - - print('=========================** Result **============================\n') - - info = None - if self.env_config['report'] == 'best': - info = top1_best_info - elif self.env_config['report'] == 'val': - info = top1_val_info - - print(f'F1 score: {info[0]}') - print(f'precision: {info[1]}') - print(f'recall: {info[2]}\n') - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument('-batch', help='batch size', type=int, default=128) - parser.add_argument('-epoch', help='train epoch', type=int, default=100) - parser.add_argument('-slide_win', help='slide_win', type=int, default=15) - parser.add_argument('-dim', help='dimension', type=int, default=64) - parser.add_argument('-slide_stride', help='slide_stride', type=int, default=5) - parser.add_argument('-save_path_pattern', help='save path pattern', type=str, default='') - parser.add_argument('-dataset', help='wadi / swat', type=str, default='wadi') - parser.add_argument('-device', help='cuda / cpu', type=str, default='cuda') - parser.add_argument('-random_seed', help='random seed', type=int, default=0) - parser.add_argument('-comment', help='experiment comment', type=str, default='') - parser.add_argument('-out_layer_num', help='outlayer num', type=int, default=1) - parser.add_argument('-out_layer_inter_dim', help='out_layer_inter_dim', type=int, default=256) - parser.add_argument('-decay', help='decay', type=float, default=0) - parser.add_argument('-val_ratio', help='val ratio', type=float, default=0.1) - parser.add_argument('-topk', help='topk num', type=int, default=20) - parser.add_argument('-report', help='best / val', type=str, default='best') - parser.add_argument('-load_model_path', help='trained model path', type=str, default='') - - args = parser.parse_args() - - random.seed(args.random_seed) - np.random.seed(args.random_seed) - torch.manual_seed(args.random_seed) - torch.cuda.manual_seed(args.random_seed) - torch.cuda.manual_seed_all(args.random_seed) - torch.backends.cudnn.benchmark = False - torch.backends.cudnn.deterministic = True - os.environ['PYTHONHASHSEED'] = str(args.random_seed) - - train_config = { - 'batch': args.batch, - 'epoch': args.epoch, - 'slide_win': args.slide_win, - 'dim': args.dim, - 'slide_stride': args.slide_stride, - 'comment': args.comment, - 'seed': args.random_seed, - 'out_layer_num': args.out_layer_num, - 'out_layer_inter_dim': args.out_layer_inter_dim, - 'decay': args.decay, - 'val_ratio': args.val_ratio, - 'topk': args.topk, - } - - env_config = { - 'save_path': args.save_path_pattern, - 'dataset': args.dataset, - 'report': args.report, - 'device': args.device, - 'load_model_path': args.load_model_path - } - - main = GDNMain(train_config, env_config, debug=False) - main.run() +# if __name__ == "__main__": +# parser = argparse.ArgumentParser() +# +# parser.add_argument('-batch', help='batch size', type=int, default=128) +# parser.add_argument('-epoch', help='train epoch', type=int, default=100) +# parser.add_argument('-slide_win', help='slide_win', type=int, default=15) +# parser.add_argument('-dim', help='dimension', type=int, default=64) +# parser.add_argument('-slide_stride', help='slide_stride', type=int, default=5) +# parser.add_argument('-save_path_pattern', help='save path pattern', type=str, default='') +# parser.add_argument('-dataset', help='wadi / swat', type=str, default='wadi') +# parser.add_argument('-device', help='cuda / cpu', type=str, default='cuda') +# parser.add_argument('-random_seed', help='random seed', type=int, default=0) +# parser.add_argument('-comment', help='experiment comment', type=str, default='') +# parser.add_argument('-out_layer_num', help='outlayer num', type=int, default=1) +# parser.add_argument('-out_layer_inter_dim', help='out_layer_inter_dim', type=int, default=256) +# parser.add_argument('-decay', help='decay', type=float, default=0) +# parser.add_argument('-val_ratio', help='val ratio', type=float, default=0.1) +# parser.add_argument('-topk', help='topk num', type=int, default=20) +# parser.add_argument('-report', help='best / val', type=str, default='best') +# parser.add_argument('-load_model_path', help='trained model path', type=str, default='') +# +# args = parser.parse_args() +# +# random.seed(args.random_seed) +# np.random.seed(args.random_seed) +# torch.manual_seed(args.random_seed) +# torch.cuda.manual_seed(args.random_seed) +# torch.cuda.manual_seed_all(args.random_seed) +# torch.backends.cudnn.benchmark = False +# torch.backends.cudnn.deterministic = True +# os.environ['PYTHONHASHSEED'] = str(args.random_seed) +# +# train_config = { +# 'batch': args.batch, +# 'epoch': args.epoch, +# 'slide_win': args.slide_win, +# 'dim': args.dim, +# 'slide_stride': args.slide_stride, +# 'comment': args.comment, +# 'seed': args.random_seed, +# 'out_layer_num': args.out_layer_num, +# 'out_layer_inter_dim': args.out_layer_inter_dim, +# 'decay': args.decay, +# 'val_ratio': args.val_ratio, +# 'topk': args.topk, +# } +# +# env_config = { +# 'save_path': args.save_path_pattern, +# 'dataset': args.dataset, +# 'report': args.report, +# 'device': args.device, +# 'load_model_path': args.load_model_path +# } +# +# main = GDNMain(train_config, env_config, debug=False) +# main.run() diff --git a/gdn/GDN/models/GDN.py b/gdn/GDN/models/GDN.py index e967790..0f97514 100644 --- a/gdn/GDN/models/GDN.py +++ b/gdn/GDN/models/GDN.py @@ -3,8 +3,8 @@ import matplotlib.pyplot as plt import torch.nn as nn import time -from util.time import * -from util.env import * +from GDN.util.time import * +from GDN.util.env import * from torch_geometric.nn import GCNConv, GATConv, EdgeConv import math import torch.nn.functional as F @@ -79,10 +79,10 @@ def forward(self, x, edge_index, embedding=None, node_num=0): return self.relu(out) -class GDN(nn.Module): +class GDNModule(nn.Module): def __init__(self, edge_index_sets, node_num, dim=64, out_layer_inter_dim=256, input_dim=10, out_layer_num=1, topk=20): - super(GDN, self).__init__() + super(GDNModule, self).__init__() self.edge_index_sets = edge_index_sets diff --git a/gdn/GDN/test.py b/gdn/GDN/test.py index 58ae625..80ee248 100644 --- a/gdn/GDN/test.py +++ b/gdn/GDN/test.py @@ -1,21 +1,7 @@ -import numpy as np -import torch -import matplotlib.pyplot as plt import torch.nn as nn import time -from util.time import * -from util.env import * - -import argparse -import matplotlib.pyplot as plt -from torch.utils.data import Dataset, DataLoader -import pandas as pd -import torch.nn.functional as F - - -from util.data import * -from util.preprocess import * - +from GDN.util.time import * +from GDN.util.env import * def test(model, dataloader): @@ -42,13 +28,11 @@ def test(model, dataloader): acu_loss = 0 for x, y, labels, edge_index in dataloader: x, y, labels, edge_index = [item.to(device).float() for item in [x, y, labels, edge_index]] - + with torch.no_grad(): predicted = model(x, edge_index).float().to(device) - - + loss = loss_func(predicted, y) - labels = labels.unsqueeze(1).repeat(1, predicted.shape[1]) @@ -60,24 +44,19 @@ def test(model, dataloader): t_test_predicted_list = torch.cat((t_test_predicted_list, predicted), dim=0) t_test_ground_list = torch.cat((t_test_ground_list, y), dim=0) t_test_labels_list = torch.cat((t_test_labels_list, labels), dim=0) - + test_loss_list.append(loss.item()) acu_loss += loss.item() - + i += 1 if i % 10000 == 1 and i > 1: print(timeSincePlus(now, i / test_len)) + test_predicted_list = t_test_predicted_list.tolist() + test_ground_list = t_test_ground_list.tolist() + test_labels_list = t_test_labels_list.tolist() - test_predicted_list = t_test_predicted_list.tolist() - test_ground_list = t_test_ground_list.tolist() - test_labels_list = t_test_labels_list.tolist() - - avg_loss = sum(test_loss_list)/len(test_loss_list) + avg_loss = sum(test_loss_list) / len(test_loss_list) return avg_loss, [test_predicted_list, test_ground_list, test_labels_list] - - - - diff --git a/gdn/GDN/train.py b/gdn/GDN/train.py index 934bd50..e47c0c2 100644 --- a/gdn/GDN/train.py +++ b/gdn/GDN/train.py @@ -1,20 +1,11 @@ -import numpy as np -import torch -import matplotlib.pyplot as plt -import torch.nn as nn import time -from util.time import * -from util.env import * -from sklearn.metrics import mean_squared_error -from test import * -import torch.nn.functional as F -import numpy as np -from evaluate import get_best_performance_data, get_val_performance_data, get_full_err_scores -from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score -from torch.utils.data import DataLoader, random_split, Subset -from scipy.stats import iqr +from GDN.test import test +from GDN.util.time import * +from GDN.util.env import * +import torch +import torch.nn.functional as F def loss_func(y_pred, y_true): @@ -23,21 +14,19 @@ def loss_func(y_pred, y_true): return loss - -def train(model = None, save_path = '', config={}, train_dataloader=None, val_dataloader=None, feature_map={}, test_dataloader=None, test_dataset=None, dataset_name='swat', train_dataset=None): - +def train(model=None, save_path='', config={}, train_dataloader=None, val_dataloader=None, feature_map={}, + test_dataloader=None, test_dataset=None, dataset_name='swat', train_dataset=None): seed = config['seed'] optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=config['decay']) now = time.time() - + train_loss_list = [] cmp_loss_list = [] device = get_device() - acu_loss = 0 min_loss = 1e+8 min_f1 = 0 @@ -68,22 +57,20 @@ def train(model = None, save_path = '', config={}, train_dataloader=None, val_d optimizer.zero_grad() out = model(x, edge_index).float().to(device) loss = loss_func(out, labels) - + loss.backward() optimizer.step() - train_loss_list.append(loss.item()) acu_loss += loss.item() - - i += 1 + i += 1 # each epoch print('epoch ({} / {}) (Loss:{:.8f}, ACU_loss:{:.8f})'.format( - i_epoch, epoch, - acu_loss/len(dataloader), acu_loss), flush=True - ) + i_epoch, epoch, + acu_loss / len(dataloader), acu_loss), flush=True + ) # use val dataset to judge if val_dataloader is not None: @@ -98,15 +85,12 @@ def train(model = None, save_path = '', config={}, train_dataloader=None, val_d else: stop_improve_count += 1 - if stop_improve_count >= early_stop_win: break else: - if acu_loss < min_loss : + if acu_loss < min_loss: torch.save(model.state_dict(), save_path) min_loss = acu_loss - - return train_loss_list diff --git a/gdn/GDN/util/net_struct.py b/gdn/GDN/util/net_struct.py index 398b4c8..8ae2513 100644 --- a/gdn/GDN/util/net_struct.py +++ b/gdn/GDN/util/net_struct.py @@ -1,5 +1,3 @@ -import glob - import pandas as pd diff --git a/gdn/__init__.py b/gdn/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gdn/algorithm.py b/gdn/algorithm.py index f90efdd..8c03f53 100644 --- a/gdn/algorithm.py +++ b/gdn/algorithm.py @@ -18,11 +18,13 @@ class CustomParameters: latent_size: int = 64 n_out_layers: int = 1 out_layer_dimensionality: int = 1 + topk: int = 20 epochs: int = 1 batch_size: int = 128 split: float = 0.9 learning_rate_decay: float = 0.001 random_state: int = 42 + device: str = 'cpu' class AlgorithmArgs(argparse.Namespace): @@ -31,7 +33,7 @@ def ts(self) -> pd.DataFrame: return self.df.iloc[:, 1:-1] @property - def tsa(self) -> pd.Dataframe: + def tsa(self) -> pd.DataFrame: return self.df.iloc[:, 1:] @property @@ -63,7 +65,7 @@ def train(args: AlgorithmArgs): "out_layer_inter_dim": args.customParameters.out_layer_dimensionality, "decay": args.customParameters.learning_rate_decay, "val_ratio": args.customParameters.split, - "topk": 20, + "topk": ts.shape[1] if args.customParameters.topk > ts.shape[1] else args.customParameters.topk } # load data @@ -76,9 +78,6 @@ def train(args: AlgorithmArgs): GDNtrain(train_config, env_config) - # TODO remove - raise NotImplementedError("GDN is not implemented yet!") - def execute(args: AlgorithmArgs): ts = args.ts @@ -92,9 +91,6 @@ def execute(args: AlgorithmArgs): GDNtest(env_config) - # TODO remove - raise NotImplementedError("GDN is not implemented yet!") - def set_random_state(config: AlgorithmArgs) -> None: seed = config.customParameters.random_state @@ -102,8 +98,8 @@ def set_random_state(config: AlgorithmArgs) -> None: random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) - torch.cuda.manual_seed(args.random_seed) - torch.cuda.manual_seed_all(args.random_seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True diff --git a/gdn/requirements.txt b/gdn/requirements.txt index 8db9a89..1c436c7 100644 --- a/gdn/requirements.txt +++ b/gdn/requirements.txt @@ -1,4 +1,7 @@ numpy>=1.19.5 pandas>=1.2.1 -torch==1.5.1 +torch==1.13.1 torch-geometric==1.5.0 +torch-scatter==2.1.1 +torch-sparse==0.6.17 +matplotlib==3.7.2 From 652683b1e539acbf7eecfcd8680ae36a0b97caf2 Mon Sep 17 00:00:00 2001 From: 2er0 <2er0@dbaumi.at> Date: Mon, 21 Aug 2023 22:43:37 +0200 Subject: [PATCH 5/6] align model and asset storing to TimeEval structure --- gdn/GDN/evaluate.py | 64 +++++++++++++++++++++++++---------------- gdn/GDN/main.py | 70 ++++++++++++++++++++++++++------------------- gdn/algorithm.py | 4 +-- 3 files changed, 82 insertions(+), 56 deletions(-) diff --git a/gdn/GDN/evaluate.py b/gdn/GDN/evaluate.py index ae4110d..ffcfdd7 100644 --- a/gdn/GDN/evaluate.py +++ b/gdn/GDN/evaluate.py @@ -1,4 +1,4 @@ -from util.data import * +from GDN.util.data import * import numpy as np from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score @@ -7,15 +7,15 @@ def get_full_err_scores(test_result, val_result): np_test_result = np.array(test_result) np_val_result = np.array(val_result) - all_scores = None + all_scores = None all_normals = None feature_num = np_test_result.shape[-1] labels = np_test_result[2, :, 0].tolist() for i in range(feature_num): - test_re_list = np_test_result[:2,:,i] - val_re_list = np_val_result[:2,:,i] + test_re_list = np_test_result[:2, :, i] + val_re_list = np_val_result[:2, :, i] scores = get_err_scores(test_re_list, val_re_list) normal_dist = get_err_scores(val_re_list, val_re_list) @@ -44,7 +44,6 @@ def get_final_err_scores(test_result, val_result): return all_scores - def get_err_scores(test_res, val_res): test_predict, test_gt = test_res val_predict, val_gt = val_res @@ -52,43 +51,42 @@ def get_err_scores(test_res, val_res): n_err_mid, n_err_iqr = get_err_median_and_iqr(test_predict, test_gt) test_delta = np.abs(np.subtract( - np.array(test_predict).astype(np.float64), - np.array(test_gt).astype(np.float64) - )) - epsilon=1e-2 + np.array(test_predict).astype(np.float64), + np.array(test_gt).astype(np.float64) + )) + epsilon = 1e-2 - err_scores = (test_delta - n_err_mid) / ( np.abs(n_err_iqr) +epsilon) + err_scores = (test_delta - n_err_mid) / (np.abs(n_err_iqr) + epsilon) smoothed_err_scores = np.zeros(err_scores.shape) before_num = 3 for i in range(before_num, len(err_scores)): - smoothed_err_scores[i] = np.mean(err_scores[i-before_num:i+1]) + smoothed_err_scores[i] = np.mean(err_scores[i - before_num:i + 1]) - return smoothed_err_scores - def get_loss(predict, gt): return eval_mseloss(predict, gt) + def get_f1_scores(total_err_scores, gt_labels, topk=1): print('total_err_scores', total_err_scores.shape) # remove the highest and lowest score at each timestep total_features = total_err_scores.shape[0] # topk_indices = np.argpartition(total_err_scores, range(total_features-1-topk, total_features-1), axis=0)[-topk-1:-1] - topk_indices = np.argpartition(total_err_scores, range(total_features-topk-1, total_features), axis=0)[-topk:] - + topk_indices = np.argpartition(total_err_scores, range(total_features - topk - 1, total_features), axis=0)[-topk:] + topk_indices = np.transpose(topk_indices) total_topk_err_scores = [] - topk_err_score_map=[] + topk_err_score_map = [] # topk_anomaly_sensors = [] for i, indexs in enumerate(topk_indices): - - sum_score = sum( score for k, score in enumerate(sorted([total_err_scores[index, i] for j, index in enumerate(indexs)])) ) + sum_score = sum( + score for k, score in enumerate(sorted([total_err_scores[index, i] for j, index in enumerate(indexs)]))) total_topk_err_scores.append(sum_score) @@ -96,13 +94,14 @@ def get_f1_scores(total_err_scores, gt_labels, topk=1): return final_topk_fmeas + def get_val_performance_data(total_err_scores, normal_scores, gt_labels, topk=1): total_features = total_err_scores.shape[0] - topk_indices = np.argpartition(total_err_scores, range(total_features-topk-1, total_features), axis=0)[-topk:] + topk_indices = np.argpartition(total_err_scores, range(total_features - topk - 1, total_features), axis=0)[-topk:] total_topk_err_scores = [] - topk_err_score_map=[] + topk_err_score_map = [] total_topk_err_scores = np.sum(np.take_along_axis(total_err_scores, topk_indices, axis=0), axis=0) @@ -120,25 +119,23 @@ def get_val_performance_data(total_err_scores, normal_scores, gt_labels, topk=1) f1 = f1_score(gt_labels, pred_labels) - auc_score = roc_auc_score(gt_labels, total_topk_err_scores) return f1, pre, rec, auc_score, thresold def get_best_performance_data(total_err_scores, gt_labels, topk=1): - total_features = total_err_scores.shape[0] # topk_indices = np.argpartition(total_err_scores, range(total_features-1-topk, total_features-1), axis=0)[-topk-1:-1] - topk_indices = np.argpartition(total_err_scores, range(total_features-topk-1, total_features), axis=0)[-topk:] + topk_indices = np.argpartition(total_err_scores, range(total_features - topk - 1, total_features), axis=0)[-topk:] total_topk_err_scores = [] - topk_err_score_map=[] + topk_err_score_map = [] total_topk_err_scores = np.sum(np.take_along_axis(total_err_scores, topk_indices, axis=0), axis=0) - final_topk_fmeas ,thresolds = eval_scores(total_topk_err_scores, gt_labels, 400, return_thresold=True) + final_topk_fmeas, thresolds = eval_scores(total_topk_err_scores, gt_labels, 400, return_thresold=True) th_i = final_topk_fmeas.index(max(final_topk_fmeas)) thresold = thresolds[th_i] @@ -157,3 +154,20 @@ def get_best_performance_data(total_err_scores, gt_labels, topk=1): return max(final_topk_fmeas), pre, rec, auc_score, thresold + +def get_best_performance_data_sequence(total_err_scores, gt_labels, topk=1): + total_features = total_err_scores.shape[0] + + topk_indices = np.argpartition(total_err_scores, range(total_features - topk - 1, total_features), axis=0)[-topk:] + + total_topk_err_scores = np.sum(np.take_along_axis(total_err_scores, topk_indices, axis=0), axis=0) + + final_topk_fmeas, thresolds = eval_scores(total_topk_err_scores, gt_labels, 400, return_thresold=True) + + th_i = final_topk_fmeas.index(max(final_topk_fmeas)) + thresold = thresolds[th_i] + + pred_labels = np.zeros(len(total_topk_err_scores)) + pred_labels[total_topk_err_scores > thresold] = 1 + + return max(final_topk_fmeas), total_topk_err_scores, pred_labels, thresold, gt_labels diff --git a/gdn/GDN/main.py b/gdn/GDN/main.py index e5d15b7..30a0e7b 100644 --- a/gdn/GDN/main.py +++ b/gdn/GDN/main.py @@ -14,6 +14,8 @@ from GDN.util.env import get_device, set_device from GDN.util.preprocess import build_loc_net, construct_data from GDN.util.net_struct import get_feature_map, get_fc_graph_struc +from GDN.evaluate import get_full_err_scores, get_best_performance_data, get_val_performance_data, \ + get_best_performance_data_sequence from GDN.datasets.TimeDataset import TimeDataset @@ -57,9 +59,9 @@ def GDNtrain(train_config: dict, env_config: dict) -> None: model = create_gdn_model(train_config, edge_index_sets, feature_map, device) - model_save_path = get_save_path(env_config["save_model_path"])[0] + save_path = get_save_path(env_config["modelOutput"]) - train_log = train(model, model_save_path, + train_log = train(model, save_path[0], config=train_config, train_dataloader=train_dataloader, val_dataloader=val_dataloader, @@ -71,15 +73,18 @@ def GDNtrain(train_config: dict, env_config: dict) -> None: ) _, train_result = test(model, full_train_dataloader) - save_result_output(train_result, env_config) + _, val_result = test(model, val_dataloader) + top1_best_info = get_score(train_result, val_result) - save_config_element(env_config["save_model_path"], - train_config, feature_map, fc_edge_index) + save_result_output(top1_best_info[4], env_config) + + save_config_element(save_path, + train_config, feature_map, fc_edge_index, val_result) def GDNtest(env_config: dict) -> None: - elements = load_config_element() - train_config, feature_map, fc_edge_index = elements[0], elements[1], elements[2] + elements = load_config_element(env_config["modelInput"]) + train_config, feature_map, fc_edge_index, val_result = elements[0], elements[1], elements[2], elements[3] set_device(env_config["device"] if "device" in env_config else @@ -90,8 +95,9 @@ def GDNtest(env_config: dict) -> None: edge_index_sets.append(fc_edge_index) test_ts = env_config["dataset"] + # TODO check how to work around the need for the label during the test phase test_dataset_indata = construct_data(test_ts, feature_map, - labels=test["is_anomaly"].tolist()) + labels=0) # test_ts["is_anomaly"].tolist()) cfg = { 'slide_win': train_config['slide_win'], @@ -103,7 +109,7 @@ def GDNtest(env_config: dict) -> None: test_dataloader = DataLoader(test_dataset, batch_size=train_config['batch'], shuffle=False, num_workers=0) - model_load_path = get_save_path(env_config["load_model_path"])[0] + model_load_path = get_save_path(env_config["modelInput"])[0] model = create_gdn_model(train_config, edge_index_sets, feature_map, device) @@ -112,7 +118,9 @@ def GDNtest(env_config: dict) -> None: best_model = model.to(device) _, test_result = test(best_model, test_dataloader) - save_result_output(test_result, env_config["dataOutput"]) + top1_best_info = get_score(test_result, val_result) + + save_result_output(top1_best_info[4], env_config) def create_gdn_model(train_config, @@ -149,22 +157,27 @@ def get_loaders(train_dataset, seed, batch, val_ratio=0.1): return train_dataloader, val_dataloader -def get_save_path(model_path: str): - # TODO change to TimeEval format of given locations - if len(model_path) > 0: - base_dir = os.path.dirname(model_path) - else: - base_dir = "results" +def get_score(full_result, val_result): + np_result = np.array(full_result) + + labels = np_result[2, :, 0].tolist() + + scores, _ = get_full_err_scores(full_result, val_result) + + top1_info = get_best_performance_data_sequence(scores, labels, topk=1) - now = datetime.now() - datestr = now.strftime('%m|%d-%H:%M:%S') + return top1_info + + +def get_save_path(model_path: str): + base_dir = os.path.dirname(model_path) paths = [ - f'{base_dir}/best_{datestr}.pt', - f'{base_dir}/results.tmp', + f'{model_path}', f'{base_dir}/train_config.pkl', f'{base_dir}/feature_map.pkl', - f'{base_dir}/fc_edge_index.pkl' + f'{base_dir}/fc_edge_index.pkl', + f'{base_dir}/val_result.pkl' ] for path in paths: @@ -174,18 +187,17 @@ def get_save_path(model_path: str): return paths -def save_config_element(model_path, train_config, feature_map, fc_edge_index) -> None: - paths = get_save_path(model_path) - for p, e in zip(paths[-3:], - [train_config, feature_map, fc_edge_index]): +def save_config_element(paths, train_config, feature_map, fc_edge_index, val_result) -> None: + for p, e in zip(paths[1:], + [train_config, feature_map, fc_edge_index, val_result]): with open(p, 'wb') as file: pkl.dump(e, file, protocol=pkl.HIGHEST_PROTOCOL) -def load_config_element() -> List[Any]: - paths = get_save_path() +def load_config_element(model_path) -> List[Any]: + paths = get_save_path(model_path) elements = [] - for p in paths[-3:]: + for p in paths[1:]: if not Path(p).exists(): raise FileNotFoundError("Base element not found in required path." "Run training first", p) @@ -195,7 +207,7 @@ def load_config_element() -> List[Any]: def save_result_output(result, env_config) -> None: - path = get_save_path(env_config["save_model_path"])[1].replace("results.tmp", env_config["dataOutput"]) + path = env_config["dataOutput"] np_result = np.array(result) np.savetxt(path, np_result, delimiter=",") diff --git a/gdn/algorithm.py b/gdn/algorithm.py index 8c03f53..c45a812 100644 --- a/gdn/algorithm.py +++ b/gdn/algorithm.py @@ -72,7 +72,7 @@ def train(args: AlgorithmArgs): env_config = { "dataset": ts, "dataOutput": args.dataOutput, - "save_model_path": args.modelOutput, + "modelOutput": args.modelOutput, "device": args.customParameters.device } @@ -85,7 +85,7 @@ def execute(args: AlgorithmArgs): env_config = { "dataset": ts, "dataOutput": args.dataOutput, - "load_model_path": args.modelInput, + "modelInput": args.modelInput, "device": args.customParameters.device } From d704a3d9f978d93679e1d0285c36cfa15b8c33ac Mon Sep 17 00:00:00 2001 From: 2er0 <2er0@dbaumi.at> Date: Thu, 21 Sep 2023 11:50:36 +0200 Subject: [PATCH 6/6] first full draft of GDN with python-base update to python version 3.10 --- 0-base-images/python3-base/Dockerfile | 2 +- 0-base-images/python3-base/requirements.txt | 10 +++--- 0-base-images/python3-torch/Dockerfile | 4 +-- README.md | 6 ++-- gdn/Dockerfile | 6 ++-- gdn/GDN/dataloader_fix.py | 2 +- gdn/GDN/evaluate.py | 2 +- gdn/GDN/main.py | 1 - gdn/GDN/util/data.py | 39 ++++++++++----------- gdn/README.md | 5 ++- gdn/algorithm.py | 7 ++-- gdn/requirements.txt | 2 -- 12 files changed, 44 insertions(+), 42 deletions(-) diff --git a/0-base-images/python3-base/Dockerfile b/0-base-images/python3-base/Dockerfile index 92624e4..4f7111f 100644 --- a/0-base-images/python3-base/Dockerfile +++ b/0-base-images/python3-base/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.7.9-slim-buster +FROM python:3.10-slim LABEL maintainer="sebastian.schmidl@hpi.de" diff --git a/0-base-images/python3-base/requirements.txt b/0-base-images/python3-base/requirements.txt index ee4661f..182d331 100644 --- a/0-base-images/python3-base/requirements.txt +++ b/0-base-images/python3-base/requirements.txt @@ -1,5 +1,5 @@ -numpy==1.20.0 -pandas==1.2.1 -matplotlib==3.3.4 -scipy==1.6.0 -scikit-learn==0.24.1 +numpy>=1.20.0 +pandas>=1.2.1 +matplotlib>=3.3.4 +scipy>=1.6.0 +scikit-learn>=0.24.1 diff --git a/0-base-images/python3-torch/Dockerfile b/0-base-images/python3-torch/Dockerfile index 1d412be..56f6a8a 100644 --- a/0-base-images/python3-torch/Dockerfile +++ b/0-base-images/python3-torch/Dockerfile @@ -1,5 +1,5 @@ -FROM registry.gitlab.hpi.de/akita/i/python3-base +FROM registry.gitlab.hpi.de/akita/i/python3-base:0.2.6 LABEL maintainer="phillip.wenig@hpi.de" -RUN pip install --no-cache-dir torch==1.7.1 +RUN pip install --no-cache-dir torch==1.13.1 diff --git a/README.md b/README.md index 7ade96f..78c48bf 100644 --- a/README.md +++ b/README.md @@ -179,11 +179,9 @@ Follow the below steps to test your algorithm using Docker (examples assume that docker run --rm \ -v $(pwd)/1-data:/data:ro \ -v $(pwd)/2-results:/results:rw \ - # -e LOCAL_UID= \ - # -e LOCAL_GID= \ - registry.gitlab.hpi.de/akita/i/:latest execute-algorithm '{ + registry.gitlab.hpi.de/akita/i/gdn:0.2.6 execute-algorithm '{ "executionType": "train", - "dataInput": "/data/dataset.csv", + "dataInput": "/data/multi-dataset.csv", "dataOutput": "/results/anomaly_scores.ts", "modelInput": "/results/model.pkl", "modelOutput": "/results/model.pkl", diff --git a/gdn/Dockerfile b/gdn/Dockerfile index 4673677..d77ac9a 100644 --- a/gdn/Dockerfile +++ b/gdn/Dockerfile @@ -1,4 +1,4 @@ -FROM registry.gitlab.hpi.de/akita/i/python3-base:0.2.6 +FROM registry.gitlab.hpi.de/akita/i/python3-torch:0.2.6 LABEL maintainer="2er0@dbaumi.at" @@ -7,10 +7,12 @@ ENV ALGORITHM_MAIN="/app/algorithm.py" # install algorithm dependencies COPY requirements.txt /app/ RUN apt-get update; \ - apt-get install -y gcc python3-dev; \ + apt-get install -y gcc g++ python3-dev; \ apt-get clean; \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* RUN pip install -r /app/requirements.txt COPY algorithm.py /app/ COPY GDN /app/GDN +# fixing six.py dataloader issue +COPY GDN/dataloader_fix.py /usr/local/lib/python3.10/site-packages/torch_geometric/data/dataloader.py diff --git a/gdn/GDN/dataloader_fix.py b/gdn/GDN/dataloader_fix.py index 8dbb43c..6fad4a1 100644 --- a/gdn/GDN/dataloader_fix.py +++ b/gdn/GDN/dataloader_fix.py @@ -9,7 +9,7 @@ int_classes = (bool, int) - +# NOTE: This overrides the default dataloader from torch_geometric to fix an issue class Collater(object): def __init__(self, follow_batch): self.follow_batch = follow_batch diff --git a/gdn/GDN/evaluate.py b/gdn/GDN/evaluate.py index ffcfdd7..ac71928 100644 --- a/gdn/GDN/evaluate.py +++ b/gdn/GDN/evaluate.py @@ -162,7 +162,7 @@ def get_best_performance_data_sequence(total_err_scores, gt_labels, topk=1): total_topk_err_scores = np.sum(np.take_along_axis(total_err_scores, topk_indices, axis=0), axis=0) - final_topk_fmeas, thresolds = eval_scores(total_topk_err_scores, gt_labels, 400, return_thresold=True) + final_topk_fmeas, thresolds = eval_scores(total_topk_err_scores, gt_labels, 400, return_threshold=True) th_i = final_topk_fmeas.index(max(final_topk_fmeas)) thresold = thresolds[th_i] diff --git a/gdn/GDN/main.py b/gdn/GDN/main.py index 30a0e7b..e234d4a 100644 --- a/gdn/GDN/main.py +++ b/gdn/GDN/main.py @@ -95,7 +95,6 @@ def GDNtest(env_config: dict) -> None: edge_index_sets.append(fc_edge_index) test_ts = env_config["dataset"] - # TODO check how to work around the need for the label during the test phase test_dataset_indata = construct_data(test_ts, feature_map, labels=0) # test_ts["is_anomaly"].tolist()) diff --git a/gdn/GDN/util/data.py b/gdn/GDN/util/data.py index 3e3e8c8..b6518d9 100644 --- a/gdn/GDN/util/data.py +++ b/gdn/GDN/util/data.py @@ -6,17 +6,17 @@ from numpy import percentile -def get_attack_interval(attack): +def get_attack_interval(attack): heads = [] tails = [] for i in range(len(attack)): if attack[i] == 1: - if attack[i-1] == 0: + if attack[i - 1] == 0: heads.append(i) - - if i < len(attack)-1 and attack[i+1] == 0: + + if i < len(attack) - 1 and attack[i + 1] == 0: tails.append(i) - elif i == len(attack)-1: + elif i == len(attack) - 1: tails.append(i) res = [] for i in range(len(heads)): @@ -24,9 +24,10 @@ def get_attack_interval(attack): # print(heads, tails) return res + # calculate F1 scores -def eval_scores(scores, true_scores, th_steps, return_thresold=False): - padding_list = [0]*(len(true_scores) - len(scores)) +def eval_scores(scores, true_scores, th_steps, return_threshold=False): + padding_list = [0] * (len(true_scores) - len(scores)) # print(padding_list) if len(padding_list) > 0: @@ -43,19 +44,18 @@ def eval_scores(scores, true_scores, th_steps, return_thresold=False): fmeas[i] = f1_score(true_scores, cur_pred) - score_index = scores_sorted.tolist().index(int(th_vals[i] * len(scores)+1)) + score_index = scores_sorted.tolist().index(int(th_vals[i] * len(scores) + 1)) thresholds[i] = scores[score_index] - if return_thresold: + if return_threshold: return fmeas, thresholds return fmeas -def eval_mseloss(predicted, ground_truth): +def eval_mseloss(predicted, ground_truth): ground_truth_list = np.array(ground_truth) predicted_list = np.array(predicted) - # mask = (ground_truth_list == 0) | (predicted_list == 0) # ground_truth_list = ground_truth_list[~mask] @@ -72,8 +72,8 @@ def eval_mseloss(predicted, ground_truth): return loss -def get_err_median_and_iqr(predicted, groundtruth): +def get_err_median_and_iqr(predicted, groundtruth): np_arr = np.abs(np.subtract(np.array(predicted), np.array(groundtruth))) err_median = np.median(np_arr) @@ -81,28 +81,28 @@ def get_err_median_and_iqr(predicted, groundtruth): return err_median, err_iqr -def get_err_median_and_quantile(predicted, groundtruth, percentage): +def get_err_median_and_quantile(predicted, groundtruth, percentage): np_arr = np.abs(np.subtract(np.array(predicted), np.array(groundtruth))) err_median = np.median(np_arr) # err_iqr = iqr(np_arr) - err_delta = percentile(np_arr, int(percentage*100)) - percentile(np_arr, int((1-percentage)*100)) + err_delta = percentile(np_arr, int(percentage * 100)) - percentile(np_arr, int((1 - percentage) * 100)) return err_median, err_delta -def get_err_mean_and_quantile(predicted, groundtruth, percentage): +def get_err_mean_and_quantile(predicted, groundtruth, percentage): np_arr = np.abs(np.subtract(np.array(predicted), np.array(groundtruth))) err_median = trim_mean(np_arr, percentage) # err_iqr = iqr(np_arr) - err_delta = percentile(np_arr, int(percentage*100)) - percentile(np_arr, int((1-percentage)*100)) + err_delta = percentile(np_arr, int(percentage * 100)) - percentile(np_arr, int((1 - percentage) * 100)) return err_median, err_delta -def get_err_mean_and_std(predicted, groundtruth): +def get_err_mean_and_std(predicted, groundtruth): np_arr = np.abs(np.subtract(np.array(predicted), np.array(groundtruth))) err_mean = np.mean(np_arr) @@ -112,8 +112,7 @@ def get_err_mean_and_std(predicted, groundtruth): def get_f1_score(scores, gt, contamination): - - padding_list = [0]*(len(gt) - len(scores)) + padding_list = [0] * (len(gt) - len(scores)) # print(padding_list) threshold = percentile(scores, 100 * (1 - contamination)) @@ -123,4 +122,4 @@ def get_f1_score(scores, gt, contamination): pred_labels = (scores > threshold).astype('int').ravel() - return f1_score(gt, pred_labels) \ No newline at end of file + return f1_score(gt, pred_labels) diff --git a/gdn/README.md b/gdn/README.md index 4efe63d..c037090 100644 --- a/gdn/README.md +++ b/gdn/README.md @@ -11,6 +11,9 @@ ## Dependencies - python 3 +- numpy +- pandas - pytorch - pytorch-geometric -- torch-scatter, torch-sparse, torch-cluster, torch-spline-conv +- torch-scatter, +- torch-sparse, \ No newline at end of file diff --git a/gdn/algorithm.py b/gdn/algorithm.py index c45a812..19b15fb 100644 --- a/gdn/algorithm.py +++ b/gdn/algorithm.py @@ -19,7 +19,7 @@ class CustomParameters: n_out_layers: int = 1 out_layer_dimensionality: int = 1 topk: int = 20 - epochs: int = 1 + epochs: int = 10 batch_size: int = 128 split: float = 0.9 learning_rate_decay: float = 0.001 @@ -30,7 +30,10 @@ class CustomParameters: class AlgorithmArgs(argparse.Namespace): @property def ts(self) -> pd.DataFrame: - return self.df.iloc[:, 1:-1] + ds = self.df.iloc[:, 1:-1] + if ds.shape[1] == 1: + raise RuntimeError("No multivariate time series provided as dataset") + return ds @property def tsa(self) -> pd.DataFrame: diff --git a/gdn/requirements.txt b/gdn/requirements.txt index 1c436c7..66fc7d7 100644 --- a/gdn/requirements.txt +++ b/gdn/requirements.txt @@ -1,7 +1,5 @@ numpy>=1.19.5 pandas>=1.2.1 -torch==1.13.1 torch-geometric==1.5.0 torch-scatter==2.1.1 torch-sparse==0.6.17 -matplotlib==3.7.2