From ee8067d7d05e579a8336bf8238941ed070e54295 Mon Sep 17 00:00:00 2001 From: Dennis Lau Date: Fri, 22 Oct 2021 20:39:01 +0800 Subject: [PATCH] feat: tried 2 novelty detection models (One-Class SVM & Local Outlier Factor) in sklearn Closes #4 --- notebooks/sklearn/sklearn.ipynb | 570 ++++++++++++++++++++++++++++++++ 1 file changed, 570 insertions(+) create mode 100644 notebooks/sklearn/sklearn.ipynb diff --git a/notebooks/sklearn/sklearn.ipynb b/notebooks/sklearn/sklearn.ipynb new file mode 100644 index 0000000..5ebf788 --- /dev/null +++ b/notebooks/sklearn/sklearn.ipynb @@ -0,0 +1,570 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# sklearn novelty detection" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Docs: https://scikit-learn.org/stable/modules/outlier_detection.html#novelty-detection\n", + "\n", + "As defined in the above link, the difference between outlier detection and novelty detection is that outlier is unsupervised, while novelty is semi-supervised.\n", + "\n", + "In our case, since the first part of each data file has no abnormal data points, it can be used for novelty detection.\n", + "\n", + "The difference to anomalies is, that novelties are considered normal after being detected once. [1](https://arxiv.org/pdf/2004.00433.pdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Matrix Profile Benchmark" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'002_UCR_Anomaly_35000.txt': 56598,\n", + " '004_UCR_Anomaly_2500.txt': 5484,\n", + " '005_UCR_Anomaly_4000.txt': 5299,\n", + " '006_UCR_Anomaly_4000.txt': 5672,\n", + " '007_UCR_Anomaly_4000.txt': 6447,\n", + " '008_UCR_Anomaly_4000.txt': 7171,\n", + " '010_UCR_Anomaly_4000.txt': 5974,\n", + " '012_UCR_Anomaly_15000.txt': 25849,\n", + " '013_UCR_Anomaly_15000.txt': 16000,\n", + " '014_UCR_Anomaly_8000.txt': 16972,\n", + " '016_UCR_Anomaly_5000.txt': 16949,\n", + " '017_UCR_Anomaly_5000.txt': 16948,\n", + " '018_UCR_Anomaly_8000.txt': 16920,\n", + " '019_UCR_Anomaly_5000.txt': 5340,\n", + " '021_UCR_Anomaly_5000.txt': 11029,\n", + " '022_UCR_Anomaly_4000.txt': 8394,\n", + " '024_UCR_Anomaly_3200.txt': 4476,\n", + " '025_UCR_Anomaly_2800.txt': 5556,\n", + " '026_UCR_Anomaly_1700.txt': 5710,\n", + " '027_UCR_Anomaly_1200.txt': 5670,\n", + " '028_UCR_Anomaly_1600.txt': 3114,\n", + " '031_UCR_Anomaly_2700.txt': 3471,\n", + " '032_UCR_Anomaly_1000.txt': 4699,\n", + " '033_UCR_Anomaly_4000.txt': 6147,\n", + " '034_UCR_Anomaly_1500.txt': 3694,\n", + " '035_UCR_Anomaly_2500.txt': 5818,\n", + " '036_UCR_Anomaly_4200.txt': 5215,\n", + " '037_UCR_Anomaly_5000.txt': 29801,\n", + " '041_UCR_Anomaly_7000.txt': 29626,\n", + " '042_UCR_Anomaly_7000.txt': 29706,\n", + " '043_UCR_Anomaly_10000.txt': 14903,\n", + " '044_UCR_Anomaly_9000.txt': 18753,\n", + " '045_UCR_Anomaly_14000.txt': 14214,\n", + " '047_UCR_Anomaly_18000.txt': 23996,\n", + " '048_UCR_Anomaly_3500.txt': 5988,\n", + " '049_UCR_Anomaly_3500.txt': 4424,\n", + " '050_UCR_Anomaly_3500.txt': 5225,\n", + " '051_UCR_Anomaly_3500.txt': 9323,\n", + " '052_UCR_Anomaly_3500.txt': 4720,\n", + " '053_UCR_Anomaly_1500.txt': 5735,\n", + " '054_UCR_Anomaly_2700.txt': 5736,\n", + " '055_UCR_Anomaly_10000.txt': 20926,\n", + " '056_UCR_Anomaly_5000.txt': 11126,\n", + " '058_UCR_Anomaly_10000.txt': 12200,\n", + " '059_UCR_Anomaly_20000.txt': 59849,\n", + " '061_UCR_Anomaly_24500.txt': 56689,\n", + " '062_UCR_Anomaly_18500.txt': 57874,\n", + " '063_UCR_Anomaly_18500.txt': 62468,\n", + " '065_UCR_Anomaly_3000.txt': 8741,\n", + " '066_UCR_Anomaly_3700.txt': 7975,\n", + " '069_UCR_Anomaly_3200.txt': 8464,\n", + " '070_UCR_Anomaly_17555.txt': 53442,\n", + " '072_UCR_Anomaly_20000.txt': 24028,\n", + " '074_UCR_Anomaly_4000.txt': 12664,\n", + " '075_UCR_Anomaly_4000.txt': 13649,\n", + " '076_UCR_Anomaly_48000.txt': 98333,\n", + " '084_UCR_Anomaly_12000.txt': 35742,\n", + " '086_UCR_Anomaly_20000.txt': 43457,\n", + " '087_UCR_Anomaly_17000.txt': 42341,\n", + " '088_UCR_Anomaly_20000.txt': 53440,\n", + " '090_UCR_Anomaly_50000.txt': 55937,\n", + " '094_UCR_Anomaly_2500.txt': 5483,\n", + " '096_UCR_Anomaly_5000.txt': 16953,\n", + " '097_UCR_Anomaly_5000.txt': 10804,\n", + " '098_UCR_Anomaly_1200.txt': 2193,\n", + " '099_UCR_Anomaly_1500.txt': 3522,\n", + " '101_UCR_Anomaly_6000.txt': 29912,\n", + " '103_UCR_Anomaly_3500.txt': 4721,\n", + " '104_UCR_Anomaly_6000.txt': 37038,\n", + " '107_UCR_Anomaly_5200.txt': 8740,\n", + " '111_UCR_Anomaly_35000.txt': 46662,\n", + " '112_UCR_Anomaly_2500.txt': 5483,\n", + " '113_UCR_Anomaly_4000.txt': 7171,\n", + " '114_UCR_Anomaly_4000.txt': 5684,\n", + " '115_UCR_Anomaly_4000.txt': 6508,\n", + " '116_UCR_Anomaly_4000.txt': 7171,\n", + " '117_UCR_Anomaly_4000.txt': 4866,\n", + " '118_UCR_Anomaly_4000.txt': 5994,\n", + " '119_UCR_Anomaly_10000.txt': 11931,\n", + " '120_UCR_Anomaly_15000.txt': 25849,\n", + " '122_UCR_Anomaly_8000.txt': 16986,\n", + " '123_UCR_Anomaly_5000.txt': 17098,\n", + " '124_UCR_Anomaly_5000.txt': 16953,\n", + " '125_UCR_Anomaly_5000.txt': 16951,\n", + " '126_UCR_Anomaly_8000.txt': 17071,\n", + " '127_UCR_Anomaly_5000.txt': 11456,\n", + " '129_UCR_Anomaly_5000.txt': 11456,\n", + " '131_UCR_Anomaly_5000.txt': 11456,\n", + " '132_UCR_Anomaly_3200.txt': 4475,\n", + " '133_UCR_Anomaly_2800.txt': 5556,\n", + " '134_UCR_Anomaly_1700.txt': 5804,\n", + " '135_UCR_Anomaly_1200.txt': 4189,\n", + " '136_UCR_Anomaly_1600.txt': 3292,\n", + " '137_UCR_Anomaly_2300.txt': 4569,\n", + " '138_UCR_Anomaly_3000.txt': 4183,\n", + " '139_UCR_Anomaly_2700.txt': 5791,\n", + " '140_UCR_Anomaly_1000.txt': 4697,\n", + " '141_UCR_Anomaly_4000.txt': 6160,\n", + " '142_UCR_Anomaly_1500.txt': 3523,\n", + " '143_UCR_Anomaly_2500.txt': 5819,\n", + " '144_UCR_Anomaly_4200.txt': 6601,\n", + " '145_UCR_Anomaly_5000.txt': 29847,\n", + " '146_UCR_Anomaly_5000.txt': 29796,\n", + " '147_UCR_Anomaly_5000.txt': 29796,\n", + " '148_UCR_Anomaly_6000.txt': 29912,\n", + " '150_UCR_Anomaly_7000.txt': 29705,\n", + " '152_UCR_Anomaly_9000.txt': 18453,\n", + " '153_UCR_Anomaly_14000.txt': 14212,\n", + " '155_UCR_Anomaly_18000.txt': 23997,\n", + " '156_UCR_Anomaly_3500.txt': 5988,\n", + " '157_UCR_Anomaly_3500.txt': 5326,\n", + " '158_UCR_Anomaly_3500.txt': 5232,\n", + " '159_UCR_Anomaly_3500.txt': 9312,\n", + " '160_UCR_Anomaly_3500.txt': 4720,\n", + " '161_UCR_Anomaly_1500.txt': 5732,\n", + " '162_UCR_Anomaly_2700.txt': 5732,\n", + " '163_UCR_Anomaly_10000.txt': 20927,\n", + " '164_UCR_Anomaly_5000.txt': 11126,\n", + " '166_UCR_Anomaly_10000.txt': 12200,\n", + " '170_UCR_Anomaly_18500.txt': 33102,\n", + " '171_UCR_Anomaly_18500.txt': 31441,\n", + " '172_UCR_Anomaly_23400.txt': 38723,\n", + " '173_UCR_Anomaly_3000.txt': 8741,\n", + " '174_UCR_Anomaly_3700.txt': 8002,\n", + " '175_UCR_Anomaly_5200.txt': 8741,\n", + " '176_UCR_Anomaly_1300.txt': 6483,\n", + " '178_UCR_Anomaly_17555.txt': 53442,\n", + " '179_UCR_Anomaly_23000.txt': 34304,\n", + " '182_UCR_Anomaly_4000.txt': 12653,\n", + " '183_UCR_Anomaly_4000.txt': 13627,\n", + " '184_UCR_Anomaly_48000.txt': 68306,\n", + " '185_UCR_Anomaly_58000.txt': 68306,\n", + " '187_UCR_Anomaly_30000.txt': 68306,\n", + " '188_UCR_Anomaly_30000.txt': 68306,\n", + " '189_UCR_Anomaly_45000.txt': 158252,\n", + " '190_UCR_Anomaly_70000.txt': 128577,\n", + " '191_UCR_Anomaly_38000.txt': 68306,\n", + " '192_UCR_Anomaly_12000.txt': 35742,\n", + " '193_UCR_Anomaly_10000.txt': 35756,\n", + " '195_UCR_Anomaly_17000.txt': 43463,\n", + " '197_UCR_Anomaly_100000.txt': 114231,\n", + " '198_UCR_Anomaly_50000.txt': 124074,\n", + " '199_UCR_Anomaly_40000.txt': 113943,\n", + " '200_UCR_Anomaly_20000.txt': 67993,\n", + " '201_UCR_Anomaly_10000.txt': 16946,\n", + " '204_UCR_Anomaly_12412.txt': 14909,\n", + " '206_UCR_Anomaly_25130.txt': 25422,\n", + " '207_UCR_Anomaly_3165.txt': 26914,\n", + " '208_UCR_Anomaly_5130.txt': 27930,\n", + " '212_UCR_Anomaly_8913.txt': 14212,\n", + " '222_UCR_Anomaly_56123.txt': 91642,\n", + " '223_UCR_Anomaly_74123.txt': 133152,\n", + " '224_UCR_Anomaly_76123.txt': 92911,\n", + " '225_UCR_Anomaly_81214.txt': 143130,\n", + " '226_UCR_Anomaly_96123.txt': 123157,\n", + " '228_UCR_Anomaly_11361.txt': 30436,\n", + " '230_UCR_Anomaly_19363.txt': 39559,\n", + " '231_UCR_Anomaly_8763.txt': 47601,\n", + " '232_UCR_Anomaly_8763.txt': 57522,\n", + " '233_UCR_Anomaly_18913.txt': 27452,\n", + " '235_UCR_Anomaly_18913.txt': 96998,\n", + " '237_UCR_Anomaly_19313.txt': 32700,\n", + " '238_UCR_Anomaly_21311.txt': 72538,\n", + " '242_UCR_Anomaly_100000.txt': 188421,\n", + " '243_UCR_Anomaly_100000.txt': 169331,\n", + " '247_UCR_Anomaly_50211.txt': 55436,\n", + " '249_UCR_Anomaly_2753.txt': 4843}" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import json\n", + "\n", + "matrixprofile_file = '../matrixprofile.json'\n", + "with open(matrixprofile_file) as file:\n", + " matrixprofile = json.load(file)\n", + "\n", + "# only pick out records with merged_discord\n", + "mp = {k: v['merged_discord'] for (k,v) in matrixprofile.items() if v['merged_discord'] is not None}\n", + "mp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "BASE_DIR = '../../data-sets/KDD-Cup/data/'" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "threshold: 35000\n" + ] + } + ], + "source": [ + "import os\n", + "import pandas as pd\n", + "import re\n", + "import numpy as np\n", + "\n", + "filenames = sorted([i for i in os.listdir(BASE_DIR) if 'txt' in i])\n", + "\n", + "filename = filenames[1]\n", + "df = pd.read_csv(BASE_DIR + filename, names=['series'])\n", + "regex = re.compile(r'^\\d{3}_UCR_Anomaly_(?P\\d+)\\.txt$')\n", + "result = regex.search(filename)\n", + "threshold = int(result.group('pos'))\n", + "print(f'threshold: {threshold}')" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "X_train = df.series[df.index <= threshold].to_numpy().reshape(-1, 1)\n", + "# no abnormal data points before threshold, so all 0\n", + "y_train = np.zeros(X_train.shape[0])\n", + "X_inf = df.series[df.index > threshold].to_numpy().reshape(-1, 1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# One Class SVM" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Docs: https://scikit-learn.org/stable/auto_examples/svm/plot_oneclass.html#sphx-glr-auto-examples-svm-plot-oneclass-py" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "OneClassSVM(gamma=0.1, nu=0.1)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.svm import OneClassSVM\n", + "\n", + "clf = OneClassSVM(nu=0.1, gamma=0.1)\n", + "clf.fit(X_train)\n", + "# print(clf.predict(X_train))\n", + "# print(clf.predict(X_inf))" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([-1, 1]), array([11644, 23357]))" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_hat_train = clf.predict(X_train)\n", + "np.unique(y_hat_train, return_counts=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([-1, 1]), array([14941, 29853]))" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_hat_test = clf.predict(X_inf)\n", + "np.unique(y_hat_test, return_counts=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Local Outlier Factor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Docs: https://scikit-learn.org/stable/modules/outlier_detection.html#novelty-detection-with-local-outlier-factor" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LocalOutlierFactor(n_jobs=-1, novelty=True)" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.neighbors import LocalOutlierFactor\n", + "\n", + "lof = LocalOutlierFactor(novelty=True, n_jobs=-1)\n", + "lof.fit(X_train)\n", + "# Note: do not call predict on training data!" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([-1, 1]), array([ 16, 44984]))" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_hat_test = lof.predict(X_inf)\n", + "np.unique(y_hat_test, return_counts=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "lof_score_df = pd.DataFrame.from_dict({'score': lof.score_samples(X_inf)})" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
score
count45000.000000
mean-1.026708
std0.056667
min-3.257690
25%-1.046834
50%-1.014971
75%-0.991647
max-0.912334
\n", + "
" + ], + "text/plain": [ + " score\n", + "count 45000.000000\n", + "mean -1.026708\n", + "std 0.056667\n", + "min -3.257690\n", + "25% -1.046834\n", + "50% -1.014971\n", + "75% -0.991647\n", + "max -0.912334" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lof_score_df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "67095\n" + ] + } + ], + "source": [ + "# score the lower, the more abnormal\n", + "\n", + "inf_id = lof_score_df['score'].idxmin()\n", + "id = threshold + inf_id\n", + "print(id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "86fe9e53bc0e86f805aec5f2bbd6ea332f33b57c8652d8bb41887c9297450cae" + }, + "kernelspec": { + "display_name": "Python 3.9.4 64-bit ('5002-project': conda)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.4" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}