From ee8067d7d05e579a8336bf8238941ed070e54295 Mon Sep 17 00:00:00 2001
From: Dennis Lau <dennishylau@gmail.com>
Date: Fri, 22 Oct 2021 20:39:01 +0800
Subject: [PATCH] feat: tried 2 novelty detection models (One-Class SVM & Local
 Outlier Factor) in sklearn

Closes #4
---
 notebooks/sklearn/sklearn.ipynb | 570 ++++++++++++++++++++++++++++++++
 1 file changed, 570 insertions(+)
 create mode 100644 notebooks/sklearn/sklearn.ipynb

diff --git a/notebooks/sklearn/sklearn.ipynb b/notebooks/sklearn/sklearn.ipynb
new file mode 100644
index 0000000..5ebf788
--- /dev/null
+++ b/notebooks/sklearn/sklearn.ipynb
@@ -0,0 +1,570 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# sklearn novelty detection"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Docs: https://scikit-learn.org/stable/modules/outlier_detection.html#novelty-detection\n",
+    "\n",
+    "As defined in the above link, the difference between outlier detection and novelty detection is that outlier is unsupervised, while novelty is semi-supervised.\n",
+    "\n",
+    "In our case, since the first part of each data file has no abnormal data points, it can be used for novelty detection.\n",
+    "\n",
+    "The difference to anomalies is, that novelties are considered normal after being detected once. [1](https://arxiv.org/pdf/2004.00433.pdf)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Matrix Profile Benchmark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'002_UCR_Anomaly_35000.txt': 56598,\n",
+       " '004_UCR_Anomaly_2500.txt': 5484,\n",
+       " '005_UCR_Anomaly_4000.txt': 5299,\n",
+       " '006_UCR_Anomaly_4000.txt': 5672,\n",
+       " '007_UCR_Anomaly_4000.txt': 6447,\n",
+       " '008_UCR_Anomaly_4000.txt': 7171,\n",
+       " '010_UCR_Anomaly_4000.txt': 5974,\n",
+       " '012_UCR_Anomaly_15000.txt': 25849,\n",
+       " '013_UCR_Anomaly_15000.txt': 16000,\n",
+       " '014_UCR_Anomaly_8000.txt': 16972,\n",
+       " '016_UCR_Anomaly_5000.txt': 16949,\n",
+       " '017_UCR_Anomaly_5000.txt': 16948,\n",
+       " '018_UCR_Anomaly_8000.txt': 16920,\n",
+       " '019_UCR_Anomaly_5000.txt': 5340,\n",
+       " '021_UCR_Anomaly_5000.txt': 11029,\n",
+       " '022_UCR_Anomaly_4000.txt': 8394,\n",
+       " '024_UCR_Anomaly_3200.txt': 4476,\n",
+       " '025_UCR_Anomaly_2800.txt': 5556,\n",
+       " '026_UCR_Anomaly_1700.txt': 5710,\n",
+       " '027_UCR_Anomaly_1200.txt': 5670,\n",
+       " '028_UCR_Anomaly_1600.txt': 3114,\n",
+       " '031_UCR_Anomaly_2700.txt': 3471,\n",
+       " '032_UCR_Anomaly_1000.txt': 4699,\n",
+       " '033_UCR_Anomaly_4000.txt': 6147,\n",
+       " '034_UCR_Anomaly_1500.txt': 3694,\n",
+       " '035_UCR_Anomaly_2500.txt': 5818,\n",
+       " '036_UCR_Anomaly_4200.txt': 5215,\n",
+       " '037_UCR_Anomaly_5000.txt': 29801,\n",
+       " '041_UCR_Anomaly_7000.txt': 29626,\n",
+       " '042_UCR_Anomaly_7000.txt': 29706,\n",
+       " '043_UCR_Anomaly_10000.txt': 14903,\n",
+       " '044_UCR_Anomaly_9000.txt': 18753,\n",
+       " '045_UCR_Anomaly_14000.txt': 14214,\n",
+       " '047_UCR_Anomaly_18000.txt': 23996,\n",
+       " '048_UCR_Anomaly_3500.txt': 5988,\n",
+       " '049_UCR_Anomaly_3500.txt': 4424,\n",
+       " '050_UCR_Anomaly_3500.txt': 5225,\n",
+       " '051_UCR_Anomaly_3500.txt': 9323,\n",
+       " '052_UCR_Anomaly_3500.txt': 4720,\n",
+       " '053_UCR_Anomaly_1500.txt': 5735,\n",
+       " '054_UCR_Anomaly_2700.txt': 5736,\n",
+       " '055_UCR_Anomaly_10000.txt': 20926,\n",
+       " '056_UCR_Anomaly_5000.txt': 11126,\n",
+       " '058_UCR_Anomaly_10000.txt': 12200,\n",
+       " '059_UCR_Anomaly_20000.txt': 59849,\n",
+       " '061_UCR_Anomaly_24500.txt': 56689,\n",
+       " '062_UCR_Anomaly_18500.txt': 57874,\n",
+       " '063_UCR_Anomaly_18500.txt': 62468,\n",
+       " '065_UCR_Anomaly_3000.txt': 8741,\n",
+       " '066_UCR_Anomaly_3700.txt': 7975,\n",
+       " '069_UCR_Anomaly_3200.txt': 8464,\n",
+       " '070_UCR_Anomaly_17555.txt': 53442,\n",
+       " '072_UCR_Anomaly_20000.txt': 24028,\n",
+       " '074_UCR_Anomaly_4000.txt': 12664,\n",
+       " '075_UCR_Anomaly_4000.txt': 13649,\n",
+       " '076_UCR_Anomaly_48000.txt': 98333,\n",
+       " '084_UCR_Anomaly_12000.txt': 35742,\n",
+       " '086_UCR_Anomaly_20000.txt': 43457,\n",
+       " '087_UCR_Anomaly_17000.txt': 42341,\n",
+       " '088_UCR_Anomaly_20000.txt': 53440,\n",
+       " '090_UCR_Anomaly_50000.txt': 55937,\n",
+       " '094_UCR_Anomaly_2500.txt': 5483,\n",
+       " '096_UCR_Anomaly_5000.txt': 16953,\n",
+       " '097_UCR_Anomaly_5000.txt': 10804,\n",
+       " '098_UCR_Anomaly_1200.txt': 2193,\n",
+       " '099_UCR_Anomaly_1500.txt': 3522,\n",
+       " '101_UCR_Anomaly_6000.txt': 29912,\n",
+       " '103_UCR_Anomaly_3500.txt': 4721,\n",
+       " '104_UCR_Anomaly_6000.txt': 37038,\n",
+       " '107_UCR_Anomaly_5200.txt': 8740,\n",
+       " '111_UCR_Anomaly_35000.txt': 46662,\n",
+       " '112_UCR_Anomaly_2500.txt': 5483,\n",
+       " '113_UCR_Anomaly_4000.txt': 7171,\n",
+       " '114_UCR_Anomaly_4000.txt': 5684,\n",
+       " '115_UCR_Anomaly_4000.txt': 6508,\n",
+       " '116_UCR_Anomaly_4000.txt': 7171,\n",
+       " '117_UCR_Anomaly_4000.txt': 4866,\n",
+       " '118_UCR_Anomaly_4000.txt': 5994,\n",
+       " '119_UCR_Anomaly_10000.txt': 11931,\n",
+       " '120_UCR_Anomaly_15000.txt': 25849,\n",
+       " '122_UCR_Anomaly_8000.txt': 16986,\n",
+       " '123_UCR_Anomaly_5000.txt': 17098,\n",
+       " '124_UCR_Anomaly_5000.txt': 16953,\n",
+       " '125_UCR_Anomaly_5000.txt': 16951,\n",
+       " '126_UCR_Anomaly_8000.txt': 17071,\n",
+       " '127_UCR_Anomaly_5000.txt': 11456,\n",
+       " '129_UCR_Anomaly_5000.txt': 11456,\n",
+       " '131_UCR_Anomaly_5000.txt': 11456,\n",
+       " '132_UCR_Anomaly_3200.txt': 4475,\n",
+       " '133_UCR_Anomaly_2800.txt': 5556,\n",
+       " '134_UCR_Anomaly_1700.txt': 5804,\n",
+       " '135_UCR_Anomaly_1200.txt': 4189,\n",
+       " '136_UCR_Anomaly_1600.txt': 3292,\n",
+       " '137_UCR_Anomaly_2300.txt': 4569,\n",
+       " '138_UCR_Anomaly_3000.txt': 4183,\n",
+       " '139_UCR_Anomaly_2700.txt': 5791,\n",
+       " '140_UCR_Anomaly_1000.txt': 4697,\n",
+       " '141_UCR_Anomaly_4000.txt': 6160,\n",
+       " '142_UCR_Anomaly_1500.txt': 3523,\n",
+       " '143_UCR_Anomaly_2500.txt': 5819,\n",
+       " '144_UCR_Anomaly_4200.txt': 6601,\n",
+       " '145_UCR_Anomaly_5000.txt': 29847,\n",
+       " '146_UCR_Anomaly_5000.txt': 29796,\n",
+       " '147_UCR_Anomaly_5000.txt': 29796,\n",
+       " '148_UCR_Anomaly_6000.txt': 29912,\n",
+       " '150_UCR_Anomaly_7000.txt': 29705,\n",
+       " '152_UCR_Anomaly_9000.txt': 18453,\n",
+       " '153_UCR_Anomaly_14000.txt': 14212,\n",
+       " '155_UCR_Anomaly_18000.txt': 23997,\n",
+       " '156_UCR_Anomaly_3500.txt': 5988,\n",
+       " '157_UCR_Anomaly_3500.txt': 5326,\n",
+       " '158_UCR_Anomaly_3500.txt': 5232,\n",
+       " '159_UCR_Anomaly_3500.txt': 9312,\n",
+       " '160_UCR_Anomaly_3500.txt': 4720,\n",
+       " '161_UCR_Anomaly_1500.txt': 5732,\n",
+       " '162_UCR_Anomaly_2700.txt': 5732,\n",
+       " '163_UCR_Anomaly_10000.txt': 20927,\n",
+       " '164_UCR_Anomaly_5000.txt': 11126,\n",
+       " '166_UCR_Anomaly_10000.txt': 12200,\n",
+       " '170_UCR_Anomaly_18500.txt': 33102,\n",
+       " '171_UCR_Anomaly_18500.txt': 31441,\n",
+       " '172_UCR_Anomaly_23400.txt': 38723,\n",
+       " '173_UCR_Anomaly_3000.txt': 8741,\n",
+       " '174_UCR_Anomaly_3700.txt': 8002,\n",
+       " '175_UCR_Anomaly_5200.txt': 8741,\n",
+       " '176_UCR_Anomaly_1300.txt': 6483,\n",
+       " '178_UCR_Anomaly_17555.txt': 53442,\n",
+       " '179_UCR_Anomaly_23000.txt': 34304,\n",
+       " '182_UCR_Anomaly_4000.txt': 12653,\n",
+       " '183_UCR_Anomaly_4000.txt': 13627,\n",
+       " '184_UCR_Anomaly_48000.txt': 68306,\n",
+       " '185_UCR_Anomaly_58000.txt': 68306,\n",
+       " '187_UCR_Anomaly_30000.txt': 68306,\n",
+       " '188_UCR_Anomaly_30000.txt': 68306,\n",
+       " '189_UCR_Anomaly_45000.txt': 158252,\n",
+       " '190_UCR_Anomaly_70000.txt': 128577,\n",
+       " '191_UCR_Anomaly_38000.txt': 68306,\n",
+       " '192_UCR_Anomaly_12000.txt': 35742,\n",
+       " '193_UCR_Anomaly_10000.txt': 35756,\n",
+       " '195_UCR_Anomaly_17000.txt': 43463,\n",
+       " '197_UCR_Anomaly_100000.txt': 114231,\n",
+       " '198_UCR_Anomaly_50000.txt': 124074,\n",
+       " '199_UCR_Anomaly_40000.txt': 113943,\n",
+       " '200_UCR_Anomaly_20000.txt': 67993,\n",
+       " '201_UCR_Anomaly_10000.txt': 16946,\n",
+       " '204_UCR_Anomaly_12412.txt': 14909,\n",
+       " '206_UCR_Anomaly_25130.txt': 25422,\n",
+       " '207_UCR_Anomaly_3165.txt': 26914,\n",
+       " '208_UCR_Anomaly_5130.txt': 27930,\n",
+       " '212_UCR_Anomaly_8913.txt': 14212,\n",
+       " '222_UCR_Anomaly_56123.txt': 91642,\n",
+       " '223_UCR_Anomaly_74123.txt': 133152,\n",
+       " '224_UCR_Anomaly_76123.txt': 92911,\n",
+       " '225_UCR_Anomaly_81214.txt': 143130,\n",
+       " '226_UCR_Anomaly_96123.txt': 123157,\n",
+       " '228_UCR_Anomaly_11361.txt': 30436,\n",
+       " '230_UCR_Anomaly_19363.txt': 39559,\n",
+       " '231_UCR_Anomaly_8763.txt': 47601,\n",
+       " '232_UCR_Anomaly_8763.txt': 57522,\n",
+       " '233_UCR_Anomaly_18913.txt': 27452,\n",
+       " '235_UCR_Anomaly_18913.txt': 96998,\n",
+       " '237_UCR_Anomaly_19313.txt': 32700,\n",
+       " '238_UCR_Anomaly_21311.txt': 72538,\n",
+       " '242_UCR_Anomaly_100000.txt': 188421,\n",
+       " '243_UCR_Anomaly_100000.txt': 169331,\n",
+       " '247_UCR_Anomaly_50211.txt': 55436,\n",
+       " '249_UCR_Anomaly_2753.txt': 4843}"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import json\n",
+    "\n",
+    "matrixprofile_file = '../matrixprofile.json'\n",
+    "with open(matrixprofile_file) as file:\n",
+    "    matrixprofile = json.load(file)\n",
+    "\n",
+    "# only pick out records with merged_discord\n",
+    "mp = {k: v['merged_discord'] for (k,v) in matrixprofile.items() if v['merged_discord'] is not None}\n",
+    "mp"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BASE_DIR = '../../data-sets/KDD-Cup/data/'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "threshold: 35000\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "import numpy as np\n",
+    "\n",
+    "filenames = sorted([i for i in os.listdir(BASE_DIR) if 'txt' in i])\n",
+    "\n",
+    "filename = filenames[1]\n",
+    "df = pd.read_csv(BASE_DIR + filename, names=['series'])\n",
+    "regex = re.compile(r'^\\d{3}_UCR_Anomaly_(?P<pos>\\d+)\\.txt$')\n",
+    "result = regex.search(filename)\n",
+    "threshold = int(result.group('pos'))\n",
+    "print(f'threshold: {threshold}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train = df.series[df.index <= threshold].to_numpy().reshape(-1, 1)\n",
+    "# no abnormal data points before threshold, so all 0\n",
+    "y_train = np.zeros(X_train.shape[0])\n",
+    "X_inf = df.series[df.index > threshold].to_numpy().reshape(-1, 1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# One Class SVM"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Docs: https://scikit-learn.org/stable/auto_examples/svm/plot_oneclass.html#sphx-glr-auto-examples-svm-plot-oneclass-py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "OneClassSVM(gamma=0.1, nu=0.1)"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.svm import OneClassSVM\n",
+    "\n",
+    "clf = OneClassSVM(nu=0.1, gamma=0.1)\n",
+    "clf.fit(X_train)\n",
+    "# print(clf.predict(X_train))\n",
+    "# print(clf.predict(X_inf))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(array([-1,  1]), array([11644, 23357]))"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y_hat_train = clf.predict(X_train)\n",
+    "np.unique(y_hat_train, return_counts=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(array([-1,  1]), array([14941, 29853]))"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y_hat_test = clf.predict(X_inf)\n",
+    "np.unique(y_hat_test, return_counts=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Local Outlier Factor"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Docs: https://scikit-learn.org/stable/modules/outlier_detection.html#novelty-detection-with-local-outlier-factor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "LocalOutlierFactor(n_jobs=-1, novelty=True)"
+      ]
+     },
+     "execution_count": 62,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.neighbors import LocalOutlierFactor\n",
+    "\n",
+    "lof = LocalOutlierFactor(novelty=True, n_jobs=-1)\n",
+    "lof.fit(X_train)\n",
+    "# Note: do not call predict on training data!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(array([-1,  1]), array([   16, 44984]))"
+      ]
+     },
+     "execution_count": 63,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y_hat_test = lof.predict(X_inf)\n",
+    "np.unique(y_hat_test, return_counts=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lof_score_df = pd.DataFrame.from_dict({'score': lof.score_samples(X_inf)})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>45000.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>-1.026708</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>0.056667</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>-3.257690</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>-1.046834</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>-1.014971</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>-0.991647</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>-0.912334</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "              score\n",
+       "count  45000.000000\n",
+       "mean      -1.026708\n",
+       "std        0.056667\n",
+       "min       -3.257690\n",
+       "25%       -1.046834\n",
+       "50%       -1.014971\n",
+       "75%       -0.991647\n",
+       "max       -0.912334"
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "lof_score_df.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "67095\n"
+     ]
+    }
+   ],
+   "source": [
+    "# score the lower, the more abnormal\n",
+    "\n",
+    "inf_id = lof_score_df['score'].idxmin()\n",
+    "id = threshold + inf_id\n",
+    "print(id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "86fe9e53bc0e86f805aec5f2bbd6ea332f33b57c8652d8bb41887c9297450cae"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.9.4 64-bit ('5002-project': conda)",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}