Skip to content

Commit

Permalink
Merge pull request #72 from redBorder/fix_always_anomaly_in_shallow_o…
Browse files Browse the repository at this point in the history
…utliers
  • Loading branch information
malvads authored May 31, 2024
2 parents 6c5dc30 + 3ba229b commit 8bed888
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 16 deletions.
74 changes: 59 additions & 15 deletions resources/src/ai/shallow_outliers.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,31 @@

class ShallowOutliers:
"""
Shallow AI model for outliers detection. Used whenever there is no deep learning model defined.
Input data should be 1-dimensional.
Shallow AI model for detecting outliers in 1-dimensional data. Utilized when a deep learning model is not defined.
Args:
(None)
sensitivity (float, optional): A value between 0 and 1 that adjusts the threshold for identifying anomalies.
At 1, at least one anomaly is always identified. At 0, no anomalies are identified. Default is 0.95.
contamination (float, optional): A value between 0 and 1 that indicates the proportion of data points
to be considered anomalous during training. Default is 0.01.
"""

def __init__(self, sensitivity=0.95, contamination=0.01):
"""
Initializes the ShallowOutliers model.
Args:
sensitivity (float, optional): A value between 0 and 1 that adjusts the threshold for identifying anomalies.
At 1, at least one anomaly is always identified. At 0, no anomalies are identified. Default is 0.95.
contamination (float, optional): A value between 0 and 1 that indicates the proportion of data points
to be considered anomalous during training. Default is 0.01.
"""
self.sens = sensitivity
self.cont = contamination


def predict(self, arr):
"""
Given an array of data points, makes a smoothed prediction of the data. To do so,
Expand Down Expand Up @@ -67,16 +86,16 @@ def predict(self, arr):
smooth_arr = np.convolve(padded_arr, kernel, mode='valid')
return smooth_arr

def get_outliers(self, arr, smoothed_arr):
def get_outliers(self, arr, smoothed_arr, other=None):
"""
Given an array of data points and an aproximation of it, return a boolean array
with the same shape as the original array which is True when the data point is
an outlier and False otherwise.
The method used for outlier detection is an isolation forest, which will look for
the 0.3% most isolated points when taking into account the original value, the
smoothed valued, the diference between them (error) and the squared diference
between them.
the 1% most isolated points when taking into account the original value, the
smoothed valued, the absolute diference between them (MAE) and the sign of the
difference between them.
Args:
arr (numpy.ndarray): 1D numpy array where the outliers shall be detected.
Expand All @@ -87,13 +106,38 @@ def get_outliers(self, arr, smoothed_arr):
numpy.ndarray: 1D numpy array with the smoothed data.
"""
error = arr-smoothed_arr
loss = error**2
data = np.stack((arr,smoothed_arr,error,loss), axis = 1)
model = IsolationForest(n_estimators=100, contamination=0.003)
sign = np.sign(error)
data = np.stack((smoothed_arr, np.abs(error), sign), axis=1)
if other is not None:
data = np.concatenate([data, other], axis=1)
model = IsolationForest(n_estimators=100, contamination=self.cont, random_state=42)
model.fit(data)
model.offset_=self.sens*(1+model.offset_)-1
outliers = model.predict(data)==-1
return outliers

def encode_timestamp(self, timestamp):
"""
Takes a pandas Series of timestamps and returns a numpy array with a sine encoding for the
hour of day and a cosine encoding for the day of the week. This encoding helps the model to
learn periodic patterns in the data while maintaining simplicity.
Args:
timestamps (pd.Series): A Pandas Series of timestamps.
Returns:
pd.DataFrame: A DataFrame with sine-cosine encodings for daily and weekly periods.
"""
if not isinstance(timestamp, pd.Series):
raise ValueError("Input must be a Pandas Series")
timestamp = pd.to_datetime(timestamp)
hour_of_day = timestamp.dt.hour + timestamp.dt.minute/60
day_of_week = timestamp.dt.dayofweek + hour_of_day/24
daily_sin = np.sin(2*np.pi*hour_of_day/24)
weekly_cos = np.cos(2*np.pi*day_of_week/7)
encoded = np.stack((daily_sin, weekly_cos), axis=1)
return encoded

def compute_json(self, raw_json):
"""
Main method used for anomaly detection.
Expand All @@ -111,7 +155,8 @@ def compute_json(self, raw_json):
data = pd.json_normalize(raw_json)
arr = data.iloc[:, 1].values
smoothed_arr = self.predict(arr)
outliers = self.get_outliers(arr, smoothed_arr)
encoded_timestamp = self.encode_timestamp(data["timestamp"])
outliers = self.get_outliers(arr, smoothed_arr, other=encoded_timestamp)
data["smooth"] = smoothed_arr
predicted = data[["timestamp","smooth"]].rename(columns={"smooth":"forecast"})
anomalies = data[["timestamp","smooth"]].rename(columns={"smooth":"expected"}).loc[outliers]
Expand All @@ -121,13 +166,12 @@ def compute_json(self, raw_json):
"status": "success"
}

@staticmethod
def execute_prediction_model(data):
def execute_prediction_model(self, data):
try:
return ShallowOutliers().compute_json(data)
return self.compute_json(data)
except Exception as e:
logger.logger.error("Could not execute shallow model")
return ShallowOutliers.return_error(e)
return self.return_error(e)

@staticmethod
def return_error(error="error"):
Expand Down
4 changes: 4 additions & 0 deletions resources/src/config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ backup_path=./backups/
#target_sensors=FlowSensor
model_names=traffic

[ShallowOutliers]
sensitivity=0.95
contamination=0.01

[Druid]
druid_endpoint=http://x.x.x.x:8080/druid/v2/

Expand Down
6 changes: 5 additions & 1 deletion resources/src/server/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ def __init__(self):
self.app = Flask(__name__)
self.app.add_url_rule('/api/v1/outliers', view_func=self.calculate, methods=['POST'])
self.exit_code = 0
self.shallow = shallow_outliers.ShallowOutliers(
sensitivity = config.get("ShallowOutliers", "sensitivity"),
contamination = config.get("ShallowOutliers", "contamination")
)
self.ai_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "ai")
self.deep_models={}

Expand Down Expand Up @@ -200,7 +204,7 @@ def execute_model(self, data, metric, model='default'):

try:
if model == 'default':
return jsonify(shallow_outliers.ShallowOutliers.execute_prediction_model(data))
return jsonify(self.shallow.execute_prediction_model(data))
if model not in self.deep_models:
logger.logger.info(f"Creating instance of model {model}")
self.deep_models[model]=outliers.Autoencoder(
Expand Down

0 comments on commit 8bed888

Please sign in to comment.