Skip to content

Commit

Permalink
Merge branch 'master' of github.com:redBorder/rb-aioutliers into add_…
Browse files Browse the repository at this point in the history
…zookeeper_support

Merge done to get up to date before pull request.
  • Loading branch information
Pablo Rodríguez Flores committed Jun 11, 2024
2 parents 4f580f4 + e49f5eb commit 4f50f4f
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 16 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,11 @@ Author: PRodriguezFlores
release - 202405261349.0.0 - 2024-05-26 13:49:31
* Upgraded Gunicorn version to comapt with 22.0.0

Author: PRodriguezFlores
release - 202405311717.0.0 - 2024-05-31 17:17:18
* Fixed an issue where there would always be at least 1 anomaly detected, no matter the data, when using the shallow outliers models.

Author: PRodriguezFlores
release - 202406070335.0.0 - 2024-06-07 03:35:54
* Fixed an issue where the shallow outliers wouldn't initialize properly in the API REST because it was not converting some values from the config to float.

74 changes: 59 additions & 15 deletions resources/src/ai/shallow_outliers.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,31 @@

class ShallowOutliers:
"""
Shallow AI model for outliers detection. Used whenever there is no deep learning model defined.
Input data should be 1-dimensional.
Shallow AI model for detecting outliers in 1-dimensional data. Utilized when a deep learning model is not defined.
Args:
(None)
sensitivity (float, optional): A value between 0 and 1 that adjusts the threshold for identifying anomalies.
At 1, at least one anomaly is always identified. At 0, no anomalies are identified. Default is 0.95.
contamination (float, optional): A value between 0 and 1 that indicates the proportion of data points
to be considered anomalous during training. Default is 0.01.
"""

def __init__(self, sensitivity=0.95, contamination=0.01):
"""
Initializes the ShallowOutliers model.
Args:
sensitivity (float, optional): A value between 0 and 1 that adjusts the threshold for identifying anomalies.
At 1, at least one anomaly is always identified. At 0, no anomalies are identified. Default is 0.95.
contamination (float, optional): A value between 0 and 1 that indicates the proportion of data points
to be considered anomalous during training. Default is 0.01.
"""
self.sens = float(sensitivity)
self.cont = float(contamination)


def predict(self, arr):
"""
Given an array of data points, makes a smoothed prediction of the data. To do so,
Expand Down Expand Up @@ -67,16 +86,16 @@ def predict(self, arr):
smooth_arr = np.convolve(padded_arr, kernel, mode='valid')
return smooth_arr

def get_outliers(self, arr, smoothed_arr):
def get_outliers(self, arr, smoothed_arr, other=None):
"""
Given an array of data points and an aproximation of it, return a boolean array
with the same shape as the original array which is True when the data point is
an outlier and False otherwise.
The method used for outlier detection is an isolation forest, which will look for
the 0.3% most isolated points when taking into account the original value, the
smoothed valued, the diference between them (error) and the squared diference
between them.
the 1% most isolated points when taking into account the original value, the
smoothed valued, the absolute diference between them (MAE) and the sign of the
difference between them.
Args:
arr (numpy.ndarray): 1D numpy array where the outliers shall be detected.
Expand All @@ -87,13 +106,38 @@ def get_outliers(self, arr, smoothed_arr):
numpy.ndarray: 1D numpy array with the smoothed data.
"""
error = arr-smoothed_arr
loss = error**2
data = np.stack((arr,smoothed_arr,error,loss), axis = 1)
model = IsolationForest(n_estimators=100, contamination=0.003)
sign = np.sign(error)
data = np.stack((smoothed_arr, np.abs(error), sign), axis=1)
if other is not None:
data = np.concatenate([data, other], axis=1)
model = IsolationForest(n_estimators=100, contamination=self.cont, random_state=42)
model.fit(data)
model.offset_=self.sens*(1+model.offset_)-1
outliers = model.predict(data)==-1
return outliers

def encode_timestamp(self, timestamp):
"""
Takes a pandas Series of timestamps and returns a numpy array with a sine encoding for the
hour of day and a cosine encoding for the day of the week. This encoding helps the model to
learn periodic patterns in the data while maintaining simplicity.
Args:
timestamps (pd.Series): A Pandas Series of timestamps.
Returns:
pd.DataFrame: A DataFrame with sine-cosine encodings for daily and weekly periods.
"""
if not isinstance(timestamp, pd.Series):
raise ValueError("Input must be a Pandas Series")
timestamp = pd.to_datetime(timestamp)
hour_of_day = timestamp.dt.hour + timestamp.dt.minute/60
day_of_week = timestamp.dt.dayofweek + hour_of_day/24
daily_sin = np.sin(2*np.pi*hour_of_day/24)
weekly_cos = np.cos(2*np.pi*day_of_week/7)
encoded = np.stack((daily_sin, weekly_cos), axis=1)
return encoded

def compute_json(self, raw_json):
"""
Main method used for anomaly detection.
Expand All @@ -111,7 +155,8 @@ def compute_json(self, raw_json):
data = pd.json_normalize(raw_json)
arr = data.iloc[:, 1].values
smoothed_arr = self.predict(arr)
outliers = self.get_outliers(arr, smoothed_arr)
encoded_timestamp = self.encode_timestamp(data["timestamp"])
outliers = self.get_outliers(arr, smoothed_arr, other=encoded_timestamp)
data["smooth"] = smoothed_arr
predicted = data[["timestamp","smooth"]].rename(columns={"smooth":"forecast"})
anomalies = data[["timestamp","smooth"]].rename(columns={"smooth":"expected"}).loc[outliers]
Expand All @@ -121,13 +166,12 @@ def compute_json(self, raw_json):
"status": "success"
}

@staticmethod
def execute_prediction_model(data):
def execute_prediction_model(self, data):
try:
return ShallowOutliers().compute_json(data)
return self.compute_json(data)
except Exception as e:
logger.logger.error("Could not execute shallow model")
return ShallowOutliers.return_error(e)
return self.return_error(e)

@staticmethod
def return_error(error="error"):
Expand Down
4 changes: 4 additions & 0 deletions resources/src/config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ epochs=1000
batch_size=32
backup_path=resources/src/ai/backups/

[ShallowOutliers]
sensitivity=0.95
contamination=0.01

[Druid]
druid_endpoint=http://x.x.x.x:8080/druid/v2/

Expand Down
6 changes: 5 additions & 1 deletion resources/src/server/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ def __init__(self):
self.app = Flask(__name__)
self.app.add_url_rule('/api/v1/outliers', view_func=self.calculate, methods=['POST'])
self.exit_code = 0
self.shallow = shallow_outliers.ShallowOutliers(
sensitivity = config.get("ShallowOutliers", "sensitivity"),
contamination = config.get("ShallowOutliers", "contamination")
)
self.ai_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "ai")
self.deep_models={}

Expand Down Expand Up @@ -200,7 +204,7 @@ def execute_model(self, data, metric, model='default'):

try:
if model == 'default':
return jsonify(shallow_outliers.ShallowOutliers.execute_prediction_model(data))
return jsonify(self.shallow.execute_prediction_model(data))
if model not in self.deep_models:
logger.logger.info(f"Creating instance of model {model}")
self.deep_models[model]=outliers.Autoencoder(
Expand Down
Loading

0 comments on commit 4f50f4f

Please sign in to comment.