-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrimp.py
456 lines (333 loc) · 14.6 KB
/
scrimp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
# -*- coding: utf-8 -*-
"""
This module consists of all code to implement the SCRIMP++ algorithm. SCRIMP++
is an anytime algorithm that computes the matrix profile for a given time
series (ts) over a given window size (m).
This algorithm was originally created at the University of California
Riverside. For further academic understanding, please review this paper:
Matrix Profile XI: SCRIMP++: Time Series Motif Discovery at Interactive
Speed. Yan Zhu, Chin-Chia Michael Yeh, Zachary Zimmerman, Kaveh Kamgar
Eamonn Keogh, ICDM 2018.
https://www.cs.ucr.edu/~eamonn/SCRIMP_ICDM_camera_ready_updated.pdf
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
range = getattr(__builtins__, 'xrange', range)
# end of py2 compatability boilerplate
import math
import time
import warnings
import numpy as np
def fast_find_nn_pre(ts, m):
n = len(ts)
X = np.fft.fft(ts)
cum_sumx = np.cumsum(ts)
cum_sumx2 = np.cumsum(np.power(ts, 2))
sumx = cum_sumx[m - 1:n] - np.insert(cum_sumx[0:n - m], 0, 0)
sumx2 = cum_sumx2[m - 1:n] - np.insert(cum_sumx2[0:n - m], 0, 0)
meanx = sumx / m
sigmax2 = (sumx2 / m) - np.power(meanx, 2)
sigmax = np.sqrt(sigmax2)
return (X, n, sumx2, sumx, meanx, sigmax2, sigmax)
def calc_distance_profile(X, y, n, m, meanx, sigmax):
# reverse the query
y = np.flip(y, 0)
# make y same size as ts with zero fill
y = np.concatenate([y, np.zeros(n - m)])
# main trick of getting dot product in O(n log n) time
Y = np.fft.fft(y)
Z = X * Y
z = np.fft.ifft(Z)
# compute y stats in O(n)
sumy = np.sum(y)
sumy2 = np.sum(np.power(y, 2))
meany = sumy / m
sigmay2 = sumy2 / m - meany ** 2
sigmay = np.sqrt(sigmay2)
dist = (z[m - 1:n] - m * meanx * meany) / (sigmax * sigmay)
dist = m - dist
dist = np.real(2 * dist)
return np.sqrt(np.absolute(dist))
def calc_exclusion_zone(window_size):
return window_size / 4
def calc_step_size(window_size, step_size):
return int(math.floor(window_size * step_size))
def calc_profile_len(n, window_size):
return n - window_size + 1
def next_subsequence(ts, idx, m):
return ts[idx:idx + m]
def calc_exclusion_start(idx, exclusion_zone):
return int(np.max([0, idx - exclusion_zone]))
def calc_exclusion_stop(idx, exclusion_zone, profile_len):
return int(np.min([profile_len, idx + exclusion_zone]))
def apply_exclusion_zone(idx, exclusion_zone, profile_len, distance_profile):
exc_start = calc_exclusion_start(idx, exclusion_zone)
exc_stop = calc_exclusion_stop(idx, exclusion_zone, profile_len)
distance_profile[exc_start:exc_stop + 1] = np.inf
return distance_profile
def find_and_store_nn(iteration, idx, matrix_profile, mp_index,
distance_profile):
if iteration == 0:
matrix_profile = distance_profile
mp_index[:] = idx
else:
update_pos = distance_profile < matrix_profile
mp_index[update_pos] = idx
matrix_profile[update_pos] = distance_profile[update_pos]
idx_min = np.argmin(distance_profile)
matrix_profile[idx] = distance_profile[idx_min]
mp_index[idx] = idx_min
idx_nn = mp_index[idx]
return (matrix_profile, mp_index, idx_nn)
def calc_idx_diff(idx, idx_nn):
return idx_nn - idx
def calc_dotproduct_idx(dotproduct, m, mp, idx, sigmax, idx_nn, meanx):
dotproduct[idx] = (m - mp[idx] ** 2 / 2) * \
sigmax[idx] * sigmax[idx_nn] + m * meanx[idx] * meanx[idx_nn]
return dotproduct
def calc_end_idx(profile_len, idx, step_size, idx_diff):
return np.min([profile_len - 1, idx + step_size - 1,
profile_len - idx_diff - 1])
def calc_dotproduct_end_idx(ts, dp, idx, m, endidx, idx_nn, idx_diff):
tmp_a = ts[idx + m:endidx + m]
tmp_b = ts[idx_nn + m:endidx + m + idx_diff]
tmp_c = ts[idx:endidx]
tmp_d = ts[idx_nn:endidx + idx_diff]
tmp_f = tmp_a * tmp_b - tmp_c * tmp_d
dp[idx + 1:endidx + 1] = dp[idx] + np.cumsum(tmp_f)
return dp
def calc_refine_distance_end_idx(refine_distance, dp, idx, endidx, meanx,
sigmax, idx_nn, idx_diff, m):
tmp_a = dp[idx + 1:endidx + 1]
tmp_b = meanx[idx + 1:endidx + 1]
tmp_c = meanx[idx_nn + 1:endidx + idx_diff + 1]
tmp_d = sigmax[idx + 1:endidx + 1]
tmp_e = sigmax[idx_nn + 1:endidx + idx_diff + 1]
tmp_f = tmp_b * tmp_c
tmp_g = tmp_d * tmp_e
tmp_h = (m - (tmp_a - m * tmp_f) / (tmp_g))
refine_distance[idx + 1:endidx + 1] = np.sqrt(np.abs(2 * tmp_h))
return refine_distance
def calc_begin_idx(idx, step_size, idx_diff):
return np.max([0, idx - step_size + 1, 2 - idx_diff])
def calc_dotproduct_begin_idx(ts, dp, beginidx, idx, idx_diff, m,
idx_nn):
indices = list(range(idx - 1, beginidx - 1, -1))
if not indices:
return dp
tmp_a = ts[indices]
indices_b = list(range(idx_nn - 1, beginidx + idx_diff - 1, -1))
tmp_b = ts[indices_b]
indices_c = list(range(idx + m - 1, beginidx + m - 1, -1))
tmp_c = ts[indices_c]
indices_d = list(range(idx_nn - 1 + m, beginidx + idx_diff + m - 1, -1))
tmp_d = ts[indices_d]
dp[indices] = dp[idx] + \
np.cumsum((tmp_a * tmp_b) - (tmp_c * tmp_d))
return dp
def calc_refine_distance_begin_idx(refine_distance, dp, beginidx, idx,
idx_diff, idx_nn, sigmax, meanx, m):
if not (beginidx < idx):
return refine_distance
tmp_a = dp[beginidx:idx]
tmp_b = meanx[beginidx:idx]
tmp_c = meanx[beginidx + idx_diff:idx_nn]
tmp_d = sigmax[beginidx:idx]
tmp_e = sigmax[beginidx + idx_diff:idx_nn]
tmp_f = tmp_b * tmp_c
tmp_g = tmp_d * tmp_e
tmp_h = (m - (tmp_a - m * tmp_f) / (tmp_g))
refine_distance[beginidx:idx] = np.sqrt(np.abs(2 * tmp_h))
return refine_distance
def apply_update_positions(matrix_profile, mp_index, refine_distance, beginidx,
endidx, orig_index, idx_diff):
tmp_a = refine_distance[beginidx:endidx + 1]
tmp_b = matrix_profile[beginidx:endidx + 1]
update_pos1 = np.argwhere(tmp_a < tmp_b).flatten()
if len(update_pos1) > 0:
update_pos1 = update_pos1 + beginidx
matrix_profile[update_pos1] = refine_distance[update_pos1]
mp_index[update_pos1] = orig_index[update_pos1] + idx_diff
tmp_a = refine_distance[beginidx:endidx + 1]
tmp_b = matrix_profile[beginidx + idx_diff:endidx + idx_diff + 1]
update_pos2 = np.argwhere(tmp_a < tmp_b).flatten()
if len(update_pos2) > 0:
update_pos2 = update_pos2 + beginidx
matrix_profile[update_pos2 + idx_diff] = refine_distance[update_pos2]
mp_index[update_pos2 + idx_diff] = orig_index[update_pos2]
return (matrix_profile, mp_index)
def calc_curlastz(ts, m, n, idx, profile_len, curlastz):
curlastz[idx] = np.sum(ts[0:m] * ts[idx:idx + m])
tmp_a = ts[m:n - idx]
tmp_b = ts[idx + m:n]
tmp_c = ts[0:profile_len - idx - 1]
tmp_d = ts[idx:profile_len - 1]
tmp_e = tmp_a * tmp_b
tmp_f = tmp_c * tmp_d
curlastz[idx + 1:profile_len] = curlastz[idx] + np.cumsum(tmp_e - tmp_f)
return curlastz
def calc_curdistance(curlastz, meanx, sigmax, idx, profile_len, m,
curdistance):
tmp_a = curlastz[idx:profile_len + 1]
tmp_b = meanx[idx:profile_len]
tmp_c = meanx[0:profile_len - idx]
tmp_d = sigmax[idx:profile_len]
tmp_e = sigmax[0:profile_len - idx]
tmp_f = tmp_b * tmp_c
tmp_g = (m - (tmp_a - m * tmp_f) / (tmp_d * tmp_e))
curdistance[idx:profile_len] = np.sqrt(np.abs(2 * tmp_g))
return curdistance
def time_is_exceeded(start_time, runtime):
"""Helper method to determine if the runtime has exceeded or not.
Returns
-------
bool
Whether or not hte runtime has exceeded.
"""
elapsed = time.time() - start_time
exceeded = runtime is not None and elapsed >= runtime
if exceeded:
warnings.warn(
'Max runtime exceeded. Approximate solution is given.',
RuntimeWarning
)
return exceeded
def scrimp_plus_plus(ts, m, step_size=0.25, runtime=None, random_state=None):
"""SCRIMP++ is an anytime algorithm that computes the matrix profile for a
given time series (ts) over a given window size (m). Essentially, it allows
for an approximate solution to be provided for quicker analysis. In the
case of this implementation, the runtime is measured based on the wall
clock. If the number of seconds exceeds the runtime, then the approximate
solution is returned. If the runtime is None, the exact solution is
returned.
This algorithm was created at the University of California Riverside. For
further academic understanding, please review this paper:
Matrix Profile XI: SCRIMP++: Time Series Motif Discovery at Interactive
Speed. Yan Zhu, Chin-Chia Michael Yeh, Zachary Zimmerman, Kaveh Kamgar
Eamonn Keogh, ICDM 2018.
https://www.cs.ucr.edu/~eamonn/SCRIMP_ICDM_camera_ready_updated.pdf
Parameters
----------
ts : np.ndarray
The time series to compute the matrix profile for.
m : int
The window size.
step_size : float, default 0.25
The sampling interval for the window. The paper suggest 0.25 is the
most practical. It should be a float value between 0 and 1.
runtime : int, default None
The maximum number of seconds based on wall clock time for this
algorithm to run. It computes the exact solution when it is set to
None.
random_state : int, default None
Set the random seed generator for reproducible results.
Returns
-------
(np.array, np.array)
The matrix profile and the matrix profile index respectively.
"""
# start the timer here
start_time = time.time()
# validate step_size
if not isinstance(step_size, float) or step_size > 1 or step_size < 0:
raise ValueError('step_size should be a float between 0 and 1.')
# validate runtime
if runtime is not None and (not isinstance(runtime, int) or runtime < 1):
raise ValueError('runtime should be a valid positive integer.')
# validate random_state
if random_state is not None:
try:
np.random.seed(random_state)
except:
raise ValueError('Invalid random_state value given.')
ts_len = len(ts)
# set the trivial match range
exclusion_zone = calc_exclusion_zone(m)
# value checking
if m > ts_len / 2:
raise ValueError('Time series is too short relative to desired \
subsequence length')
if m < 4:
raise ValueError('Window size must be at least 4')
# initialization
step_size = calc_step_size(m, step_size)
profile_len = calc_profile_len(ts_len, m)
matrix_profile = np.zeros(profile_len)
mp_index = np.zeros(profile_len, dtype='int32')
X, n, sumx2, sumx, meanx, sigmax2, sigmax = fast_find_nn_pre(ts, m)
###########################
# PreSCRIMP
#
# compute distance profile
dotproduct = np.zeros(profile_len)
refine_distance = np.full(profile_len, np.inf)
orig_index = np.arange(profile_len)
compute_order = list(range(0, profile_len, step_size))
np.random.shuffle(compute_order)
for iteration, idx in enumerate(compute_order):
# compute distance profile
subsequence = next_subsequence(ts, idx, m)
distance_profile = calc_distance_profile(X, subsequence, n, m, meanx,
sigmax)
# apply exclusion zone
distance_profile = apply_exclusion_zone(
idx, exclusion_zone, profile_len, distance_profile)
# find and store nearest neighbor
matrix_profile, mp_index, idx_nn = find_and_store_nn(
iteration, idx, matrix_profile, mp_index, distance_profile)
idx_diff = calc_idx_diff(idx, idx_nn)
dotproduct = calc_dotproduct_idx(dotproduct, m, matrix_profile, idx,
sigmax, idx_nn, meanx)
endidx = calc_end_idx(profile_len, idx, step_size, idx_diff)
dotproduct = calc_dotproduct_end_idx(ts, dotproduct, idx, m,
endidx, idx_nn, idx_diff)
refine_distance = calc_refine_distance_end_idx(
refine_distance, dotproduct, idx, endidx, meanx, sigmax, idx_nn,
idx_diff, m)
beginidx = calc_begin_idx(idx, step_size, idx_diff)
dotproduct = calc_dotproduct_begin_idx(
ts, dotproduct, beginidx, idx, idx_diff, m, idx_nn)
refine_distance = calc_refine_distance_begin_idx(
refine_distance, dotproduct, beginidx, idx, idx_diff, idx_nn,
sigmax, meanx, m)
matrix_profile, mp_index = apply_update_positions(matrix_profile,
mp_index,
refine_distance,
beginidx,
endidx,
orig_index, idx_diff)
# check if time is up
if time_is_exceeded(start_time, runtime):
break
if not time_is_exceeded(start_time, runtime):
###########################
# SCRIMP
#
compute_order = orig_index[orig_index > exclusion_zone]
np.random.shuffle(compute_order)
curlastz = np.zeros(profile_len)
curdistance = np.zeros(profile_len)
dist1 = np.full(profile_len, np.inf)
dist2 = np.full(profile_len, np.inf)
for idx in compute_order:
curlastz = calc_curlastz(ts, m, n, idx, profile_len, curlastz)
curdistance = calc_curdistance(curlastz, meanx, sigmax, idx,
profile_len, m, curdistance)
dist1[0:idx - 1] = np.inf
dist1[idx:profile_len] = curdistance[idx:profile_len]
dist2[0:profile_len - idx] = curdistance[idx:profile_len]
dist2[profile_len - idx + 2:profile_len] = np.inf
loc1 = dist1 < matrix_profile
if loc1.any():
matrix_profile[loc1] = dist1[loc1]
mp_index[loc1] = orig_index[loc1] - idx + 1
loc2 = dist2 < matrix_profile
if loc2.any():
matrix_profile[loc2] = dist2[loc2]
mp_index[loc2] = orig_index[loc2] + idx - 1
# check if time is up
if time_is_exceeded(start_time, runtime):
break
return (matrix_profile, mp_index)