Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: Updates to ML Toolkit to operate with PyKX #105

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions fresh/feat.q
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ fresh.feat.firstMin:{[data]
// @param data {number[]} Numerical data points
// @return {dictionary} Spectral centroid, variance, skew and kurtosis
fresh.feat.fftAggreg:{[data]
a:fresh.i.abso[fresh.i.rfft data]`;
a:fresh.i.abso[.p.toraw fresh.i.rfft data]`;
l:"f"$til count a;
mean:1.,(sum each a*/:3(l*)\l)%sum a;
m1:mean 1;m2:mean 2;m3:mean 3;m4:mean 4;
Expand All @@ -227,7 +227,7 @@ fresh.feat.fftCoeff:{[data;coeff]
r:(fresh.i.angle[fx;`deg pykw 1b]`;
fresh.i.real[fx]`;
fresh.i.imag[fx]`;
fresh.i.abso[fx:fresh.i.rfft data]`
fresh.i.abso[fx:.p.toraw fresh.i.rfft data]`
);
fftKeys:`$"_"sv'string raze(`coeff,/:til coeff),\:/:`angle`real`imag`abs;
fftVals:raze coeff#'r,\:coeff#0n;
Expand Down
759 changes: 379 additions & 380 deletions fresh/tests/features.t

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions fresh/tests/significancetests.p
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
p)import numpy as np
p)from scipy import stats

p)def< binary_feature_binary_test(x, y):
p)def binary_feature_binary_test(x, y):
x0, x1 = np.unique(x)
y0, y1 = np.unique(y)

Expand All @@ -17,7 +17,7 @@ p)def< binary_feature_binary_test(x, y):

return p_value

p)def< target_binary_feature_real_test(y, x):
p)def target_binary_feature_real_test(y, x):
y0, y1 = np.unique(y)

x_y1 = x[y == y1]
Expand All @@ -26,11 +26,11 @@ p)def< target_binary_feature_real_test(y, x):
KS, p_ks = stats.ks_2samp(x_y1, x_y0,mode='asymp')
return p_ks

p)def< target_real_feature_real_test(x, y):
p)def target_real_feature_real_test(x, y):
tau, p_value = stats.kendalltau(x, y)
return p_value

p)def< benjamini_hochberg_test(df_pvalues, hypotheses_independent, fdr_level):
p)def benjamini_hochberg_test(df_pvalues, hypotheses_independent, fdr_level):
df_pvalues = df_pvalues.sort_values(by="p_value")
m = len(df_pvalues)
K = np.arange(1, m + 1)
Expand Down
17 changes: 9 additions & 8 deletions fresh/tests/sigtests.t
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ In each case significance tests implemented within freshq are compared to
equivalent significance tests implemented previously in python.
\

\l p.q
\l ml.q
\l fresh/init.q
\l fresh/tests/significancetests.p
Expand All @@ -22,13 +21,13 @@ xb:5000#0101101011b
yb:5000#0101101011b

/ 1a.
.ml.fresh.i.fisher[xb;yb] ~ binary_feature_binary_test[xb;yb]
.ml.fresh.i.fisher[xb;yb] ~ .p.get[`binary_feature_binary_test;<][xb;yb]

/ 1b.
.ml.fresh.i.ks[yb;xf] ~ target_binary_feature_real_test[yb;xf]
.ml.fresh.i.ks[yb;xf] ~ .p.get[`target_binary_feature_real_test;<][yb;xf]

/ 1c.
.ml.fresh.i.kTau[xf;yf] ~ target_real_feature_real_test[xf;yf]
.ml.fresh.i.kTau[xf;yf] ~ .p.get[`target_real_feature_real_test;<][xf;yf]

/
2.
Expand All @@ -45,14 +44,16 @@ table3:([]desc 1000000?1f;1000000?10f;asc 1000000?1f)
table4:([]1000000?0b;1000000?1f;1000000?1f)
target1:asc 1000000?100f;target2:desc 1000000?1f;target3:target4:1000000?0b
bintest:{2=count distinct x}
pdmatrix:{pddf[benjamini_hochberg_test[y;"FALSE";x]][`:values]}
k:{pdmatrix[x;y]`}
pdmatrix:{pddf[.p.get[`benjamini_hochberg_test][.p.topd y;$[.pykx.loaded;0b;"FALSE"];x]][`:values]`}
k:{t:pdmatrix[x;y];@[{x`};t;{[x;y]x}[t]]}
vec:{k[x;y][;2]}
bhfn:{[table;target]
pdict:.ml.fresh.sigFeat[table;target];
ptable:([]label:key pdict;p_value:value pdict);
dfptable:tab2df[ptable];
("i"$count .ml.fresh.benjhoch[0.05;pdict]) ~ sum vec[0.05;dfptable]=1b
dfptable:$[.pykx.loaded;;tab2df]ptable;
vecret:vec[0.05;dfptable];
vecret:$[11h=type vecret;`True=;0<]vecret;
("i"$count .ml.fresh.benjhoch[0.05;pdict]) ~ sum vecret=1b
}
bhfn[table1;target1]
bhfn[table2;target2]
Expand Down
111 changes: 56 additions & 55 deletions fresh/tests/test.p
Original file line number Diff line number Diff line change
@@ -1,61 +1,62 @@
p)import numpy as np
p)import pandas as pd
p)import itertools
p)from scipy.signal import welch, cwt, ricker, find_peaks_cwt
p)from scipy.stats import linregress
p)from statsmodels.tsa.stattools import acf, adfuller, pacf
p)from numpy.linalg import LinAlgError

p)def< _get_length_sequences_where(x):
p)def _get_length_sequences_where(x):
if len(x) == 0:
return [0]
else:
res = [len(list(group)) for value, group in itertools.groupby(x) if value == 1]
return res if len(res) > 0 else [0]
p)def< aggregate_on_chunks(x, f_agg, chunk_len):return [getattr(x[i * chunk_len: (i + 1) * chunk_len], f_agg)() for i in range(int(np.ceil(len(x) / chunk_len)))]

p)def< hasduplicate(x):return len(x) != len(set(x))
p)def< hasduplicatemin(x):return sum(np.asarray(x) == min(x)) >= 2
p)def< hasduplicatemax(x):return sum(np.asarray(x) == max(x)) >= 2
p)def< abs_energy(x):x = np.asarray(x); return sum(x * x)
p)def< mean_change(x):return np.mean(np.diff(x))
p)def< mean_abs_change(x):return np.mean(np.abs(np.diff(x)))
p)def< count_above_mean(x): x = np.asarray(x); m = np.mean(x); return np.where(x > m)[0].shape[0]
p)def< count_below_mean(x): x = np.asarray(x); m = np.mean(x); return np.where(x < m)[0].shape[0]
p)def< first_location_of_maximum(x): x = np.asarray(x); return np.argmax(x) / len(x) if len(x) > 0 else np.NaN
p)def< first_location_of_minimum(x): x = np.asarray(x); return np.argmin(x) / len(x) if len(x) > 0 else np.NaN
p)def< last_location_of_minimum(x): x = np.asarray(x); return 1.0 - (1+np.argmin(x[::-1]))/ len(x) if len(x) > 0 else np.NaN
p)def< last_location_of_maximum(x): x = np.asarray(x); return 1.0 - (1+np.argmax(x[::-1]))/ len(x) if len(x) > 0 else np.NaN
p)def< ratio_val_num_to_t_series(x):return len(set(x))/len(x)
p)def< ratio_beyond_r_sigma(x,r):return sum(abs(x - np.mean(x)) > r * np.std(x))/len(x)
p)def< large_standard_deviation(x,r):x = np.asarray(x);return np.std(x) > (r * (max(x) - min(x)))
p)def< absolute_sum_of_changes(x):return np.sum(abs(np.diff(x)))
p)def< longest_strike_below_mean(x):return max(_get_length_sequences_where(x <= np.mean(x))) if len(x) > 0 else 0
p)def< longest_strike_above_mean(x):return max(_get_length_sequences_where(x >= np.mean(x))) if len(x) > 0 else 0
p)def< skewness_py(x):x = pd.Series(x);return pd.Series.skew(x)
p)def< kurtosis_py(x):x = pd.Series(x);return pd.Series.kurtosis(x)
p)def< range_count(x,min,max):return np.sum((x >= min) & (x < max))
p)def< variance_larger_than_standard_deviation(x):return np.var(x) > np.std(x)
p)def< number_cwt_peaks(x,n):return len(find_peaks_cwt(vector=x, widths=np.array(list(range(1, n + 1))), wavelet=ricker))
p)def< quantile_py(x, q):x = pd.Series(x);return pd.Series.quantile(x, q)
p)def< value_count(x, value):
p)def aggregate_on_chunks(x, f_agg, chunk_len):return [getattr(x[i * chunk_len: (i + 1) * chunk_len], f_agg)() for i in range(int(np.ceil(len(x) / chunk_len)))]

p)def hasduplicate(x):return len(x) != len(set(x))
p)def hasduplicatemin(x):return sum(np.asarray(x) == min(x)) >= 2
p)def hasduplicatemax(x):return sum(np.asarray(x) == max(x)) >= 2
p)def abs_energy(x):x = np.asarray(x); return sum(x * x)
p)def mean_change(x):return np.mean(np.diff(x))
p)def mean_abs_change(x):return np.mean(np.abs(np.diff(x)))
p)def count_above_mean(x): x = np.asarray(x); m = np.mean(x); return np.where(x > m)[0].shape[0]
p)def count_below_mean(x): x = np.asarray(x); m = np.mean(x); return np.where(x < m)[0].shape[0]
p)def first_location_of_maximum(x): x = np.asarray(x); return np.argmax(x) / len(x) if len(x) > 0 else np.NaN
p)def first_location_of_minimum(x): x = np.asarray(x); return np.argmin(x) / len(x) if len(x) > 0 else np.NaN
p)def last_location_of_minimum(x): x = np.asarray(x); return 1.0 - (1+np.argmin(x[::-1]))/ len(x) if len(x) > 0 else np.NaN
p)def last_location_of_maximum(x): x = np.asarray(x); return 1.0 - (1+np.argmax(x[::-1]))/ len(x) if len(x) > 0 else np.NaN
p)def ratio_val_num_to_t_series(x):return len(set(x))/len(x)
p)def ratio_beyond_r_sigma(x,r):return sum(abs(x - np.mean(x)) > r * np.std(x))/len(x)
p)def large_standard_deviation(x,r):x = np.asarray(x);return np.std(x) > (r * (max(x) - min(x)))
p)def absolute_sum_of_changes(x):return np.sum(abs(np.diff(x)))
p)def longest_strike_below_mean(x):return max(_get_length_sequences_where(x <= np.mean(x))) if len(x) > 0 else 0
p)def longest_strike_above_mean(x):return max(_get_length_sequences_where(x >= np.mean(x))) if len(x) > 0 else 0
p)def skewness_py(x):x = pd.Series(x);return pd.Series.skew(x)
p)def kurtosis_py(x):x = pd.Series(x);return pd.Series.kurtosis(x)
p)def range_count(x,min,max):return np.sum((x >= min) & (x < max))
p)def variance_larger_than_standard_deviation(x):return np.var(x) > np.std(x)
p)def number_cwt_peaks(x,n):return len(find_peaks_cwt(vector=x, widths=np.array(list(range(1, n + 1))), wavelet=ricker))
p)def quantile_py(x, q):x = pd.Series(x);return pd.Series.quantile(x, q)
p)def value_count(x, value):
if np.isnan(value):
return np.isnan(x)
else:
return x[x == value].shape[0]

p)def< percentage_recurring_all_data(x):
p)def percentage_recurring_all_data(x):
unique, counts = np.unique(x, return_counts=True)
return np.sum(counts > 1) / float(counts.shape[0])

p)def< percentage_recurring_all_val(x):
p)def percentage_recurring_all_val(x):
x = pd.Series(x)
if len(x) == 0:
return np.nan
x = x.copy()
value_counts = x.value_counts()
return value_counts[value_counts > 1].sum() / len(x)

p)def< number_peaks(x, n):
p)def number_peaks(x, n):
x = np.asarray(x)
x_reduced = x[n:-n]
res = None
Expand All @@ -68,7 +69,7 @@ p)def< number_peaks(x, n):
res &= (x_reduced > np.roll(x, -i)[n:-n])
return sum(res)

p)def< cid_ce(x, normalize):
p)def cid_ce(x, normalize):
x = np.asarray(x)
if normalize:
s = np.std(x)
Expand All @@ -79,43 +80,43 @@ p)def< cid_ce(x, normalize):
x = np.diff(x)
return np.sqrt(np.sum((x * x)))

p)def< mean_second_derivative_central(x):
p)def mean_second_derivative_central(x):
diff = (np.roll(x, 1) - 2 * np.array(x) + np.roll(x, -1)) / 2.0
return np.mean(diff[1:-1])

p)def< sum_recurring_values(x):
p)def sum_recurring_values(x):
unique, counts = np.unique(x, return_counts=True)
counts[counts < 2] = 0
counts[counts > 1] = 1
return np.sum(counts * unique)

p)def< sum_recurring_data_points(x):
p)def sum_recurring_data_points(x):
unique, counts = np.unique(x, return_counts=True)
counts[counts < 2] = 0
return np.sum(counts * unique)

p)def< c3_py(x, lag):
p)def c3_py(x, lag):
n = len(x)
x = np.asarray(x)
if 2 * lag >= n:
return 0
else:
return np.mean((np.roll(x, 2 * -lag) * np.roll(x, -lag) * x)[0:(n - 2 * lag)])

p)def< number_crossing_m(x, m):
p)def number_crossing_m(x, m):
if not isinstance(x, (np.ndarray, pd.Series)):
x = np.asarray(x)
positive = x > m
return np.where(np.bitwise_xor(positive[1:], positive[:-1]))[0].size

p)def< binned_entropy(x, max_bins):
p)def binned_entropy(x, max_bins):
if not isinstance(x, (np.ndarray, pd.Series)):
x = np.asarray(x)
hist, bin_edges = np.histogram(x, bins=max_bins)
probs = hist / x.size
return - np.sum(p * np.math.log(p) for p in probs if p != 0)

p)def< autocorrelation(x, lag):
p)def autocorrelation(x, lag):
if type(x) is pd.Series:
x = x.values
if len(x) < lag:
Expand All @@ -126,7 +127,7 @@ p)def< autocorrelation(x, lag):
sum_product = np.sum((y1-x_mean)*(y2-x_mean))
return sum_product / ((len(x) - lag) * np.var(x))

p)def< energy_ratio_by_chunks(x,y,z):
p)def energy_ratio_by_chunks(x,y,z):
full_series_energy = np.sum(x ** 2)
num_segments = y
segment_focus = z
Expand All @@ -137,7 +138,7 @@ p)def< energy_ratio_by_chunks(x,y,z):
res_data=(np.sum(x[start:end]**2.0)/full_series_energy)
return res_data

p)def< change_quantiles(x, ql, qh, isabs, f_agg):
p)def change_quantiles(x, ql, qh, isabs, f_agg):
if ql >= qh:
ValueError("ql={} should be lower than qh={}".format(ql, qh))
div = np.diff(x)
Expand All @@ -156,7 +157,7 @@ p)def< change_quantiles(x, ql, qh, isabs, f_agg):
aggregator = getattr(np, f_agg)
return aggregator(div[ind_inside_corridor])

p)def< time_reversal_asymmetry_statistic(x, lag):
p)def time_reversal_asymmetry_statistic(x, lag):
n = len(x)
x = np.asarray(x)
if 2 * lag >= n:
Expand All @@ -165,7 +166,7 @@ p)def< time_reversal_asymmetry_statistic(x, lag):
return np.mean((np.roll(x, 2 * -lag) * np.roll(x, 2 * -lag) * np.roll(x, -lag) -
np.roll(x, -lag) * x * x)[0:(n - 2 * lag)])

p)def< index_mass_quantile(x, q):
p)def index_mass_quantile(x, q):

x = np.asarray(x)
abs_x = np.abs(x)
Expand All @@ -177,13 +178,13 @@ p)def< index_mass_quantile(x, q):
mass_centralized = np.cumsum(abs_x) / s
return (np.argmax(mass_centralized >= q)+1)/len(x)

p)def< linear_trend(x):
p)def linear_trend(x):
linReg = linregress(range(len(x)), x)
return linReg

p)def< get_moment(y, moment):return y.dot(np.arange(len(y))**moment) / y.sum()
p)def< get_centroid(y):return get_moment(y, 1)
p)def< get_variance(y):return get_moment(y, 2) - get_centroid(y) ** 2
p)def get_moment(y, moment):return y.dot(np.arange(len(y))**moment) / y.sum()
p)def get_centroid(y):return get_moment(y, 1)
p)def get_variance(y):return get_moment(y, 2) - get_centroid(y) ** 2

p)def get_skew(y):
variance = get_variance(y)
Expand All @@ -193,7 +194,7 @@ p)def get_skew(y):
return (
get_moment(y, 3) - 3*get_centroid(y)*variance - get_centroid(y)**3
) / get_variance(y)**(1.5)
p)def< get_kurtosis(y):
p)def get_kurtosis(y):
variance = get_variance(y)
if variance < 0.5:
return np.nan
Expand All @@ -203,11 +204,11 @@ p)def< get_kurtosis(y):
+ 6*get_moment(y, 2)*get_centroid(y)**2 - 3*get_centroid(y)
) / get_variance(y)**2

p)def< fft_aggregated(x):
p)def fft_aggregated(x):
fft_abs = abs(np.fft.rfft(x))
return get_centroid(fft_abs),get_variance(fft_abs),get_skew(fft_abs),get_kurtosis(fft_abs)

p)def< index_mass_quantile(x, q):
p)def index_mass_quantile(x, q):

x = np.asarray(x)
abs_x = np.abs(x)
Expand All @@ -219,7 +220,7 @@ p)def< index_mass_quantile(x, q):
mass_centralized = np.cumsum(abs_x) / s
return (np.argmax(mass_centralized >= q)+1)/len(x)

p)def< agg_autocorrelation(x,y):
p)def agg_autocorrelation(x,y):
var = np.var(x)
n = len(x)
if np.abs(var) < 10**-10 or n == 1:
Expand All @@ -228,7 +229,7 @@ p)def< agg_autocorrelation(x,y):
a = acf(x, adjusted=True, fft=n > 1250)[1:]
return getattr(np, y)(a)

p)def< augmented_dickey_fuller(x):
p)def augmented_dickey_fuller(x):
res = None
try:
res = adfuller(x)
Expand All @@ -241,11 +242,11 @@ p)def< augmented_dickey_fuller(x):

return res

p)def< spkt_welch_density(x, y):
p)def spkt_welch_density(x, y):
freq, pxx = welch(x)
return pxx[y]

p)def< fft_coefficient(x,y,z):
p)def fft_coefficient(x,y,z):

fft = np.fft.rfft(x)

Expand All @@ -263,7 +264,7 @@ p)def< fft_coefficient(x,y,z):

return res

p)def< partial_autocorrelation(x, param):
p)def partial_autocorrelation(x, param):
max_demanded_lag = max(param)
n = len(x)
if n <= 1:
Expand Down
6 changes: 3 additions & 3 deletions fresh/utils.q
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
\d .ml

// Python imports
sci_ver :1.5<="F"$3#.p.import[`scipy][`:__version__]`
sci_ver :1.5<="F"$3#$[-11h=type x;string;]x:.p.import[`scipy][`:__version__]`
numpy :.p.import`numpy
pyStats :.p.import`scipy.stats
signal :.p.import`scipy.signal
stattools:.p.import`statsmodels.tsa.stattools
stats_ver:"F"$"." vs (.p.import`statsmodels)[`:__version__]`
stats_ver:"F"$"." vs $[-11h=type x;string;]x:.p.import[`statsmodels][`:__version__]`
stats_break:$[((stats_ver[0]=0)&stats_ver[1]>=12)|stats_ver[0]>0;1b;0b]

// @private
Expand Down Expand Up @@ -175,7 +175,7 @@ fresh.i.expandResults:{[results;column]
// @return {float} Kendall’s tau - Close to 1 shows strong agreement, close to
// -1 shows strong disagreement
fresh.i.kTau:{[target;feature]
fresh.i.kendallTau[<;target;feature]1
fresh.i.kendallTau[target;feature][`:pvalue]`
}

// @private
Expand Down
1 change: 0 additions & 1 deletion graph/tests/graph.t
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
// which will fail to produce a valid/operational graph/pipeline in order to ensure that the
// catching mechanism for the creation of such workflows is reliable and fully understood

\l p.q
\l ml.q
\l graph/utils.q
\l graph/graph.q
Expand Down
Loading