-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 2d6538a
Showing
12 changed files
with
2,996 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
import json | ||
import numpy as np | ||
import os | ||
import pandas as pd | ||
from sklearn.preprocessing import StandardScaler | ||
import time | ||
|
||
from chapydette import cp_estimation | ||
|
||
|
||
def est_cps_objs(phys_data, max_cp, min_dist=5): | ||
""" | ||
Estimate the locations of 0-max_cp change points in the physical data and return the corresponding objective values. | ||
:param phys_data: Physical data on which to estimate change points. | ||
:param max_cp: Largest number of change points to estimate. | ||
:param min_dist: Minimum allowable distance between change points. | ||
:return: objs_phys: Objective values when setting the number of change points to each of 0, 1, 2,..., max_cp (or the | ||
maximum possible number of changes given that the minimum distance between change points is | ||
min_dist). | ||
""" | ||
phys_features = np.asarray(phys_data[['temp', 'salinity']]) | ||
phys_features = StandardScaler().fit_transform(phys_features) | ||
cps_phys, objs_phys = cp_estimation.mkcpe(X=phys_features, | ||
n_cp=(0, min(max_cp, int((len(phys_features) - 1) / min_dist) - 1)), | ||
kernel_type='linear', min_dist=min_dist, return_obj=True) | ||
for key in objs_phys: | ||
objs_phys[key] = objs_phys[key]/len(phys_features) | ||
|
||
return objs_phys | ||
|
||
|
||
def est_ncp_penalty(objs, n, alpha): | ||
""" | ||
Estimate the number of change points using the penalty \alpha d/n(2log(n/d)+5) of Lebarbier (2005). | ||
:param objs: Dictionary of objective values for each number of change points. | ||
:param n: Length of the sequence. | ||
:param alpha: Value of the parameter alpha. | ||
:return: The estimated number of change points with the given value of alpha. | ||
""" | ||
objs = np.array([objs[i] for i in range(0, len(objs))]).flatten() | ||
d = np.arange(1, len(objs)+1) | ||
|
||
return np.argmin(objs + alpha*d/n*(2*np.log(n/d)+5)) | ||
|
||
|
||
def obj_ratios(objs_phys): | ||
""" | ||
Compute the ratio of successive objective values when going from one number of change points to the next. | ||
:param objs_phys: Dictionary of objective values for each number of change points. | ||
:return: ratios: Array with the ratios of successive objective values. | ||
""" | ||
objs_phys = [objs_phys[i] for i in range(0, len(objs_phys))] | ||
objs_phys = np.array(objs_phys).flatten() | ||
ratios = objs_phys[1:]/objs_phys[:-1] | ||
|
||
return ratios | ||
|
||
|
||
def est_ncp_rule_thumb(ratios, nu): | ||
""" | ||
Estimate the number of change points using the rule of thumb of Harchaoui and Levy-Leduc (2007). | ||
:param ratios: Array with the ratios of successive objective values. | ||
:param nu: Parameter nu for the rule of thumb. | ||
:return: The estimated number of change points with the given value of nu. | ||
""" | ||
return np.argwhere(ratios > 1-nu)[0].item() | ||
|
||
|
||
def estimate_params_loo(cp_results, alphas, nus): | ||
""" | ||
Estimate the values of alpha and nu in the methods for estimating the number of change points. Do this for each | ||
cruise by using the annotations from all cruises except that one. | ||
:param cp_results: Data frame with the cruise names, lengths, number of change points in the annotation files, | ||
objective values from change-point estimation, and ratios of objective values for each cruise. | ||
:param alphas: Parameter values to try for the method of Lebarbier (2005). | ||
:param nus: Parameter values to try for the method of Harchaoui and Levy-Leduc (2007). | ||
:return: cp_results: Updated data frame with the estimated number of change points from both methods and the chosen | ||
parameter values. | ||
""" | ||
ncruises = len(cp_results) | ||
cp_results['n_est_cps_penalty'] = np.zeros(ncruises, dtype=int) | ||
cp_results['n_est_cps_rule_thumb'] = np.zeros(ncruises, dtype=int) | ||
for loo_idx in range(ncruises): | ||
print('Estimating parameters for', cp_results.iloc[loo_idx]['cruise'], '- Cruise ', loo_idx+1, '/', ncruises) | ||
errors_penalty = np.zeros(len(alphas)) | ||
for alpha_num, alpha in enumerate(alphas): | ||
for i in range(ncruises): | ||
if i != loo_idx: | ||
n_est_cps = est_ncp_penalty(cp_results.iloc[i]['objs'], cp_results.iloc[i]['n'], alpha) | ||
errors_penalty[alpha_num] += np.abs(n_est_cps - cp_results.iloc[i]['n_cp']) | ||
best_alpha_idx = np.argmin(errors_penalty) | ||
|
||
errors_rule_thumb = np.zeros(len(nus)) | ||
for nu_num, nu in enumerate(nus): | ||
for i in range(ncruises): | ||
if i != loo_idx: | ||
n_est_cps = est_ncp_rule_thumb(cp_results.iloc[i]['ratios'], nu) | ||
errors_rule_thumb[nu_num] += np.abs(n_est_cps - cp_results.iloc[i]['n_cp']) | ||
best_nu_idx = np.argmin(errors_rule_thumb) | ||
|
||
cp_results.at[loo_idx, 'n_est_cps_penalty'] = int(est_ncp_penalty(cp_results.iloc[loo_idx]['objs'], | ||
cp_results.iloc[loo_idx]['n'], | ||
alphas[best_alpha_idx])) | ||
cp_results.at[loo_idx, 'n_est_cps_rule_thumb'] = int(est_ncp_rule_thumb(cp_results.iloc[loo_idx]['ratios'], | ||
nus[best_nu_idx])) | ||
cp_results.at[loo_idx, 'alpha'] = alphas[best_alpha_idx] | ||
cp_results.at[loo_idx, 'nu'] = nus[best_nu_idx] | ||
|
||
return cp_results | ||
|
||
|
||
def estimate_ncp(data_dir, cruises, max_cp, alphas, nus, output_dir, min_dist=5): | ||
""" | ||
Annotate the change points in the physical data from the directory data_dir. | ||
:param data_dir: Directory containing the files with the cleaned physical data and annotated change points. | ||
:param cruises: List of cruises to use. | ||
:param max_cp: Maximum number of allowable change points. | ||
:param alphas: Parameter values to try for the method of Lebarbier (2005). | ||
:param nus: Parameter values to try for the method of Harchaoui and Levy-Leduc (2007). | ||
:param output_dir: Directory where the annotation results should be stored. The file will be called | ||
estimated_ncp.pickle. | ||
:param min_dist: Minimum allowable distance between change points. | ||
""" | ||
for cruise_num, cruise in enumerate(cruises): | ||
print('Estimating physical change points for', cruise, '- Cruise ', cruise_num+1, '/', len(cruises)) | ||
phys_data = pd.read_parquet(os.path.join(data_dir, cruise + '_phys.parquet')) | ||
n = len(phys_data) | ||
n_cp = len(json.load(open(os.path.join(data_dir, cruise + '_annotated_phys_cps.json'), 'r'))) | ||
objs_phys = est_cps_objs(phys_data, max_cp, min_dist=min_dist) | ||
all_ratios = obj_ratios(objs_phys) | ||
if cruise_num == 0: | ||
cp_results = pd.DataFrame({'cruise': cruise, 'n': n, 'n_cp': n_cp, 'ratios': [all_ratios], | ||
'objs': [objs_phys]}) | ||
else: | ||
cp_results = cp_results.append({'cruise': cruise, 'n': n, 'n_cp': n_cp, 'ratios': all_ratios, | ||
'objs': objs_phys}, ignore_index=True) | ||
cp_results = estimate_params_loo(cp_results, alphas, nus) | ||
|
||
if not os.path.exists(output_dir): | ||
os.makedirs(output_dir) | ||
cp_results.to_pickle(os.path.join(output_dir, 'estimated_ncp.pickle')) | ||
|
||
|
||
if __name__ == '__main__': | ||
t1 = time.time() | ||
# Directory where the data is stored | ||
data_dir = '../data/' | ||
# Directory where the output will be stored | ||
output_dir = '../results/' | ||
# List of cruises to use | ||
cruises = ['DeepDOM', 'KM1712', 'KM1713', 'MGL1704', 'SCOPE_2', 'SCOPE_16', 'Thompson_1', 'Thompson_9', | ||
'Thompson_12', 'Tokyo_1', 'Tokyo_2', 'Tokyo_3'] | ||
# Parameter values to consider when optimizing the criteria for the number of change points | ||
alphas = np.arange(0, 1, 0.01) | ||
nus = np.arange(0.01, 0.1, 0.001) | ||
# Maximum number of possible change points | ||
max_cp = 150 | ||
|
||
estimate_ncp(data_dir, cruises, max_cp, alphas, nus, output_dir) | ||
t2 = time.time() | ||
print('Runtime:', t2-t1) | ||
# Runtime (Intel i9-7960X CPU @ 2.80GHz): 43s |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import numpy as np | ||
import os | ||
import pandas as pd | ||
import pickle | ||
from sklearn.preprocessing import StandardScaler | ||
import time | ||
|
||
from chapydette import feature_generation | ||
|
||
|
||
def generate_features(cruises, data_dir, features_dir, projection_dims=[128]): | ||
""" | ||
Generate features for the physical and biological data from each cruise. | ||
:param cruises: List of cruises to generate features for. | ||
:param data_dir: Directory where the (cleaned) biological and physical data is stored. | ||
:param features_dir: Directory where the features will be stored. | ||
:param projection_dims: List of dimensions to use for the projection for the biological data. | ||
""" | ||
if not os.path.exists(features_dir): | ||
os.makedirs(features_dir) | ||
|
||
for cruise in cruises: | ||
print('Generating features for', cruise) | ||
# Load the data | ||
bio_data = pd.read_parquet(os.path.join(data_dir, cruise + '_bio.parquet')) | ||
times = np.array(pd.Series(bio_data['date']).astype('category').cat.codes.values + 1) | ||
bio_data = np.asarray(bio_data[['fsc_small', 'chl_small', 'pe']]) | ||
|
||
phys_data = pd.read_parquet(os.path.join(data_dir, cruise + '_phys.parquet')) | ||
|
||
# Generate the features | ||
phys_features = np.asarray(phys_data[['salinity', 'temp']]) | ||
phys_features = StandardScaler().fit_transform(phys_features) | ||
|
||
for projection_dim in projection_dims: | ||
print('Dimension of projection:', projection_dim) | ||
save_file = os.path.join(features_dir, cruise + '_features_' + str(projection_dim) + '.pickle') | ||
bio_features = feature_generation.nystroem_features(bio_data, projection_dim, window_length=1, do_pca=False, | ||
window_overlap=0, times=times, seed=0, | ||
kmeans_iters=100)[0].astype('float64') | ||
pickle.dump({'bio_features': bio_features, 'phys_features': phys_features}, open(save_file, 'wb')) | ||
|
||
|
||
if __name__ == '__main__': | ||
t1 = time.time() | ||
# Directory where the data is stored | ||
data_dir = '../data/' | ||
# Directory where the output will be stored | ||
features_dir = '../features/' | ||
# List of cruises to use | ||
cruises = ['DeepDOM', 'KM1712', 'KM1713', 'MGL1704', 'SCOPE_2', 'SCOPE_16', 'Thompson_1', 'Thompson_9', | ||
'Thompson_12', 'Tokyo_1', 'Tokyo_2', 'Tokyo_3'] | ||
|
||
generate_features(cruises, data_dir, features_dir, projection_dims=[128]) | ||
generate_features(['SCOPE_16'], data_dir, features_dir, projection_dims=[2**i for i in range(1, 11)]) | ||
t2 = time.time() | ||
print('Runtime:', t2-t1) | ||
# Runtimes: | ||
# CPU (Intel i9-7960X CPU @ 2.80GHz): 13h15m33s | ||
# GPU (Titan Xp on same machine): 12m10s |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
import json | ||
import numpy as np | ||
import os | ||
import pickle | ||
import sklearn.metrics | ||
import time | ||
|
||
from chapydette import cp_estimation | ||
|
||
|
||
def load_features(cruise, features_dir, projection_dim): | ||
""" | ||
Load features for a cruise. | ||
:param cruise: Cruise to load features for. | ||
:param features_dir: Directory where the features are stored. | ||
:param projection_dim: Dimension of the features. | ||
:return: Tuple consisting of: | ||
* cruise_features['bio_features']: The features for the biological data | ||
* cruise_features['phys_features']: The features for the physical data | ||
""" | ||
features_path = os.path.join(features_dir, cruise + '_features_' + str(projection_dim) + '.pickle') | ||
cruise_features = pickle.load(open(features_path, 'rb')) | ||
return cruise_features['bio_features'], cruise_features['phys_features'] | ||
|
||
|
||
def get_bw_range(features): | ||
""" | ||
Get the rule-of-thumb bandwidth and a range of bandwidths on a log scale for the Gaussian RBF kernel. | ||
:param features: Features to use to obtain the bandwidths. | ||
:return: Tuple consisting of: | ||
* rule_of_thumb_bw: Computed rule-of-thumb bandwidth. | ||
* bws: List of bandwidths on a log scale. | ||
""" | ||
dists = sklearn.metrics.pairwise.pairwise_distances(features).reshape(-1) | ||
rule_of_thumb_bw = np.median(dists) | ||
gammas = np.logspace(np.log(0.5/np.percentile(dists, 99)**2), np.log(0.5/np.percentile(dists, 1)**2), 10, base=np.e) | ||
bws = np.sqrt(1/(2*gammas)) | ||
|
||
return rule_of_thumb_bw, bws | ||
|
||
|
||
def est_cps(cruise, bio_features, phys_features, max_ncp=150, min_dists=[5], kernel_types=['Gaussian-Euclidean'], | ||
bw_method='rule-of-thumb', save_dir='../results/'): | ||
""" | ||
Estimate the locations of change points in the input biological and physical features for a single cruise. | ||
:param cruise: Name of the cruise the features are from. | ||
:param bio_features: Features for the biological data. | ||
:param phys_features: Features for the physical data. | ||
:param max_ncp: Maximum number of change points in a sequence. | ||
:param min_dists: List of minimum acceptable distances between change points. | ||
:param kernel_types: List containing 'Gaussian-Euclidean' (Gaussian RBF kernel) and/or 'Linear'. | ||
:param bw_method: Method to use for obtaining the bandwidth(s). Either 'rule-of-thumb' or 'list'. | ||
:param save_dir: Top-level directory where the results will be stored. | ||
""" | ||
projection_dim = bio_features.shape[1] | ||
for min_dist in min_dists: | ||
# Perform change-point estimation on the physical data | ||
cps_phys, objs_phys = cp_estimation.mkcpe(X=phys_features, | ||
n_cp=(1, min(max_ncp, int((len(phys_features)-1)/min_dist)-1)), | ||
kernel_type='linear', min_dist=min_dist, return_obj=True) | ||
for key in cps_phys.keys(): | ||
cps_phys[key] = cps_phys[key].flatten().tolist() | ||
|
||
if not os.path.exists(os.path.join(save_dir, cruise)): | ||
os.makedirs(os.path.join(save_dir, cruise)) | ||
save_path = os.path.join(save_dir, cruise, 'cps_phys.json') | ||
json.dump({'cps_phys': cps_phys, 'objs_phys': objs_phys}, open(save_path, 'w')) | ||
|
||
for kernel_type in kernel_types: | ||
# Get the bandwidth(s) (if applicable) | ||
if kernel_type != 'Linear': | ||
rot_bw, bws = get_bw_range(bio_features) | ||
all_bws = [rot_bw] if bw_method == 'rule-of-thumb' else bws | ||
else: | ||
all_bws = [0] | ||
for bw in all_bws: | ||
# Perform change-point estimation on the biological data | ||
cps_bio, objs_bio = cp_estimation.mkcpe(X=bio_features, | ||
n_cp=(1, min(max_ncp, int((len(bio_features)-1)/min_dist)-1)), | ||
kernel_type=kernel_type, bw=bw, min_dist=min_dist, | ||
return_obj=True) | ||
for key in cps_bio.keys(): | ||
cps_bio[key] = cps_bio[key].flatten().tolist() | ||
|
||
bw_short = 'rule-of-thumb_' + str(np.round(bw, 3)) if bw_method == 'rule-of-thumb' else \ | ||
str(np.round(bw, 3)) | ||
save_path = os.path.join(save_dir, cruise, 'cps_bio_' + str(projection_dim) + '_' + | ||
kernel_type + '_' + str(bw_short) + '_' + str(min_dist) + '.json') | ||
json.dump({'cps_bio': cps_bio, 'bw': bw, 'objs_bio': objs_bio}, open(save_path, 'w')) | ||
|
||
|
||
def est_cps_all_cruises(cruises, features_dir, max_ncp=150, min_dist=5, projection_dim=128, save_dir='../results'): | ||
""" | ||
Estimate the biological and physical change points for each cruise and for all desired parameter settings in order | ||
to make the plots. | ||
:param cruises: List of cruises to generate features for. | ||
:param features_dir: Directory where the features are stored. | ||
:param max_ncp: Maximum number of acceptable change points. | ||
:param min_dist: Minimum acceptable distance between change points. | ||
:param projection_dim: Dimension of the features. | ||
:param save_dir: Location where the estimated change points will be stored. | ||
""" | ||
for cruise_num, cruise in enumerate(cruises): | ||
print('Estimating change points for', cruise, '- Cruise ', cruise_num+1, '/', len(cruises)) | ||
bio_features, phys_features = load_features(cruise, features_dir, projection_dim) | ||
est_cps(cruise, bio_features, phys_features, max_ncp=max_ncp, min_dists=[min_dist], save_dir=save_dir) | ||
|
||
|
||
def sensitivity_analysis(cruise, features_dir, max_ncp=150, min_dist=5, projection_dim=128, | ||
save_dir='../results/sensitivity_analysis/'): | ||
""" | ||
Estimate the biological change points for a given cruise when varying the parameter settings. | ||
:param cruise: Name of the cruise to perform the sensitivity analysis on. | ||
:param features_dir: Directory where the features are stored. | ||
:param max_ncp: Maximum number of acceptable change points. | ||
:param min_dist: Baseline minimum acceptable distance between change points. | ||
:param projection_dim: Baseline dimension of the features. | ||
:param save_dir: Location where the estimated change points will be stored. | ||
""" | ||
print('Performing sensitivity analysis') | ||
bio_features, phys_features = load_features(cruise, features_dir, projection_dim) | ||
kernel_types = ['Linear'] | ||
est_cps(cruise, bio_features, phys_features, max_ncp=max_ncp, min_dists=[min_dist], kernel_types=kernel_types, | ||
bw_method='rule-of-thumb', save_dir=save_dir) | ||
bw_method = 'list' | ||
est_cps(cruise, bio_features, phys_features, max_ncp=max_ncp, min_dists=[min_dist], | ||
kernel_types=['Gaussian-Euclidean'], bw_method=bw_method, save_dir=save_dir) | ||
min_dists = [1] + list(range(5, 55, 5)) | ||
est_cps(cruise, bio_features, phys_features, max_ncp=max_ncp, min_dists=min_dists, | ||
kernel_types=['Gaussian-Euclidean'], bw_method='rule-of-thumb', save_dir=save_dir) | ||
for projection_dim in [2 ** i for i in range(2, 11)]: | ||
bio_features, phys_features = load_features(cruise, features_dir, projection_dim) | ||
est_cps(cruise, bio_features, phys_features, max_ncp=max_ncp, min_dists=[min_dist], | ||
kernel_types=['Gaussian-Euclidean'], bw_method='rule-of-thumb', save_dir=save_dir) | ||
|
||
|
||
if __name__ == '__main__': | ||
t1 = time.time() | ||
# Directory where the features are stored | ||
features_dir = '../features/' | ||
# Directory where the output from the main analysis will be stored | ||
save_dir = '../results/' | ||
# Directory where the output from the sensitivity analysis will be stored | ||
save_dir_sensitivity_analysis = '../results/sensitivity_analysis/' | ||
# List of cruises to use | ||
cruises = ['DeepDOM', 'KM1712', 'KM1713', 'MGL1704', 'SCOPE_2', 'SCOPE_16', 'Thompson_1', 'Thompson_9', | ||
'Thompson_12', 'Tokyo_1', 'Tokyo_2', 'Tokyo_3'] | ||
|
||
est_cps_all_cruises(cruises, features_dir, save_dir=save_dir) | ||
sensitivity_analysis('SCOPE_16', features_dir, save_dir=save_dir_sensitivity_analysis) | ||
t2 = time.time() | ||
print('Runtime:', t2-t1) | ||
# Runtime (Intel i9-7960X CPU @ 2.80GHz): 3m25s |
Oops, something went wrong.