Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
cjones6 committed Dec 1, 2020
0 parents commit 2d6538a
Show file tree
Hide file tree
Showing 12 changed files with 2,996 additions and 0 deletions.
Empty file added __init__.py
Empty file.
168 changes: 168 additions & 0 deletions analysis/1_est_num_cps_phys.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import json
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
import time

from chapydette import cp_estimation


def est_cps_objs(phys_data, max_cp, min_dist=5):
"""
Estimate the locations of 0-max_cp change points in the physical data and return the corresponding objective values.
:param phys_data: Physical data on which to estimate change points.
:param max_cp: Largest number of change points to estimate.
:param min_dist: Minimum allowable distance between change points.
:return: objs_phys: Objective values when setting the number of change points to each of 0, 1, 2,..., max_cp (or the
maximum possible number of changes given that the minimum distance between change points is
min_dist).
"""
phys_features = np.asarray(phys_data[['temp', 'salinity']])
phys_features = StandardScaler().fit_transform(phys_features)
cps_phys, objs_phys = cp_estimation.mkcpe(X=phys_features,
n_cp=(0, min(max_cp, int((len(phys_features) - 1) / min_dist) - 1)),
kernel_type='linear', min_dist=min_dist, return_obj=True)
for key in objs_phys:
objs_phys[key] = objs_phys[key]/len(phys_features)

return objs_phys


def est_ncp_penalty(objs, n, alpha):
"""
Estimate the number of change points using the penalty \alpha d/n(2log(n/d)+5) of Lebarbier (2005).
:param objs: Dictionary of objective values for each number of change points.
:param n: Length of the sequence.
:param alpha: Value of the parameter alpha.
:return: The estimated number of change points with the given value of alpha.
"""
objs = np.array([objs[i] for i in range(0, len(objs))]).flatten()
d = np.arange(1, len(objs)+1)

return np.argmin(objs + alpha*d/n*(2*np.log(n/d)+5))


def obj_ratios(objs_phys):
"""
Compute the ratio of successive objective values when going from one number of change points to the next.
:param objs_phys: Dictionary of objective values for each number of change points.
:return: ratios: Array with the ratios of successive objective values.
"""
objs_phys = [objs_phys[i] for i in range(0, len(objs_phys))]
objs_phys = np.array(objs_phys).flatten()
ratios = objs_phys[1:]/objs_phys[:-1]

return ratios


def est_ncp_rule_thumb(ratios, nu):
"""
Estimate the number of change points using the rule of thumb of Harchaoui and Levy-Leduc (2007).
:param ratios: Array with the ratios of successive objective values.
:param nu: Parameter nu for the rule of thumb.
:return: The estimated number of change points with the given value of nu.
"""
return np.argwhere(ratios > 1-nu)[0].item()


def estimate_params_loo(cp_results, alphas, nus):
"""
Estimate the values of alpha and nu in the methods for estimating the number of change points. Do this for each
cruise by using the annotations from all cruises except that one.
:param cp_results: Data frame with the cruise names, lengths, number of change points in the annotation files,
objective values from change-point estimation, and ratios of objective values for each cruise.
:param alphas: Parameter values to try for the method of Lebarbier (2005).
:param nus: Parameter values to try for the method of Harchaoui and Levy-Leduc (2007).
:return: cp_results: Updated data frame with the estimated number of change points from both methods and the chosen
parameter values.
"""
ncruises = len(cp_results)
cp_results['n_est_cps_penalty'] = np.zeros(ncruises, dtype=int)
cp_results['n_est_cps_rule_thumb'] = np.zeros(ncruises, dtype=int)
for loo_idx in range(ncruises):
print('Estimating parameters for', cp_results.iloc[loo_idx]['cruise'], '- Cruise ', loo_idx+1, '/', ncruises)
errors_penalty = np.zeros(len(alphas))
for alpha_num, alpha in enumerate(alphas):
for i in range(ncruises):
if i != loo_idx:
n_est_cps = est_ncp_penalty(cp_results.iloc[i]['objs'], cp_results.iloc[i]['n'], alpha)
errors_penalty[alpha_num] += np.abs(n_est_cps - cp_results.iloc[i]['n_cp'])
best_alpha_idx = np.argmin(errors_penalty)

errors_rule_thumb = np.zeros(len(nus))
for nu_num, nu in enumerate(nus):
for i in range(ncruises):
if i != loo_idx:
n_est_cps = est_ncp_rule_thumb(cp_results.iloc[i]['ratios'], nu)
errors_rule_thumb[nu_num] += np.abs(n_est_cps - cp_results.iloc[i]['n_cp'])
best_nu_idx = np.argmin(errors_rule_thumb)

cp_results.at[loo_idx, 'n_est_cps_penalty'] = int(est_ncp_penalty(cp_results.iloc[loo_idx]['objs'],
cp_results.iloc[loo_idx]['n'],
alphas[best_alpha_idx]))
cp_results.at[loo_idx, 'n_est_cps_rule_thumb'] = int(est_ncp_rule_thumb(cp_results.iloc[loo_idx]['ratios'],
nus[best_nu_idx]))
cp_results.at[loo_idx, 'alpha'] = alphas[best_alpha_idx]
cp_results.at[loo_idx, 'nu'] = nus[best_nu_idx]

return cp_results


def estimate_ncp(data_dir, cruises, max_cp, alphas, nus, output_dir, min_dist=5):
"""
Annotate the change points in the physical data from the directory data_dir.
:param data_dir: Directory containing the files with the cleaned physical data and annotated change points.
:param cruises: List of cruises to use.
:param max_cp: Maximum number of allowable change points.
:param alphas: Parameter values to try for the method of Lebarbier (2005).
:param nus: Parameter values to try for the method of Harchaoui and Levy-Leduc (2007).
:param output_dir: Directory where the annotation results should be stored. The file will be called
estimated_ncp.pickle.
:param min_dist: Minimum allowable distance between change points.
"""
for cruise_num, cruise in enumerate(cruises):
print('Estimating physical change points for', cruise, '- Cruise ', cruise_num+1, '/', len(cruises))
phys_data = pd.read_parquet(os.path.join(data_dir, cruise + '_phys.parquet'))
n = len(phys_data)
n_cp = len(json.load(open(os.path.join(data_dir, cruise + '_annotated_phys_cps.json'), 'r')))
objs_phys = est_cps_objs(phys_data, max_cp, min_dist=min_dist)
all_ratios = obj_ratios(objs_phys)
if cruise_num == 0:
cp_results = pd.DataFrame({'cruise': cruise, 'n': n, 'n_cp': n_cp, 'ratios': [all_ratios],
'objs': [objs_phys]})
else:
cp_results = cp_results.append({'cruise': cruise, 'n': n, 'n_cp': n_cp, 'ratios': all_ratios,
'objs': objs_phys}, ignore_index=True)
cp_results = estimate_params_loo(cp_results, alphas, nus)

if not os.path.exists(output_dir):
os.makedirs(output_dir)
cp_results.to_pickle(os.path.join(output_dir, 'estimated_ncp.pickle'))


if __name__ == '__main__':
t1 = time.time()
# Directory where the data is stored
data_dir = '../data/'
# Directory where the output will be stored
output_dir = '../results/'
# List of cruises to use
cruises = ['DeepDOM', 'KM1712', 'KM1713', 'MGL1704', 'SCOPE_2', 'SCOPE_16', 'Thompson_1', 'Thompson_9',
'Thompson_12', 'Tokyo_1', 'Tokyo_2', 'Tokyo_3']
# Parameter values to consider when optimizing the criteria for the number of change points
alphas = np.arange(0, 1, 0.01)
nus = np.arange(0.01, 0.1, 0.001)
# Maximum number of possible change points
max_cp = 150

estimate_ncp(data_dir, cruises, max_cp, alphas, nus, output_dir)
t2 = time.time()
print('Runtime:', t2-t1)
# Runtime (Intel i9-7960X CPU @ 2.80GHz): 43s
61 changes: 61 additions & 0 deletions analysis/2_generate_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import numpy as np
import os
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
import time

from chapydette import feature_generation


def generate_features(cruises, data_dir, features_dir, projection_dims=[128]):
"""
Generate features for the physical and biological data from each cruise.
:param cruises: List of cruises to generate features for.
:param data_dir: Directory where the (cleaned) biological and physical data is stored.
:param features_dir: Directory where the features will be stored.
:param projection_dims: List of dimensions to use for the projection for the biological data.
"""
if not os.path.exists(features_dir):
os.makedirs(features_dir)

for cruise in cruises:
print('Generating features for', cruise)
# Load the data
bio_data = pd.read_parquet(os.path.join(data_dir, cruise + '_bio.parquet'))
times = np.array(pd.Series(bio_data['date']).astype('category').cat.codes.values + 1)
bio_data = np.asarray(bio_data[['fsc_small', 'chl_small', 'pe']])

phys_data = pd.read_parquet(os.path.join(data_dir, cruise + '_phys.parquet'))

# Generate the features
phys_features = np.asarray(phys_data[['salinity', 'temp']])
phys_features = StandardScaler().fit_transform(phys_features)

for projection_dim in projection_dims:
print('Dimension of projection:', projection_dim)
save_file = os.path.join(features_dir, cruise + '_features_' + str(projection_dim) + '.pickle')
bio_features = feature_generation.nystroem_features(bio_data, projection_dim, window_length=1, do_pca=False,
window_overlap=0, times=times, seed=0,
kmeans_iters=100)[0].astype('float64')
pickle.dump({'bio_features': bio_features, 'phys_features': phys_features}, open(save_file, 'wb'))


if __name__ == '__main__':
t1 = time.time()
# Directory where the data is stored
data_dir = '../data/'
# Directory where the output will be stored
features_dir = '../features/'
# List of cruises to use
cruises = ['DeepDOM', 'KM1712', 'KM1713', 'MGL1704', 'SCOPE_2', 'SCOPE_16', 'Thompson_1', 'Thompson_9',
'Thompson_12', 'Tokyo_1', 'Tokyo_2', 'Tokyo_3']

generate_features(cruises, data_dir, features_dir, projection_dims=[128])
generate_features(['SCOPE_16'], data_dir, features_dir, projection_dims=[2**i for i in range(1, 11)])
t2 = time.time()
print('Runtime:', t2-t1)
# Runtimes:
# CPU (Intel i9-7960X CPU @ 2.80GHz): 13h15m33s
# GPU (Titan Xp on same machine): 12m10s
160 changes: 160 additions & 0 deletions analysis/3_estimate_cps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
import json
import numpy as np
import os
import pickle
import sklearn.metrics
import time

from chapydette import cp_estimation


def load_features(cruise, features_dir, projection_dim):
"""
Load features for a cruise.
:param cruise: Cruise to load features for.
:param features_dir: Directory where the features are stored.
:param projection_dim: Dimension of the features.
:return: Tuple consisting of:
* cruise_features['bio_features']: The features for the biological data
* cruise_features['phys_features']: The features for the physical data
"""
features_path = os.path.join(features_dir, cruise + '_features_' + str(projection_dim) + '.pickle')
cruise_features = pickle.load(open(features_path, 'rb'))
return cruise_features['bio_features'], cruise_features['phys_features']


def get_bw_range(features):
"""
Get the rule-of-thumb bandwidth and a range of bandwidths on a log scale for the Gaussian RBF kernel.
:param features: Features to use to obtain the bandwidths.
:return: Tuple consisting of:
* rule_of_thumb_bw: Computed rule-of-thumb bandwidth.
* bws: List of bandwidths on a log scale.
"""
dists = sklearn.metrics.pairwise.pairwise_distances(features).reshape(-1)
rule_of_thumb_bw = np.median(dists)
gammas = np.logspace(np.log(0.5/np.percentile(dists, 99)**2), np.log(0.5/np.percentile(dists, 1)**2), 10, base=np.e)
bws = np.sqrt(1/(2*gammas))

return rule_of_thumb_bw, bws


def est_cps(cruise, bio_features, phys_features, max_ncp=150, min_dists=[5], kernel_types=['Gaussian-Euclidean'],
bw_method='rule-of-thumb', save_dir='../results/'):
"""
Estimate the locations of change points in the input biological and physical features for a single cruise.
:param cruise: Name of the cruise the features are from.
:param bio_features: Features for the biological data.
:param phys_features: Features for the physical data.
:param max_ncp: Maximum number of change points in a sequence.
:param min_dists: List of minimum acceptable distances between change points.
:param kernel_types: List containing 'Gaussian-Euclidean' (Gaussian RBF kernel) and/or 'Linear'.
:param bw_method: Method to use for obtaining the bandwidth(s). Either 'rule-of-thumb' or 'list'.
:param save_dir: Top-level directory where the results will be stored.
"""
projection_dim = bio_features.shape[1]
for min_dist in min_dists:
# Perform change-point estimation on the physical data
cps_phys, objs_phys = cp_estimation.mkcpe(X=phys_features,
n_cp=(1, min(max_ncp, int((len(phys_features)-1)/min_dist)-1)),
kernel_type='linear', min_dist=min_dist, return_obj=True)
for key in cps_phys.keys():
cps_phys[key] = cps_phys[key].flatten().tolist()

if not os.path.exists(os.path.join(save_dir, cruise)):
os.makedirs(os.path.join(save_dir, cruise))
save_path = os.path.join(save_dir, cruise, 'cps_phys.json')
json.dump({'cps_phys': cps_phys, 'objs_phys': objs_phys}, open(save_path, 'w'))

for kernel_type in kernel_types:
# Get the bandwidth(s) (if applicable)
if kernel_type != 'Linear':
rot_bw, bws = get_bw_range(bio_features)
all_bws = [rot_bw] if bw_method == 'rule-of-thumb' else bws
else:
all_bws = [0]
for bw in all_bws:
# Perform change-point estimation on the biological data
cps_bio, objs_bio = cp_estimation.mkcpe(X=bio_features,
n_cp=(1, min(max_ncp, int((len(bio_features)-1)/min_dist)-1)),
kernel_type=kernel_type, bw=bw, min_dist=min_dist,
return_obj=True)
for key in cps_bio.keys():
cps_bio[key] = cps_bio[key].flatten().tolist()

bw_short = 'rule-of-thumb_' + str(np.round(bw, 3)) if bw_method == 'rule-of-thumb' else \
str(np.round(bw, 3))
save_path = os.path.join(save_dir, cruise, 'cps_bio_' + str(projection_dim) + '_' +
kernel_type + '_' + str(bw_short) + '_' + str(min_dist) + '.json')
json.dump({'cps_bio': cps_bio, 'bw': bw, 'objs_bio': objs_bio}, open(save_path, 'w'))


def est_cps_all_cruises(cruises, features_dir, max_ncp=150, min_dist=5, projection_dim=128, save_dir='../results'):
"""
Estimate the biological and physical change points for each cruise and for all desired parameter settings in order
to make the plots.
:param cruises: List of cruises to generate features for.
:param features_dir: Directory where the features are stored.
:param max_ncp: Maximum number of acceptable change points.
:param min_dist: Minimum acceptable distance between change points.
:param projection_dim: Dimension of the features.
:param save_dir: Location where the estimated change points will be stored.
"""
for cruise_num, cruise in enumerate(cruises):
print('Estimating change points for', cruise, '- Cruise ', cruise_num+1, '/', len(cruises))
bio_features, phys_features = load_features(cruise, features_dir, projection_dim)
est_cps(cruise, bio_features, phys_features, max_ncp=max_ncp, min_dists=[min_dist], save_dir=save_dir)


def sensitivity_analysis(cruise, features_dir, max_ncp=150, min_dist=5, projection_dim=128,
save_dir='../results/sensitivity_analysis/'):
"""
Estimate the biological change points for a given cruise when varying the parameter settings.
:param cruise: Name of the cruise to perform the sensitivity analysis on.
:param features_dir: Directory where the features are stored.
:param max_ncp: Maximum number of acceptable change points.
:param min_dist: Baseline minimum acceptable distance between change points.
:param projection_dim: Baseline dimension of the features.
:param save_dir: Location where the estimated change points will be stored.
"""
print('Performing sensitivity analysis')
bio_features, phys_features = load_features(cruise, features_dir, projection_dim)
kernel_types = ['Linear']
est_cps(cruise, bio_features, phys_features, max_ncp=max_ncp, min_dists=[min_dist], kernel_types=kernel_types,
bw_method='rule-of-thumb', save_dir=save_dir)
bw_method = 'list'
est_cps(cruise, bio_features, phys_features, max_ncp=max_ncp, min_dists=[min_dist],
kernel_types=['Gaussian-Euclidean'], bw_method=bw_method, save_dir=save_dir)
min_dists = [1] + list(range(5, 55, 5))
est_cps(cruise, bio_features, phys_features, max_ncp=max_ncp, min_dists=min_dists,
kernel_types=['Gaussian-Euclidean'], bw_method='rule-of-thumb', save_dir=save_dir)
for projection_dim in [2 ** i for i in range(2, 11)]:
bio_features, phys_features = load_features(cruise, features_dir, projection_dim)
est_cps(cruise, bio_features, phys_features, max_ncp=max_ncp, min_dists=[min_dist],
kernel_types=['Gaussian-Euclidean'], bw_method='rule-of-thumb', save_dir=save_dir)


if __name__ == '__main__':
t1 = time.time()
# Directory where the features are stored
features_dir = '../features/'
# Directory where the output from the main analysis will be stored
save_dir = '../results/'
# Directory where the output from the sensitivity analysis will be stored
save_dir_sensitivity_analysis = '../results/sensitivity_analysis/'
# List of cruises to use
cruises = ['DeepDOM', 'KM1712', 'KM1713', 'MGL1704', 'SCOPE_2', 'SCOPE_16', 'Thompson_1', 'Thompson_9',
'Thompson_12', 'Tokyo_1', 'Tokyo_2', 'Tokyo_3']

est_cps_all_cruises(cruises, features_dir, save_dir=save_dir)
sensitivity_analysis('SCOPE_16', features_dir, save_dir=save_dir_sensitivity_analysis)
t2 = time.time()
print('Runtime:', t2-t1)
# Runtime (Intel i9-7960X CPU @ 2.80GHz): 3m25s
Loading

0 comments on commit 2d6538a

Please sign in to comment.