initial commit

cjones6 · Dec 1, 2020 · 2d6538a · 2d6538a
commit 2d6538a
Show file tree

Hide file tree

Showing 12 changed files with 2,996 additions and 0 deletions.
diff --git a/__init__.py b/__init__.py
diff --git a/analysis/1_est_num_cps_phys.py b/analysis/1_est_num_cps_phys.py
@@ -0,0 +1,168 @@
+import json
+import numpy as np
+import os
+import pandas as pd
+from sklearn.preprocessing import StandardScaler
+import time
+
+from chapydette import cp_estimation
+
+
+def est_cps_objs(phys_data, max_cp, min_dist=5):
+    """
+    Estimate the locations of 0-max_cp change points in the physical data and return the corresponding objective values.
+
+    :param phys_data: Physical data on which to estimate change points.
+    :param max_cp: Largest number of change points to estimate.
+    :param min_dist: Minimum allowable distance between change points.
+    :return: objs_phys: Objective values when setting the number of change points to each of 0, 1, 2,..., max_cp (or the
+                        maximum possible number of changes given that the minimum distance between change points is
+                        min_dist).
+    """
+    phys_features = np.asarray(phys_data[['temp', 'salinity']])
+    phys_features = StandardScaler().fit_transform(phys_features)
+    cps_phys, objs_phys = cp_estimation.mkcpe(X=phys_features,
+                                              n_cp=(0, min(max_cp, int((len(phys_features) - 1) / min_dist) - 1)),
+                                              kernel_type='linear', min_dist=min_dist, return_obj=True)
+    for key in objs_phys:
+        objs_phys[key] = objs_phys[key]/len(phys_features)
+
+    return objs_phys
+
+
+def est_ncp_penalty(objs, n, alpha):
+    """
+    Estimate the number of change points using the penalty \alpha d/n(2log(n/d)+5) of Lebarbier (2005).
+
+    :param objs: Dictionary of objective values for each number of change points.
+    :param n: Length of the sequence.
+    :param alpha: Value of the parameter alpha.
+    :return: The estimated number of change points with the given value of alpha.
+    """
+    objs = np.array([objs[i] for i in range(0, len(objs))]).flatten()
+    d = np.arange(1, len(objs)+1)
+
+    return np.argmin(objs + alpha*d/n*(2*np.log(n/d)+5))
+
+
+def obj_ratios(objs_phys):
+    """
+    Compute the ratio of successive objective values when going from one number of change points to the next.
+
+    :param objs_phys: Dictionary of objective values for each number of change points.
+    :return: ratios: Array with the ratios of successive objective values.
+    """
+    objs_phys = [objs_phys[i] for i in range(0, len(objs_phys))]
+    objs_phys = np.array(objs_phys).flatten()
+    ratios = objs_phys[1:]/objs_phys[:-1]
+
+    return ratios
+
+
+def est_ncp_rule_thumb(ratios, nu):
+    """
+    Estimate the number of change points using the rule of thumb of Harchaoui and Levy-Leduc (2007).
+
+    :param ratios: Array with the ratios of successive objective values.
+    :param nu: Parameter nu for the rule of thumb.
+    :return: The estimated number of change points with the given value of nu.
+    """
+    return np.argwhere(ratios > 1-nu)[0].item()
+
+
+def estimate_params_loo(cp_results, alphas, nus):
+    """
+    Estimate the values of alpha and nu in the methods for estimating the number of change points. Do this for each
+    cruise by using the annotations from all cruises except that one.
+
+    :param cp_results: Data frame with the cruise names, lengths, number of change points in the annotation files,
+                       objective values from change-point estimation, and ratios of objective values for each cruise.
+    :param alphas: Parameter values to try for the method of Lebarbier (2005).
+    :param nus: Parameter values to try for the method of Harchaoui and Levy-Leduc (2007).
+    :return: cp_results: Updated data frame with the estimated number of change points from both methods and the chosen
+                         parameter values.
+    """
+    ncruises = len(cp_results)
+    cp_results['n_est_cps_penalty'] = np.zeros(ncruises, dtype=int)
+    cp_results['n_est_cps_rule_thumb'] = np.zeros(ncruises, dtype=int)
+    for loo_idx in range(ncruises):
+        print('Estimating parameters for', cp_results.iloc[loo_idx]['cruise'], '- Cruise ', loo_idx+1, '/', ncruises)
+        errors_penalty = np.zeros(len(alphas))
+        for alpha_num, alpha in enumerate(alphas):
+            for i in range(ncruises):
+                if i != loo_idx:
+                    n_est_cps = est_ncp_penalty(cp_results.iloc[i]['objs'], cp_results.iloc[i]['n'], alpha)
+                    errors_penalty[alpha_num] += np.abs(n_est_cps - cp_results.iloc[i]['n_cp'])
+        best_alpha_idx = np.argmin(errors_penalty)
+
+        errors_rule_thumb = np.zeros(len(nus))
+        for nu_num, nu in enumerate(nus):
+            for i in range(ncruises):
+                if i != loo_idx:
+                    n_est_cps = est_ncp_rule_thumb(cp_results.iloc[i]['ratios'], nu)
+                    errors_rule_thumb[nu_num] += np.abs(n_est_cps - cp_results.iloc[i]['n_cp'])
+        best_nu_idx = np.argmin(errors_rule_thumb)
+
+        cp_results.at[loo_idx, 'n_est_cps_penalty'] = int(est_ncp_penalty(cp_results.iloc[loo_idx]['objs'],
+                                                              cp_results.iloc[loo_idx]['n'],
+                                                              alphas[best_alpha_idx]))
+        cp_results.at[loo_idx, 'n_est_cps_rule_thumb'] = int(est_ncp_rule_thumb(cp_results.iloc[loo_idx]['ratios'],
+                                                                          nus[best_nu_idx]))
+        cp_results.at[loo_idx, 'alpha'] = alphas[best_alpha_idx]
+        cp_results.at[loo_idx, 'nu'] = nus[best_nu_idx]
+
+    return cp_results
+
+
+def estimate_ncp(data_dir, cruises, max_cp, alphas, nus, output_dir, min_dist=5):
+    """
+    Annotate the change points in the physical data from the directory data_dir.
+
+    :param data_dir: Directory containing the files with the cleaned physical data and annotated change points.
+    :param cruises: List of cruises to use.
+    :param max_cp: Maximum number of allowable change points.
+    :param alphas: Parameter values to try for the method of Lebarbier (2005).
+    :param nus: Parameter values to try for the method of Harchaoui and Levy-Leduc (2007).
+    :param output_dir: Directory where the annotation results should be stored. The file will be called
+                       estimated_ncp.pickle.
+    :param min_dist: Minimum allowable distance between change points.
+    """
+    for cruise_num, cruise in enumerate(cruises):
+        print('Estimating physical change points for', cruise, '- Cruise ', cruise_num+1, '/', len(cruises))
+        phys_data = pd.read_parquet(os.path.join(data_dir, cruise + '_phys.parquet'))
+        n = len(phys_data)
+        n_cp = len(json.load(open(os.path.join(data_dir, cruise + '_annotated_phys_cps.json'), 'r')))
+        objs_phys = est_cps_objs(phys_data, max_cp, min_dist=min_dist)
+        all_ratios = obj_ratios(objs_phys)
+        if cruise_num == 0:
+            cp_results = pd.DataFrame({'cruise': cruise, 'n': n, 'n_cp': n_cp, 'ratios': [all_ratios],
+                                       'objs': [objs_phys]})
+        else:
+            cp_results = cp_results.append({'cruise': cruise, 'n': n, 'n_cp': n_cp, 'ratios': all_ratios,
+                                            'objs': objs_phys}, ignore_index=True)
+    cp_results = estimate_params_loo(cp_results, alphas, nus)
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    cp_results.to_pickle(os.path.join(output_dir, 'estimated_ncp.pickle'))
+
+
+if __name__ == '__main__':
+    t1 = time.time()
+    # Directory where the data is stored
+    data_dir = '../data/'
+    # Directory where the output will be stored
+    output_dir = '../results/'
+    # List of cruises to use
+    cruises = ['DeepDOM', 'KM1712', 'KM1713', 'MGL1704', 'SCOPE_2', 'SCOPE_16', 'Thompson_1', 'Thompson_9',
+               'Thompson_12', 'Tokyo_1', 'Tokyo_2', 'Tokyo_3']
+    # Parameter values to consider when optimizing the criteria for the number of change points
+    alphas = np.arange(0, 1, 0.01)
+    nus = np.arange(0.01, 0.1, 0.001)
+    # Maximum number of possible change points
+    max_cp = 150
+
+    estimate_ncp(data_dir, cruises, max_cp, alphas, nus, output_dir)
+    t2 = time.time()
+    print('Runtime:', t2-t1)
+    # Runtime (Intel i9-7960X CPU @ 2.80GHz): 43s
diff --git a/analysis/2_generate_features.py b/analysis/2_generate_features.py
@@ -0,0 +1,61 @@
+import numpy as np
+import os
+import pandas as pd
+import pickle
+from sklearn.preprocessing import StandardScaler
+import time
+
+from chapydette import feature_generation
+
+
+def generate_features(cruises, data_dir, features_dir, projection_dims=[128]):
+    """
+    Generate features for the physical and biological data from each cruise.
+
+    :param cruises: List of cruises to generate features for.
+    :param data_dir: Directory where the (cleaned) biological and physical data is stored.
+    :param features_dir: Directory where the features will be stored.
+    :param projection_dims: List of dimensions to use for the projection for the biological data.
+    """
+    if not os.path.exists(features_dir):
+        os.makedirs(features_dir)
+
+    for cruise in cruises:
+        print('Generating features for', cruise)
+        # Load the data
+        bio_data = pd.read_parquet(os.path.join(data_dir, cruise + '_bio.parquet'))
+        times = np.array(pd.Series(bio_data['date']).astype('category').cat.codes.values + 1)
+        bio_data = np.asarray(bio_data[['fsc_small', 'chl_small', 'pe']])
+
+        phys_data = pd.read_parquet(os.path.join(data_dir, cruise + '_phys.parquet'))
+
+        # Generate the features
+        phys_features = np.asarray(phys_data[['salinity', 'temp']])
+        phys_features = StandardScaler().fit_transform(phys_features)
+
+        for projection_dim in projection_dims:
+            print('Dimension of projection:', projection_dim)
+            save_file = os.path.join(features_dir, cruise + '_features_' + str(projection_dim) + '.pickle')
+            bio_features = feature_generation.nystroem_features(bio_data, projection_dim, window_length=1, do_pca=False,
+                                                                window_overlap=0, times=times, seed=0,
+                                                                kmeans_iters=100)[0].astype('float64')
+            pickle.dump({'bio_features': bio_features, 'phys_features': phys_features}, open(save_file, 'wb'))
+
+
+if __name__ == '__main__':
+    t1 = time.time()
+    # Directory where the data is stored
+    data_dir = '../data/'
+    # Directory where the output will be stored
+    features_dir = '../features/'
+    # List of cruises to use
+    cruises = ['DeepDOM', 'KM1712', 'KM1713', 'MGL1704', 'SCOPE_2', 'SCOPE_16', 'Thompson_1', 'Thompson_9',
+               'Thompson_12', 'Tokyo_1', 'Tokyo_2', 'Tokyo_3']
+
+    generate_features(cruises, data_dir, features_dir, projection_dims=[128])
+    generate_features(['SCOPE_16'], data_dir, features_dir, projection_dims=[2**i for i in range(1, 11)])
+    t2 = time.time()
+    print('Runtime:', t2-t1)
+    # Runtimes:
+    # CPU (Intel i9-7960X CPU @ 2.80GHz): 13h15m33s
+    # GPU (Titan Xp on same machine): 12m10s
diff --git a/analysis/3_estimate_cps.py b/analysis/3_estimate_cps.py
@@ -0,0 +1,160 @@
+import json
+import numpy as np
+import os
+import pickle
+import sklearn.metrics
+import time
+
+from chapydette import cp_estimation
+
+
+def load_features(cruise, features_dir, projection_dim):
+    """
+    Load features for a cruise.
+
+    :param cruise: Cruise to load features for.
+    :param features_dir: Directory where the features are stored.
+    :param projection_dim: Dimension of the features.
+    :return: Tuple consisting of:
+
+        * cruise_features['bio_features']: The features for the biological data
+        * cruise_features['phys_features']: The features for the physical data
+    """
+    features_path = os.path.join(features_dir, cruise + '_features_' + str(projection_dim) + '.pickle')
+    cruise_features = pickle.load(open(features_path, 'rb'))
+    return cruise_features['bio_features'], cruise_features['phys_features']
+
+
+def get_bw_range(features):
+    """
+    Get the rule-of-thumb bandwidth and a range of bandwidths on a log scale for the Gaussian RBF kernel.
+
+    :param features: Features to use to obtain the bandwidths.
+    :return: Tuple consisting of:
+
+        * rule_of_thumb_bw: Computed rule-of-thumb bandwidth.
+        * bws: List of bandwidths on a log scale.
+    """
+    dists = sklearn.metrics.pairwise.pairwise_distances(features).reshape(-1)
+    rule_of_thumb_bw = np.median(dists)
+    gammas = np.logspace(np.log(0.5/np.percentile(dists, 99)**2), np.log(0.5/np.percentile(dists, 1)**2), 10, base=np.e)
+    bws = np.sqrt(1/(2*gammas))
+
+    return rule_of_thumb_bw, bws
+
+
+def est_cps(cruise, bio_features, phys_features, max_ncp=150, min_dists=[5], kernel_types=['Gaussian-Euclidean'],
+            bw_method='rule-of-thumb', save_dir='../results/'):
+    """
+    Estimate the locations of change points in the input biological and physical features for a single cruise.
+
+    :param cruise: Name of the cruise the features are from.
+    :param bio_features: Features for the biological data.
+    :param phys_features: Features for the physical data.
+    :param max_ncp: Maximum number of change points in a sequence.
+    :param min_dists: List of minimum acceptable distances between change points.
+    :param kernel_types: List containing 'Gaussian-Euclidean' (Gaussian RBF kernel) and/or 'Linear'.
+    :param bw_method: Method to use for obtaining the bandwidth(s). Either 'rule-of-thumb' or 'list'.
+    :param save_dir: Top-level directory where the results will be stored.
+    """
+    projection_dim = bio_features.shape[1]
+    for min_dist in min_dists:
+        # Perform change-point estimation on the physical data
+        cps_phys, objs_phys = cp_estimation.mkcpe(X=phys_features,
+                                                  n_cp=(1, min(max_ncp, int((len(phys_features)-1)/min_dist)-1)),
+                                                  kernel_type='linear', min_dist=min_dist, return_obj=True)
+        for key in cps_phys.keys():
+            cps_phys[key] = cps_phys[key].flatten().tolist()
+
+        if not os.path.exists(os.path.join(save_dir, cruise)):
+            os.makedirs(os.path.join(save_dir, cruise))
+        save_path = os.path.join(save_dir, cruise, 'cps_phys.json')
+        json.dump({'cps_phys': cps_phys, 'objs_phys': objs_phys}, open(save_path, 'w'))
+
+        for kernel_type in kernel_types:
+            # Get the bandwidth(s) (if applicable)
+            if kernel_type != 'Linear':
+                rot_bw, bws = get_bw_range(bio_features)
+                all_bws = [rot_bw] if bw_method == 'rule-of-thumb' else bws
+            else:
+                all_bws = [0]
+            for bw in all_bws:
+                # Perform change-point estimation on the biological data
+                cps_bio, objs_bio = cp_estimation.mkcpe(X=bio_features,
+                                                        n_cp=(1, min(max_ncp, int((len(bio_features)-1)/min_dist)-1)),
+                                                        kernel_type=kernel_type, bw=bw, min_dist=min_dist,
+                                                        return_obj=True)
+                for key in cps_bio.keys():
+                    cps_bio[key] = cps_bio[key].flatten().tolist()
+
+                bw_short = 'rule-of-thumb_' + str(np.round(bw, 3)) if bw_method == 'rule-of-thumb' else \
+                            str(np.round(bw, 3))
+                save_path = os.path.join(save_dir, cruise, 'cps_bio_' + str(projection_dim) + '_' +
+                                             kernel_type + '_' + str(bw_short) + '_' + str(min_dist) + '.json')
+                json.dump({'cps_bio': cps_bio, 'bw': bw, 'objs_bio': objs_bio}, open(save_path, 'w'))
+
+
+def est_cps_all_cruises(cruises, features_dir, max_ncp=150, min_dist=5, projection_dim=128, save_dir='../results'):
+    """
+    Estimate the biological and physical change points for each cruise and for all desired parameter settings in order
+    to make the plots.
+
+    :param cruises: List of cruises to generate features for.
+    :param features_dir: Directory where the features are stored.
+    :param max_ncp: Maximum number of acceptable change points.
+    :param min_dist: Minimum acceptable distance between change points.
+    :param projection_dim: Dimension of the features.
+    :param save_dir: Location where the estimated change points will be stored.
+    """
+    for cruise_num, cruise in enumerate(cruises):
+        print('Estimating change points for', cruise, '- Cruise ', cruise_num+1, '/', len(cruises))
+        bio_features, phys_features = load_features(cruise, features_dir, projection_dim)
+        est_cps(cruise, bio_features, phys_features, max_ncp=max_ncp, min_dists=[min_dist], save_dir=save_dir)
+
+
+def sensitivity_analysis(cruise, features_dir, max_ncp=150, min_dist=5, projection_dim=128,
+                         save_dir='../results/sensitivity_analysis/'):
+    """
+    Estimate the biological change points for a given cruise when varying the parameter settings.
+
+    :param cruise: Name of the cruise to perform the sensitivity analysis on.
+    :param features_dir: Directory where the features are stored.
+    :param max_ncp: Maximum number of acceptable change points.
+    :param min_dist: Baseline minimum acceptable distance between change points.
+    :param projection_dim: Baseline dimension of the features.
+    :param save_dir: Location where the estimated change points will be stored.
+    """
+    print('Performing sensitivity analysis')
+    bio_features, phys_features = load_features(cruise, features_dir, projection_dim)
+    kernel_types = ['Linear']
+    est_cps(cruise, bio_features, phys_features, max_ncp=max_ncp, min_dists=[min_dist], kernel_types=kernel_types,
+            bw_method='rule-of-thumb', save_dir=save_dir)
+    bw_method = 'list'
+    est_cps(cruise, bio_features, phys_features, max_ncp=max_ncp, min_dists=[min_dist],
+            kernel_types=['Gaussian-Euclidean'], bw_method=bw_method, save_dir=save_dir)
+    min_dists = [1] + list(range(5, 55, 5))
+    est_cps(cruise, bio_features, phys_features, max_ncp=max_ncp, min_dists=min_dists,
+            kernel_types=['Gaussian-Euclidean'], bw_method='rule-of-thumb', save_dir=save_dir)
+    for projection_dim in [2 ** i for i in range(2, 11)]:
+        bio_features, phys_features = load_features(cruise, features_dir, projection_dim)
+        est_cps(cruise, bio_features, phys_features, max_ncp=max_ncp, min_dists=[min_dist],
+                kernel_types=['Gaussian-Euclidean'], bw_method='rule-of-thumb', save_dir=save_dir)
+
+
+if __name__ == '__main__':
+    t1 = time.time()
+    # Directory where the features are stored
+    features_dir = '../features/'
+    # Directory where the output from the main analysis will be stored
+    save_dir = '../results/'
+    # Directory where the output from the sensitivity analysis will be stored
+    save_dir_sensitivity_analysis = '../results/sensitivity_analysis/'
+    # List of cruises to use
+    cruises = ['DeepDOM', 'KM1712', 'KM1713', 'MGL1704', 'SCOPE_2', 'SCOPE_16', 'Thompson_1', 'Thompson_9',
+               'Thompson_12', 'Tokyo_1', 'Tokyo_2', 'Tokyo_3']
+
+    est_cps_all_cruises(cruises, features_dir, save_dir=save_dir)
+    sensitivity_analysis('SCOPE_16', features_dir, save_dir=save_dir_sensitivity_analysis)
+    t2 = time.time()
+    print('Runtime:', t2-t1)
+    # Runtime (Intel i9-7960X CPU @ 2.80GHz): 3m25s