diff --git a/chlamy_impi/data_exploration/initial_clustering.py b/chlamy_impi/data_exploration/initial_clustering.py new file mode 100644 index 0000000..f27bde9 --- /dev/null +++ b/chlamy_impi/data_exploration/initial_clustering.py @@ -0,0 +1,183 @@ +from collections import defaultdict + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from sklearn.mixture import GaussianMixture +from sklearn.decomposition import PCA + +from chlamy_impi.feature_extraction.prepare_feature_df import get_simple_y2_value_features, normalise_features, get_y2_features_continuous_light +from chlamy_impi.normalisation.y2_normalisation import get_normalised_y2_df + + +def main(): + df = get_normalised_y2_df() + + # First, as a sanity check, look at c=2 over all data. We expect two clusters, corresponding to HL and ML treatment + sanity_check_hl_vs_ml(df) + + # Now create a dataset with 1 row per well, with features extracted from both light treatments + feature_df = get_y2_features_continuous_light(df) + feature_df = normalise_features(feature_df) + feature_df = feature_df.dropna() + + # Get the positional indices of WT mutations in feature_df + wt_pos_inds = get_wt_pos_inds(feature_df) + + feature_df = feature_df.iloc[:, 3:] + + # Make sure we can map back from positional index of feature_df to index of df + positional_index_to_df_index = {i: index for i, index in enumerate(feature_df.index)} + + # Try clustering with different methods and parameters + # For each one record the likelihood of each data point + + models = [GaussianMixture(n_components=1), + GaussianMixture(n_components=2), + GaussianMixture(n_components=4), + GaussianMixture(n_components=8)] + + fig, axs = plt.subplots(2, 2, figsize=(12, 12), sharey=True, sharex=True) + + for i, model in enumerate(models): + pred_labels = model.fit_predict(feature_df) + scores = model.score_samples(feature_df) + #plot_clusters_pca(feature_df, pred_labels, scores) + + ax = axs.ravel()[i] + + plot_clusters_with_outlying_genes(ax, df, feature_df, positional_index_to_df_index, pred_labels, scores, + wt_pos_inds) + + plt.show() + + +def get_wt_pos_inds(feature_df): + # Get the positional indices of WT rows in feature_df + # This is somewhat tricky because feature_df has a different index to df + def check_wt(feature_df_row): + df_subset = df[(df["plate"] == feature_df_row["plate"]) & (df["i"] == feature_df_row["i"]) & ( + df["j"] == feature_df_row["j"])] + assert len(df_subset) <= 2 + if df_subset["mutant_ID"].iloc[0] == "WT": + return True + else: + return False + + wt_df_inds = feature_df.apply(check_wt, axis=1) + wt_pos_inds = [i for i, index in enumerate(feature_df.index) if wt_df_inds.loc[index]] + assert len(wt_pos_inds) == 407 # = 384 + 8 * 3 with 1 missing? + return wt_pos_inds + + +def plot_clusters_with_outlying_genes(ax, df, feature_df, positional_index_to_df_index, pred_labels, scores, + wt_pos_inds): + # Get the dataframe inds of the top-25 least likely points + least_likely_inds = np.argsort(scores)[:10] + scores = scores[least_likely_inds] + least_likely_df_inds = [positional_index_to_df_index[i] for i in least_likely_inds] + print( + f"Least likely genes: {[(score, df.loc[i, 'mutated_genes']) for score, i in zip(scores, least_likely_df_inds)]}") + + # Plot all the wild type points, along with the least likely points, coloured by cluster + pca = PCA(n_components=2) + pca.fit(feature_df) + pca_df = pca.transform(feature_df) + ax.scatter(pca_df[wt_pos_inds, 0], pca_df[wt_pos_inds, 1], c=pred_labels[wt_pos_inds], marker="o", alpha=0.2, + label="WT") + ax.scatter(pca_df[least_likely_inds, 0], pca_df[least_likely_inds, 1], c=pred_labels[least_likely_inds], marker="x", + alpha=1, label="Top-10 Outliers") + + # Annotate the gene name of the least likely points + for i in least_likely_inds: + ax.annotate(df.loc[i, "mutated_genes"].replace("&", "\&"), (pca_df[i, 0], pca_df[i, 1])) + + ax.legend() + ax.set_xlabel("PCA 1") + ax.set_ylabel("PCA 2") + ax.set_title(f"GMM + PCA (explained var={sum(pca.explained_variance_ratio_):.2f}) with N_clusters={len(set(pred_labels))}") + + +def plot_clusters_pca(feature_df, pred_labels, pred_scores, dim=3): + # Reduce dimensionality of data using PCA and then scatter plot + pca = PCA(n_components=dim) + pca.fit(feature_df) + pca_df = pca.transform(feature_df) + + if dim==2: + # Scatter plot of PCA data, coloured by likelihood + fig, ax = plt.subplots(1, 1, figsize=(6, 6)) + im = ax.scatter(pca_df[:, 0], pca_df[:, 1], c=pred_labels, marker="o", alpha=1) + fig.colorbar(im, ax=ax) + ax.set_xlabel("PCA 1") + ax.set_ylabel("PCA 2") + ax.set_title(f"PCA (explained var={sum(pca.explained_variance_ratio_):.2f}) coloured by cluster") + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + plt.show() + + elif dim==3: + # Scatter plot of PCA data, coloured by likelihood + fig = plt.figure(figsize=(6, 6)) + ax = fig.add_subplot(111, projection='3d') + im = ax.scatter(pca_df[:, 0], pca_df[:, 1], pca_df[:, 2], c=pred_labels, marker="o", alpha=1) + ax.set_xlabel("PCA 1") + ax.set_ylabel("PCA 2") + ax.set_zlabel("PCA 3") + ax.set_title(f"PCA (explained var={sum(pca.explained_variance_ratio_):.2f}) coloured by cluster") + plt.show() + + +def get_gene_to_indices(df, feature_df): + gene_to_indices = defaultdict(list) + for index in feature_df.index: + mutated_genes = df.loc[index, "mutated_genes"].split(",") + for gene in mutated_genes: + gene_to_indices[gene].append(index) + return gene_to_indices + + +def sanity_check_hl_vs_ml(df): + # First, as a sanity check, look at c=2 over all data. We expect two clusters, corresponding to HL and ML treatment + feature_df = get_simple_y2_value_features(df) + feature_df = normalise_features(feature_df) + feature_df = feature_df.dropna() + + gmm = GaussianMixture(n_components=2, random_state=0) + gmm.fit(feature_df) + pred_labels = gmm.predict(feature_df) + + ml_mask = df["light_regime"] == "20h_ML" + hl_mask = df["light_regime"] == "20h_HL" + ml_df_index = df[ml_mask].index + hl_df_index = df[hl_mask].index + ml_inds = [i for i, index in enumerate(feature_df.index) if index in ml_df_index] + hl_inds = [i for i, index in enumerate(feature_df.index) if index in hl_df_index] + assert len(ml_inds) > 0 + assert len(hl_inds) > 0 + + # Reduce dimensionality of data to 2D using PCA + pca = PCA(n_components=2) + pca.fit(feature_df) + pca_df = pca.transform(feature_df) + print(pca.explained_variance_ratio_) + print(pca.singular_values_) + + # Scatter plot of PCA data, coloured by cluster + plt.rcParams["text.usetex"] = True + plt.rcParams["font.family"] = "Computer Modern" + fig, ax = plt.subplots(1, 1, figsize=(6, 6)) + plt.set_cmap("tab20") + ax.scatter(pca_df[ml_inds, 0], pca_df[ml_inds, 1], c=pred_labels[ml_inds], vmin=0, vmax=1, marker="x", alpha=1, label="ML") + ax.scatter(pca_df[hl_inds, 0], pca_df[hl_inds, 1], c=pred_labels[hl_inds], marker="o", vmin=0, vmax=1, alpha=1, label="HL") + ax.set_xlabel("PCA 1") + ax.set_ylabel("PCA 2") + ax.set_title("PCA of features, coloured by GMM cluster") + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + ax.legend() + plt.show() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/chlamy_impi/feature_extraction/prepare_feature_df.py b/chlamy_impi/feature_extraction/prepare_feature_df.py index b2dbf58..3e7c0ff 100644 --- a/chlamy_impi/feature_extraction/prepare_feature_df.py +++ b/chlamy_impi/feature_extraction/prepare_feature_df.py @@ -4,18 +4,69 @@ import pandas as pd import matplotlib.pyplot as plt -from chlamy_impi.feature_extraction.trend_features import y2_exponential_decay_time +from chlamy_impi.feature_extraction.trend_features import y2_exponential_decay_time, y2_linear_trend, y2_quadratic_trend from chlamy_impi.feature_extraction.simple_features import mean_y2_value, std_y2_value, min_y2_value, max_y2_value from chlamy_impi.feature_extraction.rolling_features import rolling_avg_y2_value from chlamy_impi.normalisation.y2_normalisation import get_normalised_y2_df -def get_y2_value_features(df): - """Extract features from the database +def get_y2_features_continuous_light(df): + """Extract features from the database, assuming that we have continuous light treatment. + This is a combination of simple features, with features that compare the 20h_HL and 20h_ML treatments for each well + + Note: this approach won't scale very nicely to looking at more than two light treatments + """ + assert "fv_fm" in df.columns, "fv_fm column not found in dataframe" + assert "y2_1" in df.columns, "y2_1 column not found in dataframe" + + # First restructure the dataframe so that wells with the same light treatment are in the same row + with pd.option_context("display.max_rows", None, "display.max_columns", None): + #print(df[(df["plate"] == 2) & (df["mutant_ID"] == "WT")].head()) + pass + + df_hl = df[df["light_regime"] == "20h_HL"] + df_ml = df[df["light_regime"] == "20h_ML"] + + feature_cols = ["fv_fm"] + [f"y2_{i}" for i in range(1, 42)] + feature_df_hl = get_simple_y2_value_features(df_hl[feature_cols]) + feature_df_ml = get_simple_y2_value_features(df_ml[feature_cols]) + + # Drop the fv/fm column to prevent repetition + feature_df_hl = feature_df_hl.drop(columns=["fv_fm"]) + feature_df_ml = feature_df_ml.drop(columns=["fv_fm"]) + + # Add new columns using the features + df_hl = pd.concat([df_hl, feature_df_hl], axis=1) + df_ml = pd.concat([df_ml, feature_df_ml], axis=1) + + # Merge on index + df_merged = pd.merge(df_hl, df_ml, on=["plate", "i", "j"], suffixes=("_hl", "_ml"), validate="1:1") + + # Finally, put all features into one dataframe + features = pd.DataFrame() + features["plate"] = df_merged["plate"] + features["i"] = df_merged["i"] + features["j"] = df_merged["j"] + for col in feature_df_hl.columns: + features[f"{col}_hl"] = df_merged[f"{col}_hl"] + features[f"{col}_ml"] = df_merged[f"{col}_ml"] + + print("Constructed features dataframe") + with pd.option_context("display.max_rows", None, "display.max_columns", None): + print(features.head()) + + return features + + +def get_simple_y2_value_features(df): + """Extract features from the database - assuming that we just want one feature per row. + This ignores features which could use more than one row, such as the difference between different light treatments :param df: A dataframe with one row per well, and one column for each time point :returns: A dataframe with one row per well, and one column for each feature """ + assert "fv_fm" in df.columns, "fv_fm column not found in dataframe" + assert "y2_1" in df.columns, "y2_1 column not found in dataframe" features = pd.DataFrame() features["fv_fm"] = df["fv_fm"] @@ -24,22 +75,50 @@ def get_y2_value_features(df): features["min_y2_value"] = min_y2_value(df) features["max_y2_value"] = max_y2_value(df) #features["rolling_avg_y2_value"] = rolling_avg_y2_value(df) # TODO: needs more work to ingest since its a dataframe - features["y2_exponential_decay_time"] = y2_exponential_decay_time(df) + #features["y2_exponential_decay_time"] = y2_exponential_decay_time(df) # TODO: currently doesnt seem reliable + features["y2_linear_trend"] = y2_linear_trend(df) + features["y2_quadratic_trend"] = y2_quadratic_trend(df) return features +def normalise_features(features: pd.DataFrame) -> pd.DataFrame: + """Normalise the features to have zero mean and unit variance + + :param features: A dataframe with one row per well, and one column for each feature + :returns: A dataframe with one row per well, and one column for each feature + """ + if ["plate", "i", "j"] == list(features.columns[:3]): + # Leave plate, i, j columns alone + features = features.copy() + features[features.columns[3:]] = (features[features.columns[3:]] - features[features.columns[3:]].mean()) / features[features.columns[3:]].std() + return features + else: + return (features - features.mean()) / features.std() + + def main(): """Make some plots of features """ df = get_normalised_y2_df() - features = get_y2_value_features(df) + features = get_simple_y2_value_features(df) # Plot histograms of each feature num_features = len(features.columns) - fig, axes = plt.subplots(num_features, 1, figsize=(5, 5 * num_features)) + plt.rcParams["text.usetex"] = True + plt.rcParams["font.family"] = "Computer Modern" + for i, col in enumerate(features.columns): - axes[i].hist(features[col]) - axes[i].set_xlabel(col) \ No newline at end of file + fig, ax = plt.subplots(1, 1, figsize=(6, 8)) + ax.hist(features[col], bins=100) + ax.set_xlabel(col) + ax.set_title(col) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + fig.savefig(f"feature_{col}.png") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/chlamy_impi/feature_extraction/trend_features.py b/chlamy_impi/feature_extraction/trend_features.py index d240a4a..cc97505 100644 --- a/chlamy_impi/feature_extraction/trend_features.py +++ b/chlamy_impi/feature_extraction/trend_features.py @@ -9,8 +9,15 @@ def y2_linear_trend(df): :returns: A Series containing the linear trend for each well """ y2_cols = [col for col in df.columns if col.startswith("y2_")] + df = df.copy() for i, row in df[y2_cols].iterrows(): + row = row.dropna() + if len(row) == 0: + df.loc[i, "y2_linear_trend"] = np.nan + continue + + row = row[:-1] df.loc[i, "y2_linear_trend"] = np.polyfit(range(len(row)), row, 1)[0] return df["y2_linear_trend"] @@ -22,8 +29,15 @@ def y2_quadratic_trend(df): :returns: A Series containing the quadratic trend for each well """ y2_cols = [col for col in df.columns if col.startswith("y2_")] + df = df.copy() for i, row in df[y2_cols].iterrows(): + row = row.dropna() + if len(row) == 0: + df.loc[i, "y2_linear_trend"] = np.nan + continue + + row = row[:-1] df.loc[i, "y2_quadratic_trend"] = np.polyfit(range(len(row)), row, 2)[0] return df["y2_quadratic_trend"] @@ -53,7 +67,15 @@ def y2_exponential_decay_time(df): y2_cols = [col for col in df.columns if col.startswith("y2_")] for i, row in df[y2_cols].iterrows(): - params, covariance = fit_exponential_decay(row.T) - df.loc[i, "y2_exponential_decay_time"] = params[1] + row = row.dropna() + if len(row) == 0: + df.loc[i, "y2_exponential_decay_time"] = np.nan + continue + try: + params, covariance = fit_exponential_decay(row.T[:-1]) # Skip final entry since it is an outlier + df.loc[i, "y2_exponential_decay_time"] = params[1] + except RuntimeError: + df.loc[i, "y2_exponential_decay_time"] = np.nan + continue return df["y2_exponential_decay_time"] \ No newline at end of file