From b4df6f451a11c52d4c3ebcd4b9589662f8925cab Mon Sep 17 00:00:00 2001 From: zprobot <1727697083@qq.com> Date: Thu, 12 Dec 2024 20:39:59 +0800 Subject: [PATCH 1/2] update: variable --- ibaqpy/bin/ibaqpy_commons.py | 16 ++++++++----- ibaqpy/bin/peptide_normalization.py | 14 ++++++----- ibaqpy/bin/peptides2protein.py | 36 +++++++++++++++++------------ 3 files changed, 39 insertions(+), 27 deletions(-) diff --git a/ibaqpy/bin/ibaqpy_commons.py b/ibaqpy/bin/ibaqpy_commons.py index 3127395..bf95bb6 100644 --- a/ibaqpy/bin/ibaqpy_commons.py +++ b/ibaqpy/bin/ibaqpy_commons.py @@ -48,6 +48,15 @@ IBAQ_NORMALIZED = "IbaqNorm" IBAQ_LOG = "IbaqLog" IBAQ_PPB = "IbaqPpb" +TPA = "TPA" +MOLECULARWEIGHT = "MolecularWeight" +COPYNUMBER = "CopyNumber" +CONCENTRATION_NM = "Concentration[nM]" +WEIGHT_NG = "Weight[ng]" +MOLES_NMOL = "Moles[nmol]" +GLOBALMEDIAN = "globalMedian" +CONDITIONMEDIAN= "conditionMedian" + parquet_map = { "pg_accessions": PROTEIN_NAME, @@ -171,16 +180,11 @@ def plot_distributions( if log2: normalize[field] = np.log2(normalize[field]) normalize.dropna(subset=[field], inplace=True) - plt.figure(dpi=500, figsize=(12, 8)) + plt.figure(dpi=500, figsize=(width, 8)) fig = sns.kdeplot(data=normalize, x=field, hue=class_field, palette="Paired", linewidth=2) sns.despine(ax=fig, top=True, right=True) plt.title(title) pd.set_option("mode.chained_assignment", "warn") - # data_wide = normalize.pivot(columns=class_field, values=field) - # # plotting multiple density plot - # data_wide.plot.kde(figsize=(width, 8), linewidth=2, legend=False) - # plt.title(title) - # pd.set_option("mode.chained_assignment", "warn") return plt.gcf() diff --git a/ibaqpy/bin/peptide_normalization.py b/ibaqpy/bin/peptide_normalization.py index a817939..f13358d 100644 --- a/ibaqpy/bin/peptide_normalization.py +++ b/ibaqpy/bin/peptide_normalization.py @@ -20,6 +20,8 @@ RUN, SAMPLE_ID, PARQUET_COLUMNS, + GLOBALMEDIAN, + CONDITIONMEDIAN, TMT16plex, TMT11plex, TMT10plex, @@ -94,7 +96,7 @@ def analyse_sdrf(sdrf_path: str) -> tuple: return technical_repetitions, label, sample_names, choice -def get_label(labels: list) -> (str, dict): +def get_label(labels: list): """Return label type and choice dict according to labels list. :param labels: Labels from SDRF. @@ -349,7 +351,7 @@ def csv2parquet(csv): duckdb.read_csv(csv).to_parquet(parquet_path) @staticmethod - def get_label(labels: list) -> (str, dict): + def get_label(labels: list): """Return label type and choice dict according to labels list. :param labels: Labels from SDRF. @@ -543,9 +545,9 @@ def peptide_normalization( if remove_low_frequency_peptides: low_frequency_peptides = feature.low_frequency_peptides header = False - if not skip_normalization and pnmethod == "globalMedian": + if not skip_normalization and pnmethod == GLOBALMEDIAN: med_map = feature.get_median_map() - elif not skip_normalization and pnmethod == "conditionMedian": + elif not skip_normalization and pnmethod == CONDITIONMEDIAN: med_map = feature.get_median_map_to_condition() for samples, df in feature.iter_samples(): df.dropna(subset=["pg_accessions"], inplace=True) @@ -591,11 +593,11 @@ def peptide_normalization( ) # Step9: Normalize the data. if not skip_normalization: - if pnmethod == "globalMedian": + if pnmethod == GLOBALMEDIAN: dataset_df.loc[:, NORM_INTENSITY] = ( dataset_df[NORM_INTENSITY] / med_map[sample] ) - elif pnmethod == "conditionMedian": + elif pnmethod == CONDITIONMEDIAN: con = dataset_df[CONDITION].unique()[0] dataset_df.loc[:, NORM_INTENSITY] = ( dataset_df[NORM_INTENSITY] / med_map[con][sample] diff --git a/ibaqpy/bin/peptides2protein.py b/ibaqpy/bin/peptides2protein.py index 98924a4..f412f12 100644 --- a/ibaqpy/bin/peptides2protein.py +++ b/ibaqpy/bin/peptides2protein.py @@ -16,6 +16,12 @@ NORM_INTENSITY, PROTEIN_NAME, SAMPLE_ID, + TPA, + MOLECULARWEIGHT, + COPYNUMBER, + CONCENTRATION_NM, + MOLES_NMOL, + WEIGHT_NG, plot_box_plot, plot_distributions, get_accession, @@ -109,13 +115,13 @@ def calculate(protein_intensity, histone_intensity, mw): def proteomic_ruler(df): histone_intensity = df[df[PROTEIN_NAME].isin(histones_list)][NORM_INTENSITY].sum() histone_intensity = histone_intensity if histone_intensity > 0 else 1 - df[["Copy", "Moles[nmol]", "Weight[ng]"]] = df.apply( - lambda x: calculate(x[NORM_INTENSITY], histone_intensity, x["MolecularWeight"]), + df[[COPYNUMBER, MOLES_NMOL, WEIGHT_NG]] = df.apply( + lambda x: calculate(x[NORM_INTENSITY], histone_intensity, x[MOLECULARWEIGHT]), axis=1, result_type="expand", ) - volume = df["Weight[ng]"].sum() * 1e-9 / cpc # unit L - df["Concentration[nM]"] = df["Moles[nmol]"] / volume # unit nM + volume = df[WEIGHT_NG].sum() * 1e-9 / cpc # unit L + df[CONCENTRATION_NM] = df[MOLES_NMOL] / volume # unit nM return df res = res.groupby([CONDITION]).apply(proteomic_ruler) @@ -205,10 +211,10 @@ def get_protein_group_mw(group: str) -> float: res = res.reset_index(drop=True) # tpa if tpa: - res["MolecularWeight"] = res.apply(lambda x: get_protein_group_mw(x[PROTEIN_NAME]), axis=1) - res["MolecularWeight"] = res["MolecularWeight"].fillna(1) - res["MolecularWeight"] = res["MolecularWeight"].replace(0, 1) - res["TPA"] = res[NORM_INTENSITY] / res["MolecularWeight"] + res[MOLECULARWEIGHT] = res.apply(lambda x: get_protein_group_mw(x[PROTEIN_NAME]), axis=1) + res[MOLECULARWEIGHT] = res[MOLECULARWEIGHT].fillna(1) + res[MOLECULARWEIGHT] = res[MOLECULARWEIGHT].replace(0, 1) + res[TPA] = res[NORM_INTENSITY] / res[MOLECULARWEIGHT] # calculate protein weight(ng) and concentration(nM) if ruler: if not ploidy or not cpc or not organism or not tpa: @@ -218,7 +224,7 @@ def get_protein_group_mw(group: str) -> float: res = calculate_weight_and_concentration(res, ploidy, cpc, organism, histones) # Print the distribution of the protein IBAQ values if verbose: - plot_width = len(set(res["SampleID"])) * 0.5 + 10 + plot_width = len(set(res[SAMPLE_ID])) * 0.5 + 10 pdf = PdfPages(qc_report) density1 = plot_distributions( res, @@ -241,11 +247,11 @@ def get_protein_group_mw(group: str) -> float: pdf.savefig(box1) if tpa: density2 = plot_distributions( - res, "TPA", SAMPLE_ID, log2=True, width=plot_width, title="TPA Distribution" + res, TPA, SAMPLE_ID, log2=True, width=plot_width, title="TPA Distribution" ) box2 = plot_box_plot( res, - "TPA", + TPA, SAMPLE_ID, log2=True, width=plot_width, @@ -257,7 +263,7 @@ def get_protein_group_mw(group: str) -> float: if ruler: density3 = plot_distributions( res, - "Copy", + COPYNUMBER, SAMPLE_ID, width=plot_width, log2=True, @@ -265,7 +271,7 @@ def get_protein_group_mw(group: str) -> float: ) box3 = plot_box_plot( res, - "Copy", + COPYNUMBER, SAMPLE_ID, width=plot_width, log2=True, @@ -276,7 +282,7 @@ def get_protein_group_mw(group: str) -> float: pdf.savefig(box3) density4 = plot_distributions( res, - "Concentration[nM]", + CONCENTRATION_NM, SAMPLE_ID, width=plot_width, log2=True, @@ -284,7 +290,7 @@ def get_protein_group_mw(group: str) -> float: ) box4 = plot_box_plot( res, - "Concentration[nM]", + CONCENTRATION_NM, SAMPLE_ID, width=plot_width, log2=True, From 371401c247d7c657aebc431d1571e5bf3e5446cc Mon Sep 17 00:00:00 2001 From: zprobot <1727697083@qq.com> Date: Thu, 12 Dec 2024 20:45:26 +0800 Subject: [PATCH 2/2] update: variable --- ibaqpy/bin/ibaqpy_commons.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/ibaqpy/bin/ibaqpy_commons.py b/ibaqpy/bin/ibaqpy_commons.py index bf95bb6..a0e98e9 100644 --- a/ibaqpy/bin/ibaqpy_commons.py +++ b/ibaqpy/bin/ibaqpy_commons.py @@ -24,9 +24,6 @@ PEPTIDE_SEQUENCE = "PeptideSequence" PEPTIDE_CANONICAL = "PeptideCanonical" PEPTIDE_CHARGE = "PrecursorCharge" -FRAGMENT_ION = "FragmentIon" -PRODUCT_CHARGE = "ProductCharge" -ISOTOPE_LABEL_TYPE = "IsotopeLabelType" CHANNEL = "Channel" MIXTRUE = "Mixture" TECHREPMIXTURE = "TechRepMixture" @@ -37,10 +34,8 @@ FRACTION = "Fraction" INTENSITY = "Intensity" NORM_INTENSITY = "NormIntensity" -RT = "Rt" REFERENCE = "Reference" SAMPLE_ID = "SampleID" -STUDY_ID = "StudyID" SEARCH_ENGINE = "searchScore" SCAN = "Scan" MBR = "MatchBetweenRuns" @@ -63,8 +58,6 @@ "peptidoform": PEPTIDE_SEQUENCE, "sequence": PEPTIDE_CANONICAL, "precursor_charge": PEPTIDE_CHARGE, - # "fragment_ion": FRAGMENT_ION, - # "isotope_label_type": ISOTOPE_LABEL_TYPE, "channel": CHANNEL, "condition": CONDITION, "biological_replicate": BIOREPLICATE,