bigbio · ypriverol · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/ibaqpy/bin/ibaqpy_commons.py b/ibaqpy/bin/ibaqpy_commons.py
@@ -24,9 +24,6 @@
 PEPTIDE_SEQUENCE = "PeptideSequence"
 PEPTIDE_CANONICAL = "PeptideCanonical"
 PEPTIDE_CHARGE = "PrecursorCharge"
-FRAGMENT_ION = "FragmentIon"
-PRODUCT_CHARGE = "ProductCharge"
-ISOTOPE_LABEL_TYPE = "IsotopeLabelType"
 CHANNEL = "Channel"
 MIXTRUE = "Mixture"
 TECHREPMIXTURE = "TechRepMixture"
@@ -37,25 +34,30 @@
 FRACTION = "Fraction"
 INTENSITY = "Intensity"
 NORM_INTENSITY = "NormIntensity"
-RT = "Rt"
 REFERENCE = "Reference"
 SAMPLE_ID = "SampleID"
-STUDY_ID = "StudyID"
 SEARCH_ENGINE = "searchScore"
 SCAN = "Scan"
 MBR = "MatchBetweenRuns"
 IBAQ = "Ibaq"
 IBAQ_NORMALIZED = "IbaqNorm"
 IBAQ_LOG = "IbaqLog"
 IBAQ_PPB = "IbaqPpb"
+TPA = "TPA"
+MOLECULARWEIGHT = "MolecularWeight"
+COPYNUMBER = "CopyNumber"
+CONCENTRATION_NM = "Concentration[nM]"
+WEIGHT_NG = "Weight[ng]"
+MOLES_NMOL = "Moles[nmol]"
+GLOBALMEDIAN = "globalMedian"
+CONDITIONMEDIAN= "conditionMedian"
+
 
 parquet_map = {
     "pg_accessions": PROTEIN_NAME,
     "peptidoform": PEPTIDE_SEQUENCE,
     "sequence": PEPTIDE_CANONICAL,
     "precursor_charge": PEPTIDE_CHARGE,
-    # "fragment_ion": FRAGMENT_ION,
-    # "isotope_label_type": ISOTOPE_LABEL_TYPE,
     "channel": CHANNEL,
     "condition": CONDITION,
     "biological_replicate": BIOREPLICATE,
@@ -171,16 +173,11 @@ def plot_distributions(
     if log2:
         normalize[field] = np.log2(normalize[field])
     normalize.dropna(subset=[field], inplace=True)
-    plt.figure(dpi=500, figsize=(12, 8))
+    plt.figure(dpi=500, figsize=(width, 8))
     fig = sns.kdeplot(data=normalize, x=field, hue=class_field, palette="Paired", linewidth=2)
     sns.despine(ax=fig, top=True, right=True)
     plt.title(title)
     pd.set_option("mode.chained_assignment", "warn")
-    # data_wide = normalize.pivot(columns=class_field, values=field)
-    # # plotting multiple density plot
-    # data_wide.plot.kde(figsize=(width, 8), linewidth=2, legend=False)
-    # plt.title(title)
-    # pd.set_option("mode.chained_assignment", "warn")
 
     return plt.gcf()
 

diff --git a/ibaqpy/bin/peptide_normalization.py b/ibaqpy/bin/peptide_normalization.py
@@ -20,6 +20,8 @@
     RUN,
     SAMPLE_ID,
     PARQUET_COLUMNS,
+    GLOBALMEDIAN,
+    CONDITIONMEDIAN,
     TMT16plex,
     TMT11plex,
     TMT10plex,
@@ -94,7 +96,7 @@ def analyse_sdrf(sdrf_path: str) -> tuple:
     return technical_repetitions, label, sample_names, choice
 
 
-def get_label(labels: list) -> (str, dict):
+def get_label(labels: list):
     """Return label type and choice dict according to labels list.
 
     :param labels: Labels from SDRF.
@@ -349,7 +351,7 @@ def csv2parquet(csv):
         duckdb.read_csv(csv).to_parquet(parquet_path)
 
     @staticmethod
-    def get_label(labels: list) -> (str, dict):
+    def get_label(labels: list):
         """Return label type and choice dict according to labels list.
 
         :param labels: Labels from SDRF.
@@ -543,9 +545,9 @@ def peptide_normalization(
     if remove_low_frequency_peptides:
         low_frequency_peptides = feature.low_frequency_peptides
     header = False
-    if not skip_normalization and pnmethod == "globalMedian":
+    if not skip_normalization and pnmethod == GLOBALMEDIAN:
         med_map = feature.get_median_map()
-    elif not skip_normalization and pnmethod == "conditionMedian":
+    elif not skip_normalization and pnmethod == CONDITIONMEDIAN:
         med_map = feature.get_median_map_to_condition()
     for samples, df in feature.iter_samples():
         df.dropna(subset=["pg_accessions"], inplace=True)
@@ -591,11 +593,11 @@ def peptide_normalization(
                 )
             # Step9: Normalize the data.
             if not skip_normalization:
-                if pnmethod == "globalMedian":
+                if pnmethod == GLOBALMEDIAN:
                     dataset_df.loc[:, NORM_INTENSITY] = (
                         dataset_df[NORM_INTENSITY] / med_map[sample]
                     )
-                elif pnmethod == "conditionMedian":
+                elif pnmethod == CONDITIONMEDIAN:
                     con = dataset_df[CONDITION].unique()[0]
                     dataset_df.loc[:, NORM_INTENSITY] = (
                         dataset_df[NORM_INTENSITY] / med_map[con][sample]

diff --git a/ibaqpy/bin/peptides2protein.py b/ibaqpy/bin/peptides2protein.py
@@ -16,6 +16,12 @@
     NORM_INTENSITY,
     PROTEIN_NAME,
     SAMPLE_ID,
+    TPA,
+    MOLECULARWEIGHT,
+    COPYNUMBER,
+    CONCENTRATION_NM,
+    MOLES_NMOL,
+    WEIGHT_NG,
     plot_box_plot,
     plot_distributions,
     get_accession,
@@ -109,13 +115,13 @@ def calculate(protein_intensity, histone_intensity, mw):
     def proteomic_ruler(df):
         histone_intensity = df[df[PROTEIN_NAME].isin(histones_list)][NORM_INTENSITY].sum()
         histone_intensity = histone_intensity if histone_intensity > 0 else 1
-        df[["Copy", "Moles[nmol]", "Weight[ng]"]] = df.apply(
-            lambda x: calculate(x[NORM_INTENSITY], histone_intensity, x["MolecularWeight"]),
+        df[[COPYNUMBER, MOLES_NMOL, WEIGHT_NG]] = df.apply(
+            lambda x: calculate(x[NORM_INTENSITY], histone_intensity, x[MOLECULARWEIGHT]),
             axis=1,
             result_type="expand",
         )
-        volume = df["Weight[ng]"].sum() * 1e-9 / cpc  # unit L
-        df["Concentration[nM]"] = df["Moles[nmol]"] / volume  # unit nM
+        volume = df[WEIGHT_NG].sum() * 1e-9 / cpc  # unit L
+        df[CONCENTRATION_NM] = df[MOLES_NMOL] / volume  # unit nM
         return df
 
     res = res.groupby([CONDITION]).apply(proteomic_ruler)
@@ -205,10 +211,10 @@ def get_protein_group_mw(group: str) -> float:
     res = res.reset_index(drop=True)
     # tpa
     if tpa:
-        res["MolecularWeight"] = res.apply(lambda x: get_protein_group_mw(x[PROTEIN_NAME]), axis=1)
-        res["MolecularWeight"] = res["MolecularWeight"].fillna(1)
-        res["MolecularWeight"] = res["MolecularWeight"].replace(0, 1)
-        res["TPA"] = res[NORM_INTENSITY] / res["MolecularWeight"]
+        res[MOLECULARWEIGHT] = res.apply(lambda x: get_protein_group_mw(x[PROTEIN_NAME]), axis=1)
+        res[MOLECULARWEIGHT] = res[MOLECULARWEIGHT].fillna(1)
+        res[MOLECULARWEIGHT] = res[MOLECULARWEIGHT].replace(0, 1)
+        res[TPA] = res[NORM_INTENSITY] / res[MOLECULARWEIGHT]
     # calculate protein weight(ng) and concentration(nM)
     if ruler:
         if not ploidy or not cpc or not organism or not tpa:
@@ -218,7 +224,7 @@ def get_protein_group_mw(group: str) -> float:
         res = calculate_weight_and_concentration(res, ploidy, cpc, organism, histones)
     # Print the distribution of the protein IBAQ values
     if verbose:
-        plot_width = len(set(res["SampleID"])) * 0.5 + 10
+        plot_width = len(set(res[SAMPLE_ID])) * 0.5 + 10
         pdf = PdfPages(qc_report)
         density1 = plot_distributions(
             res,
@@ -241,11 +247,11 @@ def get_protein_group_mw(group: str) -> float:
         pdf.savefig(box1)
         if tpa:
             density2 = plot_distributions(
-                res, "TPA", SAMPLE_ID, log2=True, width=plot_width, title="TPA Distribution"
+                res, TPA, SAMPLE_ID, log2=True, width=plot_width, title="TPA Distribution"
             )
             box2 = plot_box_plot(
                 res,
-                "TPA",
+                TPA,
                 SAMPLE_ID,
                 log2=True,
                 width=plot_width,
@@ -257,15 +263,15 @@ def get_protein_group_mw(group: str) -> float:
         if ruler:
             density3 = plot_distributions(
                 res,
-                "Copy",
+                COPYNUMBER,
                 SAMPLE_ID,
                 width=plot_width,
                 log2=True,
                 title="Copy numbers Distribution",
             )
             box3 = plot_box_plot(
                 res,
-                "Copy",
+                COPYNUMBER,
                 SAMPLE_ID,
                 width=plot_width,
                 log2=True,
@@ -276,15 +282,15 @@ def get_protein_group_mw(group: str) -> float:
             pdf.savefig(box3)
             density4 = plot_distributions(
                 res,
-                "Concentration[nM]",
+                CONCENTRATION_NM,
                 SAMPLE_ID,
                 width=plot_width,
                 log2=True,
                 title="Concentration[nM] Distribution",
             )
             box4 = plot_box_plot(
                 res,
-                "Concentration[nM]",
+                CONCENTRATION_NM,
                 SAMPLE_ID,
                 width=plot_width,
                 log2=True,