Unify control condition arguments, add sanity check for control condi…

…tion samples
pinellolab · Mar 28, 2024 · 25c0a1b · 25c0a1b
1 parent 97d464e
commit 25c0a1b
Show file tree

Hide file tree

Showing 9 changed files with 51 additions and 35 deletions.
diff --git a/README.md b/README.md
@@ -201,9 +201,9 @@ Above command produces `prefix_editing_preference.[html,ipynb]` as editing prefe
 ### Parameters
   * `-o`, `--output-prefix` (default: `None`): Output prefix of editing pattern report (prefix.html, prefix.ipynb). If not provided, base name of `bdata_path` is used.
   * `--replicate-col` (default: `"rep"`): Column name in `bdata.samples` that describes replicate ID.
-  * `--condition-col` (default: `"bin"`): Column name in `bdata.samples` that describes experimental condition. (sorting bin, time, etc.)
+  * `--condition-col` (default: `"condition"`): Column name in `bdata.samples` that describes experimental condition. (sorting bin, time, etc.)
   * `--pam-col` (default: `None`): Column name describing PAM of each gRNA in `bdata.guides`.
-  * `--control-condition` (default: `"bulk"`): Control condition where editing preference would be profiled at. Pre-filters data where `bdata.samples[condition_col] == control_condition`.
+  * `--control-condition` (default: `"bulk"`): Control condition where editing preference would be profiled at. Pre-filters data where `bdata.samples[condition_col] == control_condition`. DO NOT use plasmid library as control here where we do not expect editing.
   * `-w`, `--window-length` (default: `6`): Window length of editing window of maximal editing efficiency to be identified. This window is used to quantify context specificity within the window.
 
 
@@ -215,7 +215,7 @@ bean-qc \
   my_sorting_screen.h5ad             `# Input ReporterScreen .h5ad file path` \
   -o my_sorting_screen_masked.h5ad   `# Output ReporterScreen .h5ad file path` \
   -r qc_report_my_sorting_screen     `# Prefix for QC report` \
-
+  --ctrl-cond presort                `# "condition" column in the control sample before selection. Mean gRNA editing rates in these samples are reported. ` \
 # Inspect the output qc_report_my_sorting_screen.html to tweak QC threshold
 
 bean-qc \
@@ -281,9 +281,9 @@ Note that these arguements will change the way the QC metrics are calculated for
                         Label of column in `bdata.samples` that describes experimental condition. (sorting bin, time,
                         etc.)
 ###### Editing rate calculation
-  * `--ctrl-cond CTRL_COND`
+  * `--control-condition CTRL_COND`
                         Values in of column in `ReporterScreen.samples[condition_label]` for guide-level editing rate
-                        to be calculated
+                        to be calculated. Default is `None`, which considers all samples.
   * `--rel-pos-is-reporter`
                         Specifies whether `edit_start_pos` and `edit_end_pos` are relative to reporter position. If
                         `False`, those are relative to spacer position.
@@ -453,6 +453,7 @@ Above command produces
   * `--guide-activity-col`: Column in `bdata.guides` DataFrame showing the editing rate estimated via external tools.
 * Sample annotations (`bdata.samples` column keys)
   * `--condition-column` (default: `condition`): Column key in `bdata.samples` that describes experimental condition.
+  * `--control-condition` (default: `bulk`): Value in `bdata.samples[condition_col]` that indicates control experimental condition.
   * `-uq`, `--sorting-bin-upper-quantile-column` (default: `upper_quantile`): Column name with upper quantile values of each sorting bin in bdata.samples
   * `-lq`, `--sorting-bin-lower-quantile-column` (default: `lower_quantile`): Column name with lower quantile values of each sorting bin in bdata.samples
 

diff --git a/bean/framework/Edit.py b/bean/framework/Edit.py
@@ -68,7 +68,7 @@ def from_str(cls, edit_str):  # pos:strand:start>end
     def match_str(cls, edit_str):
         if isinstance(edit_str, Edit):
             return True
-        pattern = r"(((chr)?\d+|nan):)?-?\d+:-?\d+:[+-]:[A-Z*-]>[A-Z*-]"
+        pattern = r"(((chr)?\w+|nan):)?-?\d+:-?\d+:[+-]:[A-Z*-]>[A-Z*-]"
         pattern2 = r"[\w*]!-?\d+:-?\d+:[+-]:[A-Z*-]>[A-Z*-]"
         return re.fullmatch(pattern, edit_str) or re.fullmatch(pattern2, edit_str)
 
@@ -175,11 +175,11 @@ def get_range(self):
             min(edit.pos for edit in self.edits),
             max(edit.pos for edit in self.edits),
         )
-    
+
     def set_uid(self, uid):
         self.edits = {edit.set_uid(uid) for edit in self.edits}
         return self
-    
+
     def get_uid(self):
         uid = None
         if (

diff --git a/bean/framework/ReporterScreen.py b/bean/framework/ReporterScreen.py
@@ -476,7 +476,7 @@ def get_guide_edit_rate(
         return_result=False,
         count_layer="X_bcmatch",
         edit_layer="edits",
-        unsorted_condition_label="bulk",
+        unsorted_condition_label=None,
     ):
         """
         prior_weight:
@@ -497,9 +497,14 @@ def get_guide_edit_rate(
             num_targetable_sites = self.guides.sequence.map(
                 lambda s: s[editable_base_start:editable_base_end].count(edited_base)
             )
-        bulk_idx = np.where(
-            self.samples.index.astype(str).map(lambda s: unsorted_condition_label in s)
-        )[0]
+        if unsorted_condition_label is not None:
+            bulk_idx = np.where(
+                self.samples.index.astype(str).map(
+                    lambda s: unsorted_condition_label in s
+                )
+            )[0]
+        else:
+            bulk_idx = np.arange(0, len(self.samples)).astype(int)
 
         if prior_weight is None:
             prior_weight = 1

diff --git a/bean/model/run.py b/bean/model/run.py
@@ -113,7 +113,7 @@ def parse_args():
         help="Column key in `bdata.samples` that describes time elapsed.",
     )
     parser.add_argument(
-        "--control-condition-label",
+        "--control-condition",
         default="bulk",
         type=str,
         help="Value in `bdata.samples[condition_col]` that indicates control experimental condition.",
@@ -334,19 +334,21 @@ def check_args(args, bdata):
         raise ValueError(
             f"Condition column set by `--replicate-col` {args.replicate_col} not in ReporterScreen.samples.columns:{bdata.samples.columns}. Check your input."
         )
-    if (
-        args.control_guide_tag is not None
-        and not bdata.guides.index.map(lambda s: args.control_guide_tag in s).any()
-    ):
-        raise ValueError(
-            f"Negative control guide label {args.control_guide_tag} provided by `--control-guide-tag` doesn't appear in any of the guide names. Check your input."
-        )
+    if args.control_guide_tag is not None:
+        if args.library_design == "variant":
+            raise ValueError(
+                "`--control-guide-tag` is not used for the variant mode. Make sure you provide the separate `target` column for negative control guide that targets different negative control variant."
+            )
+        elif not bdata.guides.index.map(lambda s: args.control_guide_tag in s).any():
+            raise ValueError(
+                f"Negative control guide label {args.control_guide_tag} provided by `--control-guide-tag` doesn't appear in any of the guide names. Check your input."
+            )
     if args.alpha_if_overdispersion_fitting_fails is not None:
         try:
             b0, b1 = args.alpha_if_overdispersion_fitting_fails.split(",")
             args.popt = (float(b0), float(b1))
         except TypeError as e:
-            raise e(
+            raise ValueError(
                 f"Input --alpha-if-overdispersion-fitting-fails {args.alpha_if_overdispersion_fitting_fails} is malformatted! Provide [float].[float] format."
             )
     else:

diff --git a/bean/qc/utils.py b/bean/qc/utils.py
@@ -148,7 +148,7 @@ def parse_args():
         default="top,bot",
     )
     input_parser.add_argument(
-        "--ctrl-cond",
+        "--control-condition",
         help="Values in of column in `ReporterScreen.samples[condition_label]` for guide-level editing rate to be calculated",
         type=str,
         default="bulk",

diff --git a/bin/bean-qc b/bin/bean-qc
@@ -31,7 +31,7 @@ def main():
             condition_label=args.condition_label,
             comp_cond1=args.lfc_cond1,
             comp_cond2=args.lfc_cond2,
-            ctrl_cond=args.ctrl_cond,
+            ctrl_cond=args.control_condition,
             exp_id=args.out_report_prefix,
             recalculate_edits=~args.dont_recalculate_edits,
             base_edit_data=args.base_edit_data,

diff --git a/bin/bean-run b/bin/bean-run
@@ -85,7 +85,7 @@ def main(args, bdata):
         use_const_pi=args.const_pi,
         condition_column=args.condition_col,
         time_column=args.time_col,
-        control_condition=args.control_condition_label,
+        control_condition=args.control_condition,
         control_can_be_selected=args.include_control_condition_for_inference,
         allele_df_key=args.allele_df_key,
         control_guide_tag=args.control_guide_tag,
@@ -101,7 +101,9 @@ def main(args, bdata):
             if "edit_rate" not in ndata.screen.guides.columns:
                 ndata.screen.get_edit_from_allele()
                 ndata.screen.get_edit_mat_from_uns(rel_pos_is_reporter=True)
-                ndata.screen.get_guide_edit_rate(unsorted_condition_label=args.control_condition_label)
+                ndata.screen.get_guide_edit_rate(
+                    unsorted_condition_label=args.control_condition
+                )
         target_info_df = _get_guide_target_info(
             ndata.screen, args, cols_include=[args.negctrl_col]
         )
@@ -182,16 +184,18 @@ def main(args, bdata):
         prefix=f"{prefix}/",
         suffix=args.result_suffix,
         guide_index=guide_index,
-        guide_acc=ndata.guide_accessibility.cpu().numpy()
-        if hasattr(ndata, "guide_accessibility")
-        and ndata.guide_accessibility is not None
-        else None,
+        guide_acc=(
+            ndata.guide_accessibility.cpu().numpy()
+            if hasattr(ndata, "guide_accessibility")
+            and ndata.guide_accessibility is not None
+            else None
+        ),
         adjust_confidence_by_negative_control=args.adjust_confidence_by_negative_control,
         adjust_confidence_negatives=adj_negctrl_idx,
         sd_is_fitted=(args.selection == "sorting"),
-        sample_covariates=ndata.sample_covariates
-        if hasattr(ndata, "sample_covariates")
-        else None,
+        sample_covariates=(
+            ndata.sample_covariates if hasattr(ndata, "sample_covariates") else None
+        ),
     )
     info("Done!")
 

diff --git a/notebooks/profile_editing_preference.ipynb b/notebooks/profile_editing_preference.ipynb
@@ -52,7 +52,7 @@
    "source": [
     "bdata_path = \"../../../bean_manuscript/workflow/results/mapped/LDLRCDS/bean_count_LDLRCDS_combined.h5ad\"\n",
     "replicate_col = \"rep\"\n",
-    "condition_col = \"bin\"\n",
+    "condition_col = \"condition\"\n",
     "control_condition = \"bulk\"\n",
     "output_prefix = \"editing pattern\"\n",
     "max_editing_window_length = 6\n",
@@ -97,7 +97,11 @@
     }
    ],
    "source": [
-    "cdata_bulk = cdata[:,cdata.samples[condition_col] == control_condition]"
+    "cdata_bulk = cdata[:,cdata.samples[condition_col] == control_condition]\n",
+    "if len(cdata_bulk) == 0:\n",
+    "    raise ValueError(\n",
+    "        f\"No samples with bdata.samples['{condition_col}'] == {control_condition}\"\n",
+    "    )"
    ]
   },
   {

diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@
 
 setup(
     name="crispr-bean",
-    version="0.3.1",
+    version="1.0.0",
     python_requires=">=3.8.0",
     author="Jayoung Ryu",
     author_email="[email protected]",