From 25c0a1b5a5452c1d9492a5c7b6d51fe34cd3ab88 Mon Sep 17 00:00:00 2001 From: jykr Date: Thu, 28 Mar 2024 17:30:30 -0400 Subject: [PATCH] Unify control condition arguments, add sanity check for control condition samples --- README.md | 11 ++++++----- bean/framework/Edit.py | 6 +++--- bean/framework/ReporterScreen.py | 13 +++++++++---- bean/model/run.py | 20 +++++++++++--------- bean/qc/utils.py | 2 +- bin/bean-qc | 2 +- bin/bean-run | 22 +++++++++++++--------- notebooks/profile_editing_preference.ipynb | 8 ++++++-- setup.py | 2 +- 9 files changed, 51 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index b78a3f2..9cf9992 100644 --- a/README.md +++ b/README.md @@ -201,9 +201,9 @@ Above command produces `prefix_editing_preference.[html,ipynb]` as editing prefe ### Parameters * `-o`, `--output-prefix` (default: `None`): Output prefix of editing pattern report (prefix.html, prefix.ipynb). If not provided, base name of `bdata_path` is used. * `--replicate-col` (default: `"rep"`): Column name in `bdata.samples` that describes replicate ID. - * `--condition-col` (default: `"bin"`): Column name in `bdata.samples` that describes experimental condition. (sorting bin, time, etc.) + * `--condition-col` (default: `"condition"`): Column name in `bdata.samples` that describes experimental condition. (sorting bin, time, etc.) * `--pam-col` (default: `None`): Column name describing PAM of each gRNA in `bdata.guides`. - * `--control-condition` (default: `"bulk"`): Control condition where editing preference would be profiled at. Pre-filters data where `bdata.samples[condition_col] == control_condition`. + * `--control-condition` (default: `"bulk"`): Control condition where editing preference would be profiled at. Pre-filters data where `bdata.samples[condition_col] == control_condition`. DO NOT use plasmid library as control here where we do not expect editing. * `-w`, `--window-length` (default: `6`): Window length of editing window of maximal editing efficiency to be identified. This window is used to quantify context specificity within the window. @@ -215,7 +215,7 @@ bean-qc \ my_sorting_screen.h5ad `# Input ReporterScreen .h5ad file path` \ -o my_sorting_screen_masked.h5ad `# Output ReporterScreen .h5ad file path` \ -r qc_report_my_sorting_screen `# Prefix for QC report` \ - + --ctrl-cond presort `# "condition" column in the control sample before selection. Mean gRNA editing rates in these samples are reported. ` \ # Inspect the output qc_report_my_sorting_screen.html to tweak QC threshold bean-qc \ @@ -281,9 +281,9 @@ Note that these arguements will change the way the QC metrics are calculated for Label of column in `bdata.samples` that describes experimental condition. (sorting bin, time, etc.) ###### Editing rate calculation - * `--ctrl-cond CTRL_COND` + * `--control-condition CTRL_COND` Values in of column in `ReporterScreen.samples[condition_label]` for guide-level editing rate - to be calculated + to be calculated. Default is `None`, which considers all samples. * `--rel-pos-is-reporter` Specifies whether `edit_start_pos` and `edit_end_pos` are relative to reporter position. If `False`, those are relative to spacer position. @@ -453,6 +453,7 @@ Above command produces * `--guide-activity-col`: Column in `bdata.guides` DataFrame showing the editing rate estimated via external tools. * Sample annotations (`bdata.samples` column keys) * `--condition-column` (default: `condition`): Column key in `bdata.samples` that describes experimental condition. + * `--control-condition` (default: `bulk`): Value in `bdata.samples[condition_col]` that indicates control experimental condition. * `-uq`, `--sorting-bin-upper-quantile-column` (default: `upper_quantile`): Column name with upper quantile values of each sorting bin in bdata.samples * `-lq`, `--sorting-bin-lower-quantile-column` (default: `lower_quantile`): Column name with lower quantile values of each sorting bin in bdata.samples diff --git a/bean/framework/Edit.py b/bean/framework/Edit.py index 5abba2e..afd0c26 100644 --- a/bean/framework/Edit.py +++ b/bean/framework/Edit.py @@ -68,7 +68,7 @@ def from_str(cls, edit_str): # pos:strand:start>end def match_str(cls, edit_str): if isinstance(edit_str, Edit): return True - pattern = r"(((chr)?\d+|nan):)?-?\d+:-?\d+:[+-]:[A-Z*-]>[A-Z*-]" + pattern = r"(((chr)?\w+|nan):)?-?\d+:-?\d+:[+-]:[A-Z*-]>[A-Z*-]" pattern2 = r"[\w*]!-?\d+:-?\d+:[+-]:[A-Z*-]>[A-Z*-]" return re.fullmatch(pattern, edit_str) or re.fullmatch(pattern2, edit_str) @@ -175,11 +175,11 @@ def get_range(self): min(edit.pos for edit in self.edits), max(edit.pos for edit in self.edits), ) - + def set_uid(self, uid): self.edits = {edit.set_uid(uid) for edit in self.edits} return self - + def get_uid(self): uid = None if ( diff --git a/bean/framework/ReporterScreen.py b/bean/framework/ReporterScreen.py index ce56061..d470de5 100644 --- a/bean/framework/ReporterScreen.py +++ b/bean/framework/ReporterScreen.py @@ -476,7 +476,7 @@ def get_guide_edit_rate( return_result=False, count_layer="X_bcmatch", edit_layer="edits", - unsorted_condition_label="bulk", + unsorted_condition_label=None, ): """ prior_weight: @@ -497,9 +497,14 @@ def get_guide_edit_rate( num_targetable_sites = self.guides.sequence.map( lambda s: s[editable_base_start:editable_base_end].count(edited_base) ) - bulk_idx = np.where( - self.samples.index.astype(str).map(lambda s: unsorted_condition_label in s) - )[0] + if unsorted_condition_label is not None: + bulk_idx = np.where( + self.samples.index.astype(str).map( + lambda s: unsorted_condition_label in s + ) + )[0] + else: + bulk_idx = np.arange(0, len(self.samples)).astype(int) if prior_weight is None: prior_weight = 1 diff --git a/bean/model/run.py b/bean/model/run.py index e09dcd5..26b24a6 100644 --- a/bean/model/run.py +++ b/bean/model/run.py @@ -113,7 +113,7 @@ def parse_args(): help="Column key in `bdata.samples` that describes time elapsed.", ) parser.add_argument( - "--control-condition-label", + "--control-condition", default="bulk", type=str, help="Value in `bdata.samples[condition_col]` that indicates control experimental condition.", @@ -334,19 +334,21 @@ def check_args(args, bdata): raise ValueError( f"Condition column set by `--replicate-col` {args.replicate_col} not in ReporterScreen.samples.columns:{bdata.samples.columns}. Check your input." ) - if ( - args.control_guide_tag is not None - and not bdata.guides.index.map(lambda s: args.control_guide_tag in s).any() - ): - raise ValueError( - f"Negative control guide label {args.control_guide_tag} provided by `--control-guide-tag` doesn't appear in any of the guide names. Check your input." - ) + if args.control_guide_tag is not None: + if args.library_design == "variant": + raise ValueError( + "`--control-guide-tag` is not used for the variant mode. Make sure you provide the separate `target` column for negative control guide that targets different negative control variant." + ) + elif not bdata.guides.index.map(lambda s: args.control_guide_tag in s).any(): + raise ValueError( + f"Negative control guide label {args.control_guide_tag} provided by `--control-guide-tag` doesn't appear in any of the guide names. Check your input." + ) if args.alpha_if_overdispersion_fitting_fails is not None: try: b0, b1 = args.alpha_if_overdispersion_fitting_fails.split(",") args.popt = (float(b0), float(b1)) except TypeError as e: - raise e( + raise ValueError( f"Input --alpha-if-overdispersion-fitting-fails {args.alpha_if_overdispersion_fitting_fails} is malformatted! Provide [float].[float] format." ) else: diff --git a/bean/qc/utils.py b/bean/qc/utils.py index 3297d11..6ed50cf 100644 --- a/bean/qc/utils.py +++ b/bean/qc/utils.py @@ -148,7 +148,7 @@ def parse_args(): default="top,bot", ) input_parser.add_argument( - "--ctrl-cond", + "--control-condition", help="Values in of column in `ReporterScreen.samples[condition_label]` for guide-level editing rate to be calculated", type=str, default="bulk", diff --git a/bin/bean-qc b/bin/bean-qc index 970cb19..a6880bf 100644 --- a/bin/bean-qc +++ b/bin/bean-qc @@ -31,7 +31,7 @@ def main(): condition_label=args.condition_label, comp_cond1=args.lfc_cond1, comp_cond2=args.lfc_cond2, - ctrl_cond=args.ctrl_cond, + ctrl_cond=args.control_condition, exp_id=args.out_report_prefix, recalculate_edits=~args.dont_recalculate_edits, base_edit_data=args.base_edit_data, diff --git a/bin/bean-run b/bin/bean-run index 550d0ec..b4eb5f4 100644 --- a/bin/bean-run +++ b/bin/bean-run @@ -85,7 +85,7 @@ def main(args, bdata): use_const_pi=args.const_pi, condition_column=args.condition_col, time_column=args.time_col, - control_condition=args.control_condition_label, + control_condition=args.control_condition, control_can_be_selected=args.include_control_condition_for_inference, allele_df_key=args.allele_df_key, control_guide_tag=args.control_guide_tag, @@ -101,7 +101,9 @@ def main(args, bdata): if "edit_rate" not in ndata.screen.guides.columns: ndata.screen.get_edit_from_allele() ndata.screen.get_edit_mat_from_uns(rel_pos_is_reporter=True) - ndata.screen.get_guide_edit_rate(unsorted_condition_label=args.control_condition_label) + ndata.screen.get_guide_edit_rate( + unsorted_condition_label=args.control_condition + ) target_info_df = _get_guide_target_info( ndata.screen, args, cols_include=[args.negctrl_col] ) @@ -182,16 +184,18 @@ def main(args, bdata): prefix=f"{prefix}/", suffix=args.result_suffix, guide_index=guide_index, - guide_acc=ndata.guide_accessibility.cpu().numpy() - if hasattr(ndata, "guide_accessibility") - and ndata.guide_accessibility is not None - else None, + guide_acc=( + ndata.guide_accessibility.cpu().numpy() + if hasattr(ndata, "guide_accessibility") + and ndata.guide_accessibility is not None + else None + ), adjust_confidence_by_negative_control=args.adjust_confidence_by_negative_control, adjust_confidence_negatives=adj_negctrl_idx, sd_is_fitted=(args.selection == "sorting"), - sample_covariates=ndata.sample_covariates - if hasattr(ndata, "sample_covariates") - else None, + sample_covariates=( + ndata.sample_covariates if hasattr(ndata, "sample_covariates") else None + ), ) info("Done!") diff --git a/notebooks/profile_editing_preference.ipynb b/notebooks/profile_editing_preference.ipynb index db1eaa5..f806f5c 100644 --- a/notebooks/profile_editing_preference.ipynb +++ b/notebooks/profile_editing_preference.ipynb @@ -52,7 +52,7 @@ "source": [ "bdata_path = \"../../../bean_manuscript/workflow/results/mapped/LDLRCDS/bean_count_LDLRCDS_combined.h5ad\"\n", "replicate_col = \"rep\"\n", - "condition_col = \"bin\"\n", + "condition_col = \"condition\"\n", "control_condition = \"bulk\"\n", "output_prefix = \"editing pattern\"\n", "max_editing_window_length = 6\n", @@ -97,7 +97,11 @@ } ], "source": [ - "cdata_bulk = cdata[:,cdata.samples[condition_col] == control_condition]" + "cdata_bulk = cdata[:,cdata.samples[condition_col] == control_condition]\n", + "if len(cdata_bulk) == 0:\n", + " raise ValueError(\n", + " f\"No samples with bdata.samples['{condition_col}'] == {control_condition}\"\n", + " )" ] }, { diff --git a/setup.py b/setup.py index 4745dcb..5e00fcf 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name="crispr-bean", - version="0.3.1", + version="1.0.0", python_requires=">=3.8.0", author="Jayoung Ryu", author_email="jayoung_ryu@g.harvard.edu",