Skip to content

Commit

Permalink
Merge pull request #45 from pinellolab/dev
Browse files Browse the repository at this point in the history
Debug with duplicated mapping
  • Loading branch information
jykr authored Nov 8, 2024
2 parents 992f4dc + e338c6d commit 6081cfb
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 22 deletions.
59 changes: 43 additions & 16 deletions bean/mapping/GuideEditCounter.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def tqdm(iterable, **kwargs):
_write_alignment_matrix,
_write_paired_end_reads,
revcomp,
find_closest_sequence_index
find_closest_sequence_index,
)

logging.basicConfig(
Expand Down Expand Up @@ -128,7 +128,9 @@ def __init__(self, **kwargs):
- self.screen.guides["guide_len"].max()
)
self.map_duplicated_to_best = kwargs["map_duplicated_to_best"]
self.map_duplicated_hamming_threshold = kwargs["map_duplicated_hamming_threshold"]
self.map_duplicated_hamming_threshold = kwargs[
"map_duplicated_hamming_threshold"
]
self.count_guide_edits = kwargs["count_guide_edits"]
if self.count_guide_edits:
self.screen.uns["guide_edit_counts"] = {}
Expand Down Expand Up @@ -603,9 +605,16 @@ def _get_guide_counts_bcmatch_semimatch(
)
self.duplicate_match_wo_barcode += 1
if self.map_duplicated_to_best:
best_matched_guide_idx = self._find_closest_sequence(R1_seq, R2_seq, semimatch, self.map_duplicated_hamming_threshold)
if best_matched_guide_idx is not None:
self.screen.layers[semimatch_layer][best_matched_guide_idx, 0] += 1
best_matched_guide_idx = self._find_closest_sequence(
R1_seq,
R2_seq,
semimatch,
self.map_duplicated_hamming_threshold,
)
if best_matched_guide_idx is not None:
self.screen.layers[semimatch_layer][
best_matched_guide_idx, 0
] += 1
else: # guide match with no barcode match
matched_guide_idx = semimatch[0]
self.screen.layers[semimatch_layer][matched_guide_idx, 0] += 1
Expand All @@ -616,8 +625,10 @@ def _get_guide_counts_bcmatch_semimatch(
if self.keep_intermediate:
_write_paired_end_reads(r1, r2, outfile_R1_dup, outfile_R2_dup)
if self.map_duplicated_to_best:
best_matched_guide_idx = self._find_closest_sequence(R1_seq, R2_seq, bc_match, self.map_duplicated_hamming_threshold)
if best_matched_guide_idx is not None:
best_matched_guide_idx = self._find_closest_sequence(
R1_seq, R2_seq, bc_match, self.map_duplicated_hamming_threshold
)
if best_matched_guide_idx is not None:
mapped = True
else: # unique barcode match
mapped = True
Expand Down Expand Up @@ -744,7 +755,9 @@ def get_barcode(self, R1_seq, R2_seq):
R2_seq[barcode_start_idx : (barcode_start_idx + self.guide_bc_len)]
)

def _match_read_to_sgRNA_bcmatch_semimatch(self, R1_seq: str, R2_seq: str) -> Tuple[int, int, int]:
def _match_read_to_sgRNA_bcmatch_semimatch(
self, R1_seq: str, R2_seq: str
) -> Tuple[int, int, int]:
# This should be adjusted for each experimental recipes.
bc_start_idx, guide_barcode = self.get_barcode(R1_seq, R2_seq)
bc_match_idx = np.array([])
Expand All @@ -759,23 +772,37 @@ def _match_read_to_sgRNA_bcmatch_semimatch(self, R1_seq: str, R2_seq: str) -> Tu
)[0]
if self.mask_barcode:
_bc_match = np.where(
self.mask_sequence(guide_barcode) == self.screen.guides.masked_barcode
self.mask_sequence(guide_barcode)
== self.screen.guides.masked_barcode
)[0]
else:
_bc_match = np.where(
guide_barcode == self.screen.guides.barcode
)[0]
_bc_match = np.where(guide_barcode == self.screen.guides.barcode)[0]

bc_match_idx = np.append(
bc_match_idx, np.intersect1d(_seq_match, _bc_match)
)
semimatch_idx = np.append(semimatch_idx, _seq_match)
return bc_match_idx.astype(int), semimatch_idx.astype(int), bc_start_idx

def _find_closest_sequence(self, R1_seq, R2_seq, ref_guide_indices: Sequence[int], hamming_distance_threshold: int = 0.1,) -> int:
guide_lens = self.screen.guides.iloc[ref_guide_indices].map(len)
query_guide_seqs = [self.get_guide_seq(R1_seq, R2_seq, guide_len) for guide_len in guide_lens]
closest_idx = find_closest_sequence_index(query_guide_seqs, self.screen.guides.sequence.iloc[ref_guide_indices].values.tolist(), hamming_distance_threshold, match_score, mismatch_penalty, allowed_substitution_penalties = {(k, v):0.2 for k, v in self.target_base_edits})
def _find_closest_sequence(
self,
R1_seq,
R2_seq,
ref_guide_indices: Sequence[int],
hamming_distance_threshold: int = 0.1,
) -> int:
guide_lens = self.screen.guides.guide_len.iloc[ref_guide_indices]
query_guide_seqs = [
self.get_guide_seq(R1_seq, R2_seq, guide_len) for guide_len in guide_lens
]
closest_idx = find_closest_sequence_index(
query_guide_seqs,
self.screen.guides.sequence.iloc[ref_guide_indices].values.tolist(),
hamming_distance_threshold,
allowed_substitutions_penalties={
(k, v): 0.2 for k, v in self.target_base_edits.items()
},
)
return ref_guide_indices[closest_idx]

def _get_guide_position_seq_of_read(self, seq):
Expand Down
3 changes: 2 additions & 1 deletion tests/data/test_guide_info.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ CONTROL_1_g3,CONTROL,,1,g3,5,12,ABE,NegCtrl,CCCTACGCGGTAGGGAACT,GTCCAAGCCCTACGCG
CONTROL_1_g4,CONTROL,,1,g4,7,13,ABE,NegCtrl,AGCCCTACGCGGTAGGGAAC,CGTCCAAGCCCTACGCGGTAGGGAACTTTGGG,TGAG,TTTGG,-3
CONTROL_1_g5,CONTROL,,1,g5,8,14,ABE,NegCtrl,AAGCCCTACGCGGTAGGGAA,ACGTCCAAGCCCTACGCGGTAGGGAACTTTGG,GTAT,CTTTG,-4
CONTROL_10_g1,CONTROL,,10,g1,4,10,ABE,NegCtrl,CTGAAGGTGTCTGGCAGAGC,TTCGTACTGAAGGTGTCTGGCAGAGCGAGGAA,TGTT,GAGGA,0
CONTROL_10_g1_dup,CONTROL,,10,g1,4,10,ABE,NegCtrl,CTGAGGGTGTCTGGCAGAGC,TTCGTACTGAGGGTGTCTGGCAGAGCGAGGAA,TGTT,GAGGA,0
CONTROL_10_g2,CONTROL,,10,g2,5,11,ABE,NegCtrl,ACTGAAGGTGTCTGGCAGAG,CTTCGTACTGAAGGTGTCTGGCAGAGCGAGGA,TTCG,CGAGG,-1
CONTROL_10_g3,CONTROL,,10,g3,6,12,ABE,NegCtrl,TACTGAAGGTGTCTGGCAGA,GCTTCGTACTGAAGGTGTCTGGCAGAGCGAGG,ACAA,GCGAG,-2
CONTROL_10_g4,CONTROL,,10,g4,6,13,ABE,NegCtrl,TACTGAAGGTGTCTGGCAG,CGCTTCGTACTGAAGGTGTCTGGCAGAGCGAG,CATG,AGCGA,-3
Expand Down Expand Up @@ -3453,4 +3454,4 @@ rs9987289_Maj_ABE_347_g1,rs9987289,Maj,347,g1,3,10,ABE,Variant,GCATCAATATCACGTGG
rs9987289_Maj_ABE_347_g2,rs9987289,Maj,347,g2,4,11,ABE,Variant,GGCATCAATATCACGTGGA,ATGCTTGGGCATCAATATCACGTGGAACCAGC,TCGC,ACCAG,-1
rs9987289_Maj_ABE_347_g3,rs9987289,Maj,347,g3,6,12,ABE,Variant,TGGGCATCAATATCACGTGG,GATGCTTGGGCATCAATATCACGTGGAACCAG,GCAC,AACCA,-2
rs9987289_Maj_ABE_347_g4,rs9987289,Maj,347,g4,7,13,ABE,Variant,TTGGGCATCAATATCACGTG,AGATGCTTGGGCATCAATATCACGTGGAACCA,TTGC,GAACC,-3
rs9987289_Maj_ABE_347_g5,rs9987289,Maj,347,g5,8,14,ABE,Variant,CTTGGGCATCAATATCACGT,TAGATGCTTGGGCATCAATATCACGTGGAACC,GCGA,GGAAC,-4
rs9987289_Maj_ABE_347_g5,rs9987289,Maj,347,g5,8,14,ABE,Variant,CTTGGGCATCAATATCACGT,TAGATGCTTGGGCATCAATATCACGTGGAACC,GCGA,GGAAC,-4
23 changes: 18 additions & 5 deletions tests/test_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def test_count_samples_match():

@pytest.mark.order(105)
def test_count_samples_nofilter():
cmd = "bean count-samples -i tests/data/sample_list.csv -b A -f tests/data/test_guide_info.csv -o tests/test_res/var/ -r --guide-start-seq=GGAAAGGACGAAACACCG --skip-filtering"
cmd = "bean count-samples -i tests/data/sample_list.csv -b A -f tests/data/test_guide_info.csv -o tests/test_res/var/ -r --guide-start-seq=GGAAAGGACGAAACACCG --skip-filtering --rerun"
try:
subprocess.check_output(
cmd,
Expand All @@ -58,6 +58,19 @@ def test_count_samples_nofilter():


@pytest.mark.order(106)
def test_count_samples_duplicated():
cmd = "bean count-samples -i tests/data/sample_list.csv -b A -f tests/data/test_guide_info.csv -o tests/test_res/var/ -r --guide-start-seq=GGAAAGGACGAAACACCG --rerun --map-duplicated-to-best"
try:
subprocess.check_output(
cmd,
shell=True,
universal_newlines=True,
)
except subprocess.CalledProcessError as exc:
raise exc


@pytest.mark.order(107)
def test_count_samples_dual():
cmd = "bean count-samples -i tests/data/sample_list.csv -b A,C -f tests/data/test_guide_info.csv -o tests/test_res/var/ -r --guide-start-seq=GGAAAGGACGAAACACCG"
try:
Expand All @@ -70,9 +83,9 @@ def test_count_samples_dual():
raise exc


@pytest.mark.order(107)
@pytest.mark.order(108)
def test_count_samples_bcstart():
cmd = "bean count-samples -i tests/data/sample_list.csv -b A -f tests/data/test_guide_info.csv -o tests/test_res/var2/ -r --barcode-start-seq=GGAA --map-duplicated-to-best"
cmd = "bean count-samples -i tests/data/sample_list.csv -b A -f tests/data/test_guide_info.csv -o tests/test_res/var2/ -r --barcode-start-seq=GGAA --skip-filtering --rerun "
try:
subprocess.check_output(
cmd,
Expand Down Expand Up @@ -108,7 +121,7 @@ def test_barcode_start_idx():
assert bc == "AGAA"


@pytest.mark.order(108)
@pytest.mark.order(109)
def test_count_samples_tiling():
cmd = "bean count-samples -i tests/data/sample_list_tiling.csv -b A -f tests/data/test_guide_info_tiling_chrom.csv -o tests/test_res/tiling/ -r --tiling"
try:
Expand All @@ -121,7 +134,7 @@ def test_count_samples_tiling():
raise exc


@pytest.mark.order(109)
@pytest.mark.order(110)
def test_count_chroms():
cmd = "bean count --R1 tests/data/test_tiling_R1.fastq --R2 tests/data/test_tiling_R2.fastq -b A -f tests/data/test_guide_info_tiling_chrom.csv -o tests/test_res/tiling_chrom/ -r --tiling"
try:
Expand Down

0 comments on commit 6081cfb

Please sign in to comment.