From 92852dfcdcb28312e737644f03e08e6aff0de680 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Tue, 13 Feb 2024 08:54:28 +0000 Subject: [PATCH 1/6] Adjust to new UMI notation inside adaptor --- anglerfish/demux/adaptor.py | 38 ++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/anglerfish/demux/adaptor.py b/anglerfish/demux/adaptor.py index 69e7ea1..03dd7cc 100644 --- a/anglerfish/demux/adaptor.py +++ b/anglerfish/demux/adaptor.py @@ -91,23 +91,35 @@ def __init__(self, sequence, name, delim, index): self.delim = delim self.index = index + # Dynamically assign attributes + self.umi = re.findall(udelim, self.sequence) + + # TODO Duplicated from Adaptor class, will be merged later + # Check if UMI is before or after index + if len(self.umi) > 0 and ">" + self.umi[0] in self.sequence: + # The index region is INDEX+UMI + self.umi_after = int(re.search(ulen, self.umi[0]).group(1)) + self.len_before_index = len(delim.split(self.sequence)[0]) + self.len_after_index = len(udelim.split(self.sequence)[-1]) + elif len(self.umi) > 0 and self.umi[0] + "<" in self.sequence: + # The index region is UMI+INDEX + self.umi_before = int(re.search(ulen, self.umi[0]).group(1)) + self.len_before_index = len(udelim.split(self.sequence)[0]) + self.len_after_index = len(delim.split(self.sequence)[-1]) + elif len(self.umi) > 0: + # TODO give details which adaptor has the problem + raise UserWarning( + "Found adaptor with UMI but it does not flank an index. This is not supported." + ) + def has_index(self): return self.sequence.find(self.delim) > -1 - def len_before_index(self): - return self.sequence.find(self.delim) + def len_before_index_region(self): + return self.len_before_index - def len_after_index(self): - return len(self.sequence) - self.sequence.find(self.delim) - len(self.delim) - - def get_mask(self, insert_Ns): - if self.has_index(): - if not insert_Ns: - return self.sequence.replace(self.delim, "") - else: - return self.sequence.replace(self.delim, "N" * len(self.index)) - else: - return self.sequence + def len_after_index_region(self): + return self.len_after_index # General function to check if a string contains a pattern From e273ca69fdb85ed5a3d18cabf535dd1a1e7f6670 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Tue, 13 Feb 2024 08:55:27 +0000 Subject: [PATCH 2/6] Changed name from index to index_region in some methods --- anglerfish/explore/explore.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py index 1fd9cf8..4011ba8 100644 --- a/anglerfish/explore/explore.py +++ b/anglerfish/explore/explore.py @@ -104,9 +104,11 @@ def run_explore( # Alignment thresholds before_thres = round( - adaptor_end.len_before_index() * good_hit_threshold + adaptor_end.len_before_index_region() * good_hit_threshold + ) + after_thres = round( + adaptor_end.len_after_index_region() * good_hit_threshold ) - after_thres = round(adaptor_end.len_after_index() * good_hit_threshold) insert_thres_low = insert_thres_low insert_thres_high = insert_thres_high @@ -133,7 +135,10 @@ def run_explore( ] = match_col_df thres = round( - (adaptor_end.len_before_index() + adaptor_end.len_after_index()) + ( + adaptor_end.len_before_index_region() + + adaptor_end.len_after_index_region() + ) * good_hit_threshold ) df_good_hits = df_good_hits[df_good_hits["match_1_len"] >= thres] From 87bb9ff329930be737f1a0274012b8cabbf0d706 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Tue, 13 Feb 2024 10:12:08 +0100 Subject: [PATCH 3/6] Nedded to have initialized values for adaptor --- anglerfish/demux/adaptor.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/anglerfish/demux/adaptor.py b/anglerfish/demux/adaptor.py index 03dd7cc..b8787ac 100644 --- a/anglerfish/demux/adaptor.py +++ b/anglerfish/demux/adaptor.py @@ -90,6 +90,11 @@ def __init__(self, sequence, name, delim, index): self.name = name self.delim = delim self.index = index + self.umi_after = 0 + self.umi_before = 0 + self.len_after_index = 0 + self.len_before_index = 0 + # Dynamically assign attributes self.umi = re.findall(udelim, self.sequence) From 65a854c5c4a2cac5929f452c98d41e5600709f06 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Tue, 13 Feb 2024 09:37:59 +0000 Subject: [PATCH 4/6] Ruff fix --- anglerfish/demux/adaptor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/anglerfish/demux/adaptor.py b/anglerfish/demux/adaptor.py index b8787ac..6f51b5c 100644 --- a/anglerfish/demux/adaptor.py +++ b/anglerfish/demux/adaptor.py @@ -95,7 +95,6 @@ def __init__(self, sequence, name, delim, index): self.len_after_index = 0 self.len_before_index = 0 - # Dynamically assign attributes self.umi = re.findall(udelim, self.sequence) From e8a065f3743f39512b18095b1ad0dfc9f190954a Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Tue, 13 Feb 2024 10:51:49 +0100 Subject: [PATCH 5/6] Forgot about the non-UMI case for adaptor methods --- anglerfish/demux/adaptor.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/anglerfish/demux/adaptor.py b/anglerfish/demux/adaptor.py index 6f51b5c..e668e6f 100644 --- a/anglerfish/demux/adaptor.py +++ b/anglerfish/demux/adaptor.py @@ -115,6 +115,10 @@ def __init__(self, sequence, name, delim, index): raise UserWarning( "Found adaptor with UMI but it does not flank an index. This is not supported." ) + # Non UMI cases + elif has_match(delim, self.sequence): + self.len_before_index = len(delim.split(self.sequence)[0]) + self.len_after_index = len(delim.split(self.sequence)[-1]) def has_index(self): return self.sequence.find(self.delim) > -1 From de9ff8bccc32c0b0fb4077b85904116415e417a8 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Tue, 13 Feb 2024 13:39:25 +0100 Subject: [PATCH 6/6] delim is not idelim --- anglerfish/demux/adaptor.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/anglerfish/demux/adaptor.py b/anglerfish/demux/adaptor.py index e668e6f..2e821c7 100644 --- a/anglerfish/demux/adaptor.py +++ b/anglerfish/demux/adaptor.py @@ -103,22 +103,22 @@ def __init__(self, sequence, name, delim, index): if len(self.umi) > 0 and ">" + self.umi[0] in self.sequence: # The index region is INDEX+UMI self.umi_after = int(re.search(ulen, self.umi[0]).group(1)) - self.len_before_index = len(delim.split(self.sequence)[0]) + self.len_before_index = len(idelim.split(self.sequence)[0]) self.len_after_index = len(udelim.split(self.sequence)[-1]) elif len(self.umi) > 0 and self.umi[0] + "<" in self.sequence: # The index region is UMI+INDEX self.umi_before = int(re.search(ulen, self.umi[0]).group(1)) self.len_before_index = len(udelim.split(self.sequence)[0]) - self.len_after_index = len(delim.split(self.sequence)[-1]) + self.len_after_index = len(idelim.split(self.sequence)[-1]) elif len(self.umi) > 0: # TODO give details which adaptor has the problem raise UserWarning( "Found adaptor with UMI but it does not flank an index. This is not supported." ) # Non UMI cases - elif has_match(delim, self.sequence): - self.len_before_index = len(delim.split(self.sequence)[0]) - self.len_after_index = len(delim.split(self.sequence)[-1]) + elif has_match(idelim, self.sequence): + self.len_before_index = len(idelim.split(self.sequence)[0]) + self.len_after_index = len(idelim.split(self.sequence)[-1]) def has_index(self): return self.sequence.find(self.delim) > -1