diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index cbbd087..1c10fd3 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -18,7 +18,7 @@ "esbenp.prettier-vscode", "wholroyd.jinja", "ms-python.python", - "charliermarsh.ruff", + "charliermarsh.ruff@2024.2.0", "ms-azuretools.vscode-docker", ], }, diff --git a/.gitignore b/.gitignore index 17aab89..9c7393b 100755 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,3 @@ .*_cache node_modules .vscode -build diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py index 0ce1337..537606e 100755 --- a/anglerfish/anglerfish.py +++ b/anglerfish/anglerfish.py @@ -28,19 +28,6 @@ MAX_PROCESSES = 64 # Ought to be enough for anybody -anglerfish_logo = r""" - ___ - ( ) \ -..__ - _.|~”~~~”…_ - ^´ `>. -(+ (+ ) “<..<^( - `´ ``´ ___ ( - \__..~ __( _…_( - \ / - “--…_ _..~%´ - ```´´ -""" - def run_demux(args): multiprocessing.set_start_method("spawn") @@ -51,54 +38,44 @@ def run_demux(args): ss = SampleSheet(args.samplesheet, args.ont_barcodes) version = pkg_resources.get_distribution("bio-anglerfish").version report = Report(args.run_name, run_uuid, version) - sys.stderr.write(anglerfish_logo) + sys.stderr.write(""" + ___ + ( ) \ -..__ + _.|~”~~~”…_ + ^´ `>. +(+ (+ ) “<..<^( + `´ ``´ ___ ( + \__..~ __( _…_( + \ / + “--…_ _..~%´ + ```´´ +""") log.info(f" version {version}") log.info(f" arguments {vars(args)}") log.info(f" run uuid {run_uuid}") - min_distance = ss.minimum_bc_distance() + bc_dist = ss.minimum_bc_distance() if args.max_distance is None: - # Default: Set the maximum distance for barcode matching to 0, 1 or 2 - # depending on the smallest detected edit distance between indices in the samplesheet - args.max_distance = min(min_distance - 1, 2) + if bc_dist > 1: + args.max_distance = 2 + else: + args.max_distance = 1 log.info(f"Using maximum edit distance of {args.max_distance}") - if args.max_distance >= min_distance: + if args.max_distance >= bc_dist: log.error( - f" The maximum allowed edit distance for barcode matching (={args.max_distance})" - + f"is greater than the smallest detected edit distance between indices in samplesheet (={min_distance})" - + ", which will result in ambiguous matches." + f" Edit distance of barcodes in samplesheet are less than the minimum specified {args.max_distance}>={bc_dist}" ) exit() - log.debug(f"Samplesheet bc_dist == {min_distance}") + log.debug(f"Samplesheet bc_dist == {bc_dist}") if args.threads > MAX_PROCESSES: log.warning( f" Setting threads to {MAX_PROCESSES} as the maximum number of processes is {MAX_PROCESSES}" ) args.threads = MAX_PROCESSES - ## Sort the adaptors by type and size - - # Get a list of tuples with the adaptor name and ONT barcode - adaptor_tuples: list[tuple[str, str]] = [ - (entry.adaptor.name, entry.ont_barcode) for entry in ss - ] - - # Convert to set to enforce uniqueness - adaptor_set: set[tuple[str, str]] = set(adaptor_tuples) - - # Create a dictionary with the adaptors as keys and an empty list as value - adaptors_sorted: dict[tuple[str, str], list] = dict([(i, []) for i in adaptor_set]) - - # Populate the dictionary values with sample-specific information - """ - adaptors_sorted = { - ( adaptor_name, ont_barcode ) : [ - (sample_name, adaptor, fastq), - (sample_name, adaptor, fastq), - ... - ], - ... - } - """ + # Sort the adaptors by type and size + adaptors_t = [(entry.adaptor.name, entry.ont_barcode) for entry in ss] + adaptor_set = set(adaptors_t) + adaptors_sorted = dict([(i, []) for i in adaptor_set]) for entry in ss: adaptors_sorted[(entry.adaptor.name, entry.ont_barcode)].append( (entry.sample_name, entry.adaptor, os.path.abspath(entry.fastq)) @@ -114,19 +91,18 @@ def run_demux(args): adaptor_name, ont_barcode = key fastq_path = sample[0][2] # If there are multiple ONT barcodes, we need to add the ONT barcode to the adaptor name + adaptor_bc_name = adaptor_name if ont_barcode: - adaptor_bc_name = f"{adaptor_name}_{ont_barcode}" - else: - adaptor_bc_name = adaptor_name + adaptor_bc_name = adaptor_name + "_" + ont_barcode fastq_files = glob.glob(fastq_path) # Align - align_path = os.path.join(args.out_fastq, f"{adaptor_bc_name}.paf") + aln_path = os.path.join(args.out_fastq, f"{adaptor_bc_name}.paf") adaptor_path = os.path.join(args.out_fastq, f"{adaptor_name}.fasta") with open(adaptor_path, "w") as f: f.write(ss.get_fastastring(adaptor_name)) for fq in fastq_files: - run_minimap2(fq, adaptor_path, align_path, args.threads) + run_minimap2(fq, adaptor_path, aln_path, args.threads) # Easy line count in input fastq files num_fq = 0 @@ -135,7 +111,7 @@ def run_demux(args): for i in f: num_fq += 1 num_fq = int(num_fq / 4) - paf_entries = parse_paf_lines(align_path) + paf_entries = parse_paf_lines(aln_path) # Make stats log.info(f" Searching for adaptor hits in {adaptor_bc_name}") @@ -277,8 +253,7 @@ def run_demux(args): sample_dists = [ ( lev.distance( - i[0], - f"{x.adaptor.i7.index_seq}+{x.adaptor.i5.index_seq}".lower(), + i[0], f"{x.adaptor.i7_index}+{x.adaptor.i5_index}".lower() ), x.sample_name, ) diff --git a/anglerfish/cli.py b/anglerfish/cli.py index 9e87279..48f737e 100644 --- a/anglerfish/cli.py +++ b/anglerfish/cli.py @@ -193,9 +193,7 @@ def run( typer.Option( "--max-distance", "-m", - help="Manually set maximum allowed edit distance for index matching," - + "by default this is set to 0, 1 or 2 based on the minimum detected" - + "index distance in the samplesheet.", + help="Manually set maximum edit distance for BC matching, automatically set this is set to either 1 or 2", ), ] = 2, max_unknowns: Annotated[ diff --git a/anglerfish/demux/adaptor.py b/anglerfish/demux/adaptor.py index 21b6a06..2e821c7 100644 --- a/anglerfish/demux/adaptor.py +++ b/anglerfish/demux/adaptor.py @@ -4,237 +4,155 @@ import yaml -# These variables correspond to the tokens used in the adaptors.yaml file -# Not compiled with re.compile to enable pattern concatenation -INDEX_TOKEN = r"()" -UMI_TOKEN = r"(\)" -UMI_LENGTH_TOKEN = r"\" - -# This pattern is used to validate the adaptor sequences in the config -VALID_SEQUENCE_TOKEN_PATTERN = re.compile(f"^({INDEX_TOKEN}|{UMI_TOKEN}|([ACTG]*))*$") +idelim = re.compile(r"\") +udelim = re.compile(r"(\)") +ulen = re.compile(r"\") class Adaptor: - def __init__( - self, - name: str, - adaptors: dict, - i7_index: str | None = None, - i5_index: str | None = None, - ): - self.name: str = name + def __init__(self, adaptors, delim, adaptor_type, i7_index=None, i5_index=None): self.i5 = AdaptorPart( - sequence_token=adaptors[name]["i5"], name=name, index_seq=i5_index + adaptors[adaptor_type]["i5"], adaptor_type, delim, i5_index ) self.i7 = AdaptorPart( - sequence_token=adaptors[name]["i7"], name=name, index_seq=i7_index + adaptors[adaptor_type]["i7"], adaptor_type, delim, i7_index ) - - def get_fastastring(self, insert_Ns: bool = True) -> str: - """Create fasta text entries for the i5 and i7 adaptors. - - Example: - - An Adaptor object with the name "truseq_dual" and the following properties: - - i5 sequence token 'AAAATTTT', index length 4 - - i7 sequence token 'GGGGCCCC', index length 4, UMI length 8 - - get_fastastring(insert_Ns=True) -> ''' - >truseq_dual_i5 - AAAANNNNTTTT - >truseq_dual_i7 - GGGGNNNNNNNNNNNNCCCC - ''' - - get_fastastring(insert_Ns=False) -> ''' - >truseq_dual_i5 - AAAATTTT - >truseq_dual_i7 - GGGGCCCC - ''' - - """ - fasta_i5 = f">{self.name}_i5\n{self.i5.get_mask(insert_Ns)}\n" - fasta_i7 = f">{self.name}_i7\n{self.i7.get_mask(insert_Ns)}\n" - return fasta_i5 + fasta_i7 - - -class AdaptorPart: - """This class is used for the i5 or i7 adaptor.""" - - def __init__(self, sequence_token: str, name: str, index_seq: str | None): - # Assign attributes from args - self.sequence_token: str = sequence_token - self.name: str = name - self.index_seq: str | None = index_seq - - # Index bool and len - if has_match(INDEX_TOKEN, self.sequence_token): - split_by_index = re.split(INDEX_TOKEN, self.sequence_token) - - self.has_index = True - self.len_index = len(index_seq) if index_seq else None - - else: - self.has_index = False - self.len_index = 0 - - # UMI bool and len - umi_tokens = re.findall(UMI_TOKEN, self.sequence_token) - if len(umi_tokens) > 1: + self.i5_index = i5_index + self.i7_index = i7_index + self.i5_umi = re.findall(udelim, self.i5.sequence) + self.i5_umi_before = 0 + self.i5_umi_after = 0 + self.i7_umi = re.findall(udelim, self.i7.sequence) + self.i7_umi_before = 0 + self.i7_umi_after = 0 + self.name = f"{adaptor_type}" + self.delim = delim + + if len(self.i5_umi) > 1 or len(self.i7_umi) > 1: raise UserWarning( - f"Found adaptor {self.name} with multiple UMIs. This is not supported." + f"Adaptor {adaptor_type} has more than one UMI in either i5 or i7. This is not supported." ) - elif len(umi_tokens) == 1: - self.has_umi = True - self.len_umi = int( - re.search(UMI_LENGTH_TOKEN, self.sequence_token).group(1) + # Check if UMI is before or after i5 index + if len(self.i5_umi) > 0 and ">" + self.i5_umi[0] in self.i5.sequence: + self.i5_umi_after = int(re.search(ulen, self.i5_umi[0]).group(1)) + elif len(self.i5_umi) > 0 and self.i5_umi[0] + "<" in self.i5.sequence: + self.i5_umi_before = int(re.search(ulen, self.i5_umi[0]).group(1)) + elif len(self.i5_umi) > 0: + raise UserWarning( + f"Adaptor {adaptor_type} has UMI but it does not flank an index. This is not supported." ) - else: - self.has_umi = False - self.len_umi = 0 - - # Type declaration of attributes to be assigned - self.len_before_index: int | None - self.len_after_index: int | None - self.len_umi_before_index: int | None - self.len_umi_after_index: int | None - self.len_total: int | None - self.len_constant: int - - # Lengths - if self.has_index and self.has_umi: - # Index and UMI - index_umi_match = re.search(INDEX_TOKEN + UMI_TOKEN, self.sequence_token) - umi_index_match = re.search(UMI_TOKEN + INDEX_TOKEN, self.sequence_token) - - if index_umi_match: - self.len_umi_before_index = 0 - self.len_umi_after_index = self.len_umi - self.len_before_index = len( - self.sequence_token[: index_umi_match.start()] - ) - self.len_after_index = ( - len(self.sequence_token[index_umi_match.end() :]) + self.len_umi - ) - elif umi_index_match: - self.len_umi_before_index = self.len_umi - self.len_umi_after_index = 0 - self.len_before_index = ( - len(self.sequence_token[: umi_index_match.start()]) + self.len_umi - ) - self.len_after_index = len(self.sequence_token[umi_index_match.end() :]) - else: - raise UserWarning( - f"Found adaptor {self.name} with UMI but it does not flank an index. This is not supported." - ) - - elif self.has_index and not self.has_umi: - # Index, no UMI - self.len_umi_before_index = 0 - self.len_umi_after_index = 0 - self.len_before_index = len(split_by_index[0]) - self.len_after_index = len(split_by_index[-1]) - - elif not self.has_index and self.has_umi: - # No index, UMI + # Check if UMI is before or after i7 index + if len(self.i7_umi) > 0 and ">" + self.i7_umi[0] in self.i7.sequence: + self.i7_umi_after = int(re.search(ulen, self.i7_umi[0]).group(1)) + elif len(self.i7_umi) > 0 and self.i7_umi[0] + "<" in self.i7.sequence: + self.i7_umi_before = int(re.search(ulen, self.i7_umi[0]).group(1)) + elif len(self.i7_umi) > 0: raise UserWarning( - f"Adaptor {self.name} has UMI but no index. This is not supported." + f"Adaptor {adaptor_type} has UMI but it does not flank an index. This is not supported." ) + def get_i5_mask(self, insert_Ns=True): + ilen = len(self.i5_index) if self.i5_index is not None and insert_Ns else 0 + ulen = max(self.i5_umi_after, self.i5_umi_before) if insert_Ns else 0 + # Test if the index is specified in the adaptor sequence when it shouldn't be + if has_match(idelim, self.i5.sequence) and self.i5_index is None and insert_Ns: + raise UserWarning("Adaptor has i5 but no sequence was specified") + if self.i5_index is not None or not insert_Ns: + new_i5 = re.sub(idelim, "N" * ilen, self.i5.sequence) + new_i5 = re.sub(udelim, "N" * ulen, new_i5) + return new_i5 else: - # No index, no UMI - self.len_umi_before_index = None - self.len_umi_after_index = None - self.len_before_index = None - self.len_after_index = None - - self.len_total = len(self.get_mask(insert_Ns=True)) if self.index_seq else None - self.len_constant = len(self.get_mask(insert_Ns=False)) - - def get_mask(self, insert_Ns: bool = True) -> str: - """Get the mask of the adaptor part. - - insert_Ns = True -> Returns the sequence with index and UMI tokens replaced with Ns of the correct length - insert_Ns = False -> Returns the sequence without index and UMI tokens - """ - - index_mask_length = ( - len(self.index_seq) - if self.index_seq is not None and insert_Ns and self.has_index - else 0 - ) - - umi_mask_length = ( - max(self.len_umi_after_index, self.len_umi_before_index) - if insert_Ns and self.has_umi - else 0 - ) + return self.i5.sequence + def get_i7_mask(self, insert_Ns=True): + ilen = len(self.i7_index) if self.i7_index is not None and insert_Ns else 0 + ulen = max(self.i7_umi_after, self.i7_umi_before) if insert_Ns else 0 # Test if the index is specified in the adaptor sequence when it shouldn't be - if ( - has_match(INDEX_TOKEN, self.sequence_token) - and self.index_seq is None - and insert_Ns - ): - raise UserWarning( - f"Can't create mask for adaptor '{self.name}' with unspecified index." - ) - - if self.index_seq is not None or not insert_Ns: - mask = re.sub(INDEX_TOKEN, "N" * index_mask_length, self.sequence_token) - mask = re.sub(UMI_TOKEN, "N" * umi_mask_length, mask) - return mask + if has_match(idelim, self.i7.sequence) and self.i7_index is None and insert_Ns: + raise UserWarning("Adaptor has i7 but no sequence was specified") + if self.i7_index is not None or not insert_Ns: + new_i7 = re.sub(idelim, "N" * ilen, self.i7.sequence) + new_i7 = re.sub(udelim, "N" * ulen, new_i7) + return new_i7 else: - return self.sequence_token + return self.i7.sequence + def get_fastastring(self, insert_Ns=True): + fasta_i5 = f">{self.name}_i5\n{self.get_i5_mask(insert_Ns)}\n" + fasta_i7 = f">{self.name}_i7\n{self.get_i7_mask(insert_Ns)}\n" + return fasta_i5 + fasta_i7 -def has_match(pattern: re.Pattern, query: str) -> bool: - """General function to check if a string contains a pattern.""" - match = re.search(pattern, query) - if match is None: - return False - return True +class AdaptorPart: + # This class is used either the i5 or i7 adaptor + def __init__(self, sequence, name, delim, index): + self.sequence = sequence + self.name = name + self.delim = delim + self.index = index + self.umi_after = 0 + self.umi_before = 0 + self.len_after_index = 0 + self.len_before_index = 0 + + # Dynamically assign attributes + self.umi = re.findall(udelim, self.sequence) + + # TODO Duplicated from Adaptor class, will be merged later + # Check if UMI is before or after index + if len(self.umi) > 0 and ">" + self.umi[0] in self.sequence: + # The index region is INDEX+UMI + self.umi_after = int(re.search(ulen, self.umi[0]).group(1)) + self.len_before_index = len(idelim.split(self.sequence)[0]) + self.len_after_index = len(udelim.split(self.sequence)[-1]) + elif len(self.umi) > 0 and self.umi[0] + "<" in self.sequence: + # The index region is UMI+INDEX + self.umi_before = int(re.search(ulen, self.umi[0]).group(1)) + self.len_before_index = len(udelim.split(self.sequence)[0]) + self.len_after_index = len(idelim.split(self.sequence)[-1]) + elif len(self.umi) > 0: + # TODO give details which adaptor has the problem + raise UserWarning( + "Found adaptor with UMI but it does not flank an index. This is not supported." + ) + # Non UMI cases + elif has_match(idelim, self.sequence): + self.len_before_index = len(idelim.split(self.sequence)[0]) + self.len_after_index = len(idelim.split(self.sequence)[-1]) -def validate_adaptors(adaptors_dict: dict): - """Validate that the adaptor config sequences only consist of expected patterns.""" + def has_index(self): + return self.sequence.find(self.delim) > -1 - for adaptor_name in adaptors_dict: - for i in ["i5", "i7"]: - sequence_token = adaptors_dict[adaptor_name][i] - match = re.match(VALID_SEQUENCE_TOKEN_PATTERN, sequence_token) - if not match: - raise UserWarning( - f"Adaptor {adaptor_name} has an invalid sequence for {i}: {sequence_token}. Does not conform to the pattern {VALID_SEQUENCE_TOKEN_PATTERN}." - ) + def len_before_index_region(self): + return self.len_before_index + def len_after_index_region(self): + return self.len_after_index -def load_adaptors(raw: bool = False) -> list[Adaptor] | dict: - """Fetch all adaptors. - Return them as a list of adaptor objects or optionally as a raw yaml dict. - """ +# General function to check if a string contains a pattern +def has_match(delim, seq): + match = re.search(delim, seq) + if match is None: + return False + return True - # Load adaptors from config file - adaptors_config_path = importlib.resources.files("anglerfish.config").joinpath( - "adaptors.yaml" - ) - assert isinstance(adaptors_config_path, os.PathLike) - with open(adaptors_config_path) as f: - adaptors_dict = yaml.safe_load(f) +# Fetch all adaptors +def load_adaptors(raw=False): + p = importlib.resources.files("anglerfish.config").joinpath("adaptors.yaml") + assert isinstance(p, os.PathLike) - # Validate input - validate_adaptors(adaptors_dict) + adaptors_raw = [] + with open(p) as f: + adaptors_raw = yaml.safe_load(f) - # Optionally, return raw dict if raw: - return adaptors_dict - - # By default, return list of Adaptor objects - else: - adaptors = [] - for adaptor_name in adaptors_dict: - adaptors.append(Adaptor(name=adaptor_name, adaptors=adaptors_dict)) - return adaptors + return adaptors_raw + adaptors = [] + for adaptor in adaptors_raw: + adaptors.append( + Adaptor(adaptors_raw, "N", adaptor, i7_index=None, i5_index=None) + ) + + return adaptors diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py index 48faef6..5abb564 100644 --- a/anglerfish/demux/demux.py +++ b/anglerfish/demux/demux.py @@ -16,11 +16,11 @@ def parse_cs(cs_string, index, umi_before=0, umi_after=0): """ Parses the CS string of a paf alignment and matches it to the given index using a max Levenshtein distance """ - nt = re.compile(r"\*n([atcg])") + nt = re.compile("\*n([atcg])") nts = "".join(re.findall(nt, cs_string)) - if umi_before is not None and umi_before > 0: + if umi_before > 0: nts = nts[umi_before:] - if umi_after is not None and umi_after > 0: + if umi_after > 0: nts = nts[:-umi_after] # Allow for mismatches return nts, lev.distance(index.lower(), nts) @@ -184,26 +184,26 @@ def cluster_matches( fi7 = "" for _, adaptor, _ in sample_adaptor: try: - i5_seq = adaptor.i5.index_seq + i5_seq = adaptor.i5.index if i5_reversed and i5_seq is not None: i5_seq = str(Seq(i5_seq).reverse_complement()) fi5, d1 = parse_cs( i5["cs"], i5_seq, - adaptor.i5.len_umi_before_index, - adaptor.i5.len_umi_after_index, + adaptor.i5_umi_before, + adaptor.i5_umi_after, ) except AttributeError: d1 = 0 # presumably it's single index, so no i5 - i7_seq = adaptor.i7.index_seq + i7_seq = adaptor.i7.index if i7_reversed and i7_seq is not None: i7_seq = str(Seq(i7_seq).reverse_complement()) fi7, d2 = parse_cs( i7["cs"], i7_seq, - adaptor.i7.len_umi_before_index, - adaptor.i7.len_umi_after_index, + adaptor.i7_umi_before, + adaptor.i7_umi_after, ) dists.append(d1 + d2) @@ -217,8 +217,8 @@ def cluster_matches( continue if dists[index_min] > max_distance: # Find only full length i7(+i5) adaptor combos. Basically a list of "known unknowns" - if len(fi7) + len(fi5) == len(adaptor.i7.index_seq or "") + len( - adaptor.i5.index_seq or "" + if len(fi7) + len(fi5) == len(adaptor.i7.index or "") + len( + adaptor.i5.index or "" ): fi75 = "+".join([i for i in [fi7, fi5] if not i == ""]) unmatched_bed.append([read, start_insert, end_insert, fi75, "999", "."]) diff --git a/anglerfish/demux/report.py b/anglerfish/demux/report.py index 49e8b67..ec433e7 100644 --- a/anglerfish/demux/report.py +++ b/anglerfish/demux/report.py @@ -102,8 +102,8 @@ def write_dataframe(self, outdir, samplesheet): sen_dict = asdict(sentry) if sen_dict["sample_name"] == s_dict["sample_name"]: s_dict["adaptor_name"] = sen_dict["adaptor"].name - s_dict["i7_index"] = sen_dict["adaptor"].i7.index_seq - s_dict["i5_index"] = sen_dict["adaptor"].i5.index_seq + s_dict["i7_index"] = sen_dict["adaptor"].i7.index + s_dict["i5_index"] = sen_dict["adaptor"].i5.index out_list.append(s_dict) for key, unmatch in self.unmatched_stats.items(): for unmatch_sample in unmatch: diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py index 90adbe5..f01c9cc 100644 --- a/anglerfish/demux/samplesheet.py +++ b/anglerfish/demux/samplesheet.py @@ -8,26 +8,27 @@ from anglerfish.demux.adaptor import Adaptor, load_adaptors -ADAPTORS = load_adaptors(raw=True) +idelim = re.compile(r"\") +udelim = re.compile(r"(\)") +ulen = re.compile(r"\") +adaptors = load_adaptors(raw=True) +# This is some leftover ugliness from a merge conflict to reconcile the old and new adaptor classes +delim = "" @dataclass class SampleSheetEntry: sample_name: str - adaptor: Adaptor + adaptor: object fastq: str - ont_barcode: str | None + ont_barcode: str class SampleSheet: - def __init__(self, input_csv: str, ont_barcodes_enabled: bool): - """Read samplesheet in format: - - sample_name, adaptors, i7_index(-i5_index), fastq_path - - If we are demuxing a run with ONT barcodes, we will have to assume - fastq files are located in "barcode##" folders. - """ + def __init__(self, input_csv, ont_bc): + # Read samplesheet in format: + # sample_name, adaptors, i7_index(-i5_index), fastq_path + # If we are demuxing a run with ONT barcodes, we will have to assume fastq files are located in "barcode##" folders self.samplesheet = [] try: @@ -39,68 +40,54 @@ def __init__(self, input_csv: str, ont_barcodes_enabled: bool): fieldnames=["sample_name", "adaptors", "index", "fastq_path"], dialect=dialect, ) - row_number = 1 + rn = 1 test_globs = {} for row in data: - if row["adaptors"] not in ADAPTORS: + if row["adaptors"] not in adaptors: raise UserWarning( - f"'{row['adaptors']}' not in the list of valid adaptors: {ADAPTORS.keys()}" + f"'{row['adaptors']}' not in the list of valid adaptors: {adaptors.keys()}" ) - i7i5_split = row["index"].split("-") - i7_index = i7i5_split[0] - if len(i7i5_split) > 1: - i5_index = i7i5_split[1] - else: - i5_index = None + i7i5 = row["index"].split("-") + i7 = i7i5[0] + i5 = None + if len(i7i5) > 1: + i5 = i7i5[1] sample_name = row["sample_name"] test_globs[row["fastq_path"]] = glob.glob(row["fastq_path"]) - barcode_dir_pattern = re.compile(r"\/(barcode\d\d|unclassified)\/") - - if ont_barcodes_enabled: - barcode_dir_match = re.findall( - barcode_dir_pattern, row["fastq_path"] - ) + bc_re = re.compile(r"\/(barcode\d\d|unclassified)\/") + ont_barcode = None + if ont_bc: + ob = re.findall(bc_re, row["fastq_path"]) assert ( - len(barcode_dir_match) > 0 and len(barcode_dir_match[0][-1]) > 0 + len(ob) > 0 and len(ob[0][-1]) > 0 ), "ONT barcode not found in fastq path. In ONT barcode mode (-n), fastq files must be located in barcode## folders" - ont_barcode = barcode_dir_match[0] - else: - ont_barcode = None + ont_barcode = ob[0] ss_entry = SampleSheetEntry( sample_name, - Adaptor( - name=row["adaptors"], - adaptors=ADAPTORS, - i5_index=i5_index, - i7_index=i7_index, - ), + Adaptor(adaptors, delim, row["adaptors"], i7, i5), row["fastq_path"], ont_barcode, ) self.samplesheet.append(ss_entry) - row_number += 1 + rn += 1 - # Explanation: Don't mess around with the globs too much. - # Don't refer to the same file twice but using globs, e.g, ./input.fastq and ./[i]nput.fastq + # Explanation: Don't mess around with the globs too much. Don't refer to the same file twice but using globs, + # e.g, ./input.fastq and ./[i]nput.fastq for a, b in combinations(test_globs.values(), 2): if len(set(a) & set(b)) > 0: raise UserWarning( - "Fastq paths are inconsistent. Please check samplesheet." + "Fastq paths are inconsistent. Please check samplesheet" ) - if ( - not ont_barcodes_enabled - and len(set([v[0] for v in test_globs.values()])) > 1 - ): + if not ont_bc and len(set([v[0] for v in test_globs.values()])) > 1: raise UserWarning( - "Found several different fastq files in samplesheet. Please carefully check any glob patterns." - + " If you are using ONT barcodes, please specify the --ont_barcodes flag." - + " Or if you are trying to input several sets of fastqs into anglerfish," - + " please run anglerfish separately for each set." + """Found several different fastq files in samplesheet. Please carefully check any glob patterns. + If you are using ONT barcodes, please specify the --ont_barcodes flag. Or if you are trying to input several + sets of fastqs into anglerfish, please run anglerfish separately for each set.""" ) except: @@ -108,52 +95,47 @@ def __init__(self, input_csv: str, ont_barcodes_enabled: bool): finally: csvfile.close() - def minimum_bc_distance(self) -> int: - """Compute the minimum edit distance between all barcodes in samplesheet, - or within each ONT barcode group. - """ + def minimum_bc_distance(self): + """Compute the minimum edit distance between all barcodes in samplesheet, or within each ONT barcode group""" - ont_bc_to_adaptors = {} + ss_by_bc = {} + testset = {} for entry in self.samplesheet: - if entry.ont_barcode in ont_bc_to_adaptors: - ont_bc_to_adaptors[entry.ont_barcode].append(entry.adaptor) + if entry.ont_barcode in ss_by_bc: + ss_by_bc[entry.ont_barcode].append(entry.adaptor) else: - ont_bc_to_adaptors[entry.ont_barcode] = [entry.adaptor] + ss_by_bc[entry.ont_barcode] = [entry.adaptor] - testset = {} - for ont_barcode, adaptors in ont_bc_to_adaptors.items(): + for ont_barcode, adaptors in ss_by_bc.items(): testset[ont_barcode] = [] for adaptor in adaptors: - if adaptor.i5.has_index: - testset[ont_barcode].append( - adaptor.i5.index_seq + adaptor.i7.index_seq - ) + if adaptor.i5.has_index(): + testset[ont_barcode].append(adaptor.i5.index + adaptor.i7.index) else: - testset[ont_barcode].append(adaptor.i7.index_seq) + testset[ont_barcode].append(adaptor.i7.index) - min_distances_all_barcodes = [] + fq_distances = [] for ont_barcode, adaptors in testset.items(): - distances_within_barcode = [] + distances = [] if len(adaptors) == 1: - # If there is only one adaptor in the group, the distance is simply the length of the adaptor - distances_within_barcode = [len(adaptors[0])] + distances = [len(adaptors[0])] else: for a, b in [i for i in combinations(adaptors, 2)]: - distance_this_pair = lev.distance(a, b) + dist = lev.distance(a, b) assert ( - distance_this_pair > 0 + dist > 0 ), f"""There is one or more identical barcodes in the input samplesheet. First one found: {a}. If these exist in different ONT barcodes, please specify the --ont_barcodes flag.""" - distances_within_barcode.append(distance_this_pair) - min_distances_all_barcodes.append(min(distances_within_barcode)) - return min(min_distances_all_barcodes) + distances.append(dist) + fq_distances.append(min(distances)) + return min(fq_distances) - def get_fastastring(self, adaptor_name: str = None) -> str: + def get_fastastring(self, adaptor_name=None): fastas = {} for entry in self.samplesheet: if entry.adaptor.name == adaptor_name or adaptor_name is None: - fastas[entry.adaptor.name + "_i7"] = entry.adaptor.i7.get_mask() - fastas[entry.adaptor.name + "_i5"] = entry.adaptor.i5.get_mask() + fastas[entry.adaptor.name + "_i7"] = entry.adaptor.get_i7_mask() + fastas[entry.adaptor.name + "_i5"] = entry.adaptor.get_i5_mask() assert len(fastas) > 0 diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py index 25e9c1a..fe0f846 100644 --- a/anglerfish/explore/explore.py +++ b/anglerfish/explore/explore.py @@ -104,10 +104,14 @@ def run_explore( for adaptor_end_name, adaptor_end in zip( ["i5", "i7"], [adaptor.i5, adaptor.i7] ): - if adaptor_end.has_index: + if adaptor_end.has_index(): # Alignment thresholds - before_thres = round(adaptor_end.len_before_index * good_hit_threshold) - after_thres = round(adaptor_end.len_after_index * good_hit_threshold) + before_thres = round( + adaptor_end.len_before_index_region() * good_hit_threshold + ) + after_thres = round( + adaptor_end.len_after_index_region() * good_hit_threshold + ) insert_thres_low = insert_thres_low insert_thres_high = insert_thres_high @@ -134,7 +138,13 @@ def run_explore( match_col_df ) - thres = round(adaptor_end.len_constant * good_hit_threshold) + thres = round( + ( + adaptor_end.len_before_index_region() + + adaptor_end.len_after_index_region() + ) + * good_hit_threshold + ) df_good_hits = df_good_hits[df_good_hits["match_1_len"] >= thres] median_insert_length = None @@ -166,7 +176,7 @@ def run_explore( ["i5", "i7"], [adaptor.i5, adaptor.i7] ): df_good_hits = entries[adaptor.name][adaptor_end_name] - if adaptor_end.has_index: + if adaptor_end.has_index(): median_insert_length = df_good_hits["insert_len"].median() if median_insert_length > umi_threshold: # Calculate entropies here