|
| 1 | +#!/usr/bin/env python3 |
| 2 | +VERSION = "1.2.0" |
| 3 | +DESCRIPTION = f""" |
| 4 | +--- |
| 5 | +Script for sanitising FASTA headers and sequences: |
| 6 | +- Shortens headers by splitting by whitespace and keeping only the first element |
| 7 | +- Replaces problematic characters in headers (commas, spaces, etc.) with underscores |
| 8 | +- Converts sequences to uppercase and replaces non-ATGC bases with N |
| 9 | +Version: {VERSION} |
| 10 | +--- |
| 11 | +
|
| 12 | +Written by Eerik Aunin (ea10) |
| 13 | +Modified by Damon-Lee Pointon (@dp24/@DLBPointon) |
| 14 | +Further modified by Eerik Aunin (ea10) |
| 15 | +
|
| 16 | +""" |
| 17 | + |
| 18 | +# MIT License |
| 19 | +# |
| 20 | +# Copyright (c) 2020-2022 Genome Research Ltd. |
| 21 | +# |
| 22 | +# Author: Eerik Aunin (eeaunin@gmail.com) |
| 23 | +# |
| 24 | +# Permission is hereby granted, free of charge, to any person obtaining a copy |
| 25 | +# of this software and associated documentation files (the "Software"), to deal |
| 26 | +# in the Software without restriction, including without limitation the rights |
| 27 | +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 28 | +# copies of the Software, and to permit persons to whom the Software is |
| 29 | +# furnished to do so, subject to the following conditions: |
| 30 | +# |
| 31 | +# The above copyright notice and this permission notice shall be included in all |
| 32 | +# copies or substantial portions of the Software. |
| 33 | +# |
| 34 | +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 35 | +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 36 | +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 37 | +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 38 | +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 39 | +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 40 | +# SOFTWARE. |
| 41 | + |
| 42 | +import general_purpose_functions as gpf |
| 43 | +import argparse |
| 44 | +import textwrap |
| 45 | +import sys |
| 46 | +import tempfile |
| 47 | +import re |
| 48 | + |
| 49 | + |
| 50 | +def is_all_n_sequence(seq): |
| 51 | + """Return True if sequence consists entirely of N's.""" |
| 52 | + return all(base == "N" for base in seq.strip().upper()) |
| 53 | + |
| 54 | + |
| 55 | +def sanitise_sequence(seq): |
| 56 | + """Convert sequence to uppercase and replace any non-ATGC bases with N.""" |
| 57 | + seq = seq.upper() |
| 58 | + return re.sub(r"[^ATGC]", "N", seq) |
| 59 | + |
| 60 | + |
| 61 | +def sanitise_header(header): |
| 62 | + """Replace problematic characters in FASTA headers with underscores.""" |
| 63 | + # Remove the '>' character if present at the start |
| 64 | + if header.startswith(">"): |
| 65 | + header = header[1:] |
| 66 | + |
| 67 | + # Replace problematic characters with underscores |
| 68 | + sanitised = re.sub(r"[,;\s|:]", "_", header) |
| 69 | + |
| 70 | + # Add back the '>' character |
| 71 | + return ">" + sanitised |
| 72 | + |
| 73 | + |
| 74 | +def parse_args(argv=None): |
| 75 | + parser = argparse.ArgumentParser( |
| 76 | + prog="sanitise_input_fasta_file", |
| 77 | + formatter_class=argparse.RawDescriptionHelpFormatter, |
| 78 | + description=textwrap.dedent(DESCRIPTION), |
| 79 | + ) |
| 80 | + parser.add_argument("fasta_path", type=str, help="Path to input FASTA file") |
| 81 | + parser.add_argument( |
| 82 | + "--delimiter", |
| 83 | + type=str, |
| 84 | + help="Delimiter string for splitting FASTA headers. Default: any whitespace character", |
| 85 | + default="", |
| 86 | + ) |
| 87 | + parser.add_argument("--allow_duplicate_headers", dest="allow_duplicate_headers", action="store_true") |
| 88 | + parser.add_argument( |
| 89 | + "--keep_n_sequences", action="store_true", help="Keep sequences that are all Ns (default: False)" |
| 90 | + ) |
| 91 | + parser.add_argument("-v", "--version", action="version", version=VERSION) |
| 92 | + return parser.parse_args(argv) |
| 93 | + |
| 94 | + |
| 95 | +def main(fasta_path, delimiter, allow_duplicate_headers, keep_n_sequences=False): |
| 96 | + with tempfile.TemporaryDirectory() as tmp_dir: |
| 97 | + input_file = fasta_path |
| 98 | + if fasta_path.endswith(".gz") or fasta_path.endswith('.gz"'): |
| 99 | + input_file = "{}/input_file.fa".format(tmp_dir) |
| 100 | + gpf.run_system_command("gunzip -c {} > {}".format(fasta_path, input_file)) |
| 101 | + |
| 102 | + headers_list = list() |
| 103 | + headers_with_commas = 0 |
| 104 | + in_data = gpf.ll(input_file) |
| 105 | + |
| 106 | + current_header = None |
| 107 | + current_sequence = [] |
| 108 | + |
| 109 | + def process_sequence(): |
| 110 | + if current_header and current_sequence: |
| 111 | + sequence = "".join(current_sequence) |
| 112 | + if keep_n_sequences or not is_all_n_sequence(sequence): |
| 113 | + print(current_header) |
| 114 | + print(sequence) |
| 115 | + else: |
| 116 | + sys.stderr.write("Skipping all-N sequence: {}\n".format(current_header[1:].strip())) |
| 117 | + |
| 118 | + for line in in_data: |
| 119 | + if line.startswith(">"): |
| 120 | + # Process previous sequence if it exists |
| 121 | + process_sequence() |
| 122 | + |
| 123 | + # Start new sequence |
| 124 | + original_header = line.strip() |
| 125 | + if delimiter == "": |
| 126 | + current_header = original_header.split()[0] |
| 127 | + else: |
| 128 | + current_header = original_header.split(delimiter)[0] |
| 129 | + |
| 130 | + # Check for commas in the original header |
| 131 | + if "," in original_header: |
| 132 | + headers_with_commas += 1 |
| 133 | + |
| 134 | + # Sanitise the header |
| 135 | + current_header = sanitise_header(current_header) |
| 136 | + |
| 137 | + if current_header in headers_list and allow_duplicate_headers is False: |
| 138 | + sys.stderr.write( |
| 139 | + "Duplicate FASTA headers ({}) were found in the input file ({}) after truncating the headers with a delimiter\n".format( |
| 140 | + current_header[1:], fasta_path |
| 141 | + ) |
| 142 | + ) |
| 143 | + sys.exit(1) |
| 144 | + headers_list.append(current_header) |
| 145 | + current_sequence = [] |
| 146 | + else: |
| 147 | + # Add sanitised sequence line |
| 148 | + current_sequence.append(sanitise_sequence(line)) |
| 149 | + |
| 150 | + # Process the last sequence |
| 151 | + process_sequence() |
| 152 | + |
| 153 | + # Print warning about headers with commas |
| 154 | + if headers_with_commas > 0: |
| 155 | + sys.stderr.write( |
| 156 | + "Warning: {} FASTA header(s) contained commas that were replaced with underscores\n".format( |
| 157 | + headers_with_commas |
| 158 | + ) |
| 159 | + ) |
| 160 | + |
| 161 | + |
| 162 | +if __name__ == "__main__": |
| 163 | + args = parse_args() |
| 164 | + main(args.fasta_path, args.delimiter, args.allow_duplicate_headers, args.keep_n_sequences) |
0 commit comments