Skip to content

Commit

Permalink
Revert "Readability improvements"
Browse files Browse the repository at this point in the history
  • Loading branch information
kedhammar authored Jun 10, 2024
1 parent 519f74b commit 93452ec
Show file tree
Hide file tree
Showing 9 changed files with 239 additions and 357 deletions.
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"esbenp.prettier-vscode",
"wholroyd.jinja",
"ms-python.python",
"charliermarsh.ruff",
"charliermarsh.ruff@2024.2.0",
"ms-azuretools.vscode-docker",
],
},
Expand Down
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,3 @@
.*_cache
node_modules
.vscode
build
85 changes: 30 additions & 55 deletions anglerfish/anglerfish.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,6 @@

MAX_PROCESSES = 64 # Ought to be enough for anybody

anglerfish_logo = r"""
___
( ) \ -..__
_.|~”~~~”…_
^´ `>.
(+ (+ ) “<..<^(
`´ ``´ ___ (
\__..~ __( _…_(
\ /
“--…_ _..~%´
```´´
"""


def run_demux(args):
multiprocessing.set_start_method("spawn")
Expand All @@ -51,54 +38,44 @@ def run_demux(args):
ss = SampleSheet(args.samplesheet, args.ont_barcodes)
version = pkg_resources.get_distribution("bio-anglerfish").version
report = Report(args.run_name, run_uuid, version)
sys.stderr.write(anglerfish_logo)
sys.stderr.write("""
___
( ) \ -..__
_.|~”~~~”…_
^´ `>.
(+ (+ ) “<..<^(
`´ ``´ ___ (
\__..~ __( _…_(
\ /
“--…_ _..~%´
```´´
""")
log.info(f" version {version}")
log.info(f" arguments {vars(args)}")
log.info(f" run uuid {run_uuid}")
min_distance = ss.minimum_bc_distance()
bc_dist = ss.minimum_bc_distance()
if args.max_distance is None:
# Default: Set the maximum distance for barcode matching to 0, 1 or 2
# depending on the smallest detected edit distance between indices in the samplesheet
args.max_distance = min(min_distance - 1, 2)
if bc_dist > 1:
args.max_distance = 2
else:
args.max_distance = 1
log.info(f"Using maximum edit distance of {args.max_distance}")
if args.max_distance >= min_distance:
if args.max_distance >= bc_dist:
log.error(
f" The maximum allowed edit distance for barcode matching (={args.max_distance})"
+ f"is greater than the smallest detected edit distance between indices in samplesheet (={min_distance})"
+ ", which will result in ambiguous matches."
f" Edit distance of barcodes in samplesheet are less than the minimum specified {args.max_distance}>={bc_dist}"
)
exit()
log.debug(f"Samplesheet bc_dist == {min_distance}")
log.debug(f"Samplesheet bc_dist == {bc_dist}")
if args.threads > MAX_PROCESSES:
log.warning(
f" Setting threads to {MAX_PROCESSES} as the maximum number of processes is {MAX_PROCESSES}"
)
args.threads = MAX_PROCESSES

## Sort the adaptors by type and size

# Get a list of tuples with the adaptor name and ONT barcode
adaptor_tuples: list[tuple[str, str]] = [
(entry.adaptor.name, entry.ont_barcode) for entry in ss
]

# Convert to set to enforce uniqueness
adaptor_set: set[tuple[str, str]] = set(adaptor_tuples)

# Create a dictionary with the adaptors as keys and an empty list as value
adaptors_sorted: dict[tuple[str, str], list] = dict([(i, []) for i in adaptor_set])

# Populate the dictionary values with sample-specific information
"""
adaptors_sorted = {
( adaptor_name, ont_barcode ) : [
(sample_name, adaptor, fastq),
(sample_name, adaptor, fastq),
...
],
...
}
"""
# Sort the adaptors by type and size
adaptors_t = [(entry.adaptor.name, entry.ont_barcode) for entry in ss]
adaptor_set = set(adaptors_t)
adaptors_sorted = dict([(i, []) for i in adaptor_set])
for entry in ss:
adaptors_sorted[(entry.adaptor.name, entry.ont_barcode)].append(
(entry.sample_name, entry.adaptor, os.path.abspath(entry.fastq))
Expand All @@ -114,19 +91,18 @@ def run_demux(args):
adaptor_name, ont_barcode = key
fastq_path = sample[0][2]
# If there are multiple ONT barcodes, we need to add the ONT barcode to the adaptor name
adaptor_bc_name = adaptor_name
if ont_barcode:
adaptor_bc_name = f"{adaptor_name}_{ont_barcode}"
else:
adaptor_bc_name = adaptor_name
adaptor_bc_name = adaptor_name + "_" + ont_barcode
fastq_files = glob.glob(fastq_path)

# Align
align_path = os.path.join(args.out_fastq, f"{adaptor_bc_name}.paf")
aln_path = os.path.join(args.out_fastq, f"{adaptor_bc_name}.paf")
adaptor_path = os.path.join(args.out_fastq, f"{adaptor_name}.fasta")
with open(adaptor_path, "w") as f:
f.write(ss.get_fastastring(adaptor_name))
for fq in fastq_files:
run_minimap2(fq, adaptor_path, align_path, args.threads)
run_minimap2(fq, adaptor_path, aln_path, args.threads)

# Easy line count in input fastq files
num_fq = 0
Expand All @@ -135,7 +111,7 @@ def run_demux(args):
for i in f:
num_fq += 1
num_fq = int(num_fq / 4)
paf_entries = parse_paf_lines(align_path)
paf_entries = parse_paf_lines(aln_path)

# Make stats
log.info(f" Searching for adaptor hits in {adaptor_bc_name}")
Expand Down Expand Up @@ -277,8 +253,7 @@ def run_demux(args):
sample_dists = [
(
lev.distance(
i[0],
f"{x.adaptor.i7.index_seq}+{x.adaptor.i5.index_seq}".lower(),
i[0], f"{x.adaptor.i7_index}+{x.adaptor.i5_index}".lower()
),
x.sample_name,
)
Expand Down
4 changes: 1 addition & 3 deletions anglerfish/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,7 @@ def run(
typer.Option(
"--max-distance",
"-m",
help="Manually set maximum allowed edit distance for index matching,"
+ "by default this is set to 0, 1 or 2 based on the minimum detected"
+ "index distance in the samplesheet.",
help="Manually set maximum edit distance for BC matching, automatically set this is set to either 1 or 2",
),
] = 2,
max_unknowns: Annotated[
Expand Down
Loading

0 comments on commit 93452ec

Please sign in to comment.