Skip to content

Commit

Permalink
Explore command working except entropy
Browse files Browse the repository at this point in the history
  • Loading branch information
alneberg committed Jan 18, 2024
1 parent 5b7e9ab commit 078a667
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 1 deletion.
126 changes: 125 additions & 1 deletion anglerfish/explore/explore.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
import os
import uuid

import pandas as pd

from anglerfish.demux.adaptor import load_adaptors
from anglerfish.demux.demux import run_minimap2
from anglerfish.demux.demux import parse_paf_lines, run_minimap2

logging.basicConfig(level=logging.INFO)
log = logging.getLogger("explore")
Expand Down Expand Up @@ -58,3 +60,125 @@ def run_explore(

log.info(f"Aligning {adaptor_name}")
run_minimap2(fastq, adaptor_path, aln_path, threads, minimap_b)

# Parse alignments
entries = {}
adaptors_included = []
for adaptor, aln_path in alignments:
log.info(f"Parsing {adaptor.name}")
aln_dict_with_lists = parse_paf_lines(aln_path, complex_identifier=True)

# Choose only the highest scoring alignment for each combination of read, adaptor end and strand
aln_dict = dict(
[(k, max(v, key=lambda x: x["q"])) for k, v in aln_dict_with_lists.items()]
)

df = pd.DataFrame.from_dict(aln_dict, orient="index")
nr_good_hits = {}
for adaptor_end_name, adaptor_end in zip(
["i5", "i7"], [adaptor.i5, adaptor.i7]
):
if adaptor_end.has_index():
# Match Insert Match = mim
# The cs string filter is quite strict, requiring 10+ perfect match before insert and 10+ perfect match after insert
# The cg string allows for mismatches within the matching strings but no insertions or deletions
# All cg string matches are also cs string matches (subset) but not vice versa
mim_re_cs = r"^cs:Z::[1-9][0-9]*\+([a,c,t,g]*):[1-9][0-9]*$" # Match Insert Match = mim
mim_re_cg = r"^cg:Z:([0-9]*)M([0-9]*)I([0-9]*)M$"
df_mim = df[df.cs.str.match(mim_re_cs)]

# Extract the match lengths
match_col_df = df_mim.cg.str.extract(mim_re_cg).rename(
{0: "match_1_len", 1: "insert_len", 2: "match_2_len"}, axis=1
)
match_col_df = match_col_df.astype(
{
"match_1_len": "int32",
"insert_len": "int32",
"match_2_len": "int32",
}
)

df_mim.loc[match_col_df.index, match_col_df.columns] = match_col_df

# Alignment thresholds
before_thres = round(
adaptor_end.len_before_index() * good_hit_threshold
)
after_thres = round(adaptor_end.len_after_index() * good_hit_threshold)
insert_thres_low = insert_thres_low
insert_thres_high = insert_thres_high

requirements = (
(df_mim["match_1_len"] >= (before_thres))
& (df_mim["insert_len"] >= insert_thres_low)
& (df_mim["insert_len"] <= insert_thres_high)
& (df_mim["match_2_len"] >= (after_thres))
)
df_good_hits = df_mim[requirements]

median_insert_length = df_good_hits["insert_len"].median()
insert_lengths = df_good_hits["insert_len"].value_counts()
else:
m_re_cs = r"^cs:Z::([1-9][0-9]*)$"
df_good_hits = df[df.cg.str.match(m_re_cs)]
match_col_df = df_good_hits.cg.str.extract(m_re_cs).rename(
{0: "match_1_len"}, axis=1
)
match_col_df = match_col_df.astype({"match_1_len": "int32"})

df_good_hits.loc[
match_col_df.index, match_col_df.columns
] = match_col_df

thres = round(
(adaptor_end.len_before_index() + adaptor_end.len_after_index())
* good_hit_threshold
)
df_good_hits = df_good_hits[df_good_hits["match_1_len"] >= thres]

median_insert_length = None
insert_lengths = None

# Filter multiple hits per read and adaptor end
df_good_hits = df_good_hits.sort_values(
by=["read", "match_1_len"], ascending=False
).drop_duplicates(subset=["read", "adapter"], keep="first")

if adaptor.name not in entries.keys():
entries[adaptor.name] = {}
entries[adaptor.name][adaptor_end_name] = df_good_hits

nr_good_hits[adaptor_end_name] = len(df_good_hits)
log.info(
f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits."
)

if min(nr_good_hits.values()) >= min_hits_per_adaptor:
log.info(f"Adaptor {adaptor.name} is included in the analysis")
adaptors_included.append(adaptor)
else:
log.info(f"Adaptor {adaptor.name} is excluded from the analysis")

# Print insert length info for adaptor types included in the analysis
for adaptor in adaptors_included:
for adaptor_end_name, adaptor_end in zip(
["i5", "i7"], [adaptor.i5, adaptor.i7]
):
df_good_hits = entries[adaptor.name][adaptor_end_name]
if adaptor_end.has_index():
median_insert_length = df_good_hits["insert_len"].median()
if median_insert_length > umi_threshold:
# Calculate entropies here
pass
insert_lengths = df_good_hits["insert_len"].value_counts()
log.info(
f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits with median insert length {median_insert_length}"
)
log.info(insert_lengths[sorted(insert_lengths.index)])
else:
median_insert_length = None
insert_lengths = None
log.info(
f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits (no insert length since no index)"
)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ biopython==1.83
click==8.1.7
levenshtein==0.23.0
numpy==1.26.3
pandas==2.1.4
pyyaml==6.0.1

0 comments on commit 078a667

Please sign in to comment.