Skip to content

Commit

Permalink
Added entropy calculation and printing to the explore command
Browse files Browse the repository at this point in the history
  • Loading branch information
alneberg committed Jan 18, 2024
1 parent 078a667 commit 76e499a
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 1 deletion.
60 changes: 60 additions & 0 deletions anglerfish/explore/entropy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import itertools
from collections import deque

import numpy as np


# Rolling window implementation
def _window(seq, n=3):
win = deque(maxlen=n)
for char in seq:
win.append(char)
if len(win) >= n:
yield tuple(win)


def _entropy(pk, qk):
"""Taken from https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html
Felt unecessary to add scipy as dependency just for this function.
"""
# TODO: Catch potential exceptions here for weird input values.
return -np.sum(pk * np.log(qk))


# Each sequence gives matrix of position and k-mer occurence.
def _seq_to_count_matrix(dna_seq, kmer_positions, max=50, k=2):
"""max is the length of the prefix that will be considered"""
if len(dna_seq) < max:
max = len(dna_seq)

matrix = np.zeros((4**k, max - k + 1))
for i, word in enumerate(_window(dna_seq[:max], n=k)):
matrix[kmer_positions[word], i] += 1
return matrix


def _count_for_seqs(seqs, kmer_positions, insert_length, k=2):
seqs = [seq for seq in seqs if len(seq) == insert_length]
matrices = [_seq_to_count_matrix(seq, kmer_positions, max=50, k=k) for seq in seqs]
stacked = np.stack(matrices)
sum_a = np.sum(stacked, axis=0)
even_dist = np.ones(4**k) / (4**k)
entropies = [_entropy(a, even_dist) for a in sum_a.T]

return entropies


def _extract_inserts_from_df(df):
mim_re_cs = r"^cs:Z::[1-9][0-9]*\+([a,c,t,g]*):[1-9][0-9]*$"
return df.cs.str.extract(mim_re_cs, expand=False).str.upper().tolist()


def calculate_relative_entropy(df_good_hits, kmer_length, insert_length):
all_kmers = itertools.product("ACTG", repeat=kmer_length)
kmer_positions = dict(
(kmer, position) for kmer, position in zip(all_kmers, range(4**kmer_length))
)
seqs = _extract_inserts_from_df(df_good_hits)
entropies = _count_for_seqs(seqs, kmer_positions, insert_length, k=kmer_length)
return entropies
8 changes: 7 additions & 1 deletion anglerfish/explore/explore.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from anglerfish.demux.adaptor import load_adaptors
from anglerfish.demux.demux import parse_paf_lines, run_minimap2
from anglerfish.explore.entropy import calculate_relative_entropy

logging.basicConfig(level=logging.INFO)
log = logging.getLogger("explore")
Expand Down Expand Up @@ -170,7 +171,12 @@ def run_explore(
median_insert_length = df_good_hits["insert_len"].median()
if median_insert_length > umi_threshold:
# Calculate entropies here
pass
entropies = calculate_relative_entropy(
df_good_hits, kmer_length, median_insert_length
)
log.info(
f"{adaptor.name}:{adaptor_end_name} had entropy {entropies}"
)
insert_lengths = df_good_hits["insert_len"].value_counts()
log.info(
f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits with median insert length {median_insert_length}"
Expand Down

0 comments on commit 76e499a

Please sign in to comment.