Added entropy calculation and printing to the explore command

NationalGenomicsInfrastructure · Jan 18, 2024 · 76e499a · 76e499a
1 parent 078a667
commit 76e499a
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 1 deletion.
diff --git a/anglerfish/explore/entropy.py b/anglerfish/explore/entropy.py
@@ -0,0 +1,60 @@
+import itertools
+from collections import deque
+
+import numpy as np
+
+
+# Rolling window implementation
+def _window(seq, n=3):
+    win = deque(maxlen=n)
+    for char in seq:
+        win.append(char)
+        if len(win) >= n:
+            yield tuple(win)
+
+
+def _entropy(pk, qk):
+    """Taken from https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html
+
+    Felt unecessary to add scipy as dependency just for this function.
+    """
+    # TODO: Catch potential exceptions here for weird input values.
+    return -np.sum(pk * np.log(qk))
+
+
+# Each sequence gives matrix of position and k-mer occurence.
+def _seq_to_count_matrix(dna_seq, kmer_positions, max=50, k=2):
+    """max is the length of the prefix that will be considered"""
+    if len(dna_seq) < max:
+        max = len(dna_seq)
+
+    matrix = np.zeros((4**k, max - k + 1))
+    for i, word in enumerate(_window(dna_seq[:max], n=k)):
+        matrix[kmer_positions[word], i] += 1
+    return matrix
+
+
+def _count_for_seqs(seqs, kmer_positions, insert_length, k=2):
+    seqs = [seq for seq in seqs if len(seq) == insert_length]
+    matrices = [_seq_to_count_matrix(seq, kmer_positions, max=50, k=k) for seq in seqs]
+    stacked = np.stack(matrices)
+    sum_a = np.sum(stacked, axis=0)
+    even_dist = np.ones(4**k) / (4**k)
+    entropies = [_entropy(a, even_dist) for a in sum_a.T]
+
+    return entropies
+
+
+def _extract_inserts_from_df(df):
+    mim_re_cs = r"^cs:Z::[1-9][0-9]*\+([a,c,t,g]*):[1-9][0-9]*$"
+    return df.cs.str.extract(mim_re_cs, expand=False).str.upper().tolist()
+
+
+def calculate_relative_entropy(df_good_hits, kmer_length, insert_length):
+    all_kmers = itertools.product("ACTG", repeat=kmer_length)
+    kmer_positions = dict(
+        (kmer, position) for kmer, position in zip(all_kmers, range(4**kmer_length))
+    )
+    seqs = _extract_inserts_from_df(df_good_hits)
+    entropies = _count_for_seqs(seqs, kmer_positions, insert_length, k=kmer_length)
+    return entropies
diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py
@@ -6,6 +6,7 @@
 
 from anglerfish.demux.adaptor import load_adaptors
 from anglerfish.demux.demux import parse_paf_lines, run_minimap2
+from anglerfish.explore.entropy import calculate_relative_entropy
 
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger("explore")
@@ -170,7 +171,12 @@ def run_explore(
                 median_insert_length = df_good_hits["insert_len"].median()
                 if median_insert_length > umi_threshold:
                     # Calculate entropies here
-                    pass
+                    entropies = calculate_relative_entropy(
+                        df_good_hits, kmer_length, median_insert_length
+                    )
+                    log.info(
+                        f"{adaptor.name}:{adaptor_end_name} had entropy {entropies}"
+                    )
                 insert_lengths = df_good_hits["insert_len"].value_counts()
                 log.info(
                     f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits with median insert length {median_insert_length}"