Merge pull request #167 from rhpvorderman/duplicatedoverrepresented

Stage hashes to prevent in-sequence duplication to affect overrepresented sequences
rhpvorderman · May 30, 2024 · 14e1cfb · 14e1cfb
2 parents 2751024 + 34465b8
commit 14e1cfb
Show file tree

Hide file tree

Showing 5 changed files with 76 additions and 8 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -9,6 +9,10 @@ Changelog
 
 version 0.10.0-dev
 ------------------
++ Overrepresented sequence analysis now only counts unique fragments per read.
+  Fragments that are duplicated inside the read are counted only once. This
+  prevents long stretches of genomic repeats getting higher reported
+  frequencies.
 + Speed up sequence identity calculations on AVX2 enabled CPUs by using a
   reverse-diagonal parallelized version of Smith-Waterman.
 

diff --git a/docs/module_options.rst b/docs/module_options.rst
@@ -64,9 +64,12 @@ such the adapter sequences will always be sampled in the same frame.
     sequence is sampled when the length is not divisible by the size of the
     fragments.
 
-Fragments are stored and counted in a hash table. When the hash table is full
-only fragments that are already present will be counted. To diminish the time
-spent on the module, by default 1 in 8 sequences is analysed.
+For each sequence only unique fragments within the sequence are sampled, to
+avoid overrepresenting common genomic repeats.
+unique fragments are then stored and counted in a hash table. When the hash
+table is full only fragments that are already present will be counted.
+To diminish the time spent on the module, by default 1 in 8 sequences is
+analysed.
 
 After the module is run, stored fragments are checked for their counts. If the
 count exceeds a certain threshold it is considered overrepresented. Sequali

diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c
@@ -3154,6 +3154,8 @@ typedef struct _SequenceDuplicationStruct {
     size_t fragment_length;
     uint64_t number_of_sequences;
     uint64_t sampled_sequences;
+    uint64_t staging_hash_table_size; 
+    uint64_t *staging_hash_table;
     uint64_t hash_table_size;
     uint64_t *hashes; 
     uint32_t *counts;
@@ -3166,6 +3168,7 @@ typedef struct _SequenceDuplicationStruct {
 static void 
 SequenceDuplication_dealloc(SequenceDuplication *self)
 {
+    PyMem_Free(self->staging_hash_table);
     PyMem_Free(self->hashes);
     PyMem_Free(self->counts);
     Py_TYPE(self)->tp_free((PyObject *)self);
@@ -3232,6 +3235,8 @@ SequenceDuplication__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
     self->hash_table_size = hash_table_size;
     self->total_fragments = 0;
     self->fragment_length = fragment_length;
+    self->staging_hash_table_size = 0;
+    self->staging_hash_table = NULL;
     self->hashes = hashes;
     self->counts = counts;
     self->sample_every = sample_every;
@@ -3265,6 +3270,44 @@ Sequence_duplication_insert_hash(SequenceDuplication *self, uint64_t hash)
     }
 }
 
+static int 
+SequenceDuplication_resize_staging(SequenceDuplication *self, uint64_t new_size)
+{
+    if (new_size <= self->staging_hash_table_size) {
+        return 0;
+    }
+    uint64_t *tmp = PyMem_Realloc(self->staging_hash_table, 
+                                  new_size * sizeof(uint64_t));
+    if (tmp == NULL) {
+        PyErr_NoMemory();
+        return -1; 
+    }
+    self->staging_hash_table = tmp;
+    self->staging_hash_table_size = new_size;
+    return 0;
+}
+
+static inline void 
+add_to_staging(uint64_t *staging_hash_table, uint64_t staging_hash_table_size, 
+               uint64_t hash) 
+{
+    /* Works because size is a power of 2 */
+    uint64_t hash_to_index_int = staging_hash_table_size - 1; 
+    uint64_t index = hash & hash_to_index_int;
+    while (true) {
+        uint64_t current_entry  = staging_hash_table[index];
+        if (current_entry == 0) {
+            staging_hash_table[index] = hash;
+            break;
+        } else if (current_entry == hash) {
+            break;
+        }
+        index += 1; 
+        index &= hash_to_index_int;
+    }
+    return;
+}
+
 static int
 SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
 {
@@ -3299,6 +3342,16 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
        still all of the sequence is being sampled.
     */
     Py_ssize_t total_fragments = (sequence_length + fragment_length - 1) / fragment_length;
+    size_t staging_hash_bits = (size_t)ceil(log2((double)total_fragments * 1.5));
+    size_t staging_hash_size = 1ULL << staging_hash_bits;
+    if (staging_hash_size > self->staging_hash_table_size) {
+        if (SequenceDuplication_resize_staging(self, staging_hash_size) < 0) {
+            return -1;
+        }
+    }
+    uint64_t *staging_hash_table = self->staging_hash_table;
+    memset(staging_hash_table, 0 , staging_hash_size * sizeof(uint64_t));
+
     Py_ssize_t from_mid_point_fragments = total_fragments / 2;
     Py_ssize_t mid_point = sequence_length - (from_mid_point_fragments * fragment_length);
     bool warn_unknown = false;
@@ -3313,7 +3366,7 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
         }
         fragments += 1;
         uint64_t hash = wanghash64(kmer);
-        Sequence_duplication_insert_hash(self, hash);
+        add_to_staging(staging_hash_table, staging_hash_size, hash);    
     }
 
     // Sample back sequences
@@ -3327,7 +3380,13 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
         }
         fragments += 1;
         uint64_t hash = wanghash64(kmer);
-        Sequence_duplication_insert_hash(self, hash);
+        add_to_staging(staging_hash_table, staging_hash_size, hash);    
+    }
+    for (size_t i=0; i<staging_hash_size; i++) {
+        uint64_t hash = staging_hash_table[i];
+        if (hash != 0) {
+            Sequence_duplication_insert_hash(self, hash);
+        }
     }
     if (warn_unknown) {
         PyObject *culprit = PyUnicode_DecodeASCII((char *)sequence, sequence_length, NULL);

diff --git a/src/sequali/report_modules.py b/src/sequali/report_modules.py
@@ -1426,8 +1426,9 @@ def to_html(self) -> str:
             <p class="explanation">
                 The percentage shown is an estimate based on the number of
                 occurences of the fragment in relation to the number of
-                sampled sequences. This makes the assumption that a
-                fragment only occurs once in each sequence.
+                sampled sequences. Fragments are only counted once per
+                sequence. Fragments that occur more than once in a sequence are
+                counted as one.
             </p>
             <table>
             <tr>

diff --git a/tests/test_sequence_duplication.py b/tests/test_sequence_duplication.py
@@ -146,7 +146,8 @@ def test_sequence_duplication_sampling_rate(divisor):
     ("GATTACAAA", {"ATC": 1, "GTA": 1, "AAA": 1}),
     ("GA", {}),
     ("GATT", {"ATC": 1, "AAT": 1}),
-    ("GATTACGATTAC", {"ATC": 2, "GTA": 2}),
+    # Fragments that are duplicated in the sequence should only be recorded once.
+    ("GATTACGATTAC", {"ATC": 1, "GTA": 1}),
 ])
 def test_sequence_duplication_all_fragments(sequence, result):
     seqdup = SequenceDuplication(fragment_length=3, sample_every=1)