Skip to content

Commit

Permalink
Merge pull request #167 from rhpvorderman/duplicatedoverrepresented
Browse files Browse the repository at this point in the history
Stage hashes to prevent in-sequence duplication to affect overrepresented sequences
  • Loading branch information
rhpvorderman authored May 30, 2024
2 parents 2751024 + 34465b8 commit 14e1cfb
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 8 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ Changelog
version 0.10.0-dev
------------------
+ Overrepresented sequence analysis now only counts unique fragments per read.
Fragments that are duplicated inside the read are counted only once. This
prevents long stretches of genomic repeats getting higher reported
frequencies.
+ Speed up sequence identity calculations on AVX2 enabled CPUs by using a
reverse-diagonal parallelized version of Smith-Waterman.

Expand Down
9 changes: 6 additions & 3 deletions docs/module_options.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,12 @@ such the adapter sequences will always be sampled in the same frame.
sequence is sampled when the length is not divisible by the size of the
fragments.

Fragments are stored and counted in a hash table. When the hash table is full
only fragments that are already present will be counted. To diminish the time
spent on the module, by default 1 in 8 sequences is analysed.
For each sequence only unique fragments within the sequence are sampled, to
avoid overrepresenting common genomic repeats.
unique fragments are then stored and counted in a hash table. When the hash
table is full only fragments that are already present will be counted.
To diminish the time spent on the module, by default 1 in 8 sequences is
analysed.

After the module is run, stored fragments are checked for their counts. If the
count exceeds a certain threshold it is considered overrepresented. Sequali
Expand Down
63 changes: 61 additions & 2 deletions src/sequali/_qcmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -3154,6 +3154,8 @@ typedef struct _SequenceDuplicationStruct {
size_t fragment_length;
uint64_t number_of_sequences;
uint64_t sampled_sequences;
uint64_t staging_hash_table_size;
uint64_t *staging_hash_table;
uint64_t hash_table_size;
uint64_t *hashes;
uint32_t *counts;
Expand All @@ -3166,6 +3168,7 @@ typedef struct _SequenceDuplicationStruct {
static void
SequenceDuplication_dealloc(SequenceDuplication *self)
{
PyMem_Free(self->staging_hash_table);
PyMem_Free(self->hashes);
PyMem_Free(self->counts);
Py_TYPE(self)->tp_free((PyObject *)self);
Expand Down Expand Up @@ -3232,6 +3235,8 @@ SequenceDuplication__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
self->hash_table_size = hash_table_size;
self->total_fragments = 0;
self->fragment_length = fragment_length;
self->staging_hash_table_size = 0;
self->staging_hash_table = NULL;
self->hashes = hashes;
self->counts = counts;
self->sample_every = sample_every;
Expand Down Expand Up @@ -3265,6 +3270,44 @@ Sequence_duplication_insert_hash(SequenceDuplication *self, uint64_t hash)
}
}

static int
SequenceDuplication_resize_staging(SequenceDuplication *self, uint64_t new_size)
{
if (new_size <= self->staging_hash_table_size) {
return 0;
}
uint64_t *tmp = PyMem_Realloc(self->staging_hash_table,
new_size * sizeof(uint64_t));
if (tmp == NULL) {
PyErr_NoMemory();
return -1;
}
self->staging_hash_table = tmp;
self->staging_hash_table_size = new_size;
return 0;
}

static inline void
add_to_staging(uint64_t *staging_hash_table, uint64_t staging_hash_table_size,
uint64_t hash)
{
/* Works because size is a power of 2 */
uint64_t hash_to_index_int = staging_hash_table_size - 1;
uint64_t index = hash & hash_to_index_int;
while (true) {
uint64_t current_entry = staging_hash_table[index];
if (current_entry == 0) {
staging_hash_table[index] = hash;
break;
} else if (current_entry == hash) {
break;
}
index += 1;
index &= hash_to_index_int;
}
return;
}

static int
SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
{
Expand Down Expand Up @@ -3299,6 +3342,16 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
still all of the sequence is being sampled.
*/
Py_ssize_t total_fragments = (sequence_length + fragment_length - 1) / fragment_length;
size_t staging_hash_bits = (size_t)ceil(log2((double)total_fragments * 1.5));
size_t staging_hash_size = 1ULL << staging_hash_bits;
if (staging_hash_size > self->staging_hash_table_size) {
if (SequenceDuplication_resize_staging(self, staging_hash_size) < 0) {
return -1;
}
}
uint64_t *staging_hash_table = self->staging_hash_table;
memset(staging_hash_table, 0 , staging_hash_size * sizeof(uint64_t));

Py_ssize_t from_mid_point_fragments = total_fragments / 2;
Py_ssize_t mid_point = sequence_length - (from_mid_point_fragments * fragment_length);
bool warn_unknown = false;
Expand All @@ -3313,7 +3366,7 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
}
fragments += 1;
uint64_t hash = wanghash64(kmer);
Sequence_duplication_insert_hash(self, hash);
add_to_staging(staging_hash_table, staging_hash_size, hash);
}

// Sample back sequences
Expand All @@ -3327,7 +3380,13 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
}
fragments += 1;
uint64_t hash = wanghash64(kmer);
Sequence_duplication_insert_hash(self, hash);
add_to_staging(staging_hash_table, staging_hash_size, hash);
}
for (size_t i=0; i<staging_hash_size; i++) {
uint64_t hash = staging_hash_table[i];
if (hash != 0) {
Sequence_duplication_insert_hash(self, hash);
}
}
if (warn_unknown) {
PyObject *culprit = PyUnicode_DecodeASCII((char *)sequence, sequence_length, NULL);
Expand Down
5 changes: 3 additions & 2 deletions src/sequali/report_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -1426,8 +1426,9 @@ def to_html(self) -> str:
<p class="explanation">
The percentage shown is an estimate based on the number of
occurences of the fragment in relation to the number of
sampled sequences. This makes the assumption that a
fragment only occurs once in each sequence.
sampled sequences. Fragments are only counted once per
sequence. Fragments that occur more than once in a sequence are
counted as one.
</p>
<table>
<tr>
Expand Down
3 changes: 2 additions & 1 deletion tests/test_sequence_duplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@ def test_sequence_duplication_sampling_rate(divisor):
("GATTACAAA", {"ATC": 1, "GTA": 1, "AAA": 1}),
("GA", {}),
("GATT", {"ATC": 1, "AAT": 1}),
("GATTACGATTAC", {"ATC": 2, "GTA": 2}),
# Fragments that are duplicated in the sequence should only be recorded once.
("GATTACGATTAC", {"ATC": 1, "GTA": 1}),
])
def test_sequence_duplication_all_fragments(sequence, result):
seqdup = SequenceDuplication(fragment_length=3, sample_every=1)
Expand Down

0 comments on commit 14e1cfb

Please sign in to comment.