From 1c5a428948407d87dc0c2752690376d6d0878cc0 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Fri, 9 Aug 2024 13:05:49 +0200
Subject: [PATCH 1/2] Use floor rather than round for phred scores

---
 CHANGELOG.rst            |  3 +++
 src/sequali/_qcmodule.c  |  4 +++-
 tests/test_qc_metrics.py | 10 +++++++---
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 3125366..1d9d873 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -9,6 +9,9 @@ Changelog
 
 version 0.11.0-dev
 ------------------
++ Fix a bug where the average phred score per read would be rounded, not
+  floored. This would lead reads with a phred score such as 9.7 to be counted
+  towards the Q>=10 results.
 + Replace some of the hand vectorized code with more generic code that can be
   automatically be optimized by the compiler. This should make things faster on
   Windows and ARM64 platforms.
diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c
index 7979319..ea2434e 100644
--- a/src/sequali/_qcmodule.c
+++ b/src/sequali/_qcmodule.c
@@ -1849,7 +1849,9 @@ QCMetrics_add_meta(QCMetrics *self, struct FastqMeta *meta)
     meta->accumulated_error_rate = accumulated_error_rate;
     double average_error_rate = accumulated_error_rate / (double)sequence_length;
     double average_phred = -10.0 * log10(average_error_rate);
-    uint64_t phred_score_index = (uint64_t)round(average_phred);
+    // Floor the average phred so q9.7 does not get represented as q10 but
+    // q9. Otherwise the Q>=10 count is going to be off.
+    uint64_t phred_score_index = (uint64_t)floor(average_phred);
     assert(phred_score_index >= 0);
     assert(phred_score_index <= PHRED_MAX);
     self->phred_scores[phred_score_index] += 1;
diff --git a/tests/test_qc_metrics.py b/tests/test_qc_metrics.py
index 633912b..8fefa32 100644
--- a/tests/test_qc_metrics.py
+++ b/tests/test_qc_metrics.py
@@ -42,7 +42,7 @@ def test_qc_metrics():
     phred_content = metrics.phred_scores()
     this_read_error = (10 ** -1) * 25 + (10 ** -3) * 25
     this_read_phred = -10 * math.log10(this_read_error / len(sequence))
-    phred_index = round(this_read_phred)
+    phred_index = math.floor(this_read_phred)
     assert phred_content[phred_index] == 1
     assert sum(phred_content) == 1
     phred_array = metrics.phred_count_table()
@@ -81,8 +81,12 @@ def test_long_sequence():
     # properly.
     sequence = 4096 * 'A' + 4096 * 'C'
     qualities = 8192 * chr(20 + 33)
+    # Use a sum range and multiply by four because that is how the algorithm
+    # in sequali works. Keeping 4 separate counters for efficiency.
+    calculated_phred = -10 * math.log10(sum(0.01 for _ in range(2048)) * 4 / 8192)
+    floored_phred = math.floor(calculated_phred)
     metrics.add_read(FastqRecordView("name", sequence, qualities))
-    assert metrics.phred_scores()[20] == 1
+    assert metrics.phred_scores()[floored_phred] == 1
     assert metrics.gc_content()[50] == 1
 
 
@@ -97,4 +101,4 @@ def test_average_long_quality():
     error_rate = (1 + 19999 * 10 ** -5) / 20000
     phred = -10 * math.log10(error_rate)
     metrics.add_read(FastqRecordView("name", sequence, qualities))
-    assert metrics.phred_scores()[round(phred)] == 1
+    assert metrics.phred_scores()[math.floor(phred)] == 1

From 8e3a085ccb223a1afb22fe951f0b192c930d7a19 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Fri, 9 Aug 2024 13:36:36 +0200
Subject: [PATCH 2/2] Rewrite test so it is not dependent on platform dependent
 flooring

---
 tests/test_qc_metrics.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/tests/test_qc_metrics.py b/tests/test_qc_metrics.py
index 8fefa32..39fb2a6 100644
--- a/tests/test_qc_metrics.py
+++ b/tests/test_qc_metrics.py
@@ -76,15 +76,18 @@ def test_qc_metrics():
 
 def test_long_sequence():
     metrics = QCMetrics()
-    # This will test the base counting in vectors properly as that is limited
-    # at 255 * 16 nucleotides (4080 bytes) and thus needs to flush the counts
-    # properly.
+
     sequence = 4096 * 'A' + 4096 * 'C'
-    qualities = 8192 * chr(20 + 33)
-    # Use a sum range and multiply by four because that is how the algorithm
-    # in sequali works. Keeping 4 separate counters for efficiency.
-    calculated_phred = -10 * math.log10(sum(0.01 for _ in range(2048)) * 4 / 8192)
-    floored_phred = math.floor(calculated_phred)
+    qualities = (2048 * chr(00 + 33) +
+                 2048 * chr(10 + 33) +
+                 2048 * chr(20 + 33) +
+                 2048 * chr(30 + 33))
+    errors = (2048 * (10 ** -0) +
+              2048 * (10 ** -1) +
+              2048 * (10 ** -2) +
+              2048 * (10 ** -3))
+    phred = -10 * math.log10(errors / 8192)
+    floored_phred = math.floor(phred)
     metrics.add_read(FastqRecordView("name", sequence, qualities))
     assert metrics.phred_scores()[floored_phred] == 1
     assert metrics.gc_content()[50] == 1