test: check whether pharmcat_position.vcf and pharmcat_position.unial…

…lelic.vcf have the same number of lines
PharmGKB · Aug 9, 2024 · 3eebdca · 3eebdca
1 parent 7250db4
commit 3eebdca
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 10 deletions.
diff --git a/preprocessor/tests/helpers.py b/preprocessor/tests/helpers.py
@@ -1,6 +1,7 @@
 import hashlib
 import shutil
 import urllib.request
+import gzip
 from pathlib import Path
 from typing import Optional, List
 
@@ -21,6 +22,7 @@
 test_dir: Path = Path(globals().get("__file__", "./_")).absolute().parent
 src_dir: Path = test_dir / '../preprocessor'
 pharmcat_positions_file: Path = test_dir / '../../pharmcat_positions.vcf.bgz'
+uniallelic_pharmcat_positions_file: Path = test_dir / '../../pharmcat_positions.uniallelic.vcf.bgz'
 
 
 def get_reference_fasta(pharmcat_positions: Path) -> Path:
@@ -51,17 +53,25 @@ def md5hash(file: Path):
         return file_hash.hexdigest()
 
 
-def read_vcf(file: Path, skip_comments: bool = True):
+def read_vcf(file: Path, bgzipped: bool = False, skip_comments: bool = True):
+    if bgzipped:
+        with gzip.open(file, mode='rt', encoding='utf-8') as in_f:
+            return _read_vcf(in_f, skip_comments=skip_comments)
+    else:
+        with open(file, mode='r', encoding='utf-8') as in_f:
+            return _read_vcf(in_f, skip_comments=skip_comments)
+
+
+def _read_vcf(in_f, skip_comments: bool = True):
     """Reads VCF file and (1) strips trailing spaces, (2) removes empty lines and (3) normalizes line endings."""
-    with open(file, 'r') as f:
-        lines = []
-        for line in f:
-            line = line.rstrip()
-            if line.startswith('##') and skip_comments:
-                continue
-            if line:
-                lines.append(line)
-        return '\n'.join(lines)
+    lines = []
+    for line in in_f:
+        line = line.rstrip()
+        if line.startswith('##') and skip_comments:
+            continue
+        if line:
+            lines.append(line)
+    return '\n'.join(lines)
 
 
 def compare_vcf_files(expected: Path, tmp_dir: Path, basename: str, sample: str = None, split_sample: bool = False,

diff --git a/preprocessor/tests/test_utilities.py b/preprocessor/tests/test_utilities.py
@@ -411,6 +411,18 @@ def test_prep_pharmcat_positions():
         assert tmp_uniallelic.is_file()
         assert uniallelic_mtime == tmp_uniallelic.stat().st_mtime
 
+        # check whether the uniallelic position file has the same number of positions as the position file
+        uniallelic_file_lines = helpers.read_vcf(helpers.uniallelic_pharmcat_positions_file, bgzipped=True).split('\n')
+        n_uniallelic_file_lines: int = len(uniallelic_file_lines)
+        # note that tmp_uniallelic is the equivalent uniallelic file generated from the pharmcat position file
+        tmp_uniallelic_lines = helpers.read_vcf(tmp_uniallelic, bgzipped=True).split('\n')
+        n_tmp_uniallelic_lines: int = len(tmp_uniallelic_lines)
+        # uniallelic_pharmcat_positions_file should
+        # have the same number of lines as its equivalent file generated from the pharmcat position file
+        assert n_uniallelic_file_lines == n_tmp_uniallelic_lines, \
+            'mismatching numbers of positions between %s and %s' % \
+            (helpers.uniallelic_pharmcat_positions_file.name, helpers.pharmcat_positions_file.name)
+
 
 def test_extract_pgx_regions():
     vcf_file = helpers.test_dir / 'raw.vcf.bgz'