From 6854ec3d7cd51982cfa8b712ed1286f3381d4472 Mon Sep 17 00:00:00 2001 From: Binglan Li Date: Mon, 4 Nov 2024 18:10:16 -0800 Subject: [PATCH] fix(preprocessor): improve the check on the gVCF END block check on the block length annotated by the END field in the INFO column. Avoid confusions caused by accidental END string and variant annotations Fix-for: #199 --- preprocessor/preprocessor/utilities.py | 16 +++++++++++++--- preprocessor/tests/test_gvcf_block.vcf | 8 ++++++++ preprocessor/tests/test_not_gvcf_block.vcf | 9 +++++++++ preprocessor/tests/test_utilities.py | 6 ++++++ 4 files changed, 36 insertions(+), 3 deletions(-) create mode 100644 preprocessor/tests/test_gvcf_block.vcf create mode 100644 preprocessor/tests/test_not_gvcf_block.vcf diff --git a/preprocessor/preprocessor/utilities.py b/preprocessor/preprocessor/utilities.py index 9160b3c4..1a9af114 100644 --- a/preprocessor/preprocessor/utilities.py +++ b/preprocessor/preprocessor/utilities.py @@ -320,9 +320,19 @@ def _check_for_gvcf(in_f) -> bool: else: line = line.rstrip('\n') fields: List[str] = line.split('\t') - # check whether input is a block gVCF - if re.search('END', fields[7]): - return True + v_len: int = abs(len(fields[4]) - len(fields[3])) + v_start: int = int(fields[1]) + end_in_info = re.search(r"[;|]?END=(\d+)", fields[7]) + if end_in_info: + end_pos: int = int(end_in_info.group(1)) + # calculate the length of the variant block indicated by the END annotation in the INFO column + end_block = end_pos - v_start + if end_block > v_len: + return True + elif end_block == v_len: # not a gVCF if an END block simply annotates the variant length + return False + else: # when end_len < v_len, it is unclear what END annotation stands for + return False return False diff --git a/preprocessor/tests/test_gvcf_block.vcf b/preprocessor/tests/test_gvcf_block.vcf new file mode 100644 index 00000000..8315a3f7 --- /dev/null +++ b/preprocessor/tests/test_gvcf_block.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.1 +##source=PharmCAT allele definitions +##reference=hg38 +##contig= +##FILTER= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample_1 +chr1 123456 . G A . PASS END=223456 GT 0|0 diff --git a/preprocessor/tests/test_not_gvcf_block.vcf b/preprocessor/tests/test_not_gvcf_block.vcf new file mode 100644 index 00000000..8f439114 --- /dev/null +++ b/preprocessor/tests/test_not_gvcf_block.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.1 +##source=PharmCAT allele definitions +##reference=hg38 +##contig= +##FILTER= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample_1 +chr1 123456 . A T . PASS P|ABEND5|ABEND5 GT 0|1 +chr1 123456 . GGT G . PASS END=123458;annotation_of_variant_ending_position GT 0|1 diff --git a/preprocessor/tests/test_utilities.py b/preprocessor/tests/test_utilities.py index 4d171219..e1d1d132 100644 --- a/preprocessor/tests/test_utilities.py +++ b/preprocessor/tests/test_utilities.py @@ -160,8 +160,14 @@ def test_is_vcf_file(): def test_is_gvcf_file(): test_vcf_file = helpers.test_dir / 'raw.vcf.bgz' test_gvcf_file = helpers.test_dir / 'test.g.vcf.bgz' + test_vcf_end_file = helpers.test_dir / 'test_not_gvcf_block.vcf' + test_gvcf_end_file = helpers.test_dir / 'test_gvcf_block.vcf' with tempfile.TemporaryDirectory() as td: assert not utils.is_gvcf_file(test_vcf_file) + # is gVCF as the END annotation indicates a large genomic block + assert utils.is_gvcf_file(test_gvcf_end_file) + # is not gVCF as the END annotation simply indicates the variant length + assert not utils.is_gvcf_file(test_vcf_end_file) tmp_dir = Path(td) f1 = tmp_dir / 'test.g.vcf.bgz'