Skip to content

Commit

Permalink
fix(preprocessor): improve the check on the gVCF END block
Browse files Browse the repository at this point in the history
check on the block length annotated by the END field in the INFO column. Avoid confusions caused by accidental END string and variant annotations

Fix-for: #199
  • Loading branch information
BinglanLi committed Nov 5, 2024
1 parent cacece3 commit 6854ec3
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 3 deletions.
16 changes: 13 additions & 3 deletions preprocessor/preprocessor/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,9 +320,19 @@ def _check_for_gvcf(in_f) -> bool:
else:
line = line.rstrip('\n')
fields: List[str] = line.split('\t')
# check whether input is a block gVCF
if re.search('END', fields[7]):
return True
v_len: int = abs(len(fields[4]) - len(fields[3]))
v_start: int = int(fields[1])
end_in_info = re.search(r"[;|]?END=(\d+)", fields[7])
if end_in_info:
end_pos: int = int(end_in_info.group(1))
# calculate the length of the variant block indicated by the END annotation in the INFO column
end_block = end_pos - v_start
if end_block > v_len:
return True
elif end_block == v_len: # not a gVCF if an END block simply annotates the variant length
return False
else: # when end_len < v_len, it is unclear what END annotation stands for
return False
return False


Expand Down
8 changes: 8 additions & 0 deletions preprocessor/tests/test_gvcf_block.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
##fileformat=VCFv4.1
##source=PharmCAT allele definitions
##reference=hg38
##contig=<ID=chr11,assembly=hg38,species="Homo sapiens">
##FILTER=<ID=PASS,Description="All filters passed">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample_1
chr1 123456 . G A . PASS END=223456 GT 0|0
9 changes: 9 additions & 0 deletions preprocessor/tests/test_not_gvcf_block.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
##fileformat=VCFv4.1
##source=PharmCAT allele definitions
##reference=hg38
##contig=<ID=chr11,assembly=hg38,species="Homo sapiens">
##FILTER=<ID=PASS,Description="All filters passed">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample_1
chr1 123456 . A T . PASS P|ABEND5|ABEND5 GT 0|1
chr1 123456 . GGT G . PASS END=123458;annotation_of_variant_ending_position GT 0|1
6 changes: 6 additions & 0 deletions preprocessor/tests/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,14 @@ def test_is_vcf_file():
def test_is_gvcf_file():
test_vcf_file = helpers.test_dir / 'raw.vcf.bgz'
test_gvcf_file = helpers.test_dir / 'test.g.vcf.bgz'
test_vcf_end_file = helpers.test_dir / 'test_not_gvcf_block.vcf'
test_gvcf_end_file = helpers.test_dir / 'test_gvcf_block.vcf'
with tempfile.TemporaryDirectory() as td:
assert not utils.is_gvcf_file(test_vcf_file)
# is gVCF as the END annotation indicates a large genomic block
assert utils.is_gvcf_file(test_gvcf_end_file)
# is not gVCF as the END annotation simply indicates the variant length
assert not utils.is_gvcf_file(test_vcf_end_file)

tmp_dir = Path(td)
f1 = tmp_dir / 'test.g.vcf.bgz'
Expand Down

0 comments on commit 6854ec3

Please sign in to comment.