Skip to content

Commit

Permalink
fix(preprocessor): add a check and test on the GVCFBlock comment in t…
Browse files Browse the repository at this point in the history
…he gVCF header lines
  • Loading branch information
BinglanLi committed Nov 6, 2024
1 parent 6854ec3 commit 76b31dd
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 17 deletions.
45 changes: 31 additions & 14 deletions preprocessor/preprocessor/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,23 +316,40 @@ def _is_gvcf_file(file: Path) -> bool:
def _check_for_gvcf(in_f) -> bool:
for line in in_f:
if line[0] == '#':
continue
# a gVCF block in the header is indicative of a gVCF file
if re.search('##GVCFBlock', line):
return True
# when there is no indicative gVCF header lines, continue to check the vcf content
else:
continue
else:
line = line.rstrip('\n')
# vcf header fields: CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample
fields: List[str] = line.split('\t')
v_len: int = abs(len(fields[4]) - len(fields[3]))
v_start: int = int(fields[1])
end_in_info = re.search(r"[;|]?END=(\d+)", fields[7])
if end_in_info:
end_pos: int = int(end_in_info.group(1))
# calculate the length of the variant block indicated by the END annotation in the INFO column
end_block = end_pos - v_start
if end_block > v_len:
return True
elif end_block == v_len: # not a gVCF if an END block simply annotates the variant length
return False
else: # when end_len < v_len, it is unclear what END annotation stands for
return False

# according to the vcf specs, the ALT column must
# be an unspecified allele, <NON_REF> or <*> (recommended by the VCF specs)
v_alt = fields[4]
if v_alt == '<NON_REF>' or v_alt == '<*>': # check if ALT has an unspecified allele
# find the END annotation in the INFO column
end_in_info = re.search(r"[;|]?END=(\d+)", fields[7])
if end_in_info:
# get the starting position
v_start: int = int(fields[1])
# identify the END genomic position
end_pos: int = int(end_in_info.group(1))
# calculate the length of the variant block indicated by the END annotation in the INFO column
end_block = end_pos - v_start
if end_block > 1:
return True
elif end_block == 1: # not a gVCF if an END block simply annotates the variant length
# when the gVCF block is only of 1 basepair long, it's equivalent to a SNP and
# it's possible to annotate ALT based on PharmCAT position vcf
return False
else: # when end_len < 1, it is unclear what END annotation stands for
return False
else:
continue
return False


Expand Down
4 changes: 2 additions & 2 deletions preprocessor/tests/test_gvcf_block.vcf
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
##fileformat=VCFv4.1
##source=PharmCAT allele definitions
##reference=hg38
##contig=<ID=chr11,assembly=hg38,species="Homo sapiens">
##contig=<ID=chr1,assembly=hg38,species="Homo sapiens">
##FILTER=<ID=PASS,Description="All filters passed">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample_1
chr1 123456 . G A . PASS END=223456 GT 0|0
chr1 123456 . G <*> . PASS END=223456 GT 0|0
9 changes: 9 additions & 0 deletions preprocessor/tests/test_gvcf_block_in_header.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
##fileformat=VCFv4.1
##source=PharmCAT allele definitions
##reference=hg38
##GVCFBlock55-56=minGQ=55(inclusive),maxGQ=56(exclusive)
##contig=<ID=chr1,assembly=hg38,species="Homo sapiens">
##FILTER=<ID=PASS,Description="All filters passed">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample_1
chr1 123456 . G <*> . PASS END=123457 GT 0|0
2 changes: 1 addition & 1 deletion preprocessor/tests/test_not_gvcf_block.vcf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
##fileformat=VCFv4.1
##source=PharmCAT allele definitions
##reference=hg38
##contig=<ID=chr11,assembly=hg38,species="Homo sapiens">
##contig=<ID=chr1,assembly=hg38,species="Homo sapiens">
##FILTER=<ID=PASS,Description="All filters passed">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample_1
Expand Down
3 changes: 3 additions & 0 deletions preprocessor/tests/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,13 @@ def test_is_gvcf_file():
test_gvcf_file = helpers.test_dir / 'test.g.vcf.bgz'
test_vcf_end_file = helpers.test_dir / 'test_not_gvcf_block.vcf'
test_gvcf_end_file = helpers.test_dir / 'test_gvcf_block.vcf'
test_gvcf_header_file = helpers.test_dir / 'test_gvcf_block_in_header.vcf'
with tempfile.TemporaryDirectory() as td:
assert not utils.is_gvcf_file(test_vcf_file)
# is gVCF as the END annotation indicates a large genomic block
assert utils.is_gvcf_file(test_gvcf_end_file)
# is gVCF as indicated by the 'GVCFBlock' comment in the header
assert utils.is_gvcf_file(test_gvcf_header_file)
# is not gVCF as the END annotation simply indicates the variant length
assert not utils.is_gvcf_file(test_vcf_end_file)

Expand Down

0 comments on commit 76b31dd

Please sign in to comment.