diff --git a/tests/test_data/vep_annotation_reporter/input.duplicate_variant.vcf.gz b/tests/test_data/vep_annotation_reporter/input.duplicate_variant.vcf.gz new file mode 100644 index 0000000..8cd8c83 Binary files /dev/null and b/tests/test_data/vep_annotation_reporter/input.duplicate_variant.vcf.gz differ diff --git a/tests/test_data/vep_annotation_reporter/output.duplicate_variant.tsv b/tests/test_data/vep_annotation_reporter/output.duplicate_variant.tsv new file mode 100644 index 0000000..2bf263c --- /dev/null +++ b/tests/test_data/vep_annotation_reporter/output.duplicate_variant.tsv @@ -0,0 +1,2 @@ +CHROM POS REF ALT SYMBOL +chr17 7675088 C T TP53 diff --git a/tests/test_vep_annotation_reporter.py b/tests/test_vep_annotation_reporter.py index b32a1bf..7834513 100644 --- a/tests/test_vep_annotation_reporter.py +++ b/tests/test_vep_annotation_reporter.py @@ -120,3 +120,17 @@ def test_vcf_with_multiple_transcripts_and_no_pick(self): vep_annotation_reporter.main(command) self.assertTrue(cmp(os.path.join(self.test_data_dir, 'output.merge_multiple_transcripts.tsv'), os.path.join(temp_path.name, 'input.tsv'))) temp_path.cleanup() + + def test_vcf_with_duplicate_variant(self): + logging.disable(logging.NOTSET) + with LogCapture() as l: + temp_path = tempfile.TemporaryDirectory() + os.symlink(os.path.join(self.test_data_dir, 'input.duplicate_variant.vcf.gz'), os.path.join(temp_path.name, 'input.vcf.gz')) + command = [ + os.path.join(temp_path.name, 'input.vcf.gz'), + 'SYMBOL', + ] + vep_annotation_reporter.main(command) + self.assertTrue(cmp(os.path.join(self.test_data_dir, 'output.duplicate_variant.tsv'), os.path.join(temp_path.name, 'input.tsv'))) + temp_path.cleanup() + l.check_present(('root', 'WARNING', "VEP entry at CHR chr17, POS 7675088, REF C , ALT T already exists. Skipping subsequent entries.")) diff --git a/vatools/vep_annotation_reporter.py b/vatools/vep_annotation_reporter.py index b70e733..5ad5c8e 100644 --- a/vatools/vep_annotation_reporter.py +++ b/vatools/vep_annotation_reporter.py @@ -8,6 +8,7 @@ import tempfile import csv import binascii +import logging def define_parser(): parser = argparse.ArgumentParser( @@ -146,7 +147,7 @@ def extract_vep_fields(args): else: vep[chr][pos][ref][alt] = None else: - sys.exit("VEP entry for at CHR %s, POS %s, REF %s , ALT % already exists" % (chr, pos, ref, alt) ) + logging.warning("VEP entry at CHR %s, POS %s, REF %s , ALT %s already exists. Skipping subsequent entries." % (chr, pos, ref, alt) ) vcf_reader.close() return vep @@ -192,6 +193,7 @@ def main(args_input = sys.argv[1:]): with open(output_file, 'w') as output_filehandle: writer = csv.DictWriter(output_filehandle, fieldnames = ['CHROM', 'POS', 'REF', 'ALT'] + args.vep_fields, delimiter = "\t") writer.writeheader() + rows = [] for variant in vcf_reader: row = { 'CHROM': str(variant.CHROM), @@ -200,7 +202,9 @@ def main(args_input = sys.argv[1:]): 'ALT' : ','.join(map(lambda a: a.serialize(), variant.ALT)), } row = add_vep_fields_to_row(args, row, vep) - writer.writerow(row) + if row not in rows: + rows.append(row) + writer.writerows(rows) if __name__ == '__main__': main()