diff --git a/README.md b/README.md index c92aaf9..2af9a05 100644 --- a/README.md +++ b/README.md @@ -5,21 +5,35 @@ Small library for parsing vcf files. Based on [PyVCF](https://github.com/jamesca ```python3 from vcf_parser import parser - my_parser = parser.VCFParser('infile.vcf') + my_parser = parser.VCFParser(infile='infile.vcf') for variant in my_parser: print(variant) ``` -**vcf_parser also works on streams now.** +**vcf_parser can split multi allelic calls in vcf now.** Vcf parser is really a lightweight version of [PyVCF](https://github.com/jamescasbon/PyVCF) with most of it's code borrowed and modified from there. -The idea was to make a faster and more flexible tool that mostly work with python dictionarys. -The drawback is inacurracy, while **PyVCF** tests if each row in the vcf is on the correct format vcf_parser is much more sloppier. +The idea was to make a faster and more flexible tool that mostly work with python dictionaries. +It is more inaccurate , while **PyVCF** tests if each row in the vcf is on the correct format, vcf_parser is much more sloppier. It is easy to access information for each variant, edit the information and edit the headers. +## Basic function ## + + Returns dictionary with the vcf info for each variant. +To split the multiallelic calls(and accurate splitting of INFO field including the VEP CSQ fiels) use: + + my_parser = parser.VCFParser(infile='infile.vcf', split_variants=True) + +The ordinary vcf entrys is stored by there header names, like + + variant['CHROM'] + variant['ALT'] + +etc. + The genotype information is converted to a genotype object and stored in a dictionary variant['genotypes'] @@ -53,7 +67,8 @@ Vep information, if present, is parsed into and looks like: - 'vep_info': {'NOC2L': {'Allele': 'G', + 'vep_info': {: { + 'Allele': 'G', 'Amino_acids': '', 'CDS_position': '', 'Codons': '', @@ -74,7 +89,8 @@ and looks like: 'SYMBOL': 'NOC2L', 'SYMBOL_SOURCE': '', 'cDNA_position': ''}, - 'SAMD11': {'Allele': 'G', + : { + 'Allele': 'G', 'Amino_acids': '', 'CDS_position': '', 'Codons': '', @@ -94,36 +110,43 @@ and looks like: 'STRAND': '1', 'SYMBOL': 'SAMD11', 'SYMBOL_SOURCE': '', - 'cDNA_position': ''}} + 'cDNA_position': '' + } + 'gene_ids':set([SAMD1, NOC2L]) + } -INFO field is parsed into +INFO field is parsed into, where the keys are the names of the info field. Values are lists, if there is no value in the vcf the value in info_dict is False. variant['info_dict] and looks like - 'info_dict': {'AC': '1', - 'AF': '0.167', - 'AN': '6', - 'BaseQRankSum': '2.286', - 'DB': True, - 'DP': '1306', - 'FS': '1.539', - 'InbreedingCoeff': '0.1379', - 'MQ': '39.83', - 'MQ0': '0', - 'MQRankSum': '-2.146', - 'POSITIVE_TRAIN_SITE': True, - 'QD': '29.57', - 'ReadPosRankSum': '0.897', - 'VQSLOD': '4.52', - 'culprit': 'FS', - 'set': 'variant'} - - -###Print a variant in it´s original format:### - - print '\t'.join([[variant[head] for head in my_parser.header]) + 'info_dict': {'AC': ['1'], + 'AF': ['0.167'], + 'AN': ['6'], + 'BaseQRankSum': ['2.286'], + 'DB': False, + 'DP': ['1306'], + 'FS': ['1.539'], + 'InbreedingCoeff': ['0.1379'], + 'MQ': ['39.83'], + 'MQ0': ['0'], + 'MQRankSum': ['-2.146'], + 'POSITIVE_TRAIN_SITE': False, + 'QD': ['29.57'], + 'ReadPosRankSum': ['0.897'], + 'VQSLOD': ['4.52'], + 'culprit': ['FS'], + 'set': ['variant']} + + +### Print a vcf in it´s original format: ### + + my_parser = parser.VCFParser(infile='infile.vcf') + for line in my_parser.metadata.print_header(): + print(line) + for variant in my_parser: + print('\t'.join([[variant[head] for head in my_parser.header])) ###Add metadata information:### diff --git a/setup.py b/setup.py index 6a1ec47..d709c52 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ long_description = 'Tool for parsing Variant Call Format (VCF) files. Works like a lightweight version of PyVCF.' setup(name='vcf_parser', - version='0.8.3', + version='0.9', description='Parsing vcf files', author = 'Mans Magnusson', author_email = 'mans.magnusson@scilifelab.se',