Skip to content

Commit

Permalink
Merge pull request #66 from griffithlab/gene_expr_id
Browse files Browse the repository at this point in the history
Update the expression annotator to use the gene (ID) column in Kallisto files
  • Loading branch information
susannasiebert authored Oct 16, 2023
2 parents a25f1f1 + a2e8f86 commit 47e6688
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 19 deletions.
2 changes: 1 addition & 1 deletion tests/test_data/input.kallisto.gx.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,4 @@
##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence type as predicted by VEP. Format: Allele|Gene|Feature|Feature_type|Consequence|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|DISTANCE|STRAND|CANONICAL|SYMBOL|SYMBOL_SOURCE|SIFT|PolyPhen|HGVSc|HGVSp|DownstreamProtein|ProteinLengthChange|WildtypeProtein">
##FORMAT=<ID=GX,Number=.,Type=String,Description="Gene Expressions">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT H_NJ-HCC1395-HCC1395
22 18644673 . C T . . CSQ=T|ENSG00000184979|ENST00000215794|Transcript|missense_variant|801|371|124|A/V|gCc/gTc|||||||1|YES|USP18|HGNC|tolerated(1)|benign(0.162)|ENST00000215794.7%3Ac.371C>T|ENSP00000215794.7%3Ap.Ala124Val|||MSKAFGLLRQICQSILAESSQSPADLEEKKEEDSNMKREQPRERPRAWDYPHGLVGLHNIGQTCCLNSLIQVFVMNVDFTRILKRITVPRGADEQRRSVPFQMLLLLEKMQDSRQKAVRPLELAYCLQKCNVPLFVQHDAAQLYLKLWNLIKDQITDVHLVERLQALYTIRVKDSLICVDCAMESSRNSSMLTLPLSLFDVDSKPLKTLEDALHCFFQPRELSSKSKCFCENCGKKTRGKQVLKLTHLPQTLTIHLMRFSIRNSQTRKICHSLYFPQSLDFSQILPMKRESCDAEEQSGGQYELFAVIAHVGMADSGHYCVYIRNAVDGKWFCFNDSNICLVSWEDIQCTYGNPNYHWQETAYLLVYMKMEC GT:BQ:SS:FDP:SDP:SUBDP:AU:CU:GU:TU:FT:FA:TLOD:GX 0/1:.:2:1:0:0:0,0:106,108:0,0:6,6:PASS:0.04:7.56609:USP18|18.4008276190378
22 18644673 . C T . . CSQ=T|ENSG00000184979|ENST00000215794|Transcript|missense_variant|801|371|124|A/V|gCc/gTc|||||||1|YES|USP18|HGNC|tolerated(1)|benign(0.162)|ENST00000215794.7%3Ac.371C>T|ENSP00000215794.7%3Ap.Ala124Val|||MSKAFGLLRQICQSILAESSQSPADLEEKKEEDSNMKREQPRERPRAWDYPHGLVGLHNIGQTCCLNSLIQVFVMNVDFTRILKRITVPRGADEQRRSVPFQMLLLLEKMQDSRQKAVRPLELAYCLQKCNVPLFVQHDAAQLYLKLWNLIKDQITDVHLVERLQALYTIRVKDSLICVDCAMESSRNSSMLTLPLSLFDVDSKPLKTLEDALHCFFQPRELSSKSKCFCENCGKKTRGKQVLKLTHLPQTLTIHLMRFSIRNSQTRKICHSLYFPQSLDFSQILPMKRESCDAEEQSGGQYELFAVIAHVGMADSGHYCVYIRNAVDGKWFCFNDSNICLVSWEDIQCTYGNPNYHWQETAYLLVYMKMEC GT:BQ:SS:FDP:SDP:SUBDP:AU:CU:GU:TU:FT:FA:TLOD:GX 0/1:.:2:1:0:0:0,0:106,108:0,0:6,6:PASS:0.04:7.56609:ENSG00000184979|18.4008276190378
20 changes: 10 additions & 10 deletions tests/test_data/kallisto.genes
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
gene_name gene abundance counts length
A1BG A1BG 9.5072780253347 253.812537262238 1113.77371499879
A1CF A1CF 0.170005984522626 33.3184303003113 8176.36764900294
A2M A2M 4.49761004558132 110 1020.35424350425
A2ML1 A2ML1 1.78407197701224 59.8726175228087 1400.09002532749
A2MP1 A2MP1 0.233702196740902 15 2677.74090252527
A3GALT2 A3GALT2 0.119577011382624 2.38061461180681 830.580036203776
A4GALT A4GALT 0.569898611254872 15 1098.07941073092
A4GNT A4GNT 0.0264285588681354 1 1578.58003620378
AAAS AAAS 18.4008276190378 475 1076.95198457067
USP18 USP18 18.4008276190378 475 1076.95198457067
A1BG ENSG00000121410 9.5072780253347 253.812537262238 1113.77371499879
A1CF ENSG00000148584 0.170005984522626 33.3184303003113 8176.36764900294
A2M ENSG00000175899 4.49761004558132 110 1020.35424350425
A2ML1 ENSG00000166535 1.78407197701224 59.8726175228087 1400.09002532749
A2MP1 ENSG00000291190 0.233702196740902 15 2677.74090252527
A3GALT2 ENSG00000184389 0.119577011382624 2.38061461180681 830.580036203776
A4GALT ENSG00000128274 0.569898611254872 15 1098.07941073092
A4GNT ENSG00000118017 0.0264285588681354 1 1578.58003620378
AAAS ENSG00000094914 18.4008276190378 475 1076.95198457067
USP18 ENSG00000184979 18.4008276190378 475 1076.95198457067
12 changes: 4 additions & 8 deletions vatools/vcf_expression_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def resolve_id_column(args):
return 'tracking_id'
elif args.format == 'kallisto':
if args.mode == 'gene':
return 'gene_name'
return 'gene'
elif args.mode == 'transcript':
return 'target_id'
elif args.format == 'stringtie':
Expand Down Expand Up @@ -156,7 +156,7 @@ def define_parser():
)
parser.add_argument(
"-i", "--id-column",
help="The column header in the expression_file for the column containing gene names/transcript ids. Required when using the `custom` format."
help="The column header in the expression_file for the column containing gene/transcript ids. Required when using the `custom` format."
)
parser.add_argument(
"-e", "--expression-column",
Expand Down Expand Up @@ -210,12 +210,8 @@ def main(args_input = sys.argv[1:]):
for key, value in zip(csq_format, transcript.split('|')):
if key == 'Feature' and value != '' and not value.startswith('ENSR'):
transcript_ids.add(value)
if args.format == 'kallisto':
if key == 'SYMBOL' and value != '':
genes.add(value)
else:
if key == 'Gene' and value != '':
genes.add(value)
if key == 'Gene' and value != '':
genes.add(value)

if args.mode == 'gene':
genes = list(genes)
Expand Down

0 comments on commit 47e6688

Please sign in to comment.