-
Notifications
You must be signed in to change notification settings - Fork 0
/
ficle.py
executable file
·180 lines (141 loc) · 7.95 KB
/
ficle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env python3
# FICLE
# Authors: Szi Kay Leung
__author__ = "[email protected]"
__version__ = '1.1.4' # Python 3.7
## Load Libraries
import gtfparse
from gtfparse import read_gtf
import pandas as pd
import numpy as np
import collections
from collections import Counter
import csv
import sys
import sys
import shutil
import os
import argparse
sys.path.append("/src")
from src import prepare_and_parse as prep
from src import matching_exons as me
from src import identify_exon_skipping as es
from src import alternative_promoter_terminator as apat
from src import identify_novel_exons as ne
from src import identify_intron_retention as ir
from src import identify_alternativeprime as aprime
from src import finalise_output as fo
from src import generate_multiregion as gm
from src import classify_mapt_isoforms as mapt
pd.options.mode.chained_assignment = None # default='warn'
def directory(raw_path):
if not os.path.isdir(raw_path):
raise argparse.ArgumentTypeError('"{}" is not an existing directory'.format(raw_path))
return os.path.abspath(raw_path)
def annotate_gene(args):
# prepare output directories
args = prep.create_output_dir(args)
# subset reference from gene
prep.subset_reference(args)
output_log = open(args.gene_stats_dir + args.genename + "parsed_transcripts.txt", "w")
# check the species dataset based on the gene input
# Human = Gene is all in capitals
if args.genename.isupper():
species = "human"
else:
species = "mouse"
print("Working with", species, "dataset")
# ORF for NMD Prediction
if args.cpat:
ORF = pd.read_csv(args.cpat, sep= "\t")
else:
print("Not using ORF for classification")
ORF = None
# read in gencode and transcriptome gtf
gencode,order = prep.parse_gencode_reference(args)
gencode.rename({'transcript': 'transcript_id'}, axis=1, inplace=True)
df = prep.parse_transcriptome_gtf(args, order)
if len(df) > 0:
# Parse through the transcriptome, classify and filter
All_FilteredParsed = []
for count, transcript in enumerate(df['transcript_id'].unique()):
#print(transcript)
parsed = prep.parse_transcript(gencode,df,transcript,10, order)
All_FilteredParsed.append(prep.filter_parsed_transcript(gencode,parsed, output_log))
if count%50==0:
print("Parsing through transcript", count)
# Aggregate all the filtered parsed output from each transcript into one big list
All_FilteredParsed = [x for l in All_FilteredParsed for x in l]
# QC: Check that the transcripts in the original transcriptome gtf captured in the final big list
if np.all(np.isin([i.split(';',3)[0] for i in All_FilteredParsed], df["transcript_id"].unique()))==False:
print("Mismatch transcripts")
print(set(df["transcript_id"].unique()) - set(set([i.split(';',3)[0] for i in All_FilteredParsed])))
sys.exit(-1)
print("Identifying transcripts with novel exons")
NE = ne.identify_novel_exon(args, df, gencode, All_FilteredParsed)
NE_classify, NExons_BeyondFirst, NExons_BeyondFirstLast, NExons_BeyondLast, NExons_Internal, NExons_First, NExons_Last = ne.classify_novel_exon(args, gencode, order, df, NE, All_FilteredParsed)
NE_Counts = ne.novel_exon_stats(args, NE, NE_classify)
if not args.novelexon:
# Check for matching or somematching
AllKnownMatch, SomeMatch = me.identify_all_matching_exons(gencode, df, All_FilteredParsed)
# Tabulate exon presence
print("Tabulating exon presence")
exon_tab = es.tabulate_exon_presence(args, gencode, df, All_FilteredParsed)
if args.genename == "MAPT" or args.genename == "Mapt":
print("Further classifiying MAPT isoforms by exons 2, 3 and 10")
mapt_exon_tab, mapt_exon_tab_counts = mapt.classify_mapt_isoforms(species, exon_tab, gencode)
mapt_exon_tab.to_csv(args.gene_stats_dir + args.genename + "_further_classifications.csv",index_label="isoform")
mapt_exon_tab_counts.to_csv(args.gene_stats_dir + args.genename + "_further_classifications_counts.csv")
# Exon Skipping
print("Processing transcripts for exon skipping")
ES = es.identify_exon_skipping(args,gencode,exon_tab,All_FilteredParsed)
ES = es.skip_not_AFexons(ES, gencode)
ES_Counts, ES_Transcripts = es.output_exon_skipping_stats(args, ES)
# Alternative First Promoter
Alternative_First_Promoter = apat.identify_alternative_promoter(df, gencode, All_FilteredParsed)
# Alternative Termination
Alternative_Termination = apat.identify_alternative_termination(df, gencode, All_FilteredParsed)
# Alternative First Exon
AF, AF_Counts, AF_Transcripts = aprime.identify_alternative_first(df, All_FilteredParsed)
# Intron Retention
print("Identifying transcripts with intron retention")
IR_Counts, IR_Transcripts, IR_Exon1, IR_LastExon = ir.identify_intron_retention(args, df, All_FilteredParsed, gencode)
# Alternative A5' and A3'
print("Identifying transcripts with alternative 5' and 3' sites")
A5A3_Counts, A5A3_Transcripts = aprime.identify_A5A3_transcripts(args, df, All_FilteredParsed)
# Final Output
# Event lists = each category is generated from previous functions and contain list of transcripts under that event type
categories = [AllKnownMatch, SomeMatch, A5A3_Transcripts, AF_Transcripts,
Alternative_First_Promoter, Alternative_Termination,
ES_Transcripts, IR_Transcripts, IR_Exon1, IR_LastExon, NExons_BeyondFirst,
NExons_Internal,NExons_BeyondLast,NExons_BeyondFirstLast]
Transcript_Classifications = fo.generate_aggregated_classification(df, categories)
if args.input_bed is not None:
print("****Generating bed files by AS events***")
fo.prioritise_write_output(args, df, Transcript_Classifications)
else:
print("****skipping generation of bed files by AS events***")
fo.populate_classification(args, Transcript_Classifications, A5A3_Counts, IR_Counts, ES_Counts, NE_Counts)
else:
print("No detected isoforms")
shutil.rmtree(args.gene_root_dir)
print("**** All Done! ****")
def main():
parser = argparse.ArgumentParser(description="Full Isoform Characterisation from (Targeted) Long-read Experiments")
parser.add_argument("-n","--genename", help='\t\tTarget gene symbol')
parser.add_argument("-r","--reference", help='\t\tGene reference annotation (<gene>_gencode.gtf)')
parser.add_argument("-b","--input_bed", help='\t\tInput bed file of all the final transcripts in long-read derived transcriptome.',required=False, default = None)
parser.add_argument("-g","--input_gtf", help='\t\tInput gtf file of all the final transcripts in long-read derived transcriptome.')
parser.add_argument("-c","--input_class", help='\t\tSQANTI classification file')
parser.add_argument("--cpat", help='\t\ORF_prob.best.tsv file generated from CPAT',required=False)
parser.add_argument("--novelexon", action='store_true', help='\t\tExtract novel exon only',required=False)
parser.add_argument("-o","--output_dir", type=directory, help='\t\tOutput path for the annotation and associated files')
parser.add_argument("-v","--version", help="Display program version number.", action='version', version='FICLE '+str(__version__))
args = parser.parse_args()
print("************ Running FICLE...", file=sys.stdout)
print("version:", __version__)
if args.novelexon:
print("**** Extracting novel exon only ****")
annotate_gene(args)
if __name__ == "__main__":
main()