Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
danilotat authored Dec 27, 2020
0 parents commit 6b85ce5
Show file tree
Hide file tree
Showing 5 changed files with 614 additions and 0 deletions.
173 changes: 173 additions & 0 deletions Extend_by_threshold_fw.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
#!/usr/bin/env python3

from collections import OrderedDict,namedtuple
from gtf_advanced_parser import *
import argparse

class GTF_element(object):
def __init__(self,chromosome,source,feature_type,start,end,score,strand,phase,attributes):
self.chromosome=chromosome
self.source=source
self.feature_type=feature_type
self.start=int(start)
self.end=int(end)
self.score=score
self.strand=strand
self.phase=phase
self.attributes=attributes

def gtf_line(self):
'''Passing a GTF_element, return a line as string'''
return f'{self.chromosome}\t{self.source}\t{self.feature_type}\t{self.start}\t{self.end}\t{self.score}\t{self.strand}\t{self.phase}\t{self.attributes}'

def extend_return(dict_type,threshold,transcript_attributes):
try:
for item in dict_type[transcript_attributes['transcript_id']]:
item_as_gtf=GTF_element(*item.split('\t'))
if item_as_gtf.feature_type == 'stop_codon' or item_as_gtf.feature_type == 'CDS':
print(item_as_gtf.gtf_line())
else:
if item_as_gtf.end == gene_id_coords[prev_gene_id][2]:
item_as_gtf.end+=threshold
print(item_as_gtf.gtf_line())
else:
print(item_as_gtf.gtf_line())
except KeyError:
pass
def iterate_and_extend(gene_id,threshold):
gene_record = GTF_element(*gene_id_records[gene_id].split('\t'))
gene_record.end+=threshold
gene_gtf_line=gene_record.gtf_line()
print(gene_gtf_line)
for transcript in transcript_id_records[gene_id]:
transcript_as_gtf=GTF_element(*transcript.split('\t'))
transcript_attributes=parse_attributes(transcript_as_gtf.attributes)
if transcript_as_gtf.end == gene_id_coords[gene_id][2]:
transcript_as_gtf.end+=threshold
transcript_gtf_line=transcript_as_gtf.gtf_line()
print(transcript_gtf_line)
# iterate through dict to return features
for k in list_of_dicts:
extend_return(k,threshold,transcript_attributes)
else:
print(transcript)
#internal transcript
for dict_type in list_of_dicts:
try:
for record in dict_type[transcript_attributes['transcript_id']]:
print(record)
except KeyError:
continue
try:
for misc in misc_features[gene_id]:
print(misc)
except KeyError:
pass

def iterate_not_extend(gene_id):
gene_record = GTF_element(*gene_id_records[gene_id].split('\t'))
gene_gtf_line=gene_record.gtf_line()
print(gene_gtf_line)
for transcript in transcript_id_records[gene_id]:
transcript_as_gtf=GTF_element(*transcript.split('\t'))
transcript_attributes=parse_attributes(transcript_as_gtf.attributes)
print(transcript)
for k in list_of_dicts:
try:
for record in k[transcript_attributes['transcript_id']]:
print(record)
except KeyError:
continue
try:
for misc in misc_features[gene_id]:
print(misc)
except KeyError:
pass

# ---- PARSE ARGUMENTS ----

parser = argparse.ArgumentParser(description="Extend 3' terminus of a GTF forward")
parser.add_argument("--input", help="Input file. Must be a fw only version of a valid GTF")
parser.add_argument("--threshold", type=int, default=1000, help="Threshold to use for 3'-terminal extension. Default 1000")
args=parser.parse_args()

# check if no argument was passed
if args.input == None:
print('No input provided. Exiting..')
exit()



fill_dict(gtf_file=args.input)
threshold=args.threshold



#first make a list of keys
list_keys=[]
for k in gene_id_coords.keys():
list_keys.append(k)

#iterate through records
list_of_dicts=[exons_records,cdss_records,five_prime_utrs_records,three_prime_utrs_records,start_stop_codons]
for idx,key in enumerate(list_keys):
if idx == 0:
continue
elif 0 < idx < len(list_keys) - 1:
cur_gene_id=list_keys[idx]
prev_gene_id=list_keys[idx-1]
#check if prev_gene is coding
prev_gene_as_gtf=GTF_element(*gene_id_records[prev_gene_id].split('\t'))
prev_gene_attributes=parse_attributes(prev_gene_as_gtf.attributes)
if prev_gene_attributes['gene_biotype'] == '"protein_coding"':
#check chromosome if same
if gene_id_coords[cur_gene_id][0] == gene_id_coords[prev_gene_id][0]:
residue=int(gene_id_coords[cur_gene_id][1]) - int(gene_id_coords[prev_gene_id][2])
#calc residue if higher than threshold
if residue > threshold:
iterate_and_extend(prev_gene_id,threshold)
elif 0 < residue < threshold:
# here extension could be done by a value lower than threshold
corrected_threshold=int(residue-1)
iterate_and_extend(prev_gene_id,corrected_threshold)
elif int(gene_id_coords[cur_gene_id][1]) < int(gene_id_coords[prev_gene_id][2]):
iterate_not_extend(prev_gene_id)
elif gene_id_coords[cur_gene_id][0] != gene_id_coords[prev_gene_id][0]:
#chromosome is different. Extend if possible
iterate_and_extend(prev_gene_id,threshold)
else:
#not coding gene
iterate_not_extend(prev_gene_id)
elif idx == len(list_keys) - 1:
cur_gene_id=list_keys[idx]
prev_gene_id=list_keys[idx-1]
#check if prev_gene is coding
prev_gene_as_gtf=GTF_element(*gene_id_records[prev_gene_id].split('\t'))
prev_gene_attributes=parse_attributes(prev_gene_as_gtf.attributes)
if prev_gene_attributes['gene_biotype'] == '"protein_coding"':
#check chromosome if same
if gene_id_coords[cur_gene_id][0] == gene_id_coords[prev_gene_id][0]:
residue=int(gene_id_coords[cur_gene_id][1]) - int(gene_id_coords[prev_gene_id][2])
#calc residue if higher than threshold
if residue > threshold:
iterate_and_extend(prev_gene_id,threshold)
elif 0 < residue < threshold:
# here extension could be done by a value lower than threshold
corrected_threshold=int(residue-1)
iterate_and_extend(prev_gene_id,corrected_threshold)
elif int(gene_id_coords[cur_gene_id][1]) < int(gene_id_coords[prev_gene_id][2]):
iterate_not_extend(prev_gene_id)
elif gene_id_coords[cur_gene_id][0] != gene_id_coords[prev_gene_id][0]:
#chromosome is different. Extend if possible
iterate_and_extend(prev_gene_id,threshold)
else:
#not coding gene
iterate_not_extend(prev_gene_id)
#last entry, check only if coding
cur_gene_as_gtf=GTF_element(*gene_id_records[cur_gene_id].split('\t'))
cur_gene_attributes=parse_attributes(cur_gene_as_gtf.attributes)
if cur_gene_attributes['gene_biotype'] == '"protein_coding"':
iterate_and_extend(cur_gene_id,threshold)
else:
iterate_not_extend(cur_gene_id)

166 changes: 166 additions & 0 deletions Extend_by_threshold_rev.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
from collections import OrderedDict,namedtuple
from gtf_advanced_parser import *

class GTF_element(object):
def __init__(self,chromosome,source,feature_type,start,end,score,strand,phase,attributes):
self.chromosome=chromosome
self.source=source
self.feature_type=feature_type
self.start=int(start)
self.end=int(end)
self.score=score
self.strand=strand
self.phase=phase
self.attributes=attributes

def gtf_line(self):
'''Passing a GTF_element, return a line as string'''
return f'{self.chromosome}\t{self.source}\t{self.feature_type}\t{self.start}\t{self.end}\t{self.score}\t{self.strand}\t{self.phase}\t{self.attributes}'

def extend_return_rev(dict_type, threshold, transcript_attributes):
try:
for item in dict_type[transcript_attributes['transcript_id']]:
item_as_gtf=GTF_element(*item.split('\t'))
if item_as_gtf.feature_type == 'stop_codon' or item_as_gtf.feature_type == 'CDS':
print(item_as_gtf.gtf_line())
else:
if item_as_gtf.start == gene_id_coords[cur_gene_id][1]:
item_as_gtf.start-=threshold
print(item_as_gtf.gtf_line())
else:
print(item_as_gtf.gtf_line())
except KeyError:
pass
def iterate_and_extend_rev(gene_id, threshold):
gene_record = GTF_element(*gene_id_records[gene_id].split('\t'))
gene_record.start-=threshold
gene_gtf_line=gene_record.gtf_line()
print(gene_gtf_line)
for transcript in transcript_id_records[gene_id]:
transcript_as_gtf=GTF_element(*transcript.split('\t'))
transcript_attributes=parse_attributes(transcript_as_gtf.attributes)
if transcript_as_gtf.start == gene_id_coords[gene_id][1]:
transcript_as_gtf.start-=threshold
transcript_gtf_line=transcript_as_gtf.gtf_line()
print(transcript_gtf_line)
# iterate through dict to return features
for k in list_of_dicts:
extend_return_rev(k,threshold,transcript_attributes)
else:
print(transcript)
#internal transcript
for dict_type in list_of_dicts:
try:
for record in dict_type[transcript_attributes['transcript_id']]:
print(record)
except KeyError:
continue
try:
for misc in misc_features[gene_id]:
print(misc)
except KeyError:
pass

def iterate_not_extend_rev(gene_id):
gene_record = GTF_element(*gene_id_records[gene_id].split('\t'))
gene_gtf_line=gene_record.gtf_line()
print(gene_gtf_line)
for transcript in transcript_id_records[gene_id]:
transcript_as_gtf=GTF_element(*transcript.split('\t'))
transcript_attributes=parse_attributes(transcript_as_gtf.attributes)
print(transcript)
for k in list_of_dicts:
try:
for record in k[transcript_attributes['transcript_id']]:
print(record)
except KeyError:
continue
try:
for misc in misc_features[gene_id]:
print(misc)
except KeyError:
pass
#first make a list of keys

parser = argparse.ArgumentParser(description="Extend 3' terminus of a GTF reverse")
parser.add_argument("--input", help="Input file. Must be a rev only version of a valid GTF")
parser.add_argument("--threshold", type=int, default=1000, help="Threshold to use for 3'-terminal extension. Default 1000")
args=parser.parse_args()

# check if no argument was passed
if args.input == None:
print('No input provided. Exiting..')
exit()



fill_dict(gtf_file=args.input)
threshold=args.threshold

list_keys=[]
for k in gene_id_coords.keys():
list_keys.append(k)

#iterate through records
list_of_dicts=[exons_records,cdss_records,five_prime_utrs_records,three_prime_utrs_records,start_stop_codons]
for idx, key in enumerate(list_keys):
if idx==0:
cur_gene_id=list_keys[idx]
cur_gene_as_gtf=GTF_element(*gene_id_records[cur_gene_id].split('\t'))
cur_gene_attributes=parse_attributes(cur_gene_as_gtf.attributes)
#now check position: remember that edits must be done against cur gene
#is it coding?
if cur_gene_attributes['gene_biotype']=='"protein_coding"':
if int(gene_id_coords[cur_gene_id][1]) > threshold:
iterate_and_extend_rev(cur_gene_id,threshold)
elif 1 < int(gene_id_coords[cur_gene_id][1]) < threshold:
adj_threshold= int(gene_id_coords[cur_gene_id][1]) - 1
iterate_and_extend_rev(cur_gene_id,threshold=threshold)
elif int(gene_id_coords[cur_gene_id][1]) == 1:
iterate_not_extend_rev(cur_gene_id)
#this is the first record: just extend if possible
if 0 < idx < len(list_keys) - 1:
cur_gene_id=list_keys[idx]
prev_gene_id=list_keys[idx-1]
cur_gene_as_gtf=GTF_element(*gene_id_records[cur_gene_id].split('\t'))
cur_gene_attributes=parse_attributes(cur_gene_as_gtf.attributes)
#now check position: remember that edits must be done against cur gene
#is it coding?
if cur_gene_attributes['gene_biotype'] == '"protein_coding"':
#first check if records are under the same chromosome
if gene_id_coords[cur_gene_id][0] == gene_id_coords[prev_gene_id][0]:
#same chromosome
residue=int(gene_id_coords[cur_gene_id][1]) - int(gene_id_coords[prev_gene_id][2])
if residue > threshold:
iterate_and_extend_rev(cur_gene_id,threshold=threshold)
elif 0 < residue <= threshold:
adj_threshold=int(residue-1)
iterate_and_extend_rev(cur_gene_id,threshold=adj_threshold)
elif int(gene_id_coords[prev_gene_id][2]) > int(gene_id_coords[cur_gene_id][1]):
iterate_not_extend_rev(cur_gene_id)
else:
#different chromosome: extend if possibile
if int(gene_id_coords[cur_gene_id][1]) > threshold:
iterate_and_extend_rev(cur_gene_id,threshold=threshold)
elif 1 < int(gene_id_coords[cur_gene_id][1]) < threshold:
adj_threshold= int(gene_id_coords[cur_gene_id][1]) - 1
iterate_and_extend_rev(cur_gene_id,threshold=adj_threshold)
elif int(gene_id_coords[cur_gene_id][1]) == 1:
iterate_not_extend_rev(cur_gene_id)

else:
#non coding: just iterate
iterate_not_extend_rev(cur_gene_id)
elif idx == len(list_keys) - 1:
#last record
cur_gene_id=list_keys[idx]
cur_gene_as_gtf=GTF_element(*gene_id_records[cur_gene_id].split('\t'))
cur_gene_attributes=parse_attributes(cur_gene_as_gtf.attributes)
if cur_gene_attributes['gene_biotype']=='"protein_coding"':
if int(gene_id_coords[cur_gene_id][1]) > threshold:
iterate_and_extend_rev(cur_gene_id,threshold=threshold)
elif int(gene_id_coords[cur_gene_id][1]) < threshold:
adj_threshold= int(gene_id_coords[cur_gene_id][1]) - 1
iterate_and_extend_rev(cur_gene_id,threshold=adj_threshold)
elif int(gene_id_coords[cur_gene_id][1]) == 1:
iterate_not_extend_rev(cur_gene_id)
42 changes: 42 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# UTR_add_extend_GTF

Provided tool will add explicit 3'UTR and extends it into a valid Ensembl GTF file.
This seems to be useful while working with 3'RNAseq.


The repository is composed by 3 script file:

**gtf_advanced_parser**

Yet another gtf parser. This is intended to be more close to the biological meaning of a GTF file, so it works by creating OrderedDict of features by exploiting gene_id or transcript_id as indexes. So *genes*, *transcripts* and miscellaneous will be stored under the *gene_id* index; any other feature, like *exon*, *CDS*, *UTR* will be stored under the *transcript_id* index. If this script is executed as a standalone python script, it will add explicit 3'UTR and reformat the provided GTF by this scheme:
For any gene:

- gene
- transcript
- exons
- CDS
- 5' UTR
- 3' UTR
- start & stop codons
- miscellaneous


**Extend_by_threshold_{fw-rev}.py**

Using the scheme above, this script use a {fw-rev} only version of a GTF* to extend the 3' terminus of genes by a given threshold when possible. By possible it means that no overlap with exisisting genes will be added and, if an overlap already exists, it won't be extended. Obviously no coding sequence will be extended, so this could be done only into the untranslated region with 3' coordinates.
It works by extending a gene, its transcript(s) with same end coordinates, its exon(s) with same end coordinates and its 3'UTR(s).


*(please refer to the bash script to see how to automatically split the GTF and pass results to these two scripts)*

**process_them**


Extremely sample bash script provided as a POC to see how to extend multiple times a valid GTF file using multiple threshold. Here a valid GTF is considered a file already processed with **gtf_advanced_parser**, like:

$ python3 gtf_advanced_parser.py --input Input.gtf --output Output.gtf





Loading

0 comments on commit 6b85ce5

Please sign in to comment.