Add files via upload

danilotat · Dec 27, 2020 · 6b85ce5 · 6b85ce5
commit 6b85ce5
Show file tree

Hide file tree

Showing 5 changed files with 614 additions and 0 deletions.
diff --git a/Extend_by_threshold_fw.py b/Extend_by_threshold_fw.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+
+from collections import OrderedDict,namedtuple
+from gtf_advanced_parser import *
+import argparse
+
+class GTF_element(object):
+	def __init__(self,chromosome,source,feature_type,start,end,score,strand,phase,attributes):
+		self.chromosome=chromosome
+		self.source=source
+		self.feature_type=feature_type
+		self.start=int(start) 
+		self.end=int(end) 
+		self.score=score
+		self.strand=strand
+		self.phase=phase
+		self.attributes=attributes 
+
+	def gtf_line(self):
+		'''Passing a GTF_element, return a line as string'''
+		return f'{self.chromosome}\t{self.source}\t{self.feature_type}\t{self.start}\t{self.end}\t{self.score}\t{self.strand}\t{self.phase}\t{self.attributes}'
+
+def extend_return(dict_type,threshold,transcript_attributes):
+	try:
+		for item in dict_type[transcript_attributes['transcript_id']]:
+			item_as_gtf=GTF_element(*item.split('\t'))
+			if item_as_gtf.feature_type == 'stop_codon' or item_as_gtf.feature_type == 'CDS':
+				print(item_as_gtf.gtf_line())
+			else:
+				if item_as_gtf.end == gene_id_coords[prev_gene_id][2]:
+					item_as_gtf.end+=threshold
+					print(item_as_gtf.gtf_line())
+				else:
+					print(item_as_gtf.gtf_line())
+	except KeyError:
+		pass
+def iterate_and_extend(gene_id,threshold):
+	gene_record = GTF_element(*gene_id_records[gene_id].split('\t'))
+	gene_record.end+=threshold
+	gene_gtf_line=gene_record.gtf_line()
+	print(gene_gtf_line)
+	for transcript in transcript_id_records[gene_id]:
+		transcript_as_gtf=GTF_element(*transcript.split('\t'))
+		transcript_attributes=parse_attributes(transcript_as_gtf.attributes)
+		if transcript_as_gtf.end == gene_id_coords[gene_id][2]:
+			transcript_as_gtf.end+=threshold
+			transcript_gtf_line=transcript_as_gtf.gtf_line()
+			print(transcript_gtf_line)
+			# iterate through dict to return features
+			for k in list_of_dicts:
+				extend_return(k,threshold,transcript_attributes)
+		else:
+			print(transcript)
+			#internal transcript
+			for dict_type in list_of_dicts:
+				try:
+					for record in dict_type[transcript_attributes['transcript_id']]:
+						print(record)
+				except KeyError:
+					continue 
+	try:
+		for misc in misc_features[gene_id]:
+			print(misc)
+	except KeyError:
+		pass
+
+def iterate_not_extend(gene_id):
+	gene_record = GTF_element(*gene_id_records[gene_id].split('\t'))
+	gene_gtf_line=gene_record.gtf_line()
+	print(gene_gtf_line)
+	for transcript in transcript_id_records[gene_id]:
+		transcript_as_gtf=GTF_element(*transcript.split('\t'))
+		transcript_attributes=parse_attributes(transcript_as_gtf.attributes)
+		print(transcript)
+		for k in list_of_dicts:
+			try:
+				for record in k[transcript_attributes['transcript_id']]:
+					print(record)
+			except KeyError:
+				continue
+	try:
+		for misc in misc_features[gene_id]:
+			print(misc)
+	except KeyError:
+		pass
+
+# ---- PARSE ARGUMENTS ----
+
+parser = argparse.ArgumentParser(description="Extend 3' terminus of a GTF forward")
+parser.add_argument("--input", help="Input file. Must be a fw only version of a valid GTF")
+parser.add_argument("--threshold", type=int, default=1000, help="Threshold to use for 3'-terminal extension. Default 1000")
+args=parser.parse_args()
+
+# check if no argument was passed 
+if args.input == None:
+	print('No input provided. Exiting..')
+	exit()
+
+
+
+fill_dict(gtf_file=args.input)
+threshold=args.threshold
+
+
+
+#first make a list of keys
+list_keys=[]
+for k in gene_id_coords.keys():
+	list_keys.append(k)
+
+#iterate through records
+list_of_dicts=[exons_records,cdss_records,five_prime_utrs_records,three_prime_utrs_records,start_stop_codons]
+for idx,key in enumerate(list_keys):
+	if idx == 0:
+		continue
+	elif 0 < idx < len(list_keys) - 1:
+		cur_gene_id=list_keys[idx]
+		prev_gene_id=list_keys[idx-1]
+		#check if prev_gene is coding
+		prev_gene_as_gtf=GTF_element(*gene_id_records[prev_gene_id].split('\t'))
+		prev_gene_attributes=parse_attributes(prev_gene_as_gtf.attributes)
+		if prev_gene_attributes['gene_biotype'] == '"protein_coding"':
+			#check chromosome if same
+			if gene_id_coords[cur_gene_id][0] == gene_id_coords[prev_gene_id][0]:
+				residue=int(gene_id_coords[cur_gene_id][1]) - int(gene_id_coords[prev_gene_id][2])
+				#calc residue if higher than threshold
+				if residue > threshold:
+					iterate_and_extend(prev_gene_id,threshold)
+				elif 0 < residue < threshold:
+					# here extension could be done by a value lower than threshold
+					corrected_threshold=int(residue-1)
+					iterate_and_extend(prev_gene_id,corrected_threshold)
+				elif int(gene_id_coords[cur_gene_id][1]) < int(gene_id_coords[prev_gene_id][2]):
+					iterate_not_extend(prev_gene_id)
+			elif gene_id_coords[cur_gene_id][0] != gene_id_coords[prev_gene_id][0]:
+				#chromosome is different. Extend if possible
+				iterate_and_extend(prev_gene_id,threshold)
+		else:
+			#not coding gene
+			iterate_not_extend(prev_gene_id)
+	elif idx == len(list_keys) - 1:
+		cur_gene_id=list_keys[idx]
+		prev_gene_id=list_keys[idx-1]
+		#check if prev_gene is coding
+		prev_gene_as_gtf=GTF_element(*gene_id_records[prev_gene_id].split('\t'))
+		prev_gene_attributes=parse_attributes(prev_gene_as_gtf.attributes)
+		if prev_gene_attributes['gene_biotype'] == '"protein_coding"':
+			#check chromosome if same
+			if gene_id_coords[cur_gene_id][0] == gene_id_coords[prev_gene_id][0]:
+				residue=int(gene_id_coords[cur_gene_id][1]) - int(gene_id_coords[prev_gene_id][2])
+				#calc residue if higher than threshold
+				if residue > threshold:
+					iterate_and_extend(prev_gene_id,threshold)
+				elif 0 < residue < threshold:
+					# here extension could be done by a value lower than threshold
+					corrected_threshold=int(residue-1)
+					iterate_and_extend(prev_gene_id,corrected_threshold)
+				elif int(gene_id_coords[cur_gene_id][1]) < int(gene_id_coords[prev_gene_id][2]):
+					iterate_not_extend(prev_gene_id)
+			elif gene_id_coords[cur_gene_id][0] != gene_id_coords[prev_gene_id][0]:
+				#chromosome is different. Extend if possible
+				iterate_and_extend(prev_gene_id,threshold)
+		else:
+			#not coding gene
+			iterate_not_extend(prev_gene_id)
+		#last entry, check only if coding
+		cur_gene_as_gtf=GTF_element(*gene_id_records[cur_gene_id].split('\t'))
+		cur_gene_attributes=parse_attributes(cur_gene_as_gtf.attributes)
+		if cur_gene_attributes['gene_biotype'] == '"protein_coding"':
+			iterate_and_extend(cur_gene_id,threshold)
+		else:
+			iterate_not_extend(cur_gene_id)
+
diff --git a/Extend_by_threshold_rev.py b/Extend_by_threshold_rev.py
@@ -0,0 +1,166 @@
+from collections import OrderedDict,namedtuple
+from gtf_advanced_parser import *
+
+class GTF_element(object):
+	def __init__(self,chromosome,source,feature_type,start,end,score,strand,phase,attributes):
+		self.chromosome=chromosome
+		self.source=source
+		self.feature_type=feature_type
+		self.start=int(start) 
+		self.end=int(end) 
+		self.score=score
+		self.strand=strand
+		self.phase=phase
+		self.attributes=attributes 
+
+	def gtf_line(self):
+		'''Passing a GTF_element, return a line as string'''
+		return f'{self.chromosome}\t{self.source}\t{self.feature_type}\t{self.start}\t{self.end}\t{self.score}\t{self.strand}\t{self.phase}\t{self.attributes}'
+
+def extend_return_rev(dict_type, threshold, transcript_attributes):
+	try:
+		for item in dict_type[transcript_attributes['transcript_id']]:
+			item_as_gtf=GTF_element(*item.split('\t'))
+			if item_as_gtf.feature_type == 'stop_codon' or item_as_gtf.feature_type == 'CDS':
+				print(item_as_gtf.gtf_line())
+			else:
+				if item_as_gtf.start == gene_id_coords[cur_gene_id][1]:
+					item_as_gtf.start-=threshold
+					print(item_as_gtf.gtf_line())
+				else:
+					print(item_as_gtf.gtf_line())
+	except KeyError:
+		pass
+def iterate_and_extend_rev(gene_id, threshold):
+	gene_record = GTF_element(*gene_id_records[gene_id].split('\t'))
+	gene_record.start-=threshold
+	gene_gtf_line=gene_record.gtf_line()
+	print(gene_gtf_line)
+	for transcript in transcript_id_records[gene_id]:
+		transcript_as_gtf=GTF_element(*transcript.split('\t'))
+		transcript_attributes=parse_attributes(transcript_as_gtf.attributes)
+		if transcript_as_gtf.start == gene_id_coords[gene_id][1]:
+			transcript_as_gtf.start-=threshold
+			transcript_gtf_line=transcript_as_gtf.gtf_line()
+			print(transcript_gtf_line)
+			# iterate through dict to return features
+			for k in list_of_dicts:
+				extend_return_rev(k,threshold,transcript_attributes)
+		else:
+			print(transcript)
+			#internal transcript
+			for dict_type in list_of_dicts:
+				try:
+					for record in dict_type[transcript_attributes['transcript_id']]:
+						print(record)
+				except KeyError:
+					continue 
+	try:
+		for misc in misc_features[gene_id]:
+			print(misc)
+	except KeyError:
+		pass
+
+def iterate_not_extend_rev(gene_id):
+	gene_record = GTF_element(*gene_id_records[gene_id].split('\t'))
+	gene_gtf_line=gene_record.gtf_line()
+	print(gene_gtf_line)
+	for transcript in transcript_id_records[gene_id]:
+		transcript_as_gtf=GTF_element(*transcript.split('\t'))
+		transcript_attributes=parse_attributes(transcript_as_gtf.attributes)
+		print(transcript)
+		for k in list_of_dicts:
+			try:
+				for record in k[transcript_attributes['transcript_id']]:
+					print(record)
+			except KeyError:
+				continue
+	try:
+		for misc in misc_features[gene_id]:
+			print(misc)
+	except KeyError:
+		pass
+#first make a list of keys
+
+parser = argparse.ArgumentParser(description="Extend 3' terminus of a GTF reverse")
+parser.add_argument("--input", help="Input file. Must be a rev only version of a valid GTF")
+parser.add_argument("--threshold", type=int, default=1000, help="Threshold to use for 3'-terminal extension. Default 1000")
+args=parser.parse_args()
+
+# check if no argument was passed 
+if args.input == None:
+	print('No input provided. Exiting..')
+	exit()
+
+
+
+fill_dict(gtf_file=args.input)
+threshold=args.threshold
+
+list_keys=[]
+for k in gene_id_coords.keys():
+	list_keys.append(k)
+
+#iterate through records
+list_of_dicts=[exons_records,cdss_records,five_prime_utrs_records,three_prime_utrs_records,start_stop_codons]
+for idx, key in enumerate(list_keys):
+	if idx==0:
+		cur_gene_id=list_keys[idx]
+		cur_gene_as_gtf=GTF_element(*gene_id_records[cur_gene_id].split('\t'))
+		cur_gene_attributes=parse_attributes(cur_gene_as_gtf.attributes)
+		#now check position: remember that edits must be done against cur gene
+		#is it coding?
+		if cur_gene_attributes['gene_biotype']=='"protein_coding"':
+			if int(gene_id_coords[cur_gene_id][1]) > threshold:
+				iterate_and_extend_rev(cur_gene_id,threshold)
+			elif 1 < int(gene_id_coords[cur_gene_id][1]) < threshold:
+				adj_threshold= int(gene_id_coords[cur_gene_id][1]) - 1
+				iterate_and_extend_rev(cur_gene_id,threshold=threshold)
+			elif int(gene_id_coords[cur_gene_id][1]) == 1:
+				iterate_not_extend_rev(cur_gene_id)
+		#this is the first record: just extend if possible
+	if 0 < idx < len(list_keys) - 1:
+		cur_gene_id=list_keys[idx]
+		prev_gene_id=list_keys[idx-1]
+		cur_gene_as_gtf=GTF_element(*gene_id_records[cur_gene_id].split('\t'))
+		cur_gene_attributes=parse_attributes(cur_gene_as_gtf.attributes)
+		#now check position: remember that edits must be done against cur gene
+		#is it coding?
+		if cur_gene_attributes['gene_biotype'] == '"protein_coding"':
+			#first check if records are under the same chromosome
+			if gene_id_coords[cur_gene_id][0] == gene_id_coords[prev_gene_id][0]:
+				#same chromosome
+				residue=int(gene_id_coords[cur_gene_id][1]) - int(gene_id_coords[prev_gene_id][2])
+				if residue > threshold:
+					iterate_and_extend_rev(cur_gene_id,threshold=threshold)
+				elif 0 < residue <= threshold:
+					adj_threshold=int(residue-1)
+					iterate_and_extend_rev(cur_gene_id,threshold=adj_threshold)
+				elif int(gene_id_coords[prev_gene_id][2]) > int(gene_id_coords[cur_gene_id][1]):
+					iterate_not_extend_rev(cur_gene_id)
+			else:
+				#different chromosome: extend if possibile
+				if int(gene_id_coords[cur_gene_id][1]) > threshold:
+					iterate_and_extend_rev(cur_gene_id,threshold=threshold)
+				elif 1 < int(gene_id_coords[cur_gene_id][1]) < threshold:
+					adj_threshold= int(gene_id_coords[cur_gene_id][1]) - 1
+					iterate_and_extend_rev(cur_gene_id,threshold=adj_threshold)
+				elif int(gene_id_coords[cur_gene_id][1]) == 1:
+					iterate_not_extend_rev(cur_gene_id)
+
+		else:
+			#non coding: just iterate
+			iterate_not_extend_rev(cur_gene_id)
+	elif idx == len(list_keys) - 1:
+		#last record 
+		cur_gene_id=list_keys[idx]
+		cur_gene_as_gtf=GTF_element(*gene_id_records[cur_gene_id].split('\t'))
+		cur_gene_attributes=parse_attributes(cur_gene_as_gtf.attributes)
+		if cur_gene_attributes['gene_biotype']=='"protein_coding"':
+			if int(gene_id_coords[cur_gene_id][1]) > threshold:
+				iterate_and_extend_rev(cur_gene_id,threshold=threshold)
+			elif int(gene_id_coords[cur_gene_id][1]) < threshold:
+				adj_threshold= int(gene_id_coords[cur_gene_id][1]) - 1
+				iterate_and_extend_rev(cur_gene_id,threshold=adj_threshold)
+			elif int(gene_id_coords[cur_gene_id][1]) == 1:
+				iterate_not_extend_rev(cur_gene_id)
diff --git a/README.md b/README.md
@@ -0,0 +1,42 @@
+# UTR_add_extend_GTF
+
+Provided tool will add explicit 3'UTR and extends it into a valid Ensembl GTF file. 
+This seems to be useful while working with 3'RNAseq.
+
+
+The repository is composed by 3 script file:
+
+**gtf_advanced_parser**
+
+Yet another gtf parser. This is intended to be more close to the biological meaning of a GTF file, so it works by creating OrderedDict of features by exploiting gene_id or transcript_id as indexes. So *genes*, *transcripts* and miscellaneous will be stored under the *gene_id* index; any other feature, like *exon*, *CDS*, *UTR* will be stored under the *transcript_id* index. If this script is executed as a standalone python script, it will add explicit 3'UTR and reformat the provided GTF by this scheme:
+For any gene:
+
+- gene
+- transcript
+- exons
+- CDS 
+- 5' UTR
+- 3' UTR
+- start & stop codons
+- miscellaneous
+
+
+**Extend_by_threshold_{fw-rev}.py**
+
+Using the scheme above, this script use a {fw-rev} only version of a GTF* to extend the 3' terminus of genes by a given threshold when possible. By possible it means that no overlap with exisisting genes will be added and, if an overlap already exists, it won't be extended. Obviously no coding sequence will be extended, so this could be done only into the untranslated region with 3' coordinates.
+It works by extending a gene, its transcript(s) with same end coordinates, its exon(s) with same end coordinates and its 3'UTR(s).
+
+
+*(please refer to the bash script to see how to automatically split the GTF and pass results to these two scripts)*
+
+**process_them**
+
+
+Extremely sample bash script provided as a POC to see how to extend multiple times a valid GTF file using multiple threshold. Here a valid GTF is considered a file already processed with **gtf_advanced_parser**, like:
+
+	$ python3 gtf_advanced_parser.py --input Input.gtf --output Output.gtf
+
+
+
+
+