-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCorrection_blast.py
executable file
·162 lines (138 loc) · 6.72 KB
/
Correction_blast.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/local/bioinfo/python/3.6.4/bin/python
# -*- coding: utf-8 -*-
# @package Correction_blast.py
# @author Florian Charriat
########## Module ###############
## Python modules
import argparse, os, sys
# Import BioPython
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
#Import module_Flo
from module_Flo import verifDir, createDir , form, isFasta, recupId ,verifFichier,fasta2dict,sort_human,indexEgale,indexDif,functionSens,comparaisonListe,isIn,selectBlast,header_script,footer_script
if __name__ == "__main__":
version="0.1"
############ Argparse #####################
parser = argparse.ArgumentParser(prog=__file__, description='''This program is used to correct orthologue groups of a sequence from fasta sequence and orthofinder output''')
parser.add_argument('-v', '--version', action='version', version='You are using %(prog)s version: ' + version, help=\
'display Correction_blast version number and exit')
filesreq = parser.add_argument_group('Input mandatory infos for running')
filesreq.add_argument('-d', '--directory',type = str, required=True, dest = 'directory', help = 'Path of the directory which contains all OG fasta file')
filesreq.add_argument('-c', '--count',type = str, required=True, dest = 'count', help = 'Path of the count result of orthofinder (format csv)')
filesreq.add_argument('-db', '--database', type=str, required=True, dest='db',
help='Path of the all assembly merge file with scafold name : Souche_Scafold_1')
filesreq.add_argument('-o', '--outdir',type = str, required=True, dest = 'outdir', help = 'Path of the output directory')
######### Recuperation arguments ###########
args = parser.parse_args()
directory = os.path.abspath(args.directory)
db = os.path.abspath(args.db)
counts = os.path.abspath(args.count)
outdir = os.path.abspath(args.outdir)
########### Gestion directory ##############
verifFichier(counts)
directory = verifDir(directory, True)
outdir = verifDir(outdir)
# Create all directory output
createDir([outdir,outdir+'OG_fasta',outdir+'blast_result',outdir+'script'])
############### start message ########################
header_script('Correction_blast',version)
############# Main ##################################
listeOG = []
i = 1
# Parse fasta OG one by one
for elt in os.listdir(directory) :
# Checks if it's a fasta file
if elt.endswith('.fasta') :
print(form(f'Traitement du fichier {elt} : ', 'green', 'bold'))
print()
listeOG.append(elt.split(".")[0])
print(form(f'\t- Open file', 'white', 'bold'))
OG_file = fasta2dict(f'{directory}/{elt}')
length_prev = 0
# This part select the smallest sequence of the fasta for blast
for name in OG_file.keys():
length = len(OG_file[name].seq)
# This condition select the smallest sequence of the fasta for blast and remove of the selection, the outgroup of the data set
# and sequence with N
if length_prev > length or length_prev == 0 and 'XX' not in str(OG_file[name].seq):
if 'BR29' not in name and 'Pm1' not in name and 'DsLIZ' not in name and 'MGG' not in name and 'XX' not in str(
OG_file[name].seq):
min = [name, OG_file[name].seq]
length_prev = length
print(form(f'\t- Lancement du Blast', 'white', 'bold'))
# Write the query for the blast analysis with the smallest sequence
with open(f'{outdir}/OG_fasta/{elt}', 'w') as og_file:
record = SeqRecord(min[1], id=elt.split(".")[0], name=elt.split(".")[0],
description=f'| length : {len(min[1])}')
SeqIO.write(record, og_file, "fasta")
# The time to make blast analysis one by one is long, if you want earn time, you can comment the os.system line and all other parts (Retrieve strain, parse blast and write output)
os.system(f'blastn -query {outdir}OG_fasta/{elt} -db {db} -evalue 1e-4 > {outdir}blast_result/{elt.split(".")[0]}_blast.txt')
# Then decomment this two line for create all sh script. Launch all script in parallel then remove comment only of 3 next part !
# with open(f'{outdir}script/launcher_{i}.sh','w') as f :
# f.write(f'#!/bin/bash\nmodule load bioinfo/ncbi-blast/2.6.0\nblastn -query {outdir}OG_fasta/{elt} -db {db} -evalue 1e-4 > {outdir}blast_result/{elt.split(".")[0]}_blast.txt') # "6 qseqid sseqid pident qlen length mismatch gapope evalue qstart qend sstart send "
i += 1
print()
print(form('-' * 100, 'white', 'bold'))
print()
# ################################## Retrieve strain absent of OG #########################
nb = 0
dico = {}
print("Retrieve strain absent of OG")
with open(counts,'r') as file_count,\
open(f'{outdir}gene.count_prov.csv','w') as result :
header = f'Groupe{file_count.readline()}'.replace('\tTotal','')
listeProtein = header.replace('\n','').split('\t')[1:len(header.split('\t'))]
result.write(header)
for line in file_count :
name = line.split()[0]
if name in listeOG :
nb += 1
total = line.split()[-1]
new_line = line.replace(f'\t{total}', '')
result.write(new_line)
OG = line.split('\t')[0]
count = line.split('\t')[1:-1]
if '0' not in count :
print(f'{OG} is in core genome')
else :
listeIndex = indexEgale(count, '0')
listeCount = []
print(OG)
for i in listeIndex:
listeCount.append(listeProtein[i])
dico[OG] = listeCount
################################## Parse blast result #######################################
dico_blast = {}
print("Parse blast result")
for name in os.listdir(f'{outdir}blast_result/'):
print(name)
OG = name.split('_')[0]
if OG in dico.keys() :
# Use a function which select only hit with 90 percent of identity and coverage as well as a max intron of 1000 pb
dico_blast[OG] = [elt.split('_')[0] for elt in selectBlast(f'{outdir}blast_result/{name}',90,1000) if elt.split('_')[0] in dico[OG]]
############################ Write new OG_count output ############################################
print("Write new OG_count output")
with open(counts,'r') as count_file ,open(f'{outdir}gene.count.csv','w') as result:
header = count_file.readline()
listeProtein = header.replace('\n', '').replace('Groupe','').split('\t')
result.write(header)
for line in count_file:
OG = line.split('\t')[0]
lineSplit = line.replace('\n','').split('\t')
print('Before : ',lineSplit)
if OG in dico_blast.keys():
print(OG)
for elt in dico_blast[OG]:
# This function retrieve the position in the line of the element to change
listeI = indexEgale(listeProtein, elt)
for i in listeI:
print(i)
print(len(lineSplit))
# Add a 999 score for all presence by blast correction
lineSplit[i] = '999'
print("After : ",lineSplit)
newline = '\t'.join(lineSplit)
result.write(newline+'\n')
############## end message ###########################
footer_script()