-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathorthology_script.sh
248 lines (210 loc) · 8.85 KB
/
orthology_script.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#!/bin/bash
#
# Kevin M. Kocot
#
#This script takes the output of OrthoFinder and performs several "cleaning" steps to remove groups and sequences that are not suitable for phylogenomic analysis.
#Note which programs must be included in $PATH
#Run this script from the working directory containing the input fasta files (extension must be .fa) where each file represents one putatively orthologous group.
#Fasta headers must be in the following format: >species_name_abbreviation|annotation_or_sequence_ID_information
#Example: >LGIG|Contig1234
#Fasta headers may not include spaces or non-alphanumeric characters except for underscores (pipes are OK as field delimiters only).
########################################################################
################## Change these before your first use ##################
########################################################################
#Set values for variables.
MIN_SEQUENCE_LENGTH=100 #Deletes original seqs shorter than this length
MIN_ALIGNMENT_LENGTH=100 #Minimum length of a trimmed alignment in amino acids
HALF_TAXA=45 #Minimum number of OTUs to keep an OG
THREEFOURTHS_TAXA=68
CLASS_PATH=/usr/local/bin/ #Location of PhyloTreePruner .class files
CORES=35 #Number of CPU cores to use with Parallel
PREFIX=IQ-TREE2 #IQ-TREE 2 prefix for output files
########################################################################
#Fix headers and remove newlines.
echo "Removing linebreaks in sequences and fixing headers..."
for FILENAME in *.fa
do
sed -i ':a; $!N; /^>/!s/\n\([^>]\)/\1/; ta; P; D' $FILENAME
done
echo Done
echo
#Backup all sequences
echo "Making a backup of all sequences before beginning..."
mkdir unedited_sequences
cp *0.fa ./unedited_sequences/
cp *1.fa ./unedited_sequences/
cp *2.fa ./unedited_sequences/
cp *3.fa ./unedited_sequences/
cp *4.fa ./unedited_sequences/
cp *5.fa ./unedited_sequences/
cp *6.fa ./unedited_sequences/
cp *7.fa ./unedited_sequences/
cp *8.fa ./unedited_sequences/
cp *9.fa ./unedited_sequences/
echo
echo Done
echo
#Delete sequences shorter than $MIN_SEQUENCE_LENGTH
echo "Deleting sequences shorter than $MIN_SEQUENCE_LENGTH AAs..."
for FILENAME in *.fa
do
grep -B 1 "[^>].\{$MIN_SEQUENCE_LENGTH,\}" $FILENAME > $FILENAME.out
sed -i 's/--//g' $FILENAME.out
sed -i '/^$/d' $FILENAME.out
rm -rf $FILENAME
mv $FILENAME.out $FILENAME
done
echo Done
echo
#If fewer than $HALF_TAXA different species are represented in the file, move that file to the "rejected_few_taxa" directory.
echo "Removing groups with fewer than $HALF_TAXA taxa..."
mkdir -p rejected_few_taxa_1
for FILENAME in *.fa
do
awk -F"|" '/^>/{ taxon[$1]++ } END{for(o in taxon){print o,taxon[o]}}' $FILENAME > $FILENAME\.taxon_count #Creates temporary file with taxon abbreviation and number of sequences for that taxon in $FILENAME
taxon_count=`grep -v 0 $FILENAME\.taxon_count | wc -l` #Counts the number of lines with an integer >0 (= the number of taxa with at least 1 sequence)
if [ "$taxon_count" -lt "$HALF_TAXA" ] ; then
echo $FILENAME
mv $FILENAME ./rejected_few_taxa_1/
fi
done
rm -rf *0.fa.taxon_count
rm -rf *1.fa.taxon_count
rm -rf *2.fa.taxon_count
rm -rf *3.fa.taxon_count
rm -rf *4.fa.taxon_count
rm -rf *5.fa.taxon_count
rm -rf *6.fa.taxon_count
rm -rf *7.fa.taxon_count
rm -rf *8.fa.taxon_count
rm -rf *9.fa.taxon_count
echo Done
echo
#Remove redundant sequences using uniqHaplo
mkdir preUniqHaplo
cp *.fa preUniqHaplo
echo "Removing redundant sequences using uniqHaplo..."
ls *0.fa | parallel -j $CORES 'perl /usr/bin/uniqHaplo.pl -a {} > {}.uniq'
ls *1.fa | parallel -j $CORES 'perl /usr/bin/uniqHaplo.pl -a {} > {}.uniq'
ls *2.fa | parallel -j $CORES 'perl /usr/bin/uniqHaplo.pl -a {} > {}.uniq'
ls *3.fa | parallel -j $CORES 'perl /usr/bin/uniqHaplo.pl -a {} > {}.uniq'
ls *4.fa | parallel -j $CORES 'perl /usr/bin/uniqHaplo.pl -a {} > {}.uniq'
ls *5.fa | parallel -j $CORES 'perl /usr/bin/uniqHaplo.pl -a {} > {}.uniq'
ls *6.fa | parallel -j $CORES 'perl /usr/bin/uniqHaplo.pl -a {} > {}.uniq'
ls *7.fa | parallel -j $CORES 'perl /usr/bin/uniqHaplo.pl -a {} > {}.uniq'
ls *8.fa | parallel -j $CORES 'perl /usr/bin/uniqHaplo.pl -a {} > {}.uniq'
ls *9.fa | parallel -j $CORES 'perl /usr/bin/uniqHaplo.pl -a {} > {}.uniq'
rm -rf *.fa
rename 's/.fa.uniq/.fa/g' *.fa.uniq
echo Done
echo
#Align the remaining sequences using Mafft.
echo "Aligning sequences using Mafft (auto)..."
mkdir backup_alignments
ls *0.fa | parallel -j $CORES 'mafft --auto --reorder --quiet --localpair --maxiterate 1000 {} > {}.aln'
ls *1.fa | parallel -j $CORES 'mafft --auto --reorder --quiet --localpair --maxiterate 1000 {} > {}.aln'
ls *1.fa | parallel -j $CORES 'mafft --auto --reorder --quiet --localpair --maxiterate 1000 {} > {}.aln'
ls *2.fa | parallel -j $CORES 'mafft --auto --reorder --quiet --localpair --maxiterate 1000 {} > {}.aln'
ls *3.fa | parallel -j $CORES 'mafft --auto --reorder --quiet --localpair --maxiterate 1000 {} > {}.aln'
ls *4.fa | parallel -j $CORES 'mafft --auto --reorder --quiet --localpair --maxiterate 1000 {} > {}.aln'
ls *5.fa | parallel -j $CORES 'mafft --auto --reorder --quiet --localpair --maxiterate 1000 {} > {}.aln'
ls *6.fa | parallel -j $CORES 'mafft --auto --reorder --quiet --localpair --maxiterate 1000 {} > {}.aln'
ls *7.fa | parallel -j $CORES 'mafft --auto --reorder --quiet --localpair --maxiterate 1000 {} > {}.aln'
ls *8.fa | parallel -j $CORES 'mafft --auto --reorder --quiet --localpair --maxiterate 1000 {} > {}.aln'
ls *9.fa | parallel -j $CORES 'mafft --auto --reorder --quiet --localpair --maxiterate 1000 {} > {}.aln'
rm -rf *0.fa
rm -rf *1.fa
rm -rf *2.fa
rm -rf *3.fa
rm -rf *4.fa
rm -rf *5.fa
rm -rf *6.fa
rm -rf *7.fa
rm -rf *8.fa
rm -rf *9.fa
rename 's/.fa.aln/.fa/g' *.fa.aln
cp *.fa ./backup_alignments/
echo Done
echo
#Remove newlines.
echo "Removing linebreaks in sequences..."
for FILENAME in *.fa
do
sed -i ':a; $!N; /^>/!s/\n\([^>]\)/\1/; ta; P; D' $FILENAME
done
echo Done
echo
#Clean alignments with HmmCleaner
echo "Removing misaligned sequence regions with HmmCleaner..."
mkdir HmmCleaner_files
ls *.fa | parallel -j $CORES 'HmmCleaner.pl {} --specificity'
mv *.fa ./HmmCleaner_files
mv *.log ./HmmCleaner_files
mv *.score ./HmmCleaner_files
rename 's/_hmm.fasta/.fa/g' *_hmm.fasta
echo Done
echo
#Trim alignments with BMGE
echo "Trimming ambiguously aligned columns in the alignment with BMGE..."
mkdir backup_pre-BMGE
ls *.fa | parallel -j $CORES 'java -jar /usr/bin/BMGE-1.12/BMGE.jar -i {} -t AA -of {}.BMGE'
mv *.fa ./backup_pre-BMGE
cp *.BMGE ./backup_pre-BMGE
rename 's/.BMGE//g' *.BMGE
echo Done
echo
#Remove newlines.
echo "Removing linebreaks in sequences..."
for FILENAME in *.fa
do
sed -i ':a; $!N; /^>/!s/\n\([^>]\)/\1/; ta; P; D' $FILENAME
done
echo Done
echo
#Remove any sequences that don't overlap with all other sequences by at least 20 amino acids. This check runs until all sequences overlap with all other sequences by at least 20 amino acids.
echo "Removing short sequences that don't overalp with all other sequences by at least 20 AAs..."
for FILENAME in *.fa
do
java -cp /usr/bin AlignmentCompare $FILENAME
done
echo Done
echo
rm -rf myTempFile.txt
#If fewer than $HALF_TAXA different species are represented in the file, move that file to the "rejected_few_taxa" directory.
echo "Removing groups with fewer than $HALF_TAXA taxa..."
mkdir -p rejected_few_taxa_2
for FILENAME in *.fa
do
awk -F"|" '/^>/{ taxon[$1]++ } END{for(o in taxon){print o,taxon[o]}}' $FILENAME > $FILENAME\.taxon_count #Creates temporary file with taxon abbreviation and number of sequences for that taxon in $FILENAME
taxon_count=`grep -v 0 $FILENAME\.taxon_count | wc -l` #Counts the number of lines with an integer >0 (= the number of taxa with at least 1 sequence)
if [ "$taxon_count" -lt "$HALF_TAXA" ] ; then
echo $FILENAME
mv $FILENAME ./rejected_few_taxa_2/
fi
done
rm -rf *.fa.taxon_count
echo Done
echo
#Makes a tree for each OG with FastTree
echo "Making a tree for each OG using FastTreeMP..."
for FILENAME in *.fa
do
FastTreeMP -slow -gamma $FILENAME > $FILENAME.tre
done
rename 's/.fa.tre/.tre/g' *.fa.tre
echo Done
#Runs PhyloPyPruner and makes trees
echo "Running PhyloPyPruner, FastTree 2, and IQ-TREE 2..."
phylopypruner --threads $CORES --dir . --min-taxa $HALF_TAXA --min-support 0.9 --mask pdist --trim-lb 3 --trim-divergent 0.75 --min-pdist 0.01 --prune LS #Run with version 0.9.5
cd phylopypruner_output
FastTreeMP -slow -gamma supermatrix.fas > FastTree.tre
iqtree2 -T AUTO -B 1000 -s supermatrix.fas -Q partition_data.txt --prefix $PREFIX -m MFP --wbtl
cd ..
mv phylopypruner_output phylopypruner_output_min_50_pct
phylopypruner --threads $CORES --dir . --min-taxa $THREEFOURTHS_TAXA --min-support 0.9 --mask pdist --trim-lb 3 --trim-divergent 0.75 --min-pdist 0.01 --prune LS #Run with version 0.9.5
cd phylopypruner_output
FastTreeMP -slow -gamma supermatrix.fas > FastTree.tre
iqtree2 -T AUTO -B 1000 -s supermatrix.fas -Q partition_data.txt --prefix $PREFIX -m MFP --wbtl
cd ..
mv phylopypruner_output phylopypruner_output_min_75_pct
echo Done