-
Notifications
You must be signed in to change notification settings - Fork 0
/
IdentifyOverRepresented.sh
28 lines (23 loc) · 1023 Bytes
/
IdentifyOverRepresented.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# Run this from output directory of step 2 (metagenomic analysis)
# Requires list of virus positive samples (see github README)
# to run: bash IdentifyOverRepresented.sh
# Check input file:
inputTrue=`ls -l virusPositives.txt 2>/dev/null | wc -l`
if [ $inputTrue = 0 ]
then
echo "No list of virus positive samples found"; exit 1
fi
# Begin process
mkdir potentialHosts
cd potentialHosts
cp ../virusPositives.txt .
posCount=`cat virusPositives.txt | wc -l`
echo >> summary.txt; echo "You found $posCount virus positive samples" >> summary.txt; echo >> summary.txt
while read line; do ln -s ../$line*uniq . ; done < virusPositives.txt
# All taxa found across virus positive samples
cat *uniq | sort > all_taxon_occurences.txt
# Top hits
echo "In these, the top 25 most frequently found taxa are (sampleCount, taxon):" >> summary.txt
cat all_taxon_occurences.txt | uniq -c | sort -k1,1nr | head -n 25 | sed -e 's/^ *//' | tr ' ' '\t' >> summary.txt; echo >> summary.txt
# Clear up
rm *uniq all_taxon_occurences.txt