-
Notifications
You must be signed in to change notification settings - Fork 67
/
splitDatabase.sh
executable file
·39 lines (30 loc) · 1.41 KB
/
splitDatabase.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/usr/bin/env bash
#######
# Purpose: Split database list of genomes into multiple chunks to
# distribute fastANI execution on multiple compute nodes
# Usage: <this_script> <file containing genome paths, one per line> <count> <output_directory>
# Ouput: see <count> number of files in <output_directory>
DATABASE=$1
COUNT=$2
OUTPUT_DIR=$3
PREFIX="partition."
######### Split database ##########
TOTAL_COUNT=$( wc -l < $DATABASE )
COUNT_PER_FILE=$( perl -w -e "use POSIX; print ceil($TOTAL_COUNT/$COUNT), qq{\n}" )
echo "Total count of genomes in the given database " $TOTAL_COUNT
cat $DATABASE | shuf > ${OUTPUT_DIR}/.split.tmp
split -dl $COUNT_PER_FILE ${OUTPUT_DIR}/.split.tmp ${OUTPUT_DIR}/${PREFIX}
rm ${OUTPUT_DIR}/.split.tmp
FILE_COUNT=$( ls -1 ${OUTPUT_DIR}/${PREFIX}* | wc -l )
echo "Database is now split into " $FILE_COUNT " files in the folder " $OUTPUT_DIR
#### END OF SCRIPT ###
######### One of many ways to run the individual database partitions in parallel on a single machine ##########
######## Assuming COUNT is count of partitions and OUTPUT_DIR is the directory where partitions are saved
#for ((i = 1 ; i <= $COUNT ; i++ ));
#do
#DATABASE_PARTITION=`ls -1 ${OUTPUT_DIR}/partition.* | sed -n ${i}p`
#echo "Launching fastANI to process " $DATABASE_PARTITION
#fastANI --sl $DATABASE_PARTITION -q <query_genome> -o ${i}.output &> BATCH_OUTPUT_${i} &
#done
#echo "Waiting for jobs to finish "
#wait