-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreateDB
22 lines (15 loc) · 1.03 KB
/
createDB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#download nt fasta from ncbi ftp
ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nt.gz
#extract from blast db
blastdbcmd -db nt -dbtype nucl -entry all -out nt_extracted
#extract taxid mapping from blast db (needed for later taxonomy mapping)
blastdbcmd -db nt -dbtype nucl -entry all -outfmt "%i %T" -out nt_extracted.map
#extract more taxonomy info
blastdbcmd -db nt -dbtype nucl -entry all -outfmt "%i\t%T\t%l\t%L\t%S\t%K" -out nt_extracted_taxa.tsv
#convert nt with C-->T
awk '{if($0 ~ ">"){print $0}else{print gensub("C","T",$0)}}' nt_extracted >nt_extracted_conv
#convert nt with ambiguity code (doesn't work as well)
#awk '{if($0 ~ ">"){print $0}else{print gensub("C","Y",$0)}}' nt_extracted >nt_extracted_amb
#transform to blast db
ncbi-blast-2.4.0+/bin/makeblastdb -in nt_extracted_conv -parse_seqids -dbtype nucl -max_file_sz '2GB' -taxid_map nt_extracted.map -out nt_conv_db/nt_conv
#ncbi-blast-2.4.0+/bin/makeblastdb -in nt_extracted_amb -parse_seqids -dbtype nucl -max_file_sz '2GB' -taxid_map nt_extracted.map -out nt_extracted_amb_db/nt_amb