Update database builder script to support new SILVA download
Torsten Seemann
7 years ago
0 | 0 | #!/bin/bash |
1 | 1 | |
2 | 2 | CPUS=$(grep -c bogomips /proc/cpuinfo) |
3 | CURL="curl" | |
4 | GUNZIP="gzip -c" | |
3 | 5 | |
4 | 6 | RFAM="Rfam.seed" |
7 | RFAMURL="ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/${RFAM}.gz" | |
5 | 8 | if [ ! -r "$RFAM" ]; then |
6 | 9 | echo "Downloading: $RFAM" |
7 | wget --quiet ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.seed.gz | |
8 | gunzip $RFAM.gz | |
10 | $CURL "$RFAMURL" | $GUNZIP -c > "$RFAM" | |
9 | 11 | else |
10 | 12 | echo "Using existing file: $RFAM" |
11 | 13 | fi |
12 | 14 | |
13 | ||
14 | 15 | # 23S only as 16S is in RFAM |
15 | SILVA="LSURef_115_tax_silva_full_align_trunc.fasta" | |
16 | SILVA="SILVA_128_LSURef_tax_silva_full_align_trunc.fasta" | |
17 | SILVAURL="http://www.arb-silva.de/fileadmin/silva_databases/current/Exports/${SILVA}.gz" | |
16 | 18 | if [ ! -r "$SILVA" ]; then |
17 | 19 | echo "Downloading: $SILVA" |
18 | wget --quiet http://www.arb-silva.de/fileadmin/silva_databases/current/Exports/LSURef_115_tax_silva_full_align_trunc.fasta.tgz | |
19 | tar zxf $SILVA.tgz | |
20 | rm -f $SILVA.tgz | |
20 | $CURL "$SILVAURL" | $GUNZIP -c > "$SILVA" | |
21 | 21 | else |
22 | 22 | echo "Using existing file: $SILVA" |
23 | 23 | fi |
24 | 24 | |
25 | 25 | # this will write three files: LSU.Kingdom.aln |
26 | ./fix-SILVA.pl --seed --type LSU $SILVA | |
26 | echo "Fixing and splitting SILVA data" | |
27 | ./fix-SILVA.pl --seed --type LSU "$SILVA" | |
27 | 28 | |
28 | 29 | # Prepare RFAM for fetches |
29 | 30 | echo "Indexing $RFAM" |
30 | rm -f $RFAM.ssi | |
31 | esl-afetch --index $RFAM | |
31 | rm -f "$RFAM.ssi" | |
32 | esl-afetch --index "$RFAM" | |
32 | 33 | |
33 | 34 | echo "Fetching models..." |
34 | 35 | |
35 | 36 | # Bact |
36 | 37 | echo "Bac" |
37 | esl-afetch $RFAM RF00001 > 5S.bac.aln | |
38 | esl-afetch "$RFAM" RF00001 > 5S.bac.aln | |
38 | 39 | esl-reformat -r stockholm LSU.Bacteria.aln > 23S.bac.aln |
39 | esl-afetch $RFAM RF00177 > 16S.bac.aln | |
40 | esl-afetch "$RFAM" RF00177 > 16S.bac.aln | |
40 | 41 | |
41 | 42 | # Arch |
42 | 43 | echo "Arc" |
43 | esl-afetch $RFAM RF00001 > 5S.arc.aln | |
44 | esl-afetch $RFAM RF00002 > 5_8S.arc.aln | |
44 | esl-afetch "$RFAM" RF00001 > 5S.arc.aln | |
45 | esl-afetch "$RFAM" RF00002 > 5_8S.arc.aln | |
45 | 46 | esl-reformat -r stockholm LSU.Archaea.aln > 23S.arc.aln |
46 | esl-afetch $RFAM RF01959 > 16S.arc.aln | |
47 | esl-afetch "$RFAM" RF01959 > 16S.arc.aln | |
47 | 48 | |
48 | 49 | # Euk |
49 | 50 | echo "Euk" |
50 | esl-afetch $RFAM RF00001 > 5S.euk.aln | |
51 | esl-afetch $RFAM RF00002 > 5_8S.euk.aln | |
51 | esl-afetch "$RFAM" RF00001 > 5S.euk.aln | |
52 | esl-afetch "$RFAM" RF00002 > 5_8S.euk.aln | |
52 | 53 | esl-reformat -r stockholm LSU.Eukaryota.aln > 28S.euk.aln |
53 | esl-afetch $RFAM RF01960 > 18S.euk.aln | |
54 | esl-afetch "$RFAM" RF01960 > 18S.euk.aln | |
54 | 55 | |
55 | 56 | # Mito |
56 | 57 | FILE="12S.mito.aln" |
66 | 67 | |
67 | 68 | |
68 | 69 | for K in arc bac euk mito ; do |
69 | for T in 5S 5_8S 16S 23S 28S ; do | |
70 | for T in 5S 5_8S 12S 16S 23S 28S ; do | |
70 | 71 | ID="$T.$K" |
71 | 72 | if [ -r "$ID.aln" ]; then |
72 | 73 | echo "*** $ID ***" |
89 | 90 | for ID in $(cat MODELS) ; do |
90 | 91 | |
91 | 92 | echo "Extracting: $ID.aln" |
92 | esl-afetch $RFAM $ID > $ID.aln | |
93 | esl-afetch "$RFAM" $ID > $ID.aln | |
93 | 94 | |
94 | 95 | echo "Building: $ID.hmm" |
95 | 96 | rm -f $ID.hmm.h?? $ID.hmm |