Codebase list centrifuge / lintian-fixes/main centrifuge-download
lintian-fixes/main

Tree @lintian-fixes/main (Download .tar.gz)

centrifuge-download @lintian-fixes/main

d9e5f02
 
 
 
 
 
 
 
64b022e
 
 
 
d9e5f02
f6f7b09
d9e5f02
 
f6f7b09
d9e5f02
64b022e
d9e5f02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64b022e
 
 
 
 
 
 
 
 
d9e5f02
64b022e
d9e5f02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64b022e
 
 
 
 
 
 
 
d9e5f02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64b022e
d9e5f02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f6f7b09
d9e5f02
 
 
 
 
 
 
 
 
 
64b022e
d9e5f02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64b022e
 
 
d9e5f02
 
64b022e
 
 
d9e5f02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64b022e
 
 
 
 
 
 
d9e5f02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f6f7b09
d9e5f02
f6f7b09
 
d9e5f02
 
 
 
 
 
 
 
 
 
 
64b022e
d9e5f02
 
 
 
 
 
 
 
64b022e
 
 
 
 
 
d9e5f02
 
 
 
 
 
 
 
 
 
 
 
 
64b022e
 
 
 
 
 
 
 
 
 
 
 
 
d9e5f02
 
 
 
f6f7b09
 
 
 
 
 
 
 
 
 
 
 
d9e5f02
 
f6f7b09
 
d9e5f02
 
 
64b022e
d9e5f02
 
 
#!/bin/bash

set -eu -o pipefail

exists() {
  command -v "$1" >/dev/null 2>&1
}

if hash rsync 2>/dev/null; then
	DL_PROG="rsync --no-motd"
        DL_MODE="rsync"
elif hash wget 2>/dev/null; then
	DL_PROG="wget -N --reject=index.html -qO"
        DL_MODE="https"
else 
	DL_PROG="curl -s -o"
        DL_MODE="https"
fi
export DL_PROG DL_MODE

cut_after_first_space_or_second_pipe() {
	grep '^>' | sed 's/ .*//' | sed 's/\([^|]*|[^|]*\).*/\1/'
}
export -f cut_after_first_space_or_second_pipe

map_headers_to_taxid() {
	grep '^>' | cut_after_first_space_or_second_pipe | sed -e "s/^>//" -e  "s/\$/	$1/"
}
export -f map_headers_to_taxid



#########################################################
## Functions

function download_n_process() {
    IFS=$'\t' read -r TAXID FILEPATH <<< "$1"

    NAME=`basename $FILEPATH .gz`

    if [ $DL_MODE = "rsync" ]; then
        FILEPATH=${FILEPATH/ftp/rsync}
        $DL_PROG "$FILEPATH" "$LIBDIR/$DOMAIN/$NAME.gz" || \
        { printf "\nError downloading $FILEPATH!\n" >&2 && exit 1; }
    else
        $DL_PROG "$LIBDIR/$DOMAIN/$NAME.gz" "$FILEPATH" || { printf "\nError downloading $FILEPATH!\n" >&2 && exit 1; }
    fi

    [[ -f "$LIBDIR/$DOMAIN/$NAME.gz" ]] || return;
    gunzip -f "$LIBDIR/$DOMAIN/$NAME.gz" ||{ printf "\nError gunzipping $LIBDIR/$DOMAIN/$NAME.gz [ downloaded from $FILEPATH ]!\n" >&2 &&  exit 255; }

	
    if [[ "$CHANGE_HEADER" == "1" ]]; then
        sed -i "s/^>/>kraken:taxid|$TAXID /" $LIBDIR/$DOMAIN/$NAME
	fi

	if [[ "$FILTER_UNPLACED" == "1" ]]; then
        echo TODO 2>&1
		##sed -n '1,/^>.*unplaced/p; /'
	fi

	## Output sequenceID to taxonomy ID map to STDOUT
	cat $LIBDIR/$DOMAIN/$NAME | map_headers_to_taxid $TAXID

    if [[ "$DO_DUST" == "1" ]]; then
      ## TODO: Consider hard-masking only low-complexity stretches with 10 or more bps
      dustmasker -infmt fasta -in $LIBDIR/$DOMAIN/$NAME -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]/N/g' > $LIBDIR/$DOMAIN/${NAME%.fna}_dustmasked.fna
      rm $LIBDIR/$DOMAIN/$NAME
    fi
    echo done
}
export -f download_n_process



function download_n_process_nofail() {
    download_n_process "$@" || true
}
export -f download_n_process_nofail


ceol=`tput el` # terminfo clr_eol

function count {
   typeset C=0
   while read L; do
      if [[ "$L" == "done" ]]; then
        C=$(( C + 1 ))
        _progress=$(( (${C}*100/${1}*100)/100 ))
        _done=$(( (${_progress}*4)/10 ))
        _left=$(( 40-$_done ))
        # Build progressbar string lengths
        _done=$(printf "%${_done}s")
        _left=$(printf "%${_left}s")

        printf "\rProgress : [${_done// /#}${_left// /-}] ${_progress}%% $C/$1"  1>&2
      else
        echo "$L"
      fi
   done
}

function check_or_mkdir_no_fail {
    #echo -n "Creating $1 ... " >&2
    if [[ -d $1 && ! -n `find $1 -prune -empty -type d` ]]; then
        echo "Directory $1 exists.  Continuing" >&2
        return `true`
    else 
        #echo "Done" >&2
        mkdir -p $1
        return `true`
    fi
}

function c_echo() {
        printf "\033[34m$*\033[0m\n"
}



## Check if GNU parallel exists
command -v parallel >/dev/null 2>&1 && PARALLEL=1 || PARALLEL=0


ALL_GENOMES="bacteria viral archaea fungi protozoa invertebrate plant vertebrate_mammalian vertebrate_other"
ALL_DATABASES="refseq genbank taxonomy contaminants"
ALL_ASSEMBLY_LEVELS="Complete\ Genome Chromosome Scaffold Contig"

## Option parsing
DATABASE="refseq"
ASSEMBLY_LEVEL="Complete Genome"
REFSEQ_CATEGORY=""
TAXID=""

BASE_DIR="."
N_PROC=1
CHANGE_HEADER=0
DOWNLOAD_RNA=0
DO_DUST=0
FILTER_UNPLACED=0

USAGE="
`basename $0` [<options>] <database>

ARGUMENT
 <database>        One of refseq, genbank, contaminants or taxonomy:
                     - use refseq or genbank for genomic sequences,
                     - contaminants gets contaminant sequences from UniVec and EmVec,
                     - taxonomy for taxonomy mappings.

COMMON OPTIONS
 -o <directory>         Folder to which the files are downloaded. Default: '$BASE_DIR'.
 -P <# of threads>      Number of processes when downloading (uses xargs). Default: '$N_PROC'

WHEN USING database refseq OR genbank:
 -d <domain>            What domain to download. One or more of ${ALL_GENOMES// /, } (comma separated).
 -a <assembly level>    Only download genomes with the specified assembly level. Default: '$ASSEMBLY_LEVEL'. Use 'Any' for any assembly level.
 -c <refseq category>   Only download genomes in the specified refseq category. Default: any.
 -t <taxids>            Only download the specified taxonomy IDs, comma separated. Default: any.
 -r                     Download RNA sequences, too.
 -u                     Filter unplaced sequences.
 -m                     Mask low-complexity regions using dustmasker. Default: off.
 -l                     Modify header to include taxonomy ID. Default: off.
 -g                     Download GI map.
"

# arguments: $OPTFIND (current index), $OPTARG (argument for option), $OPTERR (bash-specific)
while getopts "o:P:d:a:c:t:urlm" OPT "$@"; do
    case $OPT in
        o) BASE_DIR="$OPTARG" ;;
        P) N_PROC="$OPTARG" ;;
        d) DOMAINS=${OPTARG//,/ } ;;
        a) ASSEMBLY_LEVEL="$OPTARG" ;;
        c) REFSEQ_CATEGORY="$OPTARG" ;;
        t) TAXID="$OPTARG" ;;
        r) DOWNLOAD_RNA=1 ;;
		u) FILTER_UNPLACED=1 ;;
        m) DO_DUST=1 ;;
        l) CHANGE_HEADER=1 ;;
        \?) echo "Invalid option: -$OPTARG" >&2 
            exit 1 
        ;;
        :) echo "Option -$OPTARG requires an argument." >&2
           exit 1
        ;;
    esac
done
shift $((OPTIND-1))

[[ $# -eq 1 ]] || { printf "$USAGE" >&2 && exit 1; };
DATABASE=$1

#### TAXONOMY DOWNLOAD
FTP="ftp://ftp.ncbi.nih.gov"
if [[ "$DATABASE" == "taxonomy" ]]; then 
  echo "Downloading NCBI taxonomy ... " >&2
  if check_or_mkdir_no_fail "$BASE_DIR"; then
    cd "$BASE_DIR" > /dev/null
    if [ $DL_MODE = "rsync" ]; then
        $DL_PROG ${FTP/ftp/rsync}/pub/taxonomy/taxdump.tar.gz taxdump.tar.gz
    else
        $DL_PROG taxdump.tar.gz $FTP/pub/taxonomy/taxdump.tar.gz
    fi
    tar -zxvf taxdump.tar.gz nodes.dmp
    tar -zxvf taxdump.tar.gz names.dmp
    rm taxdump.tar.gz
    cd - > /dev/null
  fi
  exit 0
fi

dat_to_fna() {
	grep -E '^DE|^ ' | awk '/^DE/ { sub(/DE */,">"); gsub(/[ |]/,"_") }; { print }' | awk '/^ / { gsub(/ /,""); sub(/[0-9]*$/,"") }; { print }' 
}

#### CONTAMINANT SEQ DOWNLOAD
if [[ "$DATABASE" == "contaminants" ]]; then 
  echo "Downloading contaminant databases ... " >&2
  CONTAMINANT_TAXID=32630
  CONTAMINANT_DIR="$BASE_DIR/contaminants"
  if check_or_mkdir_no_fail "$CONTAMINANT_DIR"; then
    cd "$CONTAMINANT_DIR" > /dev/null

    # download UniVec and EmVec database
    if [ $DL_MODE = "rsync" ]; then
        $DL_PROG rsync://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec UniVec.fna
        $DL_PROG rsync://ftp.ebi.ac.uk/pub/databases/emvec/emvec.dat.gz emvec.dat.gz
    else
        $DL_PROG UniVec.fna ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec
        $DL_PROG emvec.dat.gz ftp://ftp.ebi.ac.uk/pub/databases/emvec/emvec.dat.gz
    fi
    gunzip -c emvec.dat.gz | dat_to_fna > EmVec.fna
 
    if [[ "$CHANGE_HEADER" == "1" ]]; then
        sed -i "s/^>/>taxid|$CONTAMINANT_TAXID /" UniVec.fna
        sed -i "s/^>/>taxid|$CONTAMINANT_TAXID /" EmVec.fna
    else 
	cat UniVec.fna | map_headers_to_taxid $CONTAMINANT_TAXID
	cat EmVec.fna | map_headers_to_taxid $CONTAMINANT_TAXID
    fi

   #sed ':a $!N; s/^>.*\n>/>/; P; D' Contaminants/emvec.fa  > Contaminants/emvec.fa
    rm emvec.dat.gz

    cd - > /dev/null
	exit 0;
  fi
fi



#### REFSEQ/GENBANK DOWNLOAD

export LIBDIR="$BASE_DIR"
export DO_DUST="$DO_DUST"
export CHANGE_HEADER="$CHANGE_HEADER"

## Fields in the assembly_summary.txt file
REFSEQ_CAT_FIELD=5
TAXID_FIELD=6
SPECIES_TAXID_FIELD=7
VERSION_STATUS_FIELD=11
ASSEMBLY_LEVEL_FIELD=12
FTP_PATH_FIELD=20
FTP_PATH_FIELD2=21  ## Needed for wrongly formatted virus files - hopefully just a temporary fix

AWK_QUERY="\$$VERSION_STATUS_FIELD==\"latest\""
[[ "$ASSEMBLY_LEVEL" != "Any" ]] && AWK_QUERY="$AWK_QUERY && \$$ASSEMBLY_LEVEL_FIELD==\"$ASSEMBLY_LEVEL\""
[[ "$REFSEQ_CATEGORY" != "" ]] && AWK_QUERY="$AWK_QUERY && \$$REFSEQ_CAT_FIELD==\"$REFSEQ_CATEGORY\""

TAXID=${TAXID//,/|}
[[ "$TAXID" != "" ]] && AWK_QUERY="$AWK_QUERY && match(\$$TAXID_FIELD,\"^($TAXID)\$\")"

#echo "$AWK_QUERY" >&2

#echo "Downloading genomes for $DOMAINS at assembly level $ASSEMBLY_LEVEL" >&2
if exists wget; then
	wget -qO- --no-remove-listing ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/ > /dev/null
else
	curl --disable-epsv -s ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/ > .listing
fi

if [[ "$CHANGE_HEADER" == "1" ]]; then
    echo "Modifying header to include taxonomy ID" >&2
fi


for DOMAIN in $DOMAINS; do
    if [[ -f .listing ]]; then
        if [[ ! `grep "^d.* $DOMAIN" .listing` ]]; then
            c_echo "$DOMAIN is not a valid domain - use one of the following:" >&2
            grep '^d' .listing  | sed 's/.* //' | sed 's/^/  /' 1>&2
            exit 1
        fi
    fi
    
    #if [[ "$CHANGE_HEADER" != "1" ]]; then
        #echo "Writing taxonomy ID to sequence ID map to STDOUT" >&2
        #[[ -f "$LIBDIR/$DOMAIN.map" ]] && rm "$LIBDIR/$DOMAIN.map"
    #fi

    export DOMAIN=$DOMAIN
    check_or_mkdir_no_fail $LIBDIR/$DOMAIN

    FULL_ASSEMBLY_SUMMARY_FILE="$LIBDIR/$DOMAIN/assembly_summary.txt"
    ASSEMBLY_SUMMARY_FILE="$LIBDIR/$DOMAIN/assembly_summary_filtered.txt"

    echo "Downloading ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/$DOMAIN/assembly_summary.txt ..." >&2
    if [ $DL_MODE = "rsync" ]; then
        $DL_PROG rsync://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/$DOMAIN/assembly_summary.txt "$FULL_ASSEMBLY_SUMMARY_FILE" || {
            c_echo "rsync Download failed! Have a look at valid domains at ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE ." >&2
            exit 1;
        }
    else
        $DL_PROG "$FULL_ASSEMBLY_SUMMARY_FILE" ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/$DOMAIN/assembly_summary.txt > "$FULL_ASSEMBLY_SUMMARY_FILE" || {
            c_echo "Download failed! Have a look at valid domains at ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE ." >&2
            exit 1;
        }
    fi

    awk -F "\t" "BEGIN {OFS=\"\t\"} $AWK_QUERY" "$FULL_ASSEMBLY_SUMMARY_FILE" > "$ASSEMBLY_SUMMARY_FILE"

    N_EXPECTED=`cat "$ASSEMBLY_SUMMARY_FILE" | wc -l`
    [[ $N_EXPECTED -gt 0 ]] || { echo "Domain $DOMAIN has no genomes with specified filter." >&2; exit 1; }

    if [[ "$DOMAIN" == "viral" ]]; then
      ## Wrong columns in viral assembly summary files - the path is sometimes in field 20, sometimes 21
      cut -f "$TAXID_FIELD,$FTP_PATH_FIELD,$FTP_PATH_FIELD2" "$ASSEMBLY_SUMMARY_FILE" | \
       sed 's/^\(.*\)\t\(ftp:.*\)\t.*/\1\t\2/;s/^\(.*\)\t.*\t\(ftp:.*\)/\1\t\2/' | \
      sed 's#\([^/]*\)$#\1/\1_genomic.fna.gz#' |\
         tr '\n' '\0' | xargs -0 -n1 -P $N_PROC bash -c 'download_n_process_nofail "$@"' _ | count $N_EXPECTED
    else
      echo "Downloading $N_EXPECTED $DOMAIN genomes at assembly level $ASSEMBLY_LEVEL ... (will take a while)" >&2
      cut -f "$TAXID_FIELD,$FTP_PATH_FIELD" "$ASSEMBLY_SUMMARY_FILE" | sed 's#\([^/]*\)$#\1/\1_genomic.fna.gz#' |\
         tr '\n' '\0' | xargs -0 -n1 -P $N_PROC bash -c 'download_n_process_nofail "$@"' _ | count $N_EXPECTED
    fi
    echo >&2



    if [[ "$DOWNLOAD_RNA" == "1" && ! `echo $DOMAIN | egrep 'bacteria|viral|archaea'` ]]; then
    	echo "Downloadinging rna sequence files" >&2
        cut -f $TAXID_FIELD,$FTP_PATH_FIELD  "$ASSEMBLY_SUMMARY_FILE"| sed 's#\([^/]*\)$#\1/\1_rna.fna.gz#' |\
            tr '\n' '\0' | xargs -0 -n1 -P $N_PROC bash -c 'download_n_process_nofail "$@"' _ | count $N_EXPECTED
        echo >&2
    fi
done