#!/bin/bash

# Copyright 2016-2026, André Müller (github.com/muellan)
#
# This file is part of the MetaCache taxonomic sequence classification tool.
#
# MetaCache is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# MetaCache is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with MetaCache.  If not, see <http://www.gnu.org/licenses/>.

set -u  # Protect against uninitialized vars.
set -e  # Stop on error

FTPURL="ftp://ftp.ncbi.nih.gov/genomes"


function download_specific_lib {
  DBASE=$1
  FILTER=$2
  PARALLEL=16

  mkdir -p $DBASE
  cd $DBASE

  if [ ! -e "complete.flag" ]; then

    echo "Downloading NCBI $DBASE."

    rm -f assembly_summary.txt
    rm -f ftpdirpaths*
    rm -f ftpfilepaths*

    wget $FTPURL/$DBASE/assembly_summary.txt

    if [ -e "assembly_summary.txt" ]; then

      if [ $FILTER -ge 0 ]; then
        #only complete genomes with valid ftp paths
        awk -F "\t" '$12=="Complete Genome" && $11=="latest" && $20!="na"{print $20}' assembly_summary.txt > ftpdirpaths
      fi
      if [ $FILTER -ge 1 ]; then
        awk -F "\t" '$11=="latest"{print $20}' assembly_summary.txt > ftpdirpaths
      fi

      awk 'BEGIN{FS=OFS="/";filesuffix="genomic.fna.gz"}{ftpdir=$0;asm=$10;file=asm"_"filesuffix;print ftpdir,file}' ftpdirpaths > ftpfilepaths
      # wget -nc -i ftpfilepaths
      cat ftpfilepaths | xargs -P ${PARALLEL} -n 1 wget -c -nv

      echo -n "Download complete. Unpacking files ... "
      find -maxdepth 1 -name '*.gz' -type f -print0 | xargs -0 -P ${PARALLEL} gunzip

      rm -f ftpdirpaths*
      rm -f ftpfilepaths*

      touch complete.flag

      echo "complete."

    else
      echo "ERROR: Couldn't find assembly_summary text file!"
    fi
  else
    echo "Download of $DBASE already complete."
  fi

  cd ..
}


if [ $# -lt 2 ]; then
  echo "Usage: $0 (refseq|genbank)/<sublibrary> <target directory> [all]"
  echo "    Example 1: $0 refseq/bacteria directory1"
  echo ""
  echo "    Example 2: $0 genbank/bacteria directory2"
  echo ""
  echo "    Example 3: All bacterial sequences from RefSeq:"
  echo "               $0 refseq/bacteria directory1 all"
  exit
fi

DATABASE=$1
TARGETDIR=$2
FILTER=0
if [ $# -gt 2 ]; then
  if [ "$3" == "contigs" -o "$3" == "all" ]; then
    echo "PLEASE NOTE: Make sure you have MetaCache compiled with support for more than 65536 reference genomes!"
    FILTER=1
  else
    echo "unknown filter option $3"
    exit
  fi
fi


mkdir -p $TARGETDIR
cd $TARGETDIR

download_specific_lib $DATABASE $FILTER

cd ..
