#!/usr/bin/env bash

# PADLOC :: Locate antiviral defence systems in prokaryotic genomes
# https://github.com/leightonpayne/padloc
# Copyright (c) 2020-2023 Leighton Payne

VERSION="v2.0.0"
ME="${0}"
SRC_DIR="$(dirname ${ME})"
DBREPO="https://github.com/padlocbio/padloc-db"

# BASH STRICT MODE ------------------------------------------------------------

# Refer to: http://redsymbol.net/articles/unofficial-bash-strict-mode/

set -o nounset    # Unbound variables cause an error on parameter expansion.
set -o errexit    # Exit if a pipeline returns non-zero.
set -o errtrace   # Traps inherited by shell functions.
set -o pipefail   # Use the last non-zero exit code in a pipeline.
IFS=$'\n\t'       # Set wordsplitting to newline and tab.

# Trap on errexit
trap 'echo "$(date "+[%H:%M:%S]") ERROR >> errexit on line $LINENO" >&2' ERR

# MESSAGING -------------------------------------------------------------------

# die <message>; Print error message and exit.
die() { printf "\nERROR: \t %s\n" "$@" >&2 ; exit 1; } 

# debug <message>; Print message when using debug.
debug() {
  if [[ "${USE_DEBUG:-"0"}" -eq 1 ]] ; then 
    printf "$(date "+[%H:%M:%S]") DEBUG >> ${@}"
  fi
}

# info <message>; Print info message.
info() { printf "$(date "+[%H:%M:%S]") >> ${@}"; }

# HELP ------------------------------------------------------------------------

# help; Print usage information.
help() {
cat << EOF

        ___       ___                           ___       ___   
       /  /\     /  /\     _____               /  /\     /  /\  
      /  /::\   /  /::\   /  /::\   ___  __   /  /::\   /  /:/_
     /  /:/:/  /  /:/::\ /  /:/\:\ /  /\/ /\ /  /:/\:\ /  /:/ /\\
     \  \::/   \  \::/\/ \  \:\/:/ \  \:\/:/ \  \:\/:/ \  \:\/:/
      \  \:\    \  \:\    \  \::/   \  \::/   \  \::/   \  \::/  
       \__\/     \__\/     \__\/     \__\/     \__\/     \__\/          

PADLOC :: Locate antiviral defence systems in prokaryotic genomes

Usage:
  padloc [options] --faa <genome.faa> --gff <features.gff>
  padloc [options] --fna <genome.fna>

General:
  --help            Print this help message
  --version         Print version information
  --citation        Print citation information
  --check-deps      Check that dependencies are installed
  --debug           Run with debug messages
Database:
  --db-list         List all PADLOC-DB releases
  --db-install [n]  Install specific PADLOC-DB release [n]
  --db-update       Install latest PADLOC-DB release
  --db-version      Print database version information
Input:
  --faa [f]         Amino acid FASTA file (only valid with [--gff])
  --gff [f]         GFF file (only valid with [--faa])
  --fna [f]         Nucleic acid FASTA file
  --crispr [f]      CRISPRDetect output file containing array data
                    To pre-compute this file see instructions at:
                    'https://github.com/padlocbio/padloc/tree/master/etc'
  --ncrna [f]       Infernal output file containing ncRNA data
                    To pre-compute this file see instructions in:
                    'https://github.com/padlocbio/padloc/tree/master/etc'
Output:
  --outdir [d]      Output directory
Optional:
  --data [d]        Data directory (default '${DATA}')
  --cpu [n]         Use [n] CPUs (default '1')
  --fix-prodigal    Set this flag when providing an FAA and GFF file
                    generated with prodigal to force fixing of sequence IDs

Example input files can be found at:
${TEST}
EOF
}

# version; Print version information.
version() { printf "padloc %s\n" "${VERSION}"; }

# citation; Print citation information.
citation() { printf "
If you use PADLOC or PADLOC-DB please cite:

  Payne, L.J., Todeschini, T.C., Wu, Y., Perry, B.J., Ronson, C.W., 
  Fineran, P.C., Nobrega, F.L., Jackson, S.A. (2021) Identification and 
  classification of antiviral defence systems in bacteria and archaea with
  PADLOC reveals new system types. Nucleic Acids Research, 49, 10868-10878.
  doi: https://doi.org/10/gzgh

The HMMs and system models in PADLOC-DB were built and curated using the data
and conclusions from many different sources, we encourage you to also give
credit to these groups by reading their work and citing them where appropriate.
References to relevant literature can be found at:
https://github.com/padlocbio/padloc-db/blob/master/system_info.md#References
  "; }

# UTILITITES ------------------------------------------------------------------

# abspath <directory>; Get the absolute path to a directory.
abspath() {
  [[ "${1}" = /* ]] && echo "${1}" || echo "$PWD/${1#./}"
}

# normpath <directory>; Get the normalised path to a directory.
normpath() {
  printf $(unset CDPATH && cd "${1}" && pwd)
}

# check_command <command>; Check whether command is in $PATH
check_command() {
  command -v "$1" > /dev/null 2>&1
}

# check_rpkg <R package>; Check whether R package is installed
check_rpkg() {
  R --slave -e "packageVersion('$1')" > /dev/null 2>&1
}

# check_deps; Check that all dependencies are installed
check_deps() {
  if check_command R; then 
    info "R installed (%s)\n" $(R --version | grep "R version" | cut -d ' ' -f 3)
  else
    info "WARNING: R not installed\n"
  fi
  if check_command hmmsearch; then 
    info "HMMER installed (%s)\n" $(hmmsearch -h | grep HMMER | cut -d ' ' -f 3)
  else
    info "WARNING: HMMER not installed\n"
  fi
  if check_command prodigal; then 
    info "Prodigal installed (%s)\n" $(prodigal -v 2>&1 >/dev/null | grep -o '[0-9]*\.[0-9]*\.[0-9]*') 
  else
    info "WARNING: Prodigal not installed\n"
  fi
  if check_rpkg tidyverse; then
    info "R package 'tidyverse' installed (%s)\n" $(R --slave -e "packageVersion('tidyverse')" | grep -o '[0-9]*\.[0-9]*\.[0-9]*')
  else
    info "WARNING: R package 'tidyverse' not installed\n"
  fi
  if check_rpkg yaml; then
    info "R package 'yaml' installed (%s)\n" $(R --slave -e "packageVersion('yaml')" | grep -o '[0-9]*\.[0-9]*\.[0-9]*')
  else
    info "WARNING: R package 'yaml' not installed\n"
  fi
  if check_rpkg getopt; then
    info "R package 'getopt' installed (%s)\n" $(R --slave -e "packageVersion('getopt')" | grep -o '[0-9]*\.[0-9]*\.[0-9]*')
  else
    info "WARNING: R package 'getopt' not installed\n"
  fi
}


# DATABASE TOOLS --------------------------------------------------------------

db_list() {
  RELEASES=($(curl -s "${DBREPO}/releases" | grep "${DBREPO#https://github.com/}/releases/tag/" | grep -oP 'v[0-9].[0-9].[0.9](?=")'))
  for i in ${RELEASES[@]}; do
    echo "${i} :: ${DBREPO}/releases/${i}"
  done
}

db_install() {
  local version="${1:-}"
  if [[ $(curl -sL "${DBREPO}/releases/${version}") == "Not Found" ]]; then
    printf "ERROR: database version \"${version}\" does not exist, try install one of these instead:\n\n"
    db_list
    LATEST_VER=$(curl -s "${DBREPO}/releases" | grep "${DBREPO#https://github.com/}/releases/tag/" | grep -oP 'v[0-9].[0-9].[0.9](?=")' | grep -m1 "")
    printf "\nFor example:\n  $ padloc --db-install ${LATEST_VER}\n"
    exit 1
  else 
    [[ -d ${DATA} ]] && rm -drf ${DATA}
    mkdir ${DATA}
    echo "Downloading database version ${version}..."
    curl -sL "${DBREPO}/archive/refs/tags/${version}.tar.gz" --output ${DATA}/data.tar.gz
    echo "Unpacking database..."
    tar -xzf ${DATA}/data.tar.gz --directory ${DATA} --strip-components 1 && rm ${DATA}/data.tar.gz
  fi
}

db_update() {
  LATEST_VER=$(curl -s "${DBREPO}/releases" | grep "${DBREPO#https://github.com/}/releases/tag/" | grep -oP 'v[0-9].[0-9].[0.9](?=")' | grep -m1 "")
  db_install "${LATEST_VER}"
}

db_compile() {
  echo "Compiling database..."
  find ${DATA}/hmm/ -maxdepth 1 -name "*.hmm" -exec cat {} > ${DATA}/padlocdb.hmm \;
  find ${DATA}/hmm/ -maxdepth 1 -name "*.hmm" -delete
  mv ${DATA}/padlocdb.hmm ${DATA}/hmm/
  find ${DATA}/cm/ -maxdepth 1 -name "*.cm" -exec cat {} > ${DATA}/padlocdb.cm \;
  find ${DATA}/cm/ -maxdepth 1 -name "*.cm" -delete
  mv ${DATA}/padlocdb.cm ${DATA}/cm/
}

db_version() {
  if DBVER=$(grep "# PADLOC-DB release v" ${DATA}/RELEASE.md); then 
    printf "padloc-db v%s\n" "${DBVER#'# PADLOC-DB release v'}"
  else 
    echo "ERROR: Database version information not found" && exit 1
  fi
}

db_dev() {
  echo "Downloading PADLOC-DB development version..."
  curl -L "${DBREPO}/archive/master.tar.gz" --output ${DATA}/data.tar.gz
  echo "Unpacking database..."
  tar -xzf ${DATA}/data.tar.gz --directory ${DATA} --strip-components 1 && rm ${DATA}/data.tar.gz
}

# OPTIONS ---------------------------------------------------------------------

# Set defaults.
USE_DEBUG=0
CPU=1
QUIET=0
mkdir -p "${SRC_DIR}/../data"
DATA=$(normpath "${SRC_DIR}/../data")
TEST=$(normpath "${SRC_DIR}/../test")
FAA_FILE=""
FNA_FILE=""
FNA_COLLAPSED=""
FASTA_NAME=""
GFF_FILE=""
CRISPR_FILE=""
NCRNA_FILE=""
OUT_DIR="."
PRODIGAL=0
FORCE=0

# If no options given, print help.
[[ "${#}" = 0 ]] && help && exit 0

# require_argument <option> <argument>; Die when <argument> is blank or another option.
require_argument() {
  local option="${1:-}"
  local argument="${2:-}"
  if [[ -z "${argument}" ]] || [[ "${argument}" =~ ^- ]]; then
    die "Option [${option}] requires an argument"
  fi
}

# Read options.
while [[ "${#}" -gt 0 ]]; do
  option="${1:-}"
  parameter="${2:-}"
  case "${option}" in
    -h|--help) help; exit 0 ;;
    -v|--version) version; exit 0 ;;
    --db-list) db_list; exit 0 ;;
    --db-install) require_argument "${option}" "${parameter}"; db_install "${parameter}" && db_compile; exit 0 ;;
    --db-update) db_update && db_compile; exit 0 ;;
    --db-dev) db_dev && db_compile; exit 0 ;;
    --db-version) db_version; exit 0 ;;
    --citation) citation; exit 0 ;;
    --check-deps) check_deps; exit 0 ;;
    -d|--debug) USE_DEBUG=1 ;;
    -q|--quiet) QUIET=1 ;;
    --faa) require_argument "${option}" "${parameter}"
      FAA_FILE=$(abspath "${parameter}") 
      FASTA_NAME=$(basename ${FAA_FILE%.faa}) ; shift ;;
    --gff) require_argument "${option}" "${parameter}"
      GFF_FILE=$(abspath "${parameter}") ; shift ;;
    --fna) require_argument "${option}" "${parameter}"
      FNA_FILE=$(abspath "${parameter}") 
      FASTA_NAME=$(basename ${FNA_FILE%.fna}) ; shift ;;
    --crispr) require_argument "${option}" "${parameter}"
      CRISPR_FILE=$(abspath "${parameter}") ; shift ;;
    --ncrna) require_argument "${option}" "${parameter}"
      NCRNA_FILE=$(abspath "${parameter}") ; shift ;;
    --data) require_argument "${option}" "${parameter}"
      DATA=$(abspath "${parameter}") ; shift ;;
    -o|--outdir) require_argument "${option}" "${parameter}"
      OUT_DIR=$(abspath "${parameter}") ; shift ;;
    -c|--cpu) require_argument "${option}" "${parameter}"
      CPU="${parameter}" ; shift ;;
    --force) FORCE=1;;
    --fix-prodigal) PRODIGAL=1 ;;
    --endopts) break ;;
    *) die "Unexpected option: [${option}]" ;;
  esac
  shift
done

# Set database paths.
HMM_DATABASE="${DATA}/hmm/padlocdb.hmm"
YAML_DIR="${DATA}/sys/"
HMM_META="${DATA}/hmm_meta.txt"
SYS_META="${DATA}/sys_meta.txt"

# CORE FUNCTIONS --------------------------------------------------------------

# check_opt; Check whether options are valid.
check_opt() {

  # Check for valid input.
  if [[ ! -z "${FAA_FILE}" ]]; then
    [[ -f "${FAA_FILE}" ]] || die "'${FAA_FILE}' does not exist"
    [[ -s "${FAA_FILE}" ]] || die "'${FAA_FILE}' is empty"
    [[ -z "${FNA_FILE}" ]] || die "Only one of [--faa] or [--fna] should be provided, not both"
    [[ ! -z "${GFF_FILE}" ]] || die "[--faa] requires [--gff]"
  elif [[ ! -z "${FNA_FILE}" ]]; then
    [[ -f "${FNA_FILE}" ]] || die "'${FNA_FILE}' does not exist"
    [[ -s "${FNA_FILE}" ]] || die "'${FNA_FILE}' is empty"
    [[ -z "${FAA_FILE}" ]] || die "Only one of [--faa] or [--fna] should be provided, not both"
    [[ -z "${GFF_FILE}" ]] || die "[--gff] can't be used with [--fna]"
  else
    die "Valid FASTA file required [--fna|--faa]"
  fi

  # Check length of FNA sequence.
  if [[ ! -z "${FNA_FILE}" ]]; then
    FNA_COLLAPSED=$(sed '/^>/d' "${FNA_FILE}" | tr -d '\n')
    [[ "${#FNA_COLLAPSED}" -ge 100000 ]] || die "Prodigal works best with sequences > 100kbp, consider carrying out gene prediction with another tool before using PADLOC"
  fi


  # Check that the specified CRISPR input exists.
   if [[ ! -z "${CRISPR_FILE}" ]]; then
    [[ -f "${CRISPR_FILE}" ]] || die "The CRISPR array input file, '${CRISPR_FILE}' does not exist"
  fi
  
  # Check that the specified retron ncRNA input exists.
   if [[ ! -z "${NCRNA_FILE}" ]]; then
    [[ -f "${NCRNA_FILE}" ]] || die "The retron ncRNA input file, '${NCRNA_FILE}' does not exist"
  fi
  
  # Check that output directory exists.
  [[ -d "${OUT_DIR}" ]] || die "Valid output directory required"

  # Check whether --cpu is a valid non-zero integer.
  [[ "${CPU}" =~ ^-?[0-9]+$ && "${CPU}" -gt 0 ]] ||
  die "Number of cpus must be a non-zero integer"

  # Check whether --data directory exists.
  [[ ! -z "${DATA}" ]] && [[ ! -d "${DATA}" ]] && die "Valid data directory required"

  # Print parameters for debug.
  OPTIONS=(FAA_FILE FNA_FILE GFF_FILE HMM_DATABASE YAML_DIR HMM_META SYS_META OUT_DIR CPU)
  for i in "${OPTIONS[@]}"; do
    debug "\$${i}: ${!i}\n"
  done

}

# prodigal_wrapper; Wrapper for running prodigal.
prodigal_wrapper() {

  local timestamp=$(date '+%Y%m%d%H%M%S')
  FAA_FILE="${OUT_DIR}/${FASTA_NAME}_prodigal.faa"
  GFF_FILE="${OUT_DIR}/${FASTA_NAME}_prodigal.gff"
  PRODIGAL=1

  if [[ "${FORCE}" -eq 0 && -f "${FAA_FILE}" && -f "${GFF_FILE}" ]] ; then
    info "${FASTA_NAME}_prodigal.faa and ${FASTA_NAME}_prodigal.gff already exist; use --force to overwrite\n"
  else
    # Print command.
    debug "prodigal -i ${FNA_FILE} -f gff -o ${GFF_FILE} -a ${FAA_FILE} -q\n"
    # Run prodigal.
    if prodigal -i "${FNA_FILE}" -f gff -o "${GFF_FILE}" -a "${FAA_FILE}" -q; then
      debug "Finished @ $(date) ... SUCCESSFUL\n"
      # Convert to short headers.
      sed -i.bak 's/#.*//g' "${FAA_FILE}" && rm "${FAA_FILE}.bak"
      # Fix error in Prodigal formatting
      sed -i.bak '/^#/d' "${GFF_FILE}" && rm "${GFF_FILE}.bak"
    else
      debug "Finished @ $(date) ... FAILED\n"
      die "prodigal failed\n"
    fi
  fi

}

# hmmsearch_wrapper; Wrapper for running hmmsearch.
hmmsearch_wrapper() {

  DOMTBL_PATH="${OUT_DIR}/${FASTA_NAME}.domtblout"

  # If domtblout already exists, skip - unless forcing.
  if [[ "${FORCE}" -eq 0 && -f "${DOMTBL_PATH}" ]] ; then
    info "${FASTA_NAME}.domtblout already exists; use --force to overwrite\n"
  else
    # Print command.
    debug "hmmsearch --cpu ${CPU} --acc --noali --domtblout ${DOMTBL_PATH} ${HMM_DATABASE} ${FAA_FILE}\n"
    # Run hmmer.
    if hmmsearch --cpu "${CPU}" --acc --noali --domtblout "${DOMTBL_PATH}" "${HMM_DATABASE}" "${FAA_FILE}" > /dev/null; then
      debug "Finished @ $(date) ... SUCCESSFUL\n"
    else
      debug "Finished @ $(date) ... FAILED\n"
      die "hmmsearch failed"
    fi
  fi

}

# padloc_wrapper; Wrapper for running padloc.R
padloc_wrapper() {

  OUT_PATH="${OUT_DIR}/${FASTA_NAME}_padloc.csv"

  # If not appending and domtblout already exists, skip - unless forcing.
  if [[ "${FORCE}" -eq 0 && -f "${OUT_PATH}" ]] ; then
    info "${FASTA_NAME}_padloc.csv already exists; use --force to overwrite\n"
  else
    # Print command.
    debug "Rscript ${SRC_DIR}/bin/padloc.R -d ${DOMTBL_PATH} -f ${GFF_FILE} -c ${CRISPR_FILE} -r ${NCRNA_FILE} -h ${HMM_META} -s ${SYS_META} -y ${YAML_DIR} -o ${OUT_DIR} -b ${USE_DEBUG} -q ${QUIET} -p ${PRODIGAL}\n"
    # Run padloc.
    Rscript "${SRC_DIR}/padloc.R" \
    -d "${DOMTBL_PATH}" \
    -f "${GFF_FILE}" \
    -c "${CRISPR_FILE}" \
    -r "${NCRNA_FILE}" \
    -h "${HMM_META}" \
    -s "${SYS_META}" \
    -y "${YAML_DIR}" \
    -o "${OUT_DIR}" \
    -b "${USE_DEBUG}" \
    -q "${QUIET}" \
    -p "${PRODIGAL}"
  fi

}

# MAIN ------------------------------------------------------------------------

# main; Main program.
main() { 

  # Check options
  check_opt

  # Run Prodigal if input is fna
  if [[ ! -z "${FNA_FILE}" ]]; then
    info "Predicting protein-coding genes with prodigal\n"
    prodigal_wrapper
  fi

  # Run HMMER
  info "Scanning ${FASTA_NAME} for defence system proteins\n"
  hmmsearch_wrapper

  # Run padloc.R
  info "Searching ${FASTA_NAME} for defence systems\n"
  padloc_wrapper

}

# Run main
main
