#!/bin/bash
# LABEL - Lineage Assignment By Extended Learning
# Predicts the clade for a given nucleotide sequence & gene.

# Affiliation: Centers for Disease Control & Prevention
# Author: Samuel S. Shepard et al. (sshepard@cdc.gov)

PROGRAM="LABEL"
YEAR="2025"
VERSION="v0.7.1"
AUTHOR="Samuel S. Shepard"
EMAIL="vfn4@cdc.gov"
AFFIL="Centers for Disease Control & Prevention"

################
# INSTALLATION #
################
# Set manual base path or allow script to locate its current folder automatically.
# The resource folder will be expected to be in the same folder.
# Idea courtesy: stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in
bpath=
if [ "$bpath" == "" ]; then
    bpath=$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)
fi
resources=LABEL_RES

## SHELL VARS ##
LANG=POSIX
TAB=$'\t'
shopt -u nocaseglob
shopt -u nocasematch
################
################

function die() {
    echo -e "$PROGRAM ERROR:\t$1" 1>&2
    exit 1
}

function echoerr() {
    echo -e "$1" 1>&2
}

function echoerrn() {
    echo -n "$1" 1>&2
}

function cd_or_die() {
    local p=$1
    cd "$p" || die "Cannot cd to '$p', aborting."
}

function print_usage() {
    echoerr "\n$PROGRAM $VERSION, updated $YEAR\n$AUTHOR ($EMAIL), $AFFIL"
    echoerr "Usage:\n\t$(basename $PROGRAM) [-E C_OPT] [-W WRK_PATH|-O OUT_PATH] [-TRD|-S] [-L LIN_PATH] <nts.fasta> <project> <Module:H5,H9,etc.>"
    echoerr "\t\t-T\tDo TRAINING again instead of using classifier files."
    echoerr "\t\t-E\tSGE clustering option. Use 1 or 2 for SGE with array jobs, else local."
    echoerr "\t\t-R\tNo RECURSIVE prediction. Limits scope, useful with -L option."
    echoerr "\t\t-D\tNo DELETION of extra intermediary files."
    echoerr "\t\t-S\tShow available protein modules."
    echoerr "\t\t-W\tWeb-server mode: requires ABSOLUTE path to WRITABLE working directory."
    echoerr "\t\t-O\tOutput directory path, do not use with web mode."
    echoerr "Example: $0 -C gisaid_H5N1.fa Bird_Flu H5\n"
}

# Idea courtesy: steve-parker.org/sh/exitcodes.shtml
function err_test() {
    if [ "$1" -ne "0" ]; then
        echoerr ""
        if [ $# -eq "1" ]; then
            echoerr "$PROGRAM ERROR: operations have been ABORTED!"
        else
            echoerr "$PROGRAM ERROR ($2): operations have been ABORTED!"
        fi
        if [ $NO_DELETE -eq 0 -a -d "$ppath" -a $DO_WEB -eq 0 ]; then
            rm -rf "$ppath"
        fi
        exit 1
    fi
}

# Idea courtesy: stackoverflow.com/questions/592620/check-if-a-program-exists-from-a-bash-script
function check_prgm() {
    command -v "$1" > /dev/null 2>&1 || die "Program '$1' not found, please check your PATH or install it."
}

# PROCESS BASH OPTIONS #
LOCAL_PROC=${IFX_LOCAL_PROCS:-16} # MAX CONCURRENT PARALLEL
GRID_PROC=128                     # MAX CONCURRENT GRID PROCESSORS
DO_TRAIN=0                        # Use original data for training.
NO_RECURSION=0                    # Do not use recursion.
NO_DELETE=0                       # Do not clean up extra data.
DO_WEB=0                          # Web-server mode
DO_OUT=0
LINEAGE_PATH=""
OUTPUT_DIR=$(pwd)
USE_SGE=0     # use SGE loosely
CLUSTER_OPT=0 # clustering option
BASH=bash

# Check for programs
check_prgm cut
check_prgm paste
check_prgm perl
check_prgm /usr/bin/env

while getopts "GE:TACRDSW:L:O:h" option; do
    case $option in
        E)
            CLUSTER_OPT=$OPTARG
            ;;
        T)
            DO_TRAIN=1
            ;;
        R)
            NO_RECURSION=1
            ;;
        D)
            NO_DELETE=1
            ;;
        L)
            LINEAGE_PATH=$OPTARG
            ;;
        O)
            DO_OUT=1
            OUTPUT_DIR=$OPTARG
            if [ ! -w "$OUTPUT_DIR" ]; then
                die "output directory '$OUTPUT_DIR' not writable."
            fi
            ;;
        S)
            ls "$bpath"/$resources/training_data/
            exit 0
            ;;
        W)
            DO_WEB=1
            tpath=$OPTARG
            if [ ! -w "$tpath" ]; then
                die "output directory '$tpath' not writable."
            fi
            ;;
        h | *)
            print_usage
            exit 0
            ;;
    esac
done

if [ $# -lt $((OPTIND + 2)) ]; then
    print_usage
    exit 1
fi

if [ $DO_WEB -eq 1 -a $DO_OUT -eq 1 ]; then
    print_usage
    exit 1
fi

SEQ_LIMIT=100
if [ "$CLUSTER_OPT" -gt 0 ]; then
    USE_SGE=1
fi

# VARIABLES & PATHS #
args=("$@")
inputFasta=${args[$OPTIND - 1]}
project=${args[$OPTIND]}
module=${args[$OPTIND + 1]}
owd=$(pwd)
bin="$bpath/$resources/scripts"
vendorp=$bpath/$resources/third_party

function set_bin() {
    unalias "$1" 2> /dev/null

    command -v "$1" 2> /dev/null || {
        local cmd="$vendorp/${1}"
        if [ -x "$cmd" ]; then
            echo "$cmd"
        else
            local target="$(uname -s)"
            [[ "$target" == "Linux" ]] && target+="_$(uname -m)"

            cmd+="_$target"
            if [ -x "$cmd" ]; then
                echo "$cmd"
            else
                die "Could could not find program: $1"
            fi
        fi
    }
}
export -f set_bin

PARALLEL=$(set_bin parallel)
hmmscore=$(set_bin hmmscore)
shogun=$(set_bin shogun)

if [ "$DO_WEB" -eq "0" ]; then
    tpath="$bpath"/$resources/test_data
    ppath="$bpath"/$resources/test_data/${project}
    check_prgm zip
else
    ppath="$tpath"/${project}
fi

[ -d "$ppath" ] && die "Project currently in use. See: '$ppath'" || mkdir -p "$ppath"

if [ "$LINEAGE_PATH" != "" ]; then
    tnpath="$bpath"/$resources/training_data/$module/$LINEAGE_PATH
    mpath="$tnpath"
    grouping=$LINEAGE_PATH

else
    tnpath="$bpath"/$resources/training_data/$module
    mpath="$tnpath"
    grouping=$module
fi
level=$(basename "$grouping")

[ ! -d "$tnpath" ] && die "module '$module' does not exist. Try: LABEL -S"

# FNC - DO_ANALYSIS #
# Main recursive function for prediction at each level.
function doAnalysis() {
    local ppath=$1
    local mpath=$2
    local tnpath=$3
    local grouping=$4
    local level=$5
    local script="$ppath"/${level}.tmp
    local m
    local check
    local c

    # Concurrency limiting courtesy "tangens"
    rm "$ppath/${project}.tab.tmp" > /dev/null 2>&1
    [ -d "$mpath"/x-rev -a ! -d "$ppath"/x-rev ] && mkdir "$ppath"/x-rev

    local HMMSCORE_OPTS=""
    if [ -r "$mpath"/null.mod ]; then
        local -a modList=("$mpath"/null.mod "$mpath"/*hmm.mod)
        local -a modNames=($(for m in "${modList[@]}"; do basename "$m"; done))
        HMMSCORE_OPTS=" -subtract_null 0"
    elif [ -d "$mpath"/x-rev ]; then
        local -a modList=("$mpath"/*hmm.mod "$mpath"/x-rev/*hmm.mod)
        local -a modNames=($(for m in "$mpath"/*hmm.mod; do basename "$m"; done) $(for m in "$mpath"/x-rev/*hmm.mod; do echo x-rev/$(basename "$m"); done))
        HMMSCORE_OPTS=" -dpstyle 1 -subtract_null 1"
    else
        local -a modList=("$mpath"/*hmm.mod)
        local -a modNames=($(for m in "${modList[@]}"; do basename "$m"; done))
    fi

    # number of modules, sequences, and groups
    local mods=${#modList[@]}
    local seqs=$(grep '>' "$ppath/${project}_${level}.fas" -c)
    local procs=$LOCAL_PROC
    local doGrid=0
    if [ "$USE_SGE" -eq 1 -a "$seqs" -ge "$SEQ_LIMIT" ]; then
        procs=$GRID_PROC
        doGrid=1
    fi

    local groups=$((seqs / 100 + 1))
    local capacity=$((procs / mods + 1))
    [ "$groups" -gt "$capacity" ] && groups=$capacity
    local partitions=$((groups * mods))

    cat > "${script}.sh" << EOL
#!/bin/bash
LANG=$LANG
shopt -u nocaseglob;shopt -u nocasematch

if [ "\$#" -eq "1" ];then
	ID=\$1
else
	ID=\$SGE_TASK_ID
fi

ID=\$((ID - 1))
m=\$((ID / $groups ))
leaf=\$(printf %04d \$((ID % $groups + 1)))
mods=(${modNames[@]})
mod="$mpath"/"\${mods[\$m]}"

if [[ "\$(dirname "\${mods[\$m]}")" == "x-rev" ]];then
	db="../leaf_\${leaf}.tmpp"
	workdir="$ppath"/x-rev
else
	db=leaf_\${leaf}.tmpp
	workdir="$ppath"
fi

run="\$(basename "\$mod" .mod)_\$leaf"
cd "\$workdir"
"$hmmscore" \$run -db "\$db" -modelfile "\$mod" $HMMSCORE_OPTS

EOL
    chmod 755 "${script}.sh"
    "$bin"/interleavedSamples.pl -X tmpp -G $groups "$ppath/${project}_${level}.fas" "$ppath/leaf" > /dev/null 2>&1
    if [ "$doGrid" -eq "1" ]; then
        qsub $IRMA_QUEUE -t 1-$partitions:1 -N "N${project}-${level}" -sync y -j y -o "${script}.o" "${script}.sh" > /dev/null 2>&1 \
            && rm "${script}.o" \
            || {
                ((DO_WEB)) || {
                    cat "$script.o"
                    echoerr "\n\nWARNING: qsub of '$project-$level' failed. Switching to master node.\n\n"
                }
                "$PARALLEL" --nn --workdir "$ppath" -j "$LOCAL_PROC" -q $BASH "${script}.sh" {} ::: $(seq $partitions) > "$script.o" 2>&1 \
                    && rm "${script}.o" \
                    || {
                        cat "$script.o"
                        die "GNU Parallel failed to run analysis."
                    }
            }
    else
        "$PARALLEL" --nn --workdir "$ppath" -j "$LOCAL_PROC" -q $BASH "${script}.sh" {} ::: $(seq $partitions) > "$script.o" 2>&1 \
            && rm "${script}.o" \
            || {
                cat "$script.o"
                die "GNU Parallel failed to run analysis."
            }
    fi
    ((DO_WEB)) || echoerrn '...'

    local mod=""
    local run=""
    local pat=""
    for mod in "${modList[@]}"; do
        run=$(basename "$mod" .mod)
        pat=$(basename "$(dirname "$mod")")

        if [[ "$pat" == "x-rev" ]]; then
            "$bin"/parseScores.pl "$ppath/x-rev/${run}_"????.dist > "$ppath/x-rev/${run}.tab" \
                && rm "$ppath/x-rev/${run}_"????.dist
        else
            "$bin"/parseScores.pl "$ppath/${run}_"????.dist > "$ppath/${run}.tab" \
                && rm "$ppath/${run}_"????.dist
        fi
    done
    rm "$ppath/$level.tmp.sh" "$ppath"/*.tmpp "$ppath"/null_????.dist > /dev/null 2>&1

    local -a OPTS=(-Q -S "$shogun")
    [ -r "$ppath/null.tab" ] && OPTS+=(-N "$ppath/null.tab")
    [ "$DO_TRAIN" -eq "1" ] && OPTS+=(-R)
    [ -r "$tnpath/x-filter.txt" ] && OPTS+=(-D "$tnpath/x-filter.txt")
    [ -d "$ppath"/x-rev ] && OPTS+=(-C "$ppath/x-rev" -F 3)

    "$bin"/doLABELlevel.pl "$ppath" "$tnpath" "${OPTS[@]}" 2> "$ppath/${project}.log"
    err_test $? $LINENO

    check=$(ls -d "$mpath"/c-* 2> /dev/null | wc -w)
    if [ "$check" -ne 0 -a "$NO_RECURSION" -eq "0" ]; then
        for c in "$mpath"/c-*; do
            c=$(basename "$c")
            check=$(cut -f2 "$ppath"/LEVEL_result.tab | grep "^${c}$" -c)
            if [ "$check" -ne "0" ]; then
                mkdir -p "$ppath/$c"
                grep "${TAB}${c}$" "$ppath"/LEVEL_result.tab \
                    | "$bin"/fastaExtractor.pl "$ppath/${project}_${level}.fas" -F 1 > "$ppath/$c/${project}_$c.fas"

                if [ "$grouping" == "$module" ]; then
                    doAnalysis "$ppath/$c" "$mpath/$c" "$tnpath/$c" "$c" "$c" &
                else
                    doAnalysis "$ppath/$c" "$mpath/$c" "$tnpath/$c" "$grouping/$c" "$c" &
                fi
            fi
        done
    fi
    wait

    if [ "$NO_DELETE" -eq "0" ]; then
        # Condtionals get added to return value of the function
        [ -d "$ppath"/x-rev ] && rm -rf "$ppath"/x-rev
        [ -r "$ppath"/null.tab ] && rm -rf "$ppath"/null.tab
        rm "$ppath"/*.o > /dev/null 2>&1
        rm "$ppath"/*_hmm.tab "$ppath"/*dat "$ppath/${project}.log"
        rm "$ppath/${project}_${level}.fas"
    fi
}

((DO_WEB)) || echoerr "LABEL: Processing input data."
if [ -d "$inputFasta" ]; then
    die "'$inputFasta' was a directory."
elif [ ! -r "$inputFasta" ]; then
    die "'$inputFasta' is missing or had unreadable permissions."
elif [ ! -s "$inputFasta" -a ! -p "$inputFasta" ]; then
    die "'$inputFasta' is an empty input file."
else
    # PROCESS INPUT #
    if [ -p "$inputFasta" ]; then
        IS_PIPE=1
        DO_WEB=1
    else
        IS_PIPE=0
    fi

    "$bin"/stripSequences.pl -F "$inputFasta" '0-9 :~.-' \
        | "$bin"/removeByRedundantHeader.pl > "$ppath/${project}_$level.fas" \
        && cp "$ppath/${project}_$level.fas" "$ppath/${project}_$level.fas2" \
        || die "Failed to process '$inputFasta'."

    if [ "$USE_SGE" -eq "1" ]; then
        AVG_LEN=$(($(grep -v '>' "$ppath/${project}_$level.fas" | wc -c) / $(grep '>' "$ppath/${project}_$level.fas" -c) + 1))
        if [ "$AVG_LEN" -le 300 ]; then
            SEQ_LIMIT=1600
        elif [ "$AVG_LEN" -le 1000 ]; then
            SEQ_LIMIT=400
        elif [ "$AVG_LEN" -le 1800 ]; then
            SEQ_LIMIT=100
        else
            SEQ_LIMIT=50
        fi
    fi
fi

# START MAIN #
cd_or_die "$ppath"
((DO_WEB)) || echoerrn "LABEL: Performing lineage/clade prediction."
doAnalysis "$ppath" "$mpath" "$tnpath" "$grouping" "$level" \
    && "$bin"/findByNamePostOrder.pl -P "$ppath" LEVEL_result.tab > "$ppath/${project}_recursive.tmp" \
    && mv "$ppath/${project}_$level.fas2" "$ppath/${project}_$level.fas" \
    && "$bin"/finalizeResults.pl "$ppath/${project}_recursive.tmp" "$ppath/${project}_${level}.fas" "$ppath/$project" \
    && mv "$ppath/${project}_${level}.fas.final" "$ppath/${project}_${level}.fas" \
    || die "Analysis or post-processing failed."

if [ $DO_WEB -eq 0 ]; then
    echoerr ""
    cat "$ppath/${project}_final.txt" >&2
fi

cd_or_die "$owd"
((DO_WEB)) || cp "$ppath/${project}_final.txt" "$OUTPUT_DIR"
cp "$bpath"/README.md "$ppath"
((DO_WEB)) || "$bin"/evaluateResults.pl -H -S "$ppath/${project}_final.txt" >&2

if [ $NO_DELETE -eq 0 ]; then
    rm "$ppath"/*tmp
fi

# FASTA MANIPULATION #
mkdir -p "$ppath"/FASTA
mv "$ppath/${project}_${level}.fas" "$ppath/FASTA/${project}_predictions.fas" \
    && "$bin"/reviseTaxa.pl "$ppath/FASTA/${project}_predictions.fas" -C -D > "$ppath/FASTA/${project}_reannotated.fas" \
    && "$bin"/partitionTaxa.pl "$ppath/FASTA/${project}_reannotated.fas" "$ppath/FASTA/" -P "${project}_clade_" \
    && cat "$ppath/FASTA/${project}_clade"* > "$ppath/FASTA/${project}_reannotated.fas" \
    || die "FASTA input processing failed."

# CLEAN UP #
if [ "$IS_PIPE" -eq "1" ]; then
    cat "$ppath/${project}_final.tab"
    rm -rf "$ppath" &
elif [ "${DO_WEB:-0}" -eq 1 ]; then
    echo "$ppath"
else
    cd_or_die "$tpath"
    zip -q -r "${project}.zip" "$project"
    cd_or_die "$owd"
    mv "$tpath/${project}.zip" "$OUTPUT_DIR"
    rm -rf "$ppath"
fi
