#!/bin/bash

usage()
{
        echo "  #############################
        earlGrey version 7.1.0 (Library Construction Only)
        Required Parameters:
                -g == genome.fasta
                -s == species name
                -o == output directory

        Optional Parameters:
                -t == Number of Threads (DO NOT specify more than are available)
                -r == RepeatMasker search term (e.g arthropoda/eukarya)
                -l == Starting consensus library for an initial mask (in fasta format)
                -i == Number of Iterations to BLAST, Extract, Extend (Default: 10)
                -f == Number flanking basepairs to extract (Default: 1000)
                -n == Max number of sequences used to generate consensus sequences (Default: 20)
                -a == minimum number of sequences required to build a consensus sequence (Default: 3)
                -h == Show help

        Example Usage:

        earlGrey -g bombyxMori.fasta -s bombyxMori -o /home/toby/bombyxMori/repeatAnnotation/ -t 16

        Queries can be sent to:
        tobias.baril[at]unine.ch

        Please make use of the GitHub Issues and Discussion Tabs at: https://github.com/TobyBaril/EarlGrey
        #############################"
}

# Subprocess Make Directories #
makeDirectory()
{
        directory=$(realpath ${directory})
        mkdir -p ${directory}/${species}_EarlGrey/ && OUTDIR=${directory}/${species}_EarlGrey
        if [ ! -z "$RepSpec" ] || [ ! -z "$startCust" ] ; then
                mkdir -p $OUTDIR/${species}_RepeatMasker/
        fi
        mkdir -p $OUTDIR/${species}_Database/
        mkdir -p $OUTDIR/${species}_RepeatModeler/
        mkdir -p $OUTDIR/${species}_strainer/
        mkdir -p ${OUTDIR}/${species}_summaryFiles/
}

# Subprocess PrepGenome #
prepGenome()
{
        if [ ! -L ${genome} ]; then
                genome=$(realpath ${genome})
        fi
        if [ -L $genome ]; then
                genome=$(realpath -s ${genome})
        fi
        if [ ! -f ${genome}.prep ] || [ ! -f ${genome}.dict ]; then
                cp ${genome} ${genome}.bak && gzip -f ${genome}.bak 
                sed '/>/ s/[[:space:]].*//g; /^$/d' ${genome} > ${genome}.tmp
                ${SCRIPT_DIR}/headSwap.sh -i ${genome}.tmp -o ${genome}.prep && rm ${genome}.tmp
                mv ${genome}.tmp.dict ${genome}.dict
                sed -i.bak '/^>/! s/[DVHBPE]/N/g' ${genome}.prep
                dict=${genome}.dict
                genOrig=${genome}
                genome=${genome}.prep
        else
                dict=${genome}.dict
                genOrig=${genome}
                genome=${genome}.prep
        fi
}

# Subprocess getRepeatMaskerFasta
# Generate a copy of the RepeatMasker library subset used for the initial TE mask in FASTA format (only used if a RepeatMasker run is specified)
getRepeatMaskerFasta()
{
	if [[ $RepSpec = *" "* ]]; then
		echo "ERROR: You have entered a species name that contains a space, please use the NCBI TaxID rather than name. E.G In place of \"Homo sapiens\" use \"9606\""
		exit 2
	fi
	if [[ $(which RepeatMasker) == *"bin"* ]]; then
		libpath="$(which RepeatMasker | sed 's|bin/RepeatMasker|share/RepeatMasker/Libraries/famdb/|')"
	else
		libpath="$(which RepeatMasker | sed 's|/[^/]*$||g')/Libraries/famdb/"
	fi
	if [[ $(which RepeatMasker) == *"bin"* ]]; then
		PATH=$PATH:"$(which RepeatMasker | sed 's|bin/RepeatMasker|share/RepeatMasker/|')"
	fi
	mkdir -p $OUTDIR/${species}_Curated_Library/
	famdb.py -i $libpath families -f fasta_name --include-class-in-name -a -d --curated $RepSpec > ${OUTDIR}/${species}_Curated_Library/${RepSpec}.RepeatMasker.lib
	RepSub=${OUTDIR}/${species}_Curated_Library/${RepSpec}.RepeatMasker.lib
}

# Subprocess firstMask
# Run the initial RepeatMasker run with the specified species subset (only used if a RepeatMasker run is specified)
firstMask()
{
        cd ${OUTDIR}/${species}_RepeatMasker
        rmthreads=$(expr $ProcNum / 4)
        RepeatMasker -species $RepSpec -no_is -lcambig -s -a -pa $rmthreads -dir $OUTDIR/${species}_RepeatMasker $genome
        if [ ! -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
                echo "ERROR: RepeatMasker failed, please check logs. This is likely because of an invalid species search term, if issue persists please use NCBI Taxids (E.G Drosophila is replaced with 7125)"; exit 2
        fi
}

# Subprocess firstMaskCustomLib
# Run the initial RepeatMasker run with the specific fasta consensus library
firstMaskCustomLib()
{
        cd ${OUTDIR}/${species}_RepeatMasker
        rmthreads=$(expr $ProcNum / 4)
        RepeatMasker -lib ${startCust} -no_is -lcambig -s -a -pa $rmthreads -dir $OUTDIR/${species}_RepeatMasker $genome
        RepSub=${startCust}
        if [ ! -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
                echo "ERROR: RepeatMasker failed, please check logs. This is likely because of an invalid custom consensus file, check the fasta file looks as expected"; exit 2
        fi
}


# Subprocess buildDB 
# if masked genome exists, build database from this. If not, build database from original input genome
buildDB()
{
        if [ -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
                cd ${OUTDIR}/${species}_Database
                BuildDatabase -name ${species} ${OUTDIR}/${species}_RepeatMasker/*.masked
        else
                cd ${OUTDIR}/${species}_Database
                BuildDatabase -name ${species} $genome
        fi
}

# Subprocess deNovo1
# Run the initial de novo annotation run with RepeatModeler2, with error catches for weird genome sizes
deNovo1()
{
        cd ${OUTDIR}/${species}_RepeatModeler
        RepeatModeler -threads ${ProcNum} -database ${OUTDIR}/${species}_Database/${species}
        if [ ! -e ${OUTDIR}/${species}_Database/${species}-families.fa ]; then
                echo "ERROR: RepeatModeler Failed, Retrying with limit set as Round 5"
                RepeatModeler -threads ${ProcNum} -database ${OUTDIR}/${species}_Database/${species} -genomeSampleSizeMax 81000000
                if [ ! -e ${OUTDIR}/${species}_Database/${species}-families.fa ]; then
                        echo "ERROR: RepeatModeler Failed, Retrying with limit set as Round 4"
                        RepeatModeler -threads ${ProcNum} -database ${OUTDIR}/${species}_Database/${species} -genomeSampleSizeMax 27000000
                        if [ ! -e ${OUTDIR}/${species}_Database/${species}-families.fa ]; then
                                echo "ERROR: RepeatModeler Failed"
                                exit 2
                        fi
                fi
        fi
}

# Subprocess strainer # CHECK FILE STRUCTURE FOR EARL GREY RUN
# contains the BLAST, Extract, Extend, Trim pipeline from James Galbraith
strainer()
{
        cd ${OUTDIR}/${species}_strainer/
	if [ "$ProcNum" -ge 4 ]; then
		strainthreads=$(expr $ProcNum / 4)
	else
		strainthreads=$ProcNum
	fi
        ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh -g $genome -l ${OUTDIR}/${species}_Database/${species}-families.fa -t ${strainthreads} -f $Flank -r $num -n $no_seq -m $min_seq
        latestFile="$(realpath $(ls -td -- ${OUTDIR}/${species}_strainer/*/ | head -n 1))/${species}-families.fa.strained"
        cp $latestFile ${OUTDIR}/${species}_strainer/
}

# Subprocess strainerResume
# resumes interrupted TEstrainer runs
strainerResume() {
    cd ${OUTDIR}/${species}_strainer/
    strainDataDir=$(find . -maxdepth 1 -type d -name "TS_${species}-families.fa_*")
    if [ ! $(echo "$strainDataDir" | wc -l) -eq 1 ]; then
        echo "ERROR: There are more TEstrainer directories than expected present"
        exit 2
    fi
    if [ "$ProcNum" -ge 4 ]; then
        strainthreads=$(expr $ProcNum / 4)
    else
        strainthreads=$ProcNum
    fi
    ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh -g $genome -l ${OUTDIR}/${species}_Database/${species}-families.fa -t ${strainthreads} -f $Flank -r $num -n $no_seq -m $min_seq -d ${strainDataDir} -R
    latestFile="$(realpath $(ls -td -- ${OUTDIR}/${species}_strainer/*/ | head -n 1))/${species}-families.fa.strained"
    cp $latestFile ${OUTDIR}/${species}_strainer/
}

# Subprocess sweepUp
# Puts required files into a summary folder
sweepUp()
{
        cd ${OUTDIR}/${species}_summaryFiles/
        if [ -s ${latestFile} ]; then
                cp $latestFile ${OUTDIR}/${species}_summaryFiles/ 
        else
                echo "ERROR: TEstrainer failed to generate a TE library, please check logs" && exit 1
        fi
}

# Subprocess runningTea 
runningTea()
{
        echo "    
              )  (
             (   ) )
             ) ( (
           _______)_
        .-'---------|  
       ( C|/\/\/\/\/|
        '-./\/\/\/\/|
          '_________'
           '-------'
        <<< $stage >>>"
}

# Subprocess GetTime
convertsecs() 
{
        h=$(bc <<< "${1}/3600")
        m=$(bc <<< "(${1}%3600)/60")
        s=$(bc <<< "${1}%60")
        printf "%02d:%02d:%05.2f\n" $h $m $s
}

# Subprocess Checks
Checks()
{
        if [ -z "$genome" ] || [ -z "$species" ] || [ -z "$directory" ] ; then
                usage; exit 1
        fi

        if [ -z "$ProcNum" ] ; then
                ProcNum=1; echo "$ProcNum Cores Will Be Used"
        else
                echo "$ProcNum Cores Will Be Used"
        fi

        if [ -z "$RepSpec" ] && [ -z "$startCust" ] ; then
                echo "RepeatMasker species not specified, running Earl Grey without an initial mask with known repeats"
        else
                echo "Running Earl Grey with an intial mask with known repeats"
        fi

        if [ -z "$num" ] ; then
                num=10; echo "De Novo Sequences Will Be Extended Through a Maximum of $num Iterations"
        else
                echo "De Novo Sequences Will Be Extended Through a Maximum of $num Iterations"
        fi

        if [ -z "$no_seq" ] ; then
                no_seq=20; echo "$no_seq sequences will be used in BEAT consensus generation"
        else
                echo "$no_seq sequences will be used in BEAT consensus generation"
        fi

        if [ -z "$Flank" ]; then
                Flank=1000; echo "Blast, Extend, Align, Trim Process Will Add 1000bp to Each End in Each Iteration"
        else
                echo "Blast, Extract, Extend, Trim Process Will Add $Flank to Each End in Each Iteration"
        fi

        if [ -z "$min_seq" ]; then
                min_seq=3; echo "Blast, Extend, Align, Trim Process Will Require 3 Sequences to Generate a New Consensus Sequence"
        else
                echo "Blast, Extend, Align, Trim Process Will Require $min_seq Sequences to Generate a New Consensus Sequence"
        fi

        if [ ! -d "$SCRIPT_DIR" ]; then
                echo "ERROR: Script directory variable not set, please run the configure script in the Earl Grey directory before attempting to run Earl Grey"; exit 1
        fi

        if [ ! -d "$SCRIPT_DIR/TEstrainer/" ]; then
                echo "ERROR: teStrainer module not found, please check all modules are present and run the configure script in the Earl Grey directory before attempting to run Earl Grey"; exit 1
        fi

	echo "Please cite the following paper when using this software:"
	echo "Baril, T., Galbraith, J. and Hayward, A., 2024. Earl Grey: a fully automated user-friendly transposable element annotation and analysis pipeline. Molecular Biology and Evolution, 41(4), p.msae068."
}

# Subprocess Dfam Checks
dfamCheck()
{
	# Dfam 3.9 checks
	library_path=$(which RepeatMasker | sed 's|/bin/RepeatMasker|/share/RepeatMasker/Libraries/famdb|g')
	expected_file="${library_path}/dfam39_full.0.h5"
	file_count=$(find "$library_path" -maxdepth 1 -type f | wc -l)
	if [ $file_count -eq 2 ] && [ -f $expected_file ] && [ -f ${library_path}/rmlib.config ] ; then
    		echo "WARNING: Earl Grey v6.1.0 has updated to Dfam v3.9." 
    		echo "Before using Earl Grey, you MUST download the required partitions from Dfam (https://dfam.org/releases/current/families/FamDB/)"
    		echo "These must be located at ${library_path}/. Then, you must reconfigure RepeatMasker before using Earl Grey."
    		echo "I have written the required commands for you to faciliate the process."
    		echo "Please follow these carefully to ensure successful configuration of RepeatMasker with Dfam 3.9."
    		echo "As a note, these DB paritions are BIG, so only download the ones you need for your studies (or, if you do need all of them, make sure you have the space for them)"
    		echo "########################################################"
    		echo "A script for modification and automation of these steps has been generated in $(pwd)/configure_dfam39.sh"
    		echo "Please open the script in your preferred text editor (e.g. nano or vim) and change the partition numbers in [] to those that you would like to download"
    		echo "then, make the script executable (chmod +x $(pwd)/configure_dfam39.sh) and run it: ./configure_dfam39.sh"
    		echo "Alternatively, you can run the steps below one-by-one to achieve the same outcome:"
    		echo "########################################################"
    		echo ""
    		if [ -f "$(pwd)/configure_dfam39.sh" ]; then
        		rm $(pwd)/configure_dfam39.sh
    		fi
    		echo "# first, change directory to the famdb library location" | tee -a $(pwd)/configure_dfam39.sh
    		echo "cd ${library_path}/" | tee -a $(pwd)/configure_dfam39.sh
    		echo "" | tee -a $(pwd)/configure_dfam39.sh
    		echo "# download the partitions you require from Dfam 3.9. In the below, change the numbers or range inside the square brackets to choose your subsets." | tee -a $(pwd)/configure_dfam39.sh
    		echo "# e.g. to download partitions 0 to 10: [0-10]; or to download partitions 3,5, and 7: [3,5,7]; [0-16] is ALL PARTITIONS" | tee -a $(pwd)/configure_dfam39.sh
    		echo "curl -o 'dfam39_full.#1.h5.gz' 'https://dfam.org/releases/current/families/FamDB/dfam39_full.[0-16].h5.gz'" | tee -a $(pwd)/configure_dfam39.sh
    		echo "" | tee -a $(pwd)/configure_dfam39.sh
    		echo "# decompress Dfam 3.9 paritions" | tee -a $(pwd)/configure_dfam39.sh
    		echo "gunzip *.gz" | tee -a $(pwd)/configure_dfam39.sh
    		echo "" | tee -a $(pwd)/configure_dfam39.sh
    		echo "# move up to RepeatMasker main directory" | tee -a $(pwd)/configure_dfam39.sh
    		echo "cd $(which RepeatMasker | sed 's|/bin/RepeatMasker|/share/RepeatMasker/|g')" | tee -a $(pwd)/configure_dfam39.sh
    		echo "" | tee -a $(pwd)/configure_dfam39.sh
    		#echo "# Replace famdb with version 2.0.1 (required for the Dfam 3.9)" | tee -a $(pwd)/configure_dfam39.sh
    		#echo "wget https://github.com/Dfam-consortium/FamDB/archive/refs/tags/2.0.1.tar.gz && tar --wildcards -zxvf 2.0.1.tar.gz FamDB-2.0.1/famdb.py FamDB-2.0.1/famdb_*.py" | tee -a $(pwd)/configure_dfam39.sh
    		#echo "mv FamDB-2.0.1/*.py . && rm -rf 2.0.1.tar.gz FamDB-2.0.1/" | tee -a $(pwd)/configure_dfam39.sh
    		#echo "" | tee -a $(pwd)/configure_dfam39.sh
    		echo "# save the min_init partition as a backup, just in case!" | tee -a $(pwd)/configure_dfam39.sh
    		echo "mv ${library_path}/min_init.0.h5 ${library_path}/min_init.0.h5.bak" | tee -a $(pwd)/configure_dfam39.sh
    		echo "" | tee -a $(pwd)/configure_dfam39.sh
    		echo "# Rerun RepeatMasker configuration" | tee -a $(pwd)/configure_dfam39.sh
    		echo "perl ./configure -libdir $(which RepeatMasker | sed 's|/bin/RepeatMasker|/share/RepeatMasker/Libraries/|g') \
    		-trf_prgm $(which RepeatMasker | sed 's|/bin/RepeatMasker|/bin/trf|g') \
   			-rmblast_dir $(which RepeatMasker | sed 's|/bin/RepeatMasker|/bin|g') \
    		-hmmer_dir $(which RepeatMasker | sed 's|/bin/RepeatMasker|/bin|g') \
    		-abblast_dir $(which RepeatMasker | sed 's|/bin/RepeatMasker|/bin|g') \
    		-crossmatch_dir $(which RepeatMasker | sed 's|/bin/RepeatMasker|/bin|g') \
    		-default_search_engine rmblast" | tee -a $(pwd)/configure_dfam39.sh
    		echo "" | tee -a $(pwd)/configure_dfam39.sh
    		sed -i.bak '/^perl/  s/\s\s*/ /g' $(pwd)/configure_dfam39.sh
    		echo "########################################################"
    		echo ""
    		echo "Following these steps should enable you to use Dfam 3.9 with RepeatMasker and Earl Grey"
    		echo "You will not see this message again if configuration is successful"
			echo "If you only want partition 0, you do not have to do anything except rerun your Earl Grey command"
    		echo "If you are still having trouble, please open an issue on the Earl Grey github and we will try to help!"
			touch ${library_path}/.earlgrey.config.complete
    		exit 2
	fi
}

# Main #

while getopts g:s:o:t:f:i:r:l:n:a:h option
do
        case "${option}" 
                in
                g) genome=${OPTARG};;
                s) species=${OPTARG};;
                o) directory=${OPTARG};;
                t) ProcNum=${OPTARG};;
                f) Flank=${OPTARG};;
                i) num=${OPTARG};;
                r) RepSpec=${OPTARG};;
                l) startCust=${OPTARG};;
                n) no_seq=${OPTARG};;
                a) min_seq=${OPTARG};;
                h) usage; exit 0;; 
        esac
done

SECONDS=0

# Step 1 - set up the directories and make sure all modules are present

stage="Checking Parameters" && runningTea
SCRIPT_DIR=/opt/conda/conda-bld/earlgrey_1773324219515/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_plac/share/earlgrey-7.1.0-0/scripts/
dfamCheck
Checks
stage="Making Directories" && runningTea
makeDirectory

# Start Logs - only bother starting if everything is present and correct
exec > >(tee "${OUTDIR}/${species}EarlGrey.log") 2>&1
stage="Cleaning Genome" && runningTea
prepGenome
sleep 1

# Step 2 - initial annotation, either de novo, or if required an initial RepeatMasker followed by a de novo run
if [ -z "$RepSpec" ]; then
        sleep 1
else
        if [ ! -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
                stage="Getting RepeatMasker Sequences for $RepSpec and Saving as Fasta" && runningTea
                getRepeatMaskerFasta
                stage="Running Initial Mask with Known Repeats" && runningTea
                firstMask
        else
                stage="Genome has already been masked, skipping..." && runningTea
                getRepeatMaskerFasta
        fi
fi

if [ -z "$startCust" ]; then
        sleep 1
else
        if [ ! -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
                stage="Running Initial Mask with Known Repeats in Custom Library" && runningTea
                firstMaskCustomLib
        else
                stage="Genome has already been masked, skipping..." && runningTea
        fi
fi

# Step 3 - make a database from the genome, and then run a de novo TE annotation using RepeatModeler2
if [ ! -f ${OUTDIR}/${species}_Database/${species}-families.fa ]; then
        stage="Detecting Novel Repeats" && runningTea
        buildDB
        deNovo1
        sleep 1
else
        stage="De novo repeats have already been detected, skipping..." && runningTea
        sleep 1
fi

# Step 4 - run TEstrainer, which runs a BLAST, extract, extent, trim, on the the de novo repeat library to refine consensus sequences
if [ ! -s ${OUTDIR}/${species}_strainer/${species}-families.fa.strained ]; then
        stage="Straining TEs and Refining de novo Consensus Sequences" && runningTea
        if [ -f ${OUTDIR}/${species}_strainer/${species}-families.fa.strained ]; then
                rm -r ${OUTDIR}/${species}_strainer/${species}-families.fa.strained ${OUTDIR}/${species}_strainer/TS*
                strainer
        elif [ -d ${OUTDIR}/${species}_strainer/TS_${species}-families.fa_* ]; then
                stage="Resuming TEstrainer" && runningTea
                strainerResume
        else
                strainer
        fi
        if [ ! -s ${OUTDIR}/${species}_strainer/${species}-families.fa.strained ]; then
                echo "WARNING: TEstrainer failed to produce a strain file, please check the log file for more information. If you have run an intial mask with known repeats, this could be due RepeatModeler2 failing to identify any new repeats. Please check if this is expected."
        fi
        sleep 1
else
        stage="TEs already strained, skipping..." && runningTea
        latestFile=${OUTDIR}/${species}_strainer/${species}-families.fa.strained
        cp ${latestFile} ${OUTDIR}/${species}_summaryFiles/
        sleep 1
fi

# Stage 5 - Tidy up!
stage="Tidying Directories and Organising Important Files" && runningTea
sweepUp
sleep 1

# Stage 6
time=$(convertsecs $SECONDS)
stage="Done in $time" && runningTea
sleep 5

# Stage 10
stage="TE library in Standard Format Can Be Found in ${OUTDIR}/${species}_summaryFiles/" && runningTea
sleep 5
