#!/bin/bash

usage()
{
	echo "	#############################
	earlGrey version 7.1.0
	Required Parameters:
		-g == genome.fasta
		-s == species name
		-o == output directory

	Optional Parameters:
		-t == Number of Threads (DO NOT specify more than are available)
		-r == RepeatMasker search term (e.g arthropoda/eukarya)
		-l == Starting consensus library for an initial mask (in fasta format)
		-i == Number of Iterations to BLAST, Extract, Extend (Default: 10)
		-f == Number flanking basepairs to extract (Default: 1000)
		-c == Cluster TE library to reduce redundancy? (yes/no, Default: no)
		-m == Remove putative spurious TE annotations <100bp? (yes/no, Default: no)
		-d == Create soft-masked genome at the end? (yes/no, Default: no)
		-n == Max number of sequences used to generate consensus sequences (Default: 20)
		-a == minimum number of sequences required to build a consensus sequence (Default: 3)
		-e == Run HELIANO as an optional step to detect Helitrons (yes/no, Default: no)
		-h == Show help

	Example Usage:

	earlGrey -g bombyxMori.fasta -s bombyxMori -o /home/toby/bombyxMori/repeatAnnotation/ -t 16

	Queries can be sent to:
	tobias.baril[at]unine.ch

	Please make use of the GitHub Issues and Discussion Tabs at: https://github.com/TobyBaril/EarlGrey
	#############################"
}

# Subprocess Make Directories #
makeDirectory()
{
	directory=$(realpath ${directory})
	mkdir -p ${directory}/${species}_EarlGrey/ && OUTDIR=${directory}/${species}_EarlGrey
	if [ ! -z "$RepSpec" ] || [ ! -z "$startCust" ] ; then
		mkdir -p $OUTDIR/${species}_RepeatMasker/
	fi
	mkdir -p $OUTDIR/${species}_Database/
	mkdir -p $OUTDIR/${species}_RepeatModeler/
	mkdir -p $OUTDIR/${species}_strainer/
	mkdir -p $OUTDIR/${species}_RepeatMasker_Against_Custom_Library/
	mkdir -p $OUTDIR/${species}_RepeatLandscape/
	mkdir -p $OUTDIR/${species}_mergedRepeats/
	mkdir -p ${OUTDIR}/${species}_summaryFiles/
	if [ ! -z "$heli" ]; then
		mkdir -p ${OUTDIR}/${species}_heliano/
	fi
}

# Subprocess PrepGenome #
prepGenome()
{
	if [ ! -L ${genome} ]; then
		genome=$(realpath ${genome})
	fi
	if [ -L $genome ]; then
		genome=$(realpath -s ${genome})
	fi
	if [ ! -f ${genome}.prep ] || [ ! -f ${genome}.dict ]; then
		cp ${genome} ${genome}.bak && gzip -f ${genome}.bak 
		sed '/>/ s/[[:space:]].*//g; /^$/d' ${genome} > ${genome}.tmp
		${SCRIPT_DIR}/headSwap.sh -i ${genome}.tmp -o ${genome}.prep && rm ${genome}.tmp
		mv ${genome}.tmp.dict ${genome}.dict
		sed -i.bak '/^>/! s/[DVHBPE]/N/g' ${genome}.prep
		dict=${genome}.dict
		genOrig=${genome}
		genome=${genome}.prep
	else
		dict=${genome}.dict
		genOrig=${genome}
		genome=${genome}.prep
	fi
}

# Subprocess getRepeatMaskerFasta
# Generate a copy of the RepeatMasker library subset used for the initial TE mask in FASTA format (only used if a RepeatMasker run is specified)
getRepeatMaskerFasta()
{
	if [[ $RepSpec = *" "* ]]; then
		echo "ERROR: You have entered a species name that contains a space, please use the NCBI TaxID rather than name. E.G In place of \"Homo sapiens\" use \"9606\""
		exit 2
	fi
	if [[ $(which RepeatMasker) == *"bin"* ]]; then
		libpath="$(which RepeatMasker | sed 's|bin/RepeatMasker|share/RepeatMasker/Libraries/famdb/|')"
	else
		libpath="$(which RepeatMasker | sed 's|/[^/]*$||g')/Libraries/famdb/"
	fi
	if [[ $(which RepeatMasker) == *"bin"* ]]; then
		PATH=$PATH:"$(which RepeatMasker | sed 's|bin/RepeatMasker|share/RepeatMasker/|')"
	fi
	mkdir -p $OUTDIR/${species}_Curated_Library/
	famdb.py -i $libpath families -f fasta_name --include-class-in-name -a -d --curated $RepSpec > ${OUTDIR}/${species}_Curated_Library/${RepSpec}.RepeatMasker.lib
	RepSub=${OUTDIR}/${species}_Curated_Library/${RepSpec}.RepeatMasker.lib
}

# Subprocess firstMask
# Run the initial RepeatMasker run with the specified species subset (only used if a RepeatMasker run is specified)
firstMask()
{
	cd ${OUTDIR}/${species}_RepeatMasker
        rmthreads=$(expr $ProcNum / 4)
	RepeatMasker -species $RepSpec -no_is -lcambig -s -a -pa $rmthreads -dir $OUTDIR/${species}_RepeatMasker $genome
	if [ ! -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
		echo "ERROR: RepeatMasker failed, please check logs. This is likely because of an invalid species search term, if issue persists please use NCBI Taxids (E.G Drosophila is replaced with 7125)"; exit 2
	fi
}

# Subprocess firstMaskCustomLib
# Run the initial RepeatMasker run with the specific fasta consensus library
firstMaskCustomLib()
{
	cd ${OUTDIR}/${species}_RepeatMasker
	rmthreads=$(expr $ProcNum / 4)
	RepeatMasker -lib ${startCust} -no_is -lcambig -s -a -pa $rmthreads -dir $OUTDIR/${species}_RepeatMasker $genome
	RepSub=${startCust}
	if [ ! -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
		echo "ERROR: RepeatMasker failed, please check logs. This is likely because of an invalid custom consensus file, check the fasta file looks as expected"; exit 2
	fi
}


# Subprocess buildDB 
# if masked genome exists, build database from this. If not, build database from original input genome
buildDB()
{
	if [ -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
		cd ${OUTDIR}/${species}_Database
		BuildDatabase -name ${species} ${OUTDIR}/${species}_RepeatMasker/*.masked
	else
		cd ${OUTDIR}/${species}_Database
		BuildDatabase -name ${species} $genome
	fi
}

# Subprocess deNovo1
# Run the initial de novo annotation run with RepeatModeler2, with error catches for weird genome sizes
deNovo1()
{
	cd ${OUTDIR}/${species}_RepeatModeler
	RepeatModeler -threads ${ProcNum} -database ${OUTDIR}/${species}_Database/${species}
	if [ ! -e ${OUTDIR}/${species}_Database/${species}-families.fa ]; then
		echo "ERROR: RepeatModeler Failed, Retrying with limit set as Round 5"
		RepeatModeler -threads ${ProcNum} -database ${OUTDIR}/${species}_Database/${species} -genomeSampleSizeMax 81000000
		if [ ! -e ${OUTDIR}/${species}_Database/${species}-families.fa ]; then
			echo "ERROR: RepeatModeler Failed, Retrying with limit set as Round 4"
			RepeatModeler -threads ${ProcNum} -database ${OUTDIR}/${species}_Database/${species} -genomeSampleSizeMax 27000000
			if [ ! -e ${OUTDIR}/${species}_Database/${species}-families.fa ]; then
				echo "ERROR: RepeatModeler Failed"
				exit 2
			fi
		fi
	fi
}

# Subprocess strainer 
# contains the BLAST, Extract, Extend, Trim pipeline from James Galbraith
strainer()
{
	cd ${OUTDIR}/${species}_strainer/
	if [ "$ProcNum" -ge 4 ]; then
		strainthreads=$(expr $ProcNum / 4)
	else
		strainthreads=$ProcNum
	fi
	${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh -g $genome -l ${OUTDIR}/${species}_Database/${species}-families.fa -t ${strainthreads} -f $Flank -r $num -n $no_seq -m $min_seq
	latestFile="$(realpath $(ls -td -- ${OUTDIR}/${species}_strainer/*/ | head -n 1))/${species}-families.fa.strained"
	cp $latestFile ${OUTDIR}/${species}_strainer/
}

# Subprocess strainerResume
# resumes interrupted TEstrainer runs
strainerResume() {
	cd ${OUTDIR}/${species}_strainer/
	strainDataDir=$(find . -maxdepth 1 -type d -name "TS_${species}-families.fa_*")
	if [ ! $(echo "$strainDataDir" | wc -l) -eq 1 ]; then
		echo "ERROR: There are more TEstrainer directories than expected present"
		exit 2
	fi
	if [ "$ProcNum" -ge 4 ]; then
		strainthreads=$(expr $ProcNum / 4)
	else
		strainthreads=$ProcNum
	fi
	${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh -g $genome -l ${OUTDIR}/${species}_Database/${species}-families.fa -t ${strainthreads} -f $Flank -r $num -n $no_seq -m $min_seq -d ${strainDataDir} -R
	latestFile="$(realpath $(ls -td -- ${OUTDIR}/${species}_strainer/*/ | head -n 1))/${species}-families.fa.strained"
	cp $latestFile ${OUTDIR}/${species}_strainer/
}

# Subprocess clust
# optional wicker clustering of TE library consensus sequences to reduce redundancy
clust()
{
	cd ${OUTDIR}/${species}_strainer/
	cd-hit-est -d 0 -aS 0.8 -c 0.8 -G 0 -g 1 -b 500 -r 1 -i $latestFile -o ${latestFile}.clstrd.fa
	latestFile=$latestFile.clstrd.fa
}


# Subprocess novoMask
# Run RepeatMasker with the final processed library
novoMask()
{
	if [ -z "$RepSpec" ] && [ -z "$startCust" ]; then
		cd ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/
		rmthreads=$(expr $ProcNum / 4)
		RepeatMasker -lib $latestFile -no_is -lcambig -s -a -pa $rmthreads -dir ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/ $genome
	else
		cd ${OUTDIR}/${species}_Curated_Library/
		cat $latestFile $RepSub > ${species}_combined_library.fasta
		latestFile=$(realpath ${species}_combined_library.fasta)
		cd ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/
		rmthreads=$(expr $ProcNum / 4)
		RepeatMasker -lib $latestFile -no_is -lcambig -s -a -pa $rmthreads -dir ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/ $genome
	fi
}

# Subprocess heliano
# Run HELIANO as an optional step, then replace overlapping repeats in the merged output with HELIANO outputs
### TODO:
heliano_optional()
{
	cd ${OUTDIR}/${species}_heliano/
	heliano -g $genome --nearest -dn 6000 -flank_sim 0.5 -o ${OUTDIR}/${species}_heliano/HEL_${timestamp} -w 10000 -n $ProcNum
	awk '{OFS="\t"}{print $1, "HELIANO", "RC/Helitron", $2+1, $3, $5, $6, ".", "ID="$9"_"$11";shortTE=F"}' ${OUTDIR}/${species}_heliano/HEL_${timestamp}/RC.representative.bed > ${OUTDIR}/${species}_heliano/HEL_${timestamp}/RC.representative.gff
	helitron_gff=${OUTDIR}/${species}_heliano/HEL_${timestamp}/RC.representative.gff
}

# Subprocess rcMergeRepeats
# Defragment repeat sequences to adjust for insertion times
mergeRep()
{
	mkdir ${OUTDIR}/${species}_mergedRepeats/looseMerge
	if [ -s "$helitron_gff" ]; then
		echo "Running loose merge with HELIANO output"
		${SCRIPT_DIR}/rcMergeRepeatsLoose -f $genome -s $species -d ${OUTDIR}/${species}_mergedRepeats/looseMerge -u ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).out -q ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).tbl -t $ProcNum -b ${dict} -m $margin -e $helitron_gff
	else
		${SCRIPT_DIR}/rcMergeRepeatsLoose -f $genome -s $species -d ${OUTDIR}/${species}_mergedRepeats/looseMerge -u ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).out -q ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).tbl -t $ProcNum -b ${dict} -m $margin
	fi

	if [ -f "${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed" ]; then
		awk '{OFS="\t"}{print $1, $2, $3, $4, $5, $6, $7, $8, toupper($9)}' ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff > ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff.1 && mv ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff{.1,}
	fi

	if [ ! -f "${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed" ]; then
		echo "ERROR: loose merge defragmentation failed, trying strict merge..."
		cd ${OUTDIR}/${species}_mergedRepeats/
		if [ -s "$helitron_gff" ]; then
			${SCRIPT_DIR}/rcMergeRepeats -f $genome -s $species -d ${OUTDIR}/${species}_mergedRepeats/ -u ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).out -q ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).tbl -t $ProcNum -b ${dict} -m $margin -e $helitron_gff
		else
			${SCRIPT_DIR}/rcMergeRepeats -f $genome -s $species -d ${OUTDIR}/${species}_mergedRepeats/ -u ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).out -q ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).tbl -t $ProcNum -b ${dict} -m $margin
		fi
		if [ ! -f "${OUTDIR}/${species}_mergedRepeats/${species}.filteredRepeats.bed" ]; then
			echo "ERROR: strict merge also failed, check ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).out looks as expected"
			exit 2
		fi
	fi
}

# Subprocess pieChart
# Generate a pie chart summary of TE content
charts()
{
	cd ${OUTDIR}/${species}_summaryFiles/
	if [ -f "${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed" ]; then
		${SCRIPT_DIR}/autoPie.sh -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.summary -t ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).tbl -p ${OUTDIR}/${species}_summaryFiles/${species}.summaryPie.pdf -o ${OUTDIR}/${species}_summaryFiles/${species}.highLevelCount.txt
	else
		${SCRIPT_DIR}/autoPie.sh -i ${OUTDIR}/${species}_mergedRepeats/${species}.filteredRepeats.summary -t ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).tbl -p ${OUTDIR}/${species}_summaryFiles/${species}.summaryPie.pdf -o ${OUTDIR}/${species}_summaryFiles/${species}.highLevelCount.txt
	fi
}

# Subprocess calcDivRL
# Calculate divergence estimates
calcDivRL()
{
	cd ${OUTDIR}/${species}_RepeatLandscape
	if [ -z "$RepSpec" ] && [ -z "$startCust" ]; then
		python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l $latestFile -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
		Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ && \
		awk -F'\t' 'BEGIN{OFS="\t"} {gsub("NAME=","Name=",$9); gsub("TSTART=","tstart=",$9); gsub("TEND=","tend=",$9); gsub("SHORTTE=","shortte=",$9); gsub("TEGROUP=","tegroup=",$9); gsub("KIMURA80=","kimura80=",$9); gsub("NESTED=", "nested=", $9); print}' ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff > ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff
		rm -rf ${OUTDIR}/${species}_RepeatLandscape/tmp/
		cp ${OUTDIR}/${species}_RepeatLandscape/*.pdf ${OUTDIR}/${species}_summaryFiles/
		cp ${OUTDIR}/${species}_RepeatLandscape/*_summary_table.tsv ${OUTDIR}/${species}_summaryFiles/${species}_divergence_summary_table.tsv
	else
		python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
		Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ && \
		awk -F'\t' 'BEGIN{OFS="\t"} {gsub("NAME=","Name=",$9); gsub("TSTART=","tstart=",$9); gsub("TEND=","tend=",$9); gsub("SHORTTE=","shortte=",$9); gsub("TEGROUP=","tegroup=",$9); gsub("KIMURA80=","kimura80=",$9); gsub("NESTED=", "nested=", $9); print}' ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff > ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff
		rm -rf ${OUTDIR}/${species}_RepeatLandscape/tmp/
		cp ${OUTDIR}/${species}_RepeatLandscape/*.pdf ${OUTDIR}/${species}_summaryFiles/
		cp ${OUTDIR}/${species}_RepeatLandscape/*_summary_table.tsv ${OUTDIR}/${species}_summaryFiles/${species}_divergence_summary_table.tsv
	fi
}

# Subprocess sweepUp
# Puts required files into a summary folder
sweepUp()
{
	cd ${OUTDIR}/${species}_summaryFiles/
	if [ -f "${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed" ] && [ ! -f "${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta" ]; then
		cp ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff $latestFile ${OUTDIR}/${species}_summaryFiles/
	elif [ -f "${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed" ] && [ -f "${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta" ]; then
		cp ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta $latestFile ${OUTDIR}/${species}_summaryFiles/
	elif [ ! -f "${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed" ] && [ ! -f "${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta" ]; then
		cp ${OUTDIR}/${species}_mergedRepeats/${species}.filteredRepeats.bed ${OUTDIR}/${species}_mergedRepeats/${species}.filteredRepeats.gff $latestFile ${OUTDIR}/${species}_summaryFiles/
	elif [ ! -f "${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed" ] && [ -f "${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta" ]; then
		cp ${OUTDIR}/${species}_mergedRepeats/${species}.filteredRepeats.bed ${OUTDIR}/${species}_mergedRepeats/${species}.filteredRepeats.gff ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta $latestFile ${OUTDIR}/${species}_summaryFiles/
	else
		echo "Error: Cannot find required summary files, please check one of the following is present: ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed and/or ${OUTDIR}/${species}_mergedRepeats/${species}.filteredRepeats.bed"
	fi
}

# Subprocess runningTea 
runningTea()
{
	echo "    
              )  (
	     (   ) )
	     ) ( (
	   _______)_
	.-'---------|  
       ( C|/\/\/\/\/|
	'-./\/\/\/\/|
	  '_________'
	   '-------'
	<<< $stage >>>"
}

# Subprocess GetTime
convertsecs() 
{
	h=$(bc <<< "${1}/3600")
	m=$(bc <<< "(${1}%3600)/60")
	s=$(bc <<< "${1}%60")
	printf "%02d:%02d:%05.2f\n" $h $m $s
}

# Subprocess Checks
Checks()
{
	if [ -z "$genome" ] || [ -z "$species" ] || [ -z "$directory" ] ; then
		usage; exit 1
	fi

	if [ -z "$ProcNum" ] ; then
		ProcNum=1; echo "$ProcNum Cores Will Be Used"
	else
		echo "$ProcNum Cores Will Be Used"
	fi

	if [ -z "$RepSpec" ] && [ -z "$startCust" ] ; then
		echo "RepeatMasker species not specified, running Earl Grey without an initial mask with known repeats"
	else
		echo "Running Earl Grey with an intial mask with known repeats"
	fi

	if [ -z "$num" ] ; then
		num=10; echo "De Novo Sequences Will Be Extended Through a Maximum of $num Iterations"
	else
		echo "De Novo Sequences Will Be Extended Through a Maximum of $num Iterations"
	fi

	if [ -z "$no_seq" ] ; then
		no_seq=20; echo "$no_seq sequences will be used in BEAT consensus generation"
	else
		echo "$no_seq sequences will be used in BEAT consensus generation"
	fi
	
	if [ -z "$cluster" ] || [ "$cluster" == "no" ] ; then
		cluster=no; echo "TE library consensus sequences will not be clustered"
	elif [ "$cluster" == "yes" ]; then
		cluster=yes; echo "TE library consensus sequences will be clustered, be aware of the impact this can have on subfamilies and chimeric TEs!"
	else
		cluster=no; echo "Clustering not specified using (yes/no). Using default parameter (no)."
	fi

	if [ -z "$softMask" ] || [ "$softMask" == "no" ] ; then
		softMask=no; echo "Softmasked genome will not be generated"
	elif [ "$softMask" == "yes" ]; then
		softMask=yes; echo "Softmasked genome will be generated"
	else
		softMask=no; echo "Softmask not specified using (yes/no). Using default parameter (no)."
	fi

	if [ "$margin" == "yes" ] ; then
		margin=yes; echo "Short TE sequences <100bp will be removed from annotation"
	elif [ -z "$margin" ] || [ "$margin" == "no" ]; then
		margin=no; echo "Short TE sequences <100bp will not be removed from annotation"
	else
		margin=no; echo "Margin not specified using (yes/no). Using default parameter (no)."
	fi

	if [ -z "$Flank" ]; then
		Flank=1000; echo "Blast, Extend, Align, Trim Process Will Add 1000bp to Each End in Each Iteration"
	else
		echo "Blast, Extract, Extend, Trim Process Will Add $Flank to Each End in Each Iteration"
	fi

	if [ -z "$min_seq" ]; then
		min_seq=3; echo "Blast, Extend, Align, Trim Process Will Require 3 Sequences to Generate a New Consensus Sequence"
	else
		echo "Blast, Extend, Align, Trim Process Will Require $min_seq Sequences to Generate a New Consensus Sequence"
	fi

	if [ ! -d "$SCRIPT_DIR" ]; then
		echo "ERROR: Script directory variable not set, please run the configure script in the Earl Grey directory before attempting to run Earl Grey"; exit 1
	fi

	if [ ! -d "$SCRIPT_DIR/TEstrainer/" ]; then
		echo "ERROR: teStrainer module not found, please check all modules are present and run the configure script in the Earl Grey directory before attempting to run Earl Grey"; exit 1
	fi

	if [ "$heli" == "yes" ] ; then
		heli=yes; echo "HELITRON detection will be run using HELIANO"
	elif [ -z "$heli" ] || [ "$heli" == "no" ]; then
		heli=no; echo "HELITRON detection will not be run"
	else
		heli=no; echo "HELITRON detection not specified using (yes/no). Using default parameter (no)."
	fi

	echo "Please cite the following paper when using this software:"
	echo "Baril, T., Galbraith, J. and Hayward, A., 2024. Earl Grey: a fully automated user-friendly transposable element annotation and analysis pipeline. Molecular Biology and Evolution, 41(4), p.msae068."
}

# Subprocess Dfam Checks
dfamCheck()
{
	# Dfam 3.9 checks
	library_path=$(which RepeatMasker | sed 's|/bin/RepeatMasker|/share/RepeatMasker/Libraries/famdb|g')
	expected_file="${library_path}/dfam39_full.0.h5"
	file_count=$(find "$library_path" -maxdepth 1 -type f | wc -l)
	if [ $file_count -eq 2 ] && [ -f $expected_file ] && [ -f ${library_path}/rmlib.config ] ; then
    		echo "WARNING: Earl Grey v6.1.0 has updated to Dfam v3.9." 
    		echo "Before using Earl Grey, you MUST download the required partitions from Dfam (https://dfam.org/releases/current/families/FamDB/)"
    		echo "These must be located at ${library_path}/. Then, you must reconfigure RepeatMasker before using Earl Grey."
    		echo "I have written the required commands for you to faciliate the process."
    		echo "Please follow these carefully to ensure successful configuration of RepeatMasker with Dfam 3.9."
    		echo "As a note, these DB paritions are BIG, so only download the ones you need for your studies (or, if you do need all of them, make sure you have the space for them)"
    		echo "########################################################"
    		echo "A script for modification and automation of these steps has been generated in $(pwd)/configure_dfam39.sh"
    		echo "Please open the script in your preferred text editor (e.g. nano or vim) and change the partition numbers in [] to those that you would like to download"
    		echo "then, make the script executable (chmod +x $(pwd)/configure_dfam39.sh) and run it: ./configure_dfam39.sh"
    		echo "Alternatively, you can run the steps below one-by-one to achieve the same outcome:"
    		echo "########################################################"
    		echo ""
    		if [ -f "$(pwd)/configure_dfam39.sh" ]; then
        		rm $(pwd)/configure_dfam39.sh
    		fi
    		echo "# first, change directory to the famdb library location" | tee -a $(pwd)/configure_dfam39.sh
    		echo "cd ${library_path}/" | tee -a $(pwd)/configure_dfam39.sh
    		echo "" | tee -a $(pwd)/configure_dfam39.sh
    		echo "# download the partitions you require from Dfam 3.9. In the below, change the numbers or range inside the square brackets to choose your subsets." | tee -a $(pwd)/configure_dfam39.sh
    		echo "# e.g. to download partitions 0 to 10: [0-10]; or to download partitions 3,5, and 7: [3,5,7]; [0-16] is ALL PARTITIONS" | tee -a $(pwd)/configure_dfam39.sh
    		echo "curl -o 'dfam39_full.#1.h5.gz' 'https://dfam.org/releases/current/families/FamDB/dfam39_full.[0-16].h5.gz'" | tee -a $(pwd)/configure_dfam39.sh
    		echo "" | tee -a $(pwd)/configure_dfam39.sh
    		echo "# decompress Dfam 3.9 paritions" | tee -a $(pwd)/configure_dfam39.sh
    		echo "gunzip *.gz" | tee -a $(pwd)/configure_dfam39.sh
    		echo "" | tee -a $(pwd)/configure_dfam39.sh
    		echo "# move up to RepeatMasker main directory" | tee -a $(pwd)/configure_dfam39.sh
    		echo "cd $(which RepeatMasker | sed 's|/bin/RepeatMasker|/share/RepeatMasker/|g')" | tee -a $(pwd)/configure_dfam39.sh
    		echo "" | tee -a $(pwd)/configure_dfam39.sh
    		#echo "# Replace famdb with version 2.0.1 (required for the Dfam 3.9)" | tee -a $(pwd)/configure_dfam39.sh
    		#echo "wget https://github.com/Dfam-consortium/FamDB/archive/refs/tags/2.0.1.tar.gz && tar --wildcards -zxvf 2.0.1.tar.gz FamDB-2.0.1/famdb.py FamDB-2.0.1/famdb_*.py" | tee -a $(pwd)/configure_dfam39.sh
    		#echo "mv FamDB-2.0.1/*.py . && rm -rf 2.0.1.tar.gz FamDB-2.0.1/" | tee -a $(pwd)/configure_dfam39.sh
    		#echo "" | tee -a $(pwd)/configure_dfam39.sh
    		echo "# save the min_init partition as a backup, just in case!" | tee -a $(pwd)/configure_dfam39.sh
    		echo "mv ${library_path}/min_init.0.h5 ${library_path}/min_init.0.h5.bak" | tee -a $(pwd)/configure_dfam39.sh
    		echo "" | tee -a $(pwd)/configure_dfam39.sh
    		echo "# Rerun RepeatMasker configuration" | tee -a $(pwd)/configure_dfam39.sh
    		echo "perl ./configure -libdir $(which RepeatMasker | sed 's|/bin/RepeatMasker|/share/RepeatMasker/Libraries/|g') \
    		-trf_prgm $(which RepeatMasker | sed 's|/bin/RepeatMasker|/bin/trf|g') \
   			-rmblast_dir $(which RepeatMasker | sed 's|/bin/RepeatMasker|/bin|g') \
    		-hmmer_dir $(which RepeatMasker | sed 's|/bin/RepeatMasker|/bin|g') \
    		-abblast_dir $(which RepeatMasker | sed 's|/bin/RepeatMasker|/bin|g') \
    		-crossmatch_dir $(which RepeatMasker | sed 's|/bin/RepeatMasker|/bin|g') \
    		-default_search_engine rmblast" | tee -a $(pwd)/configure_dfam39.sh
    		echo "" | tee -a $(pwd)/configure_dfam39.sh
    		sed -i.bak '/^perl/  s/\s\s*/ /g' $(pwd)/configure_dfam39.sh
    		echo "########################################################"
    		echo ""
    		echo "Following these steps should enable you to use Dfam 3.9 with RepeatMasker and Earl Grey"
    		echo "You will not see this message again if configuration is successful"
			echo "If you only want partition 0, you do not have to do anything except rerun your Earl Grey command"
    		echo "If you are still having trouble, please open an issue on the Earl Grey github and we will try to help!"
			touch ${library_path}/.earlgrey.config.complete
    		exit 2
	fi
}

# Main #

while getopts g:s:o:t:f:i:r:c:l:m:d:n:a:e:h option
do
	case "${option}" 
		in
		g) genome=${OPTARG};;
		s) species=${OPTARG};;
		o) directory=${OPTARG};;
		t) ProcNum=${OPTARG};;
		f) Flank=${OPTARG};;
		i) num=${OPTARG};;
		r) RepSpec=${OPTARG};;
		l) startCust=${OPTARG};;
		c) cluster=${OPTARG};;
		m) margin=${OPTARG};;
		d) softMask=${OPTARG};;
		n) no_seq=${OPTARG};;
		a) min_seq=${OPTARG};;
		e) heli=${OPTARG};;
		h) usage; exit 0;; 
	esac
done

SECONDS=0

# Step 1 - set up the directories and make sure all modules are present

stage="Checking Parameters" && runningTea
SCRIPT_DIR=/opt/conda/conda-bld/earlgrey_1773324219515/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_plac/share/earlgrey-7.1.0-0/scripts/
dfamCheck
Checks
stage="Making Directories" && runningTea
makeDirectory

# Start Logs - only bother starting if everything is present and correct
exec > >(tee "${OUTDIR}/${species}EarlGrey.log") 2>&1
stage="Cleaning Genome" && runningTea
prepGenome
sleep 1

# Step 2 - initial annotation, either de novo, or if required an initial RepeatMasker followed by a de novo run
if [ -z "$RepSpec" ]; then
	sleep 1
else
	if [ ! -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
		stage="Getting RepeatMasker Sequences for $RepSpec and Saving as Fasta" && runningTea
		getRepeatMaskerFasta
		stage="Running Initial Mask with Known Repeats" && runningTea
		firstMask
	else
		stage="Genome has already been masked, skipping..." && runningTea
		getRepeatMaskerFasta
	fi
fi

if [ -z "$startCust" ]; then
	sleep 1
else
	if [ ! -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
		stage="Running Initial Mask with Known Repeats in Custom Library" && runningTea
		firstMaskCustomLib
	else
		stage="Genome has already been masked, skipping..." && runningTea
	fi
fi

# Step 3 - make a database from the genome, and then run a de novo TE annotation using RepeatModeler2
if [ ! -f ${OUTDIR}/${species}_Database/${species}-families.fa ]; then
	stage="Detecting Novel Repeats" && runningTea
	buildDB
	deNovo1
	sleep 1
else
	stage="De novo repeats have already been detected, skipping..." && runningTea
	sleep 1
fi

# Step 4 - run TEstrainer, which runs a BLAST, extract, extent, trim, on the the de novo repeat library to refine consensus sequences
if [ ! -s ${OUTDIR}/${species}_strainer/${species}-families.fa.strained ]; then
	stage="Straining TEs and Refining de novo Consensus Sequences" && runningTea
	if [ -f ${OUTDIR}/${species}_strainer/${species}-families.fa.strained ]; then
		rm -r ${OUTDIR}/${species}_strainer/${species}-families.fa.strained ${OUTDIR}/${species}_strainer/TS*
		strainer
	elif [ -d ${OUTDIR}/${species}_strainer/TS_${species}-families.fa_* ]; then
		stage="Resuming TEstrainer" && runningTea
		strainerResume
	else
		strainer
	fi
	if [ ! -s ${OUTDIR}/${species}_strainer/${species}-families.fa.strained ]; then
		echo "WARNING: TEstrainer failed to produce a strain file, please check the log file for more information. If you have run an intial mask with known repeats, this could be due RepeatModeler2 failing to identify any new repeats. Please check if this is expected."
	fi
	sleep 1
else
	stage="TEs already strained, skipping..." && runningTea
	latestFile=${OUTDIR}/${species}_strainer/${species}-families.fa.strained
	sleep 1
fi

# Step 4.5 - cluster TE library IF REQUIRED

if [ "$cluster" == "yes" ]; then
	stage="Clustering TE sequences to Wicker TE Family Level (80-80-80 rule)" && runningTea
	clust
	sleep 1
fi

# Stage 5 - run the final RepeatMasker annotation using the refined TE consensus library
if [ ! -f ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/*.tbl ]; then
	stage="Identifying Repeats Using Species-Specific Library" && runningTea
	novoMask
	if [ ! -f ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/*.tbl ]; then
		echo "ERROR: RepeatMasker failed, please check logs" && exit 2
	fi
	sleep 1
else
	stage="Final masking already complete, skipping..." && runningTea
	sleep 1
fi

# Stage 5.5 - Run HELIANO as an optional step

if [ "$heli" == "yes" ]; then
	if [ ! -s ${OUTDIR}/${species}_heliano/*/RC.representative.gff ]; then
		stage="Running HELIANO to Detect Helitrons" && runningTea
		timestamp=$(date +"%Y%m%d_%H%M")
		heliano_optional
	else
		stage="HELITRON detection already complete, skipping..." && runningTea
		helitron_gff="$(realpath $(ls -td -- ${OUTDIR}/${species}_heliano/*/ | head -n 1))/RC.representative.gff"
		echo "HELITRON GFF: $helitron_gff"
	fi
	sleep 1
fi

# Stage 6
if [ ! -f ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed ] ; then
	stage="Defragmenting Repeats" && runningTea
	mergeRep
	sleep 1
else
	stage="Repeats already defragmented, skipping..." && runningTea
	sleep 1
fi

# Stage 7
stage="Generating Summary Plots" && runningTea
charts
calcDivRL
sleep 1

# Stage 8
stage="Tidying Directories and Organising Important Files" && runningTea
sweepUp
sleep 1

# Stage 8.5 Optional Softmask
if [ "$softMask" == "yes" ]; then
	stage="Generating Softmasked Genome" && runningTea
	gunzip ${genome%.prep}.bak.gz && bedtools maskfasta -fi ${genome%.prep}.bak -bed ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed -fo ${OUTDIR}/${species}_summaryFiles/${species}.softmasked.fasta -soft && gzip ${genome%.prep}.bak
	sleep 1
fi

# Stage 9
time=$(convertsecs $SECONDS)
stage="Done in $time" && runningTea
sleep 5

# Stage 10
stage="TE library, Summary Figures, and TE Quantifications in Standard Formats Can Be Found in ${OUTDIR}/${species}_summaryFiles/" && runningTea
cat ${OUTDIR}/${species}_summaryFiles/${species}.highLevelCount.kable
sleep 5
