#!/bin/bash
#
#  ccfind - circular complete genome finder
#
#    Copyright: 2017-2020 (C) Yosuke Nishimura (ynishimura@aori.u-tokyo.ac.jp)
#    License: MIT license
#

PROGNAME=$(basename $0)
VERSION="1.4.7"
PROGNAMEc="[35m$PROGNAME[0m"
Errmsg="[31m[error][0m"
Sccmsg="[32m[success][0m"
Moddate="2026-03-05"
incmd="$0 $*"


# {{{ usage
read -r -d '' usage <<EOF
[35m### $PROGNAME ver $VERSION ($Moddate) ###

[32m[description][0m
$PROGNAME - Circular Complete genome FINDer.

$PROGNAME is a general tool to detect circular complete genomes with clues of terminal redundancy.
$PROGNAME was originally designed for identification of complete virus genomes from metagenome assembly.
$PROGNAME can be used for any contig/genome, but cutoff values should be carefully considered (see README).
It should be noted that terminal redundancy (circularity) does not necessarily mean completion of the sequence. Partial genomes might be detected as circular contigs for some reasons (e.g., sequence repeats).

[32m[dependencies][0m
- ssearch36      -- Smith-Waterman alignment, included in FASTA program
                    (http://fasta.bioch.virginia.edu/fasta_www2/fasta_list2.shtml)
- blastn         -- nucleotide blast, included in the BLAST+ program
                    (https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download)
- prodigal       -- a gene prediction tool, used to detect a well-arranged start position of a circular complete genome (do not split ORF).
                    (https://github.com/hyattpd/Prodigal)
- ruby (ver >=2.0) 

[32m[usage][0m
$ $PROGNAME <input fasta> <output dir> [options]

<output dir> should not exist.

[32m[options][0m
  (general)
    -h, --help
    -v, --version

  (cutoff values)
    -L, --terminal-fragment-size  [int] (default: 500) -- should be longer than estimated terminal redundancy length. If sequence length is less than 2 x (this value), the sequence will be ignored and listed in "result/too_short_seq.list".
    -i, --min-percent-identity    [int] (default: 94)
    -l, --min-aligned-length      [int] (default: 50)

  (use GNU parallel)
    --ncpus        [int]        -- number of jobs in parallel	

  (misc)
    --preserve-tmpdir (default: off) -- do not remove temporary files

[32m[output files][0m
result/circ.detected.list    -- list of complete genomes (circular sequences).
result/circ.fasta            -- circular sequences in fasta format. Terminal redundancy is preserved (without modification).
result/circ.noTR.fasta       -- circular sequences in fasta format. Terminal redundancy is trimmed (redundant region at the end is removed).
result/circ.noTR.cPerm.fasta -- circular sequences in fasta format. Terminal redundancy is trimmed and circular permutation is perfomred, trying not to split any ORF into two fragments (the last intergenic region detected by prodigal is selected as termini).
result/too_short_seq.list    -- list of too short sequences that are excluded from analysis. See the above description of '--terminal-fragment-size'.
EOF
# }}} usage


# {{{ parse options
for OPT in "$@"
do
	case "$OPT" in
		'-h'|'--help' )
			echo "$usage"
			exit 0
			;;
		'-v'|'--version' )
			echo $VERSION
			exit 0
			;;
		'--preserve-tmpdir' )
			keeptmp=true
			shift 1
			;;
		'-i'|'--min-percent-identity' )
			if [[ -z "$2" ]] || [[ "$2" =~ ^-+ ]] ; then
				echo "$PROGNAME: option requires an argument -- $1" 1>&2
				exit 1
			fi
			idt="$2"
			shift 2
			;;
		'-l'|'--min-aligned-length' )
			if [[ -z "$2" ]] || [[ "$2" =~ ^-+ ]] ; then
				echo "$PROGNAME: option requires an argument -- $1" 1>&2
				exit 1
			fi
			len="$2"
			shift 2
			;;
		'-L'|'--terminal-fragment-size' )
			if [[ -z "$2" ]] || [[ "$2" =~ ^-+ ]] ; then
				echo "$PROGNAME: option requires an argument -- $1" 1>&2
				exit 1
			fi
			size="$2"
			shift 2
			;;
		'--ncpus' )
			if [[ -z "$2" ]] || [[ "$2" =~ ^-+ ]] ; then
				echo "$Errmsg option requires an argument -- $1" 1>&2
				exit 1
			fi
			ncpus="$2"
			shift 2
			;;
		'--'|'-' )
			shift 1
			params+=( "$@" )
			break
			;;
		-*)
			echo "$PROGNAME: illegal option -- '$(echo $1 | sed 's/^-*//')'" 1>&2
			exit 1
			;;
		*)
			if [[ ! -z "$1" ]] && [[ ! "$1" =~ ^-+ ]] ; then
				#params=( ${params[@]} "$1" )
				params+=( "$1" )
				shift 1
			fi
			;;
	esac
done
if [ -z $params ] || [ ${#params[@]} -lt 2 ] ; then
	echo "[error] need 2 arguments" 1>&2
	echo 
	echo "$usage" 1>&2
	exit 1
elif ! [ -f "${params[0]}" ] ; then
	echo "[error] first argument should be a fasta file." 1>&2
	echo 
	echo "$usage" 1>&2
	exit 1
elif [ -e "${params[1]}" ] ; then
	echo "[error] output directory is already exist." 1>&2
	echo 
	echo "$usage" 1>&2
	exit 1
fi
# }}} parse options


# {{{ check dependencies
### check availablity of ssearch36
command -v ssearch36 >/dev/null 2>&1 || { echo >&2 "ssearch36 is required but not found.  Aborting."; exit 1; }

### check availablity of blastn
command -v blastn >/dev/null 2>&1 || { echo >&2 "$Errmsg blastn (BLAST+ package) is required but not found.  Aborting."; exit 1; }
command -v makeblastdb >/dev/null 2>&1 || { echo >&2 "$Errmsg makeblastdb (BLAST+ package) is required but not found.  Aborting."; exit 1; }

### check availablity of prodigal
command -v prodigal >/dev/null 2>&1 || { echo >&2 "prodigal is required but not found. Aborting."; exit 1; }

### check ruby version
ST=`ruby -e 'print RUBY_VERSION.to_f >= 2.0 ? 0 : 1'`
if [ $ST -ne 0 ]; then 
	echo >&2 "ruby (version >=2.0) is required.  Aborting."; exit 1;
fi

### check availablity of GNU parallel
if [ -n $ncpus ]; then 
	command -v parallel >/dev/null 2>&1 || { echo >&2 "$Errmsg GNU parallel is required for parallel computation but not found.  Aborting."; exit 1; }
fi
# }}} check dependencies


# {{{ assign variables 
### parse args and default params
fin=${params[0]}
dir=${params[1]}
idt="${idt:-94}"
len="${len:-50}"
size="${size:-500}"

### rakefile and log directory
pushd `dirname $0` > /dev/null
scrdir=`pwd -P`
popd > /dev/null
jobname=`basename $0`
rakefile=$scrdir/$jobname.rake
logdir=$dir/log
# }}} assign variables 


# {{{ after run function
function after_run(){
  endTime=$(LC_ALL=C date +"%Y-%m-%d %H:%M:%S %z")

  if [[ $1 -gt 0 ]] ; then
    echo "$Errmsg run failed: $endTime" > "$logdir/$startTime.$jobname.error"
    echo "$Errmsg process is killed: $endTime" |tee -a "$logdir/$jobname.log"
    exit 1
  else
    echo "$Sccmsg run finished: $endTime" > "$logdir/$startTime.$jobname.success"
  fi
}
### }}} after run function


# {{{ input command
read -r -d '' incmdlog <<EOF

[1;32m===== command line[0m

$ $incmd

[1;32m===== log file[0m

$logdir/$jobname.log

[1;32m===== check version: $PROGNAME[0m

$VERSION

EOF
# }}} input command


### main
mkdir -p $logdir
startTime=$(LC_ALL=C date +"%Y-%m-%d %H:%M:%S")

## write input command
echo "$incmdlog" |tee -a $logdir/$jobname.log

## trap Ctrl-C etc...
trap 'after_run 1' 1 2 3 9 15

### run
### move to output directory and then back --- input file path of ssearch36 should not be too long (when length is 127, it returns an error '*** error [comp_lib9.c:642] cannot open library <input file>x')
absolute_fin=$(echo "$fin" |ruby -e 'puts File.absolute_path("#{$stdin.read.strip}")') 

pushd "$dir" > /dev/null
rake -f $rakefile fin="$absolute_fin" size="$size" len="$len" idt="$idt" ncpus="$ncpus" keeptmp="$keeptmp" 2>&1 |tee -a log/$jobname.log
st=$(( ${PIPESTATUS[0]} + ${PIPESTATUS[1]} ))
popd > /dev/null

after_run $st
