#!/usr/bin/env bash

# MIT License

# Copyright (c) [2026] [riccabolla]

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

VERSION="3.0.0" 

set -e
set -o pipefail

cat << "EOF"

    ____  ____ _  _ ____
   |  _ \ / ___| \ | | ____|
   | |_) | |   |  \| |  _|
   |  __/| |___| |\  | |___
   |_|    \____|_| \_|_____|
   Plasmid Copy Number Estimator v3.0.0
   by riccabolla

EOF

# --- Color Codes ---
RESET=$'\033[0m'
GREEN='\e[0;32m'
RED_BOLD='\e[1;31m'
BLUE_BOLD=$'\033[1;34m'
CYAN_BOLD='\e[1;36m'

start_time=$(date +%s)

# --- Default Parameters ---
THREADS=1
OUTPUT_PREFIX="pcne"
KEEP_INTERMEDIATE=0
MIN_MAPQ=0
SAM_FILTER_FLAG=0
GENERATE_PLOT=0
AGGREGATE_PLASMIDS=0
GC_CORRECTION=0
GC_WINDOW=1000
GC_LOESS_FRAC="auto" 
GENERATE_GC_PLOT=0
MINIMAP_PRESET="map-ont"
MINIMAP_OPTS=""

# --- File Path Variables ---
CHR_FASTA=""
PLS_FASTA=""
ASSEMBLY_FASTA=""
CHR_LIST_FILE=""
PLS_LIST_FILE=""
READS=""
MODE=0

# --- Find R Scripts ---
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
RSCRIPT_ANALYSIS_PATH="${SCRIPT_DIR}/PCNE_analysis.R"

if [ ! -f "${RSCRIPT_ANALYSIS_PATH}" ]; then
    echo -e "${RED_BOLD}Error: PCNE_analysis.R not found at: ${RSCRIPT_ANALYSIS_PATH}${RESET}" >&2; exit 1;
fi

# --- Usage Function ---
usage() {
  local script_name=$(basename "$0")
  echo -e "${BLUE_BOLD}Usage:${RESET}"
  echo -e "  ${script_name} [Mode 1 | Mode 2] [Common Required] [Optional]"
  echo -e "\n  ${BLUE_BOLD}Input Modes:${RESET}"
  echo -e "    ${CYAN_BOLD}Mode 1:${RESET}"
  echo "      -c, --chromosome FILE : Path to chromosome contigs FASTA file (Required)"
  echo "      -p, --plasmid FILE    : Path to plasmid contigs FASTA file (Required)"
  echo -e "    ${CYAN_BOLD}Mode 2:${RESET}"
  echo "      -a, --assembly FILE   : Path to the assembled genome FASTA file (Required)"
  echo "      -P, --plasmid-list FILE : Path to file containing plasmid contig names (Required)"
  echo "      -C, --chr-list FILE   : Path to file containing chromosome contig names (Optional)"
  echo -e "\n  ${BLUE_BOLD}Common Required:${RESET}"
  echo "      -r, --reads FILE     : Path to reads (FASTQ format) (Mandatory)"
  echo -e "\n  ${BLUE_BOLD}Optional Analysis:${RESET}"
  echo "      --gc-correction       : Enable GC correction"
  echo "      --gc-window INT       : Window size for GC analysis (default: ${GC_WINDOW})"
  echo "      --gc-frac FLOAT|auto  : LOESS smoothing fraction (default: ${GC_LOESS_FRAC})"
  echo -e "\n  ${BLUE_BOLD}Optional Filtering & Plotting:${RESET}"
  echo "      --preset STR          : Minimap2 preset (default: ${MINIMAP_PRESET})"
  echo "      --minimap-opts STR    : Additional minimap2 options (enclose in quotes)"
  echo "      -Q, --min-quality INT : Minimum mapping quality (MQ) (default: ${MIN_MAPQ})"
  echo "      -F, --filter INT      : SAM flag (default: ${SAM_FILTER_FLAG})"
  echo "      -l, --plot            : Generate a bar plot for copy numbers (<prefix>_plot.png)"
  echo "      --gc-plot FILE        : Save the GC vs. Depth diagnostic plot to this file."
  echo -e "\n  ${BLUE_BOLD}Other:${RESET}"
  echo "      -s, --single-plasmid  : Treat all contigs in '-p' FASTA as one fragmented plasmid (Mode 1 only)"
  echo "      -t, --threads INT     : Number of threads (default: ${THREADS})"
  echo "      -o, --output STR      : Prefix for output files (default: ${OUTPUT_PREFIX})"
  echo "      -k, --keep-intermediate : Keep intermediate files [Default: OFF]"
  echo "      -v, --version         : Display version information and exit"
  echo "      -h, --help            : Display this help message and exit"
}

# --- Parse Arguments ---
while [[ $# -gt 0 ]]; do
    key="$1"
    case ${key} in
        -c|--chromosome) CHR_FASTA="$2"; shift 2 ;;
        -p|--plasmid) PLS_FASTA="$2"; shift 2 ;;
        -a|--assembly) ASSEMBLY_FASTA="$2"; shift 2 ;;
        -C|--chr-list) CHR_LIST_FILE="$2"; shift 2 ;;
        -P|--plasmid-list) PLS_LIST_FILE="$2"; shift 2 ;;
        -r|--reads) READS="$2"; shift 2 ;;
        -t|--threads) THREADS="$2"; shift 2 ;;
        -o|--output) OUTPUT_PREFIX="$2"; shift 2 ;;
        --preset) MINIMAP_PRESET="$2"; shift 2 ;;
        --minimap-opts) MINIMAP_OPTS="$2"; shift 2 ;;
        -Q|--min-quality) MIN_MAPQ="$2"; shift 2 ;;
        -F|--filter) SAM_FILTER_FLAG="$2"; shift 2 ;;
        -k|--keep-intermediate) KEEP_INTERMEDIATE=1; shift ;;
        -l|--plot) GENERATE_PLOT=1; shift ;;
        -s|--single-plasmid) AGGREGATE_PLASMIDS=1; shift ;;
        --gc-correction) GC_CORRECTION=1; shift;;
        --gc-window) GC_WINDOW="$2"; shift 2;;
        --gc-frac) GC_LOESS_FRAC="$2"; shift 2;;
        --gc-plot) GENERATE_GC_PLOT=1; shift;;
        -v|--version) echo "$(basename "$0") version ${VERSION}"; exit 0 ;;
        -h|--help) usage; exit 0 ;;
        *) echo -e "${RED_BOLD}Error: Unknown option: $1${RESET}" >&2; usage; exit 1;;
    esac
done

# --- Define File Names ---
LOG_FILE="${OUTPUT_PREFIX}.log"
COMBINED_REF="${OUTPUT_PREFIX}_combined_reference.fasta"
ALIGNMENT_REF=""
SORTED_BAM="${OUTPUT_PREFIX}_alignment.sorted.bam"
FILTERED_BAM="${OUTPUT_PREFIX}_alignment.filtered.bam"
FINAL_REPORT_TSV="${OUTPUT_PREFIX}_results.tsv"
PLOT_FILE="${OUTPUT_PREFIX}_plot.png"
WINDOWS_BED="${OUTPUT_PREFIX}.windows.bed"
WINDOWS_GC="${OUTPUT_PREFIX}.windows.gc.tsv"
WINDOWS_COV="${OUTPUT_PREFIX}.windows.cov.tsv"
GC_PLOT_FILE="${OUTPUT_PREFIX}_gcplot.png"
WINDOWS_DATA="${OUTPUT_PREFIX}.windows.gc_depth.tsv" 
chr_name_list_arg=""
pls_name_list_arg=""
tmp_chr_list="${OUTPUT_PREFIX}_chr_names.tmp"
tmp_pls_list="${OUTPUT_PREFIX}_pls_names.tmp"

# --- Initialize Log ---
echo "PCNE Log File - v${VERSION}" > "${LOG_FILE}"
echo "Start Time: $(date)" >> "${LOG_FILE}"
cmd_args_orig=("$0" "$@")
cmd_args_quoted=$(printf " %q" "${cmd_args_orig[@]}")
echo "Command: ${cmd_args_quoted}" >> "${LOG_FILE}"
echo "--------------------" >> "${LOG_FILE}"

# --- Validate Arguments ---
if [[ -n "${CHR_FASTA}" && -n "${PLS_FASTA}" ]]; then
    MODE=1;
elif [[ -n "${ASSEMBLY_FASTA}" && -n "${PLS_LIST_FILE}" ]]; then
    MODE=2;
else
    MODE=0;
fi
if [[ ${MODE} -eq 0 ]]; then echo -e "${RED_BOLD}Error: Invalid or incomplete input files.${RESET}" >&2; usage; exit 1; fi
if [[ -z "${READS}" ]]; then echo -e "${RED_BOLD}Error: Missing read files (-r).${RESET}" >&2; usage; exit 1; fi

# --- Check Input Files ---
echo -e "Checking input files..." | tee -a "${LOG_FILE}"
if [ ! -f "${READS}" ]; then echo -e "${RED_BOLD}Error: Reads file not found: ${READS}${RESET}" >&2; exit 1; fi
if [[ ${MODE} -eq 1 ]]; then
    if [ ! -f "${CHR_FASTA}" ]; then echo -e "${RED_BOLD}Error: Chromosome FASTA file (-c) not found: ${CHR_FASTA}${RESET}" >&2; exit 1; fi
    if [ ! -f "${PLS_FASTA}" ]; then echo -e "${RED_BOLD}Error: Plasmid FASTA file (-p) not found: ${PLS_FASTA}${RESET}" >&2; exit 1; fi
elif [[ ${MODE} -eq 2 ]]; then
    if [ ! -f "${ASSEMBLY_FASTA}" ]; then echo -e "${RED_BOLD}Error: Assembly FASTA file (-a) not found: ${ASSEMBLY_FASTA}${RESET}" >&2; exit 1; fi
    if [ ! -f "${PLS_LIST_FILE}" ]; then echo -e "${RED_BOLD}Error: Plasmid list file (-P) not found: ${PLS_LIST_FILE}${RESET}" >&2; exit 1; fi
fi
echo -e "Done." | tee -a "${LOG_FILE}"

# --- Check Dependencies ---
echo -e "Checking dependencies..." | tee -a "${LOG_FILE}"
command -v minimap2 >/dev/null 2>&1 || { echo -e >&2 "${RED_BOLD}Error: minimap2 not found in PATH.${RESET}"; exit 1; }
command -v samtools >/dev/null 2>&1 || { echo -e >&2 "${RED_BOLD}Error: samtools not found in PATH.${RESET}"; exit 1; }
command -v Rscript >/dev/null 2>&1 || { echo -e >&2 "${RED_BOLD}Error: Rscript not found in PATH.${RESET}"; exit 1; }
command -v bedtools >/dev/null 2>&1 || { echo -e >&2 "${RED_BOLD}Error: bedtools not found in PATH.${RESET}"; exit 1; }
echo -e "Done." | tee -a "${LOG_FILE}"

# --- Prepare Contig Lists ---
echo -e "Preparing plasmid list..." | tee -a "${LOG_FILE}"
if [[ ${MODE} -eq 1 ]]; then
    grep '^>' "${PLS_FASTA}" | sed 's/>//; s/ .*//' > "${tmp_pls_list}"
    pls_name_list_arg="${tmp_pls_list}"
elif [[ ${MODE} -eq 2 ]]; then
    pls_name_list_arg="${PLS_LIST_FILE}"
fi
echo -e "Preparing chromosome list..." | tee -a "${LOG_FILE}"
if [[ ${MODE} -eq 1 ]]; then
    grep '^>' "${CHR_FASTA}" | sed 's/>//; s/ .*//' > "${tmp_chr_list}"
    chr_name_list_arg="${tmp_chr_list}"
elif [[ ${MODE} -eq 2 ]]; then
    if [[ -n "${CHR_LIST_FILE}" ]]; then
        chr_name_list_arg="${CHR_LIST_FILE}"
    else
        tmp_all_contigs_assembly="${OUTPUT_PREFIX}_all_contigs.tmp"
        grep '^>' "${ASSEMBLY_FASTA}" | sed 's/>//; s/ .*//' > "${tmp_all_contigs_assembly}"
        grep -Fxvf "${pls_name_list_arg}" "${tmp_all_contigs_assembly}" > "${tmp_chr_list}"
        chr_name_list_arg="${tmp_chr_list}"
        rm -f "${tmp_all_contigs_assembly}"
    fi
fi
echo "Done." | tee -a "${LOG_FILE}"

# --- Print Configuration ---
echo -e "${BLUE_BOLD}--- Starting Plasmid Copy Number Estimation v${VERSION} ---${RESET}" | tee -a "${LOG_FILE}"
printf "%-25s : %s\n" "Mode" "${MODE}" | tee -a "${LOG_FILE}"
printf "%-25s : %s\n" "Reads" "${READS}" | tee -a "${LOG_FILE}"
printf "%-25s : %s\n" "Threads" "${THREADS}" | tee -a "${LOG_FILE}"
printf "%-25s : %s\n" "Output Prefix" "${OUTPUT_PREFIX}" | tee -a "${LOG_FILE}"
printf "%-25s : %s\n" "Minimap2 Preset" "${MINIMAP_PRESET}" | tee -a "${LOG_FILE}"
printf "%-25s : %s\n" "Minimap2 Options" "${MINIMAP_OPTS:-None}" | tee -a "${LOG_FILE}"
printf "%-25s : %s\n" "GC Correction" "$(if [[ $GC_CORRECTION -eq 1 ]]; then echo "Enabled (LOESS)"; else echo "Disabled"; fi)" | tee -a "${LOG_FILE}"
echo -e "${BLUE_BOLD}--------------------------------------------------------${RESET}" | tee -a "${LOG_FILE}"

# --- Main Workflow ---

# STEP 1: Minimap2 Processing
echo -e "[Step 1/3] Preparing reference and aligning reads..." | tee -a "${LOG_FILE}"
{
    if [[ ${MODE} -eq 1 ]]; then
        ALIGNMENT_REF="${COMBINED_REF}"
        cat "${CHR_FASTA}" "${PLS_FASTA}" > "${ALIGNMENT_REF}"
    else
        ALIGNMENT_REF="${ASSEMBLY_FASTA}"
    fi
    #bwa index "${ALIGNMENT_REF}"
    samtools faidx "${ALIGNMENT_REF}"

    BAM_FOR_COVERAGE="${SORTED_BAM}"
    #bwa mem -t ${THREADS} "${ALIGNMENT_REF}" "${READS_R1}" "${READS_R2}" | samtools view -@ ${THREADS} -Sb | samtools sort -@ ${THREADS} -o "${SORTED_BAM}" -
    minimap2 -ax ${MINIMAP_PRESET} -t ${THREADS} "${ALIGNMENT_REF}" "${READS}" ${MINIMAP_OPTS} | samtools view -@ ${THREADS} -Sb | samtools sort -@ ${THREADS} -o "${SORTED_BAM}" -
    if [[ ${SAM_FILTER_FLAG} -gt 0 || ${MIN_MAPQ} -gt 0 ]]; then
        BAM_FOR_COVERAGE="${FILTERED_BAM}"
        samtools view -@ ${THREADS} -bh -F ${SAM_FILTER_FLAG} -q ${MIN_MAPQ} -o "${BAM_FOR_COVERAGE}" "${SORTED_BAM}"
    fi
    samtools index -@ ${THREADS} "${BAM_FOR_COVERAGE}"
} >> "${LOG_FILE}" 2>&1
echo "Done." | tee -a "${LOG_FILE}"

# STEP 2: Generate Windowed Data and Run R Analysis
echo -e "[Step 2/3] Generating windowed data and calculating copy number..." | tee -a "${LOG_FILE}"
{
    echo "Generating genomic window data..."
    awk '{print $1"\t"$2}' OFS='\t' "${ALIGNMENT_REF}.fai" | bedtools makewindows -g - -w ${GC_WINDOW} > "${WINDOWS_BED}"
    bedtools nuc -fi "${ALIGNMENT_REF}" -bed "${WINDOWS_BED}" | awk 'NR>1{print $1"\t"$2"\t"$3"\t"$5}' > "${WINDOWS_GC}"
    bedtools coverage -mean -a "${WINDOWS_BED}" -b "${BAM_FOR_COVERAGE}" | awk '{print $1"\t"$2"\t"$3"\t"$NF}' > "${WINDOWS_COV}"

    paste <(cut -f1-3 "${WINDOWS_GC}") <(cut -f4 "${WINDOWS_GC}") <(cut -f4 "${WINDOWS_COV}") | \
        awk 'BEGIN{OFS="\t"; print "contig\tstart\tend\tgc\tdepth"} {print $1,$2,$3,$4,$5}' > "${WINDOWS_DATA}"
    SAMPLE_NAME="${OUTPUT_PREFIX}"
    echo "Running R analysis script..."
    Rscript "${RSCRIPT_ANALYSIS_PATH}" \
        "${WINDOWS_DATA}" \
        "${chr_name_list_arg}" \
        "${pls_name_list_arg}" \
        "${GC_CORRECTION}" \
        "${GC_LOESS_FRAC}" \
        "${FINAL_REPORT_TSV}" \
        "${GENERATE_GC_PLOT}" \
        "${GENERATE_PLOT}" \
        "${AGGREGATE_PLASMIDS}" \
        "${PLS_FASTA}" \
        "${SAMPLE_NAME}" 
} >> "${LOG_FILE}" 2>&1
R_EXIT_CODE=$?
echo "Done." | tee -a "${LOG_FILE}"

# STEP 3: Cleanup
echo -e "[Step 3/3] Cleaning up intermediate files..." | tee -a "${LOG_FILE}"
rm -f "${tmp_chr_list}" "${tmp_pls_list}"
if [[ ${KEEP_INTERMEDIATE} -eq 0 ]]; then
    rm -f "${SORTED_BAM}" "${SORTED_BAM}.bai"
    if [[ -f "${FILTERED_BAM}" ]]; then rm -f "${FILTERED_BAM}" "${FILTERED_BAM}.bai"; fi
    rm -f "${WINDOWS_BED}" "${WINDOWS_GC}" "${WINDOWS_COV}" "${WINDOWS_DATA}"
    if [[ -n "${ALIGNMENT_REF}" && -f "${ALIGNMENT_REF}" ]]; then
      rm -f "${ALIGNMENT_REF}.fai" >> "${LOG_FILE}" 2>&1 || true
    fi
    if [[ ${MODE} -eq 1 && -n "${ALIGNMENT_REF}" && -f "${ALIGNMENT_REF}" ]]; then
        if [[ "${ALIGNMENT_REF}" == "${COMBINED_REF}" ]]; then rm -f "${ALIGNMENT_REF}"; fi
    fi
fi
echo "Done." | tee -a "${LOG_FILE}"

# --- Final Check and Exit ---
if [ ${R_EXIT_CODE} -ne 0 ]; then
     echo -e "${RED_BOLD}Error: R script failed with exit code ${R_EXIT_CODE}. Check ${LOG_FILE}${RESET}" >&2
     exit 1
fi

# --- Elapsed Time ---
end_time=$(date +%s)
duration=$((end_time - start_time))
secs=$((duration % 60))
mins=$(( (duration / 60) % 60 ))
hrs=$(( duration / 3600 ))
echo -e "${BLUE_BOLD}-----------------------------------------------${RESET}" | tee -a "${LOG_FILE}"
echo -e "${GREEN}✅ Workflow Complete ${RESET}" | tee -a "${LOG_FILE}"
printf "${GREEN}   Total Elapsed Time: %02d:%02d:%02d${RESET}\n" $hrs $mins $secs | tee -a "${LOG_FILE}"
echo -e "${GREEN}   Log file: ${LOG_FILE}${RESET}" | tee -a "${LOG_FILE}"
echo -e "${GREEN}   Final report: ${FINAL_REPORT_TSV}${RESET}" | tee -a "${LOG_FILE}"
if [[ $GENERATE_PLOT -eq 1 && -f "${PLOT_FILE}" ]]; then echo -e "${GREEN}   Plot file: ${PLOT_FILE}${RESET}" | tee -a "${LOG_FILE}"; fi
if [[ -n $GC_PLOT_FILE && -f "${GC_PLOT_FILE}" ]]; then echo -e "${GREEN}   GC Plot file: ${GC_PLOT_FILE}${RESET}" | tee -a "${LOG_FILE}"; fi
echo -e "${BLUE_BOLD}-----------------------------------------------${RESET}" | tee -a "${LOG_FILE}"

exit 0