#!/usr/bin/env python

import os
import sys
import json
from datetime import datetime
from argparse import ArgumentParser
from dqc import dqc_version
from dqc.config import config

def parse_args():
    parser = ArgumentParser(description=f"DFAST_QC: Taxonomy and completeness check (ver. {dqc_version})")
    parser.add_argument('--version', version=f'DFAST_QC ver. {dqc_version}', action='version', help="Show program version", default=False)
    parser.add_argument(
        "-i",
        "--input_fasta",
        type=str,
        # required=True,
        help="Input FASTA file (raw or gzipped) [required]",
        metavar="PATH"
    )
    parser.add_argument(
        "-o",
        "--out_dir",
        type=str,
        help="Output directory (default: OUT)",
        metavar="PATH"
    )
    parser.add_argument(
        "-hits",
        "--num_hits",
        default="10",
        type=int,
        help="Number of top hits by MASH (default: 10)",
        metavar="INT"
    )
    parser.add_argument(
        "-a",
        "--ani",
        default="95",
        type=int,
        help="ANI threshold (default: 95%%)",
        metavar="INT"
    )
    parser.add_argument(
        "-t",
        "--taxid",
        type=int,
        default=None,
        help="NCBI taxid for completeness check. Use '--show_taxon' for available taxids. (Default: Automatically inferred from taxonomy check)",
        metavar="INT"
    )
    parser.add_argument(
        "-r",
        "--ref_dir",
        default=None,
        type=str,
        metavar="PATH",
        help="DQC reference directory (default: DQC_REFERENCE_DIR)")
    parser.add_argument(
        "-n",
        "--num_threads",
        type=int,
        default=1,
        help="Number of threads for parallel processing (default: 1)",
        metavar="INT"
    )
    parser.add_argument(
        '--enable_gtdb',
        action='store_true',
        help='Enable GTDB search'
    )
    # group_disable = parser.add_mutually_exclusive_group()
    parser.add_argument(
        '--disable_tc',
        action='store_true',
        help='Disable taxonomy check using ANI'
    )
    parser.add_argument(
        '--disable_cc',
        action='store_true',
        help='Disable completeness check using CheckM'
    )
    parser.add_argument(
        '--disable_shigapass',
        action='store_true',
        help='Disable ShigaPass analysis even when Shigella/E. coli is detected'
    )
    parser.add_argument(
        '--disable_auto_download',
        action='store_true',
        help='Disable auto-download for missing reference genomes'
    )
    parser.add_argument(
        '--force',
        action='store_true',
        help='Force overwriting result'
    )
    parser.add_argument(
        '--debug',
        action='store_true',
        help='Debug mode'
    )
    parser.add_argument(
        "-p",
        "--prefix",
        type=str,
        help="Prefix for output (for debugging use, default: None)",
        metavar="STR"
    )
    parser.add_argument(
        '--show_taxon',
        action='store_true',
        help='Show available taxa for competeness check'

    )
    if len(sys.argv)==1:
        parser.print_help()
        exit()
    args = parser.parse_args()
    return args


args = parse_args()

if args.ref_dir:
    config.DQC_REFERENCE_DIR = args.ref_dir

if args.show_taxon:
    from dqc.checkm_helper import show_taxon
    show_taxon()
    exit()
if args.input_fasta is None:
    sys.stderr.write("dfast_qc: error: Query FASTA is not specified. Aborted\n")
    exit(1)
# set input query
if not os.path.isfile(args.input_fasta):
    sys.stderr.write(f"dfast_qc: error: Query FASTA file not found. Aborted. [PATH:{args.input_fasta}]\n")
    exit(1)
else:
    config.QUERY_GENOME = args.input_fasta

# set other options
if args.out_dir:
    config.OUT_DIR = args.out_dir
if args.num_hits:
    config.MASH_OPTION = args.num_hits
if args.ani:
    config.ANI_THRESHOLD = args.ani
if args.taxid is not None:
    config.CHECKM_TAXID = args.taxid
if args.num_threads:
    assert args.num_threads > 0
    config.NUM_THREADS = args.num_threads
if args.prefix:
    config.PREFIX = args.prefix
if args.enable_gtdb:
    config.ENABLE_GTDB = True
if args.disable_tc:
    config.DISABLE_TC = True
if args.disable_cc:
    config.DISABLE_CC = True
if args.disable_shigapass:
    config.DISABLE_SHIGAPASS = True
if args.disable_auto_download:
    config.AUTO_DOWNLOAD = False
if args.debug:
    config.DEBUG = True
if args.force:
    config.FORCE = True

# check enabled processes
if config.DISABLE_TC and config.DISABLE_CC and (not config.ENABLE_GTDB):
    sys.stderr.write("dfast_qc: error: At least one process must be enabled: Taxonomy Check, Completeness Check, GTDB search.\n")
    exit(1)

# check invalid options
if config.DISABLE_TC and config.CHECKM_TAXID is None and (not config.DISABLE_CC):
    sys.stderr.write("dfast_qc: error: '--taxid' is required to conduct completeness check when '--disable_tc' is specified.\n")
    exit(1)

from dqc.common import get_logger, prepare_output_directory, get_ref_inf

prepare_output_directory()
logger = get_logger(__name__)
start_time = datetime.now()
logger.info("DFAST_QC pipeline started.")
logger.info("DFAST_QC version: %s", dqc_version)
if not os.path.exists(config.DQC_REFERENCE_DIR):
    logger.error("DQC Reference Directory does not exist. Aborted. %s", config.DQC_REFERENCE_DIR)
    logger.error("Please download the reference data by 'dqc_ref_manager.py download'.")
    exit(1)
else:
    logger.info("DQC Reference Directory: %s", config.DQC_REFERENCE_DIR)
    logger.info(get_ref_inf(as_str=True))

from dqc import taxonomy_check, completeness_check, gtdb_search, shigapass_check

best_hit_species_taxid = None  # for genome size check

# completeness check
if not config.DISABLE_TC:
    tc_result = taxonomy_check.run() # tc_result is a list containing dictionaries of ANI result

    if len(tc_result) > 0:
        first_hit = tc_result[0]
        best_hit_species_taxid = first_hit["species_taxid"]

    if config.CHECKM_TAXID is None:  # taxid is automatically inferred from taxonomy check
        if len(tc_result) > 0:
            first_hit = tc_result[0]
            config.CHECKM_TAXID = first_hit["species_taxid"]
            logger.info("Taxid for CheckM is set to %d.", first_hit["species_taxid"])
        else:
            logger.warning("Failed to determine species. Taxid 0 (Prokaryote) is set for CheckM.")
            config.CHECKM_TAXID = 0
else:
    tc_result = []

# ShigaPass serotype prediction
shigapass_result = {}
if not config.DISABLE_SHIGAPASS and not config.DISABLE_TC:
    if shigapass_check.should_run_shigapass(tc_result):
        config.ENABLE_SHIGAPASS = True
        logger.info("Shigella/E. coli detected. Running ShigaPass.")
        shigapass_result = shigapass_check.run()

if not config.DISABLE_CC:
    cc_result = completeness_check.run()

    # Genome size check
    from dqc.genome_size_check import genome_size_check
    logger.info("Checking expected genome size for taxid %s", best_hit_species_taxid)
    genome_size_check_result = genome_size_check(config.QUERY_GENOME, best_hit_species_taxid)
    cc_result.update(genome_size_check_result)

else:
    cc_result = {}


# GTDB search
if config.ENABLE_GTDB:
    gtdb_result = gtdb_search.run()
else:
    gtdb_result = []
dqc_result = {"tc_result": tc_result, "cc_result": cc_result, "gtdb_result": gtdb_result, "shigapass_result": shigapass_result}
dqc_result_file_json = os.path.join(config.OUT_DIR, config.DQC_RESULT_JSON)
logger.debug("DQC result json %s\n%s\n%s", "-"*80, json.dumps(dqc_result, indent=4), "-"*80)
with open(dqc_result_file_json, "w") as f:
    json.dump(dqc_result, f, indent=4)
logger.info("DFAST_QC result json was written to %s", dqc_result_file_json)


end_time = datetime.now()
running_time = end_time - start_time
running_time = running_time.total_seconds()
h, remainder = divmod(running_time, 3600)
m, s = divmod(remainder, 60)

logger.info("DFAST_QC completed!")
logger.info("Total running time: {0:.0f}h{1:.0f}m{2:.0f}s".format(h, m, s))
