#!/usr/bin/env perl
use 5.32.0;
use strict;
use warnings;
use FindBin;
use lib "$FindBin::RealBin/../perl5";
use Data::Dumper;
use Path::Tiny;
use List::MoreUtils qw(pairwise);
use Digest::MD5 qw(md5_hex);
use JSON;
use MLST::PubMLST;
use MLST::Scheme;
use MLST::Logger qw(err wrn msg dbg);
use MLST::Requirements qw(require_exe);

#..............................................................................
# Globals

my $VERSION = '2.33.1';
my $EXE = path($FindBin::RealScript)->basename;
my $AUTHOR = 'Torsten Seemann';
my $URL = 'https://github.com/tseemann/mlst';

# this is just the default schemes to exclude
my $EXCLUDE = 'ecoli,abaumannii,vcholerae_2,senterica_achtman_2';

# internal symbols
my $SEP = '/';
my $SEEN = 'N';

# for the tabular output
my $OUTSEP = "\t";
my $OUTFH = \*STDOUT;
my $QUOTECHAR = '"';

#..............................................................................
# Command line options

my(@Options, $debug, $quiet, $blastdb, $datadir, $threads, $exclude,
             $list, $longlist, $scheme, $legacy, $check, $skipcheck, $label,
             $minid, $mincov, $minscore, $fofn, $info, $outfile,
             $full, $csv, $json_fn, $nopath, $novel);
setOptions();

#..............................................................................
# Option parsing

MLST::Logger->quiet($quiet);
MLST::Logger->debug($debug);

msg("This is $EXE $VERSION running on $^O with Perl $]");

# check we have the right tools and versions thereof
if ($skipcheck) {
  msg("Skipping dependency check due to --skipcheck");
}
else {  
  msg("Checking $EXE dependencies:");
  require_exe('blastn', 'any2fasta');
  if ($check) {
    msg("OK.");
    exit(0);
  }
}

if ($fofn) {
  msg("Using '$fofn' as source of input files");
  @ARGV = path($fofn)->lines( {chomp=>1} );;
  msg("Read",0+@ARGV,"filenames");
}

($label && @ARGV > 1) and err("Using --label when scanning multiple files does not make sense");


$legacy && ! $scheme and err("Must specify a --scheme for --legacy output mode");

my $pubmlst = MLST::PubMLST->new( dir => $datadir );
my %scheme = (map { ($_->name => $_) } @{$pubmlst->schemes});
my %nov;  # keep track of novel alleles

# in case user is forcing a scheme on us (legacy mode)
$scheme && ! exists $scheme{$scheme} and err("Invalid --scheme '$scheme'. Check using --list");

# we set minscore to 0 if --scheme is set
if ($scheme and $minscore != 0) {
  msg("Setting --minscore=0 because user chose --scheme");
  $minscore=0;
}

if ($list or $longlist or $info) {
  if ($list) { 
    print join( " ", $pubmlst->names ), "\n";
  }
  elsif ($longlist) {
    for my $s (@{ $pubmlst->schemes }) {
      print_row( $s->name, @{$s->genes} );
    }
  }
  elsif ($info) {
    print_row(qw"SCHEME LOCII TYPES ALLELES DATE LOCII_NAMES");
    for my $s (sort { $a->name cmp $b->name } @{ $pubmlst->schemes }) {
      print_row(
        $s->name,
        $s->num_genes,
        $s->num_genotypes,
        $s->num_alleles,,
        $s->last_updated,
        join(' ', $s->genes->@*),
      );
    }
  }
  exit(0);
}

@ARGV or err("Please provide some FASTA/Genbank files to genotype (can be .gz)");

# build a hash of schemes to ignore from the CSV string
my %exclude = map { ($_ => 1) } (split m/,/ , $exclude);
%exclude = () if $scheme;   # don't exclude anything is specific scheme provided
my $num_excluded = scalar(keys %exclude);
msg("Excluding $num_excluded schemes:", (keys %exclude)) if $num_excluded > 0;

#..............................................................................
# Output results

if ($outfile) {
  # defaults to STDOUT
  $OUTFH = path($outfile)->filehandle('>');
}
$OUTSEP = ',' if $csv;  # default is tab

my $json;
my $allele_cnt = 7;

# If --scheme is set, set the length for initialising @sig to the number of alleles in $scheme
if ($scheme) {
  $allele_cnt = @{$scheme{$scheme}->genes};
}

# output the header for the old style syntax 
if ($scheme and $legacy) {
  print_row( qw"FILE SCHEME ST", @{ $scheme{$scheme}->genes } );
}
elsif ($full) {
  print_row( qw"FILE SCHEME ST STATUS SCORE ALLELES" );
}

my $dir = Path::Tiny->tempdir(CLEANUP=>1);

for my $argv (@ARGV) {
  -r $argv or err("Unable to read from '$argv'");
  -d $argv and err("'$argv' seems to be a directory, not a file");
  my $LABEL;

  # use --label, else just filename
  $LABEL = $label || ($nopath ? path($argv)->basename : $argv);
  msg("Using label '$LABEL' for file $argv") if $LABEL ne $argv;
  #msg("argv=$argv infile=$infile LABEL=$LABEL label=$label");

  # Do the big function call
  my($sch, $ST, $sig, $score) = find_mlst($argv, $LABEL);
  $scheme && $sch ne $scheme and err("BUG: got back $sch despite --scheme $scheme");

  # delete all novel alleles from failed schemes
  foreach (keys %{$nov{$LABEL}}) {
    delete $nov{$LABEL}{$_} unless $_ eq $sch;
  }

  my @code = $sch eq '-' ? () : split m{$SEP}, $sig ;
  
  # sort duplicate alleles numerically (only for pure 100% ones)
  for my $as (@code) {
    if ($as =~ m/,/ and $as =~ m/^[0-9,]+/) {
      #print STDERR "DEBUG: '$as' in [@code]\n";
      $as = join ',', sort { $a <=> $b } split m/,/, $as;
      #print STDERR "DEBUG: '$as' in [@code]\n";
    }
  }

  if ($scheme and $legacy) {
    # old style "one scheme" layout
    print_row( $LABEL, $sch, $ST, @code );
  } 
  else {
    # new style "autodetect" layout
    my @gene = $sch eq '-' ? () : $scheme{$sch}->genes->@* ;
    my $status = status_column($ST,$score,@code);
    my @allele = pairwise { "$a($b)" } @gene, @code ;
    if ($full) {
      print_row(
        $LABEL,
        $sch,
        $ST,
        $status,
        $score,
        join(';',@allele)
      );
    }
    else {
      print_row( $LABEL, $sch, $ST, @allele );
    }
  }

  if ($json_fn) {
    push @$json, {
      'id' => $LABEL,
      'filename' => $argv,
      'scheme' => $sch,
      'sequence_type' => $ST,
      'alleles' => $sch eq '-' 
        ? undef 
        : { (pairwise { ($a=>$b) } @{$scheme{$sch}->genes}, @code) },
    };
  }
}

if ($json_fn) {
  msg("Writing JSON: $json_fn");
  path($json_fn)->spew(
    to_json( $json, {ascii=>1,pretty=>1} ),
  );
}

if ($novel) {
  my %seen;
  my @new;
  for my $fname (keys %nov) {
    for my $sch (keys %{$nov{$fname}}) {
      for my $gene (keys %{$nov{$fname}{$sch}}) {
        my $seq = $nov{$fname}{$sch}{$gene};
        if ($seq ne $SEEN) {
          my $hash = md5_hex($seq);
          my $id = "$sch.$gene-$hash";
          next if $seen{$id}++; # same novel allele in > 1 input file
          push @new, ">$id $fname\n$seq\n";
        }
      }
    }
  }

  msg("Found",0+@new,"novel alleles");
  path($novel)->spew(\@new);
  dbg(Dumper(\%nov));
}

# Say our goodbyes
my @motd = (
  "Please also cite: Jolley et al (2018) https://pubmed.ncbi.nlm.nih.gov/30345391",
  "You can use --label XXX to replace an ugly filename in the output.",
  "$EXE also supports --json output for the modern bioinformatician.",
  "Did you know $EXE also works on .gbk and .gz files?",
  "I am quite confident this won't be the last time you run $EXE.",
  "Somehow this tool escaped my normal Australiana based naming system.",
  "If you have problems, please file at https://github.com/tseemann/mlst/issues",
  "The manual is now quite extensive; read it at https://github.com/tseemann/mlst",
  "You can use '$EXE --longlist' to see all the supported schemes.",
  "Use --quiet or -q to avoid all the message output, including these witticisms.",
  "Thanks for using $EXE, I hope you found it useful.",
  "If you like MLST, you're going to absolutely love cgMLST!",
  "If you like MLST, you're absolutely going to love wgMLST!",
  "Remember that --minscore is only used when using automatic scheme detection.",
  "You can follow me on Twitter at \@torstenseemann",
);
msg( $motd[ int(rand(scalar(@motd))) ] );
msg("Done.");
exit(0);

#----------------------------------------------------------------------
sub status_column {
  my($ST, $score, @code) = @_;
  return 'PERFECT' if $ST =~ m/^\d+$/; # already good
  return 'NONE' if $score <= 0;
  return 'NOVEL' if ! grep m/\D/, @code;
  return 'MIXED' if grep m/,/, @code;
  return 'MISSING' if grep m/-/, @code;
  return 'BAD' if $score < 70;
  return 'OK';
}

#----------------------------------------------------------------------
sub revcom {
  my($dna) = @_;
  $dna = reverse($dna);
  $dna =~ tr/ATGCatgc/TACGtacg/;
  return $dna;
}
#----------------------------------------------------------------------
sub run_cmd {
  my($cmd) = @_;
  msg("Running: $cmd");
  system($cmd)==0
    or err("COMMAND FAILED: $cmd");
}
#----------------------------------------------------------------------
sub find_mlst {
  my($fname, $LABEL) = @_;

  my $FNA = "$dir/mlst.fna";
  my $BLS = "$dir/mlst.bls";

  run_cmd("any2fasta -q \Q$fname\E > $FNA");

  run_cmd(" blastn -query $FNA -out $BLS"
         ." -db \Q$blastdb\E -num_threads $threads"
         ." -ungapped -dust no -word_size 32"
         ." -max_target_seqs 100000"
         ." -perc_identity $minid -evalue 1E-20"
         ." -outfmt '6 sseqid slen length nident qseqid qstart qend qseq sstrand'"
         );

  my %res;
  my $res_count=0;

  open my $hits, '<', $BLS
    or err("Can;t open BLAST output");

  foreach (<$hits>) {
    chomp;
    next unless m/ ^ (\w+)\.(\w+)[_-](\d+) 
                   \t (\d+) \t (\d+) \t (\d+) 
                   \t (\S+) \t (\d+) \t (\d+) \t (\S+) \t (\S+) /x;
    my($sch, $gene, $num, $hlen, $alen, $nident, $qid, $qstart, $qend, $qseq, $sstrand) = ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11);
    $res_count++;
    dbg("[$res_count] $qid:$qstart-$qend($sstrand) | $sch $gene $num | id=$nident/$alen | cov=$alen/$hlen | seq=$qseq");

    next unless $nident/$hlen >= $mincov/100 ; # need min-cov to reach minid

    if ($scheme and $sch ne $scheme) {
      dbg("Skipping $sch.$gene.$num allele as user specified --scheme $scheme");
      next;
    }
    if ($exclude{$sch}) {
      dbg("Excluding $sch.$gene.$num due to --exclude option");
      next;
    }
    if ($hlen == $alen and $nident == $hlen) {   # need full length 100% hits
      if (exists $res{$sch}{$gene} and $res{$sch}{$gene} !~ m/[~?]/) {
        msg("WARNING: found additional exact allele match $sch.$gene-$num");
        $res{$sch}{$gene} .= ",$num";
      }
      else {
        msg("Found exact allele match $sch.$gene-$num");
        $res{$sch}{$gene} = "$num";
      }
      # force update of hash with our exact match (see below for inexact behavior)
      $nov{$LABEL}{$sch}{$gene} = $SEEN;
    }
    else { # either 100% length (with some SNPs) or partial coverage
      # next if $res{$sch}{$gene};

      my $label = ($alen == $hlen) ? "~$num" : "${num}?";
      $res{$sch}{$gene} ||= $label;
      if ($novel and $alen==$hlen) {
        # only update hash if we haven't seen a good version yet
        $qseq = revcom($qseq) if $sstrand eq 'minus';
        $nov{$LABEL}{$sch}{$gene} ||= $qseq;
      }
    }
  }

  # find the signature with the fewest missing/approximate alleles
  my @sig = ( [ ($scheme || '-'), '-', join("/", ('-') x $allele_cnt), 0 ] );  # sentinel

  for my $name (keys %res) {
    my $sig = $scheme{$name}->signature_of($res{$name});
    my $ST = $scheme{$name}->sequence_type($sig);

    # this is to handle null alleles which produce an ST type
    if ($ST ne '-' and $sig =~ m{/-}) {
      $sig =~ s{/-}{/0}g; # want gene(0) not gene(-) in these cases
    }

    my $nlocii = $scheme{$name}->num_genes;
    # score is out of 100 (90 split across alleles, 10 for an ST type)
    # penalties are given as below for inexact alleles
    my $score = $nlocii;
    #dbg("Start: score=$score ($sig)");
    $score -= 0.3*($sig =~ tr/~/~/);  # novelish
    #dbg("Novel: score=$score");
    $score -= 0.5*($sig =~ tr/?/?/);  # approx
    #dbg("Parti: score=$score");
    $score -= 1.0*($sig =~ tr/-/-/);  # absent
    #dbg("Absen: score=$score");
    $score = int($score * 90 / $nlocii);
    #dbg("Normz: score=$score");
    $score += 10 if $ST ne '-';
    #dbg("HasST: score=$score");
    dbg("SCORE=$score\t$name\t$ST\t$sig\t($nlocii genes)");
    push @sig, [ $name, $ST, $sig, $score ] if $score >= $minscore;
  }

  # sort by score highest to lowest
  @sig = sort { $b->[3] <=> $a->[3] } @sig;
  
  # mention any equal-firsts
  for my $i (1 .. $#sig) {
    my $scaw = $sig[$i][3];
    if ($scaw == $sig[0][3]) {
      wrn("$sig[0][0]($sig[0][1])==$sig[$i][0]($sig[$i][1]) score=$scaw $fname");
      #wrn(Dumper(\@sig));
    }
  }
  dbg(Dumper(@sig));

  # take the top scorer
  my @best = @{ $sig[0] };

  return @best;
}
#----------------------------------------------------------------------
sub print_row {
  # will use globals $OUTSEP and $OUTFH;
  my(@col) = @_;
  # escape any conflicts  (eg. comma)
  foreach (@col) {
    next unless m/"|$OUTSEP/;
    s{$QUOTECHAR}{$QUOTECHAR$QUOTECHAR}g;
    s/\\/\\\\/g;
    $_ = $QUOTECHAR.$_.$QUOTECHAR;
  }
  print {$OUTFH} join($OUTSEP, @col), "\n";
}
#----------------------------------------------------------------------
sub show_version {
  print "$EXE $VERSION\n";
  exit(0);
}
#----------------------------------------------------------------------
# Option setting routines

sub setOptions {
  use Getopt::Long;

  @Options = (
    "GENERAL",
    {OPT=>"help",       VAR=>\&usage,                 DESC=>"This help"},
    {OPT=>"version",   VAR=>\&show_version,          DESC=>"Print version and exit"},
    {OPT=>"check!",     VAR=>\$check,    DEFAULT=>0,  DESC=>"Just check dependencies and exit"},
    {OPT=>"skipcheck!", VAR=>\$skipcheck,    DEFAULT=>0,  DESC=>"Skip dependency check at runtime"},
    {OPT=>"quiet!",     VAR=>\$quiet,    DEFAULT=>0,  DESC=>"Quiet - no stderr output"},
    {OPT=>"threads=i",  VAR=>\$threads,  DEFAULT=>1,  DESC=>"Number of BLAST threads (suggest GNU Parallel instead)"},
    {OPT=>"debug!",     VAR=>\$debug,    DEFAULT=>0,  DESC=>"Verbose debug output to stderr"},
    {OPT=>"fofn=s",     VAR=>\$fofn,    DEFAULT=>'',  DESC=>"File of input filenames to use"},
    "SCHEME",
    {OPT=>"scheme=s",   VAR=>\$scheme,   DEFAULT=>'', DESC=>"Do not auto-detect, force this scheme"},
    {OPT=>"info!",      VAR=>\$info,     DEFAULT=>0,  DESC=>"More information about schemes"},
    {OPT=>"list!",      VAR=>\$list,     DEFAULT=>0,  DESC=>"List available MLST scheme names"},
    {OPT=>"longlist!",  VAR=>\$longlist, DEFAULT=>0,  DESC=>"List allelles for all MLST schemes"},
    {OPT=>"exclude=s",  VAR=>\$exclude,  DEFAULT=>$EXCLUDE,  DESC=>"Ignore these schemes"},
    "OUTPUT FORMAT",
    {OPT=>"full!",   VAR=>\$full,     DEFAULT=>0,  DESC=>"More detailed output format (recommended)"},
    {OPT=>"legacy!", VAR=>\$legacy,   DEFAULT=>0,  DESC=>"Use old legacy output with allele header row (requires --scheme)"},
    {OPT=>"csv!",    VAR=>\$csv,      DEFAULT=>0,  DESC=>"Output CSV instead of TSV"},
    {OPT=>"label=s", VAR=>\$label,    DEFAULT=>'', DESC=>"Replace FILE with this name instead"},
    {OPT=>"nopath!", VAR=>\$nopath,   DEFAULT=>0,  DESC=>"Strip filename paths from FILE column"},
    "OUTPUT FILES",
    {OPT=>"outfile=s", VAR=>\$outfile,    DEFAULT=>'', DESC=>"Save output to this file [STDOUT]"},
    {OPT=>"novel=s", VAR=>\$novel,    DEFAULT=>'', DESC=>"Save novel alleles to this FASTA file"},
    {OPT=>"json=s",  VAR=>\$json_fn,  DEFAULT=>'', DESC=>"Also write results to this file in JSON format"},
    "SCORING",
    {OPT=>"minid=f",    VAR=>\$minid,    DEFAULT=>95, DESC=>"DNA %identity of full allelle to consider similar/'~''"},
    {OPT=>"mincov=f",   VAR=>\$mincov,   DEFAULT=>50, DESC=>"DNA %cov to report partial/'?' allele"},
    {OPT=>"minscore=f", VAR=>\$minscore, DEFAULT=>50, DESC=>"Minumum score out of 100 to match a scheme (when auto --scheme)"},
    "DATABASE",
    {OPT=>"blastdb=s",  VAR=>\$blastdb,  DEFAULT=>path("$FindBin::RealBin/../db/blast/mlst.fa")->realpath, DESC=>"BLAST database prefix"},
    {OPT=>"datadir=s",  VAR=>\$datadir,  DEFAULT=>path("$FindBin::RealBin/../db/pubmlst")->realpath, DESC=>"PubMLST data folders"},
  );
  GetOptions(map {$_->{OPT}, $_->{VAR}} grep { ref }  @Options)
    or exit(1);

  # Now setup default values.
  foreach (@Options) {
    if (ref $_ && defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) {
      ${$_->{VAR}} = $_->{DEFAULT};
    }
  }
}

sub usage {
  my($exitcode) = @_;
  $exitcode ||= 0;
  $exitcode = 0 if $exitcode eq 'help'; # what gets passed by getopt func ref
  select STDERR if $exitcode; # write to STDERR if exitcode is error

  print "SYNOPSIS\n  Automatic MLST calling from assembled contigs\n";
  print "USAGE\n";
  print "  % $EXE --longlist                                        # list known schemes\n";
  print "  % $EXE [options] <contigs.{fasta,gbk,embl}[.gz]          # auto-detect scheme\n";
  print "  % $EXE --scheme <scheme> <contigs.{fasta,gbk,embl}[.gz]> # force a scheme\n";
  foreach (@Options) {
    if (ref) {
      my $def = defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
      $def = ($def ? ' (default OFF)' : '(default ON)') if $_->{OPT} =~ m/!$/;
      my $opt = $_->{OPT};
      $opt =~ s/!$//;
      $opt =~ s/=s$/ STR/;
      $opt =~ s/=i$/ INT/;
      $opt =~ s/=f$/ NUM/;
      printf "  --%-15s %s%s\n",$opt,$_->{DESC}, $def;
    }
    else {
      print "$_\n"; # Subheadings in the help output
    }
  }
  print "HOMEPAGE\n  $URL - $AUTHOR\n";
  exit($exitcode);
}
 
#----------------------------------------------------------------------
