#!/usr/bin/env perl

##
# RF Jackknife
# RNA Framework [http://www.rnaframework.com]
#
# Author:  Danny Incarnato (dincarnato[at]rnaframework.com)
# Summary: Iteratively optimize slope and intercept parameters to maximize
#          FMI (or mFMI) using a set of reference structures
#
# This program is free software, and can be redistribute  and/or modified
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.
#
# Please see <http://www.gnu.org/licenses/> for more informations.
##

use strict;
use File::Basename;
use File::Spec;
use FindBin qw($Bin);
use Getopt::Long qw(:config no_ignore_case);
use Math::BigFloat qw(:constant);
use threads;
use threads::shared;

use lib $Bin . "/lib";

use Core::Utils;
use Core::Mathematics qw(:all);
use Core::Process::Queue;
use Data::IO::Sequence;
use Data::Sequence::Utils;
use Graphics::Chart::Heatmap;
use Graphics::Image;
use RF::Data::IO::XML;
use RF::Utils;
use RNA::Utils;
use Term::Constants qw(:screen);
use Term::Table;

$|++;

my ($output, $overwrite, $reference, $refio,  
    $rffold, $processmanager, $threads, $slope, 
    $intercept, $sstep, $istep, $rfparams, $relaxed, 
    $mFMI, $kp, $kl, $m, $am, $help, $tmpdir, $decimals,
    $ignoreseq, $onlyCommon, $installed, $image,
    $R, @slope, @intercept, @samples, %files, 
    %reference, %table, %results);
%results = ( used          => 0,
             ioerr         => 0,
             diffseq       => 0,
             missingstruct => 0 );
my @tested : shared;
my @results : shared;

do {

    local $SIG{__WARN__} = sub { };

    GetOptions( "h|help"               => \$help,
                "r|reference=s"        => \$reference,
                "p|processors=i"       => \$threads,
                "o|output-dir=s"       => \$output,
                "ow|overwrite"         => \$overwrite,
                "sl|slope=s"           => \$slope,
                "in|intercept=s"       => \$intercept,
                "ss|slope-step=s"      => \$sstep,
                "is|intercept-step=s"  => \$istep,
                "rf|rf-fold=s"         => \$rffold,
                "rp|rf-fold-params=s"  => \$rfparams,
                "x|relaxed"            => \$relaxed,
                "kp|keep-pseudoknots"  => \$kp,
                "kl|keep-lonelypairs"  => \$kl,
                "m|mFMI"               => \$mFMI,
                "e|median"             => \$m,
                "am|arithmetic-mean"   => \$am,
                "i|ignore-sequence"    => \$ignoreseq,
                "oc|only-common"       => \$onlyCommon,
                "g|img"                => \$image,
                "R|R-path=s"           => \$R,
                "d|decimals=s"         => \$decimals ) or help(1);

    @samples = uniq(@ARGV);

};

help() if ($help);

# Default values
$threads ||= 1;
$output ||= "rf_jackknife/";
$rffold ||= which("rf-fold");
$slope ||= "0:5";
$intercept ||= "-3:0";
$istep ||= 0.2;
$sstep ||= 0.2;
$decimals ||= 3;
$R = checkRinstall($R) if ($image);

$installed = eval { require RNA; 1; };
$output =~ s/\/?$/\//;
$tmpdir = $output . "/tmp/";
@slope = sort {$a <=> $b} split(/[,;:]/, $slope);
@intercept = sort {$a <=> $b} split(/[,;:]/, $intercept);

##
# Input validation
##

die "\n  [!] Error: ViennaRNA package's Perl module RNA.pm is required." .
    "\n             Please ensure that ViennaRNA package v2.2.0 (or greater) is installed and try again\n\n" if (!$installed);

die "\n  [!] Error: No reference structures file provided\n\n" if (!defined $reference);
die "\n  [!] Error: Reference structures file doesn't exist\n\n" if (!-e $reference);
die "\n  [!] Error: Number of processors must be an integer greater than 0\n\n" if ($threads < 1);
die "\n  [!] Error: Not enough values in slope range\n\n" if (@slope < 2);
die "\n  [!] Error: Not enough values in intercept range\n\n" if (@intercept < 2);
die "\n  [!] Error: Too many values in slope range\n\n" if (@slope > 2);
die "\n  [!] Error: Too many values in intercept range\n\n" if (@intercept > 2);
die "\n  [!] Error: Slope range upper and lower limits must be different\n\n" if ($slope[0] == $slope[1]);
die "\n  [!] Error: Intercept range upper and lower limits must be different\n\n" if ($intercept[0] == $intercept[1]);
die "\n  [!] Error: Number of decimals must be an INT comprised between 1 and 10\n\n" if (!isint($decimals) || !inrange($decimals, [1, 10]));

for (0 .. 1) {

    die "\n  [!] Error: Invalid slope range " . ($_ ? "upper" : "lower") . " limit value\n\n" unless(isreal($slope[$_]));
    die "\n  [!] Error: Invalid intercept range " . ($_ ? "upper" : "lower") . " limit value\n\n" unless(isreal($intercept[$_]));

}

die "\n  [!] Error: Parameters -m and -am are mutually exclusive\n\n" if ($m && $am);
die "\n  [!] Error: Intercept step value must be a positive value > 0\n\n" if (!ispositive($istep) || !$istep);
die "\n  [!] Error: Slope step value must be a positive value > 0\n\n" if (!ispositive($sstep) || !$sstep);

if (!defined $rffold) { die "\n  [!] Error: rf-fold is not in PATH\n\n"; }
elsif (!-e $rffold) { die "\n  [!] Error: rf-fold doesn't exist\n\n"; }
elsif (!-x $rffold) { die "\n  [!] Error: rf-fold is not executable\n\n"; }

$SIG{__DIE__} = \&cleanup;

print "\n[+] Making output directory...";

if (-e $output) {

    if ($overwrite) {

        my $error = rmtree($output);

        die "\n\n  [!] Error: " . $error . "\n\n" if ($error);

    }
    else { die "\n\n  [!] Error: Output directory already exists." .
               "\n      Please use -ow (or --overwrite) to overwrite output directory\n\n"; }

}

if (my $error = mktree($tmpdir, "755")) { die "\n\n  [!] Error: Unable to create temporary directory ($error)\n\n"; }

##
# Check structures
##

print "\n[+] Importing input reference structures and probing data...";

$refio = Data::IO::Sequence->new( file        => $reference,
                                  pseudoknots => $kp,
                                  lonelypairs => $kl );

while(my $ref = $refio->read()) {

    die "\n\n  [!] Error: Provided reference file is not a valid dot-bracket or CT file\n\n" if (!$ref->can("structure"));

    my (@files);

    foreach my $sample (@samples) {

        die "\n\n  [!] Error: Sample XML file/directory \"$sample\" does not exist\n\n" if (!-e $sample);

        if (-d $sample) {

            next if (!-e "$sample/" . $ref->id() . ".xml");

            push(@files, "$sample/" . $ref->id() . ".xml");

        }
        else {

            my ($fileid, undef, $format) = fileparse($sample, qr/\.[^.]*/);

            next if ($fileid ne $ref->id() || $format ne ".xml");

            push(@files, $sample);

        }

    }

    if (!@files || ($onlyCommon && @files != @samples)) {

        $results{missingstruct}++;

        next;

    }

    foreach my $file (@files) {

        my ($eval, $queryIO);
        $eval = do { 

            local $@;

            eval { $queryIO = RF::Data::IO::XML->new(file => $file); };

            $@;

        };

        if ($@) {

            $results{ioerr}++;

            next;

        }

        if ((!$ignoreseq && dna2rna($queryIO->sequence()) ne dna2rna($ref->sequence())) ||
            ($ignoreseq && $queryIO->length() != $ref->length())) {

            $results{diffseq}++;

            next;

        }

        $files{$ref->id()} = \@files;
        $reference{$ref->id()} = $ref->structure();

    }

}

print " " . keys(%files) . " imported.";

if (keys %files) {

    my ($table, @table);
    $results{used} = keys(%files);

    print "\n[+] Checking RF Fold parameters...\n";

    # Remove potentially conflicting or useless RF Fold parameters
    $rfparams =~ s/(?:^|\s)(?:--ignore-reactivity|--dotplot|--shannon-entropy|--help|--img|-dp|-s|-g|-sh|-i|-h)(?=\s|$)/ /g;
    $rfparams =~ s/(?:^|\s)(?:--slope|-sl)(?:\s+|=)(?:-?\d+(?:\.\d+)+?)?(?=\s|$)/ /g;
    $rfparams =~ s/(?:^|\s)(?:--intercept|-in)(?:\s+|=)(?:-?\d+(?:\.\d+)?)?(?=\s|$)/ /g;
    $rfparams =~ s/(?:^|\s)(?:--processors|-p)(?:\s+|=)(?:\d+)?(?=\s|$)/ /g;
    $rfparams =~ s/(?:^|\s)(?:--output-dir|-o)(?:\s+|=)(?:\S+)?(?=\s|$)/ /g;
    $rfparams =~ s/\s+/ /g;

    if ($rfparams) { # Evaluate rf-fold params

        my $ret = `$rffold $rfparams 2>&1`;

        die "\n  [!] Error: Invalid RF Fold parameters (\"" . $rfparams . "\")." .
            "\n             Please check  documentation for detailed parameters description\n\n" if ($ret =~ m/Error: Invalid option. Please check the help/ ||
                                                                                                     $ret =~ m/RNA Framework \[http:\/\/www.rnaframework.com\]/);

    }

    # This ensures that, if the program is not up-to-date, RF Fold won't throw a warning that
    # will end up in the error.out
    $ENV{"RF_NOCHECKUPDATES"} = 1;

    $processmanager = Core::Process::Queue->new( processors => $threads,
                                                 stderr     => "$output/error.out",
                                                 verbosity  => 1 );

    $processmanager->onstart(sub { print CLRRET . "[+] Jackknifing folding parameters (this may take a while) [Current: " . $_[0] . "]"; });

    foreach my $id (sort keys %files) {

        my (@pool);

        print CLRRET . "[+] Jackknifing folding parameters (this may take a while) [Current: " . $id . "]";

        for(my $sl = $slope[0]; $sl <= $slope[1]; $sl += $sstep) {

            for(my $in = $intercept[0]; $in <= $intercept[1]; $in += $istep) {

                push(@tested, shared_clone([$id, $sl, $in]));

                $processmanager->enqueue( command => join(" ", $rffold, $rfparams, "-sl", $sl, "-in", $in, "-o", $tmpdir . join("_", @{$tested[-1]}), $ignoreseq ? "-is" : undef, @{$files{$id}}),
                                          id      => $id . "; Slope: " . $sl . "; Intercept: " . $in,
                                          stdout  => "/dev/null" );

            }

        }

        $processmanager->start();
        $processmanager->waitall();

        while (my $test = $processmanager->dequeue()) { die "\n\n  [!] Error: Folding failed for transcript \"" . $id . "\"." .
                                                              "\n             Please check input parameters and files and try again.\n\n" if ($test->exitcode()->[0]); }

        print CLRRET . "[+] Jackknifing folding parameters (this may take a while) [Current: " . $id . "; " . ($mFMI ? "mFMI" : "FMI") . " calculation]";

        @pool = map{ threads->create(\&calc) } 1 .. $threads;
        $_->join() for(@pool);

        push(@{$table{$_->[0]}->{$_->[1]}}, $_->[2]) for (@results);

    }

    print CLRRET . "[+] Jackknifing folding parameters (this may take a while) [DONE]";
    print "\n[+] Reporting output tables...";

    open(my $gh, ">", $output . ($mFMI ? "mFMI" : "FMI") . ".csv") or die "\n\n  [!] Error: Unable to write ouput " . ($mFMI ? "mFMI" : "FMI") . " table (" . $! . ")\n\n";
    select((select($gh), $|=1)[0]);

    print $gh join(";", ($mFMI ? "mFMI" : "FMI"), sort {$a <=> $b} keys %{$table{min(@slope)}}) . "\n";

    foreach my $sl (sort {$a <=> $b} keys %table) {

        my @grow = ($sl);

        foreach my $in (sort {$a <=> $b} keys %{$table{$sl}}) {

            my $fmi = "NaN";

            if (isnumeric(@{$table{$sl}->{$in}})) {

                if ($m) { $fmi = (sprintf("%." . $decimals . "f", median(@{$table{$sl}->{$in}}))); }
                elsif ($am) { $fmi = (sprintf("%." . $decimals . "f", mean(@{$table{$sl}->{$in}}))); }
                else { $fmi = (sprintf("%." . $decimals . "f", geomean(@{$table{$sl}->{$in}}))); }

            }

            push(@grow, $fmi);
            push(@table, [$sl, $in, $fmi]) if (!isnan($fmi));

        }

        print $gh join(";", @grow) . "\n";

    }

    close($gh);

    @table = sort { $b->[2] <=> $a->[2] } @table;

    $table = Term::Table->new(indent => 2);
    $table->head("Slope", "Intercept", ($mFMI ? "mFMI" : "FMI"));

    for (0 .. ($#table < 9 ? $#table : 9)) { $table->row(@{$table[$_]}); }

    print "\n[+] Top slope/intercept value pairs:\n\n";
    $table->print();
    print "\n";

    if ($image) {

        my ($plot, $heatmap, @slope, @intercept);
        @slope = map { $_->[0] } @table;
        @intercept = map { $_->[1] } @table;

        $plot = Graphics::Image->new( file   => $output . ($mFMI ? "mFMI" : "FMI") . ".pdf",
                                      width  => 10,
                                      height => 9.2,
                                      R      => $R,
                                      tmpdir => $tmpdir );
        $heatmap = Graphics::Chart::Heatmap->new( dataLabels    => { slope     => \@slope,
                                                                     intercept => \@intercept },
                                                  dataLabelSort => { slope     => [ uniq(sort { $a <=> $b } @slope) ],
                                                                     intercept => [ reverse(uniq(sort { $a <=> $b } @intercept)) ] },
                                                  data          => [ map { $_->[2] } @table ],
                                                  background    => 0,
                                                  grid          => 0,
                                                  legendTitle   => $mFMI ? "mFMI" : "FMI",
                                                  labelTextSize => 12,
                                                  axisTextSize  => 14,
                                                  axisTitleSize => 16,
                                                  valueTextSize => 3,
                                                  x             => "intercept",
                                                  y             => "slope",
                                                  colorPalette  => "RdYlBu",
                                                  invertPalette => 1,
                                                  plotValues    => 1,
                                                  yTitle        => "Slope (kcal/mol)",
                                                  xTitle        => "Intercept (kcal/mol)",
                                                  xLabelAngle   => 90 );
        $plot->plot([$heatmap]);

    }

}
else { die "\n\n [!] Error: No reference structure passed checks. Please ensure reference ID matches XML IDs\n\n"; }

$results{missingstruct} = 1 unless (sum(values %results));

print "\n[+] Jackknifing statistics:\n" .
      "\n  [*] Used transcripts:     " . $results{used} .
      "\n  [*] Excluded transcripts: " . ($results{ioerr} + $results{diffseq} + $results{missingstruct}) . " total" .
      "\n                            " . $results{ioerr} . " XML file parsing failed" .
      "\n                            " . $results{diffseq} . " mismatch between reference and XML transcript sequences" .
      "\n                            " . $results{missingstruct} . " missing XML file\n";

cleanup();

print "\n[+] All done.\n\n";

sub calc {

    while (1) {

        my ($test, $testio, $entry, $fmi,
            $id);

        { lock(@tested);
          $test = shift(@tested) if (@tested); }

        last unless($test);

        $id = shift(@{$test});

        eval { $testio = Data::IO::Sequence->new(file => $tmpdir . join("_", $id, @{$test}) . "/structures/" . $id . ".db"); };

        if ($@) { $fmi = "NaN"; }
        else {

            $entry = $testio->read();
            $fmi = $mFMI ? mfmi($reference{$id}, $entry->structure(), $relaxed) :
                           fmi($reference{$id}, $entry->structure(), $relaxed);

        }

        { lock(@results);
          push(@results, shared_clone([@{$test}, $fmi]));

          rmtree($tmpdir . join("_", $id, @{$test})); }

    }

}

sub cleanup {

    unlink(glob($tmpdir . "*"));

    if (!-s $output . "error.out") { unlink($output . "error.out"); }
    else { print "\n  [!] Warning: Execution completed with error(s)/warning(s). Please check the \"${output}error.out\" file\n"; }

    rmtree($tmpdir);

}

sub help {

    print "\n  [!] Error: Invalid option. Please check the help\n" if ($_[0]);

    die <<HELP;

 RF JackKnife (v$Core::Utils::VERSION)
 RNA Framework [http://www.rnaframework.com]

 Author:  Danny Incarnato (dincarnato[at]rnaframework.com)
 Summary: Iteratively optimize slope and intercept parameters to maximize
          FMI (geometric mean of PPV and sensitivity), or modified FMI (mFMI)
          using a set of reference structures

 Usage:   rf-jackknife [Options] XML_folder/                                    # Multiple transcripts
          rf-jackknife [Options] file.xml                                       # Single transcript
          rf-jackknife [Options] XML_folder_1/ XML_folder_2/ .. XML_folder_n/   # Replicates

 Options                                      Description
 -r  or --reference        <string>           A file containing reference structures in Vienna format (dotbracket notation)
 -oc or --only-common                         In case of replicates, only transcripts covered across all experiments will
                                              be used to derive the optimal slope/intercept pair
 -p  or --processors       <int>              Number of processors to use (Default: 1)
 -o  or --output-dir       <string>           Output directory (Default: rf_jackknife/)
 -ow or --overwrite                           Overwrites output directory (if the specified path already exists)
 -g  or --img                                 Generates heatmap of grid search results (requires R)
 -sl or --slope            <float>,<float>    Range of slope values to test (Default: 0,5)
 -in or --intercept        <float>,<float>    Range of intercept values to test (Default: -3,0)
 -ss or --slope-step       <float>            Step for testing slope values (Default: 0.2)
 -is or --intercept-step   <float>            Step for testing intercept values (Default: 0.2)
 -x  or --relaxed                             Uses relaxed criteria (Deigan et al., 2009) to calculate FMI
 -kp or --keep-pseudoknots                    Keeps pseudoknotted basepairs in reference structure
 -kl or --keep-lonelypairs                    Keeps lonely basepairs (helices of length 1 bp) in reference structure
 -i  or --ignore-sequence                     Ignores nucleotide differences (e.g. SNVs) between the compared structures
 -m  or --mFMI                                Uses modified FMI (mFMI, Lan et al., 2022) instead of standard FMI to quantify the
                                              agreement between predicted and reference structure
 -e  or --median                              FMI values across multiple reference structures are aggregated by median
                                              Note: default is to aggregate them by geometric mean
 -am or --arithmetic-mean                     FMI values across multiple reference structures are aggregated by arithmetic mean
                                              Note: default is to aggregate them by geometric mean
 -d  or --decimals         <int>              Number of decimals for reporting FMI/mFMI values (1-10, Default: 3)
 -rf or --rf-fold          <string>           Path to rf-fold executable (Default: assumes rf-fold is in PATH)
 -rp or --rf-fold-params   <string>           Manually specify additional RF Fold parameters (e.g. -rp "-md 500 -m 2")
 -R  or --R-path           <string>           Path to R executable (Default: assumes R is in PATH)

HELP

}
