#!/usr/bin/env perl

##
# RF Eval
# RNA Framework [http://www.rnaframework.com]
#
# Author:  Danny Incarnato (dincarnato[at]rnaframework.com)
# Summary: Calculates various metrics of agreement between reactivity 
#          data and a structure model
#
# This program is free software, and can be redistribute  and/or modified
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.
#
# Please see <http://www.gnu.org/licenses/> for more informations.
##

use strict;
use File::Basename;
use FindBin qw($Bin);
use Getopt::Long qw(:config no_ignore_case);
use threads;
use threads::shared;

use lib $Bin . "/lib";

use Core::Mathematics qw(:all);
use Core::Utils;
use Data::IO::Sequence;
use Graphics::Chart::Barplot;
use Graphics::Chart::Density;
use Graphics::Chart::Linedot;
use Graphics::Image;
use RF::Data::IO::XML;
use RF::Utils;
use Term::Constants qw(:screen);
use Term::Progress;

$|++;

my ($help, $output, $overwrite, $reactCutoff, 
    $structs, $reacts, $termAsUnpaired, $ignoreTerminal,
    $keepLonelyPairs, $keepPseudoknots, $threads, $failed, 
    $image, $R, $tmpDir, $noOverall, @pool, %structs);

my $progressBar : shared;
my @overallCoeff : shared;
my @overallAuc : shared;
my @overallDsci : shared;
my @ids : shared;
my @metrics : shared;
my %results : shared;

%results = ( diffLen  => 0,
             parseErr => 0,
             success  => 0 );

do {

    local $SIG{__WARN__} = sub { };

    GetOptions( "h|help"                  => \$help,
                "o|output=s"              => \$output,
                "ow|overwrite"            => \$overwrite,
                "p|processors=s"          => \$threads,
                "s|structures=s"          => \$structs,
                "r|reactivities=s"        => \$reacts,
                "tu|terminal-as-unpaired" => \$termAsUnpaired,
                "it|ignore-terminal"      => \$ignoreTerminal,
                "kl|keep-lonelypairs"     => \$keepLonelyPairs,
                "kp|keep-pseudoknots"     => \$keepPseudoknots,
                "c|reactivity-cutoff=s"   => \$reactCutoff,
                "g|img"                   => \$image,
                "R|R-path=s"              => \$R,
                "no|no-overall"           => \$noOverall ) or help(1);

};

help() if ($help);

$threads ||= 1;
$reactCutoff ||= 0.7;
$output ||= "rf_eval/";
$R = checkRinstall($R) if ($image);

$output =~ s/\/?$/\//;
$tmpDir = "$output/tmp/";

die "\n  [!] Error: Number of processors must be an INT >= 1\n\n" if (!isint($threads) || !ispositive($threads));
die "\n  [!] Error: No structure file/directory specified\n\n" if (!defined $structs);
die "\n  [!] Error: Provided structure file/directory does not exist\n\n" if (!-e $structs);
die "\n  [!] Error: No XML reactivity file/directory specified\n\n" if (!defined $reacts);
die "\n  [!] Error: Provided XML reactivity file/directory does not exist\n\n" if (!-e $reacts);
die "\n  [!] Error: Reactivity cutoff must > 0\n\n" if (!ispositive($reactCutoff));
die "\n  [!] Error: Parameters -tu and -it are mutually exclusive\n\n" if ($termAsUnpaired && $ignoreTerminal);
die "\n  [!] Error: Output file already exists." .
    "\n             Please specify -ow (or --overwrite) to overwrite it.\n\n" if (-e $output && !$overwrite);

$SIG{__DIE__} = \&cleanup;

print "\n[+] Making output directory...";

if (-e $output) {

    if ($overwrite) {

        my $error = rmtree($output);

        die "\n\n  [!] Error: " . $error . "\n\n" if ($error);

    }
    else { die "\n\n  [!] Error: Output directory already exists." .
               "\n      Please use -ow (or --overwrite) to overwrite output directory\n\n"; }

}

if (my $error = mktree($tmpDir, "755")) { die "\n\n  [!] Error: Unable to create temporary directory ($error)\n\n"; }
mktree("${output}plots/dsci", "755") if ($image);

print "\n[+] Importing structure file(s) [0 imported]";

if (-d $structs) { # Directory of structure files

    opendir(my $dh, $structs) or die "\n\n  [!] Error: Unable to read structure files from directory\n\n";
    while(my $file = readdir($dh)) {

        next if ($file !~ m/\.(?:db|ct)$/);

        my $io = Data::IO::Sequence->new( file        => "$structs/$file",
                                          lonelypairs => $keepLonelyPairs,
                                          pseudoknots => $keepPseudoknots );
        while(my $entry = $io->read()) { 
            
            die "\n\n  [!] Error: Duplicate structure ID \"" . $entry->id() . "\"\n\n" if (exists $structs{$entry->id()});

            $structs{$entry->id()}->{structure} = $entry->structure() if ($entry->can("structure")); 
            print CLRRET . "[+] Importing structure file(s) [" . scalar(keys %structs) . " imported]" if (scalar(keys %structs) % 50);

        }

    }
    closedir($dh);

    die "\n\n  [!] Error: Specified folder doesn't contain any structure file\n\n" unless(keys %structs);

}
else { # Single structure file

    die "\n\n  [!] Error: Provided file lacks .db or .ct extension\n\n" if ($structs !~ m/\.(?:db|ct)$/);

    my $io = Data::IO::Sequence->new( file        => $structs,
                                      lonelypairs => $keepLonelyPairs,
                                      pseudoknots => $keepPseudoknots ); 
    while(my $entry = $io->read()) { 

        die "\n\n  [!] Error: Duplicate structure ID \"" . $entry->id() . "\"\n\n" if (exists $structs{$entry->id()});

        $structs{$entry->id()}->{structure} = $entry->structure() if ($entry->can("structure")); 
        print CLRRET . "[+] Importing structure file(s) [" . scalar(keys %structs) . " imported]" if (scalar(keys %structs) % 50);

    }
    
}

print CLRRET . "[+] Importing structure file(s) [" . scalar(keys %structs) . " imported]";

die "\n\n  [!] Error: No structure file imported\n\n" if (!keys %structs);

print "\n[+] Importing XML file(s) [0 imported]";

if (-d $reacts) { # Directory of structure files

    my $imported = 0;

    opendir(my $dh, $reacts) or die "\n\n  [!] Error: Unable to read XML files from directory\n\n";
    while(my $file = readdir($dh)) {

        next if ($file !~ m/\.(?:xml)$/);

        my $id = $file;
        $id =~ s/\.xml$//;

        if (exists $structs{$id}) {

            $structs{$id}->{reactFile} = "$reacts/$file";
            $imported++;

        }

    }
    closedir($dh);

    die "\n\n  [!] Error: Specified folder doesn't contain any XML file\n\n" unless ($imported);

    print CLRRET . "[+] Importing XML file(s) [$imported imported]";

}
else { # Single reactivity file

    die "\n\n  [!] Error: Provided file lacks .xml extension\n\n" if ($reacts !~ m/\.xml$/);

    my $id = (fileparse($reacts, ".xml"))[0];
    $structs{$id}->{reactFile} = $reacts if (exists $structs{$id});

    print CLRRET . "[+] Importing XML file(s) [1 imported]";
    
}

for (keys %structs) { delete($structs{$_}) if (!exists $structs{$_}->{reactFile}); }

@ids = keys %structs;

if (@ids) { print "\n[+] " . scalar(@ids) . " structures with matched reactivity file\n\n"; }
else { die "\n\n  [!] Error: No matching structure-XML pair\n\n"; }

$progressBar = shared_clone(Term::Progress->new( max     => scalar(@ids),
                                                 width   => 50,
                                                 colored => 1 ));
$progressBar->init();

@pool = map{ threads->create(\&metrics) } 1 .. $threads;
$_->join() for (@pool);

$failed = $results{diffLen} + $results{parseErr};
print "\n\n[i] Successfully evaluated " . $results{success} . " structure(s)";
print ", failed $failed (XML parsing failed: " . $results{parseErr} . "; Different length: " . $results{diffLen} . ")" if ($failed); 

if ($results{success}) {

    my ($coeffUnpaired, $dsci, $auc, @combinedRoc);

    if (!$noOverall) {

        $coeffUnpaired = @overallCoeff ? $overallCoeff[0] / $overallCoeff[1] : "NaN";
        $dsci = @overallDsci ? $overallDsci[0] / $overallDsci[1] : "NaN";
        @combinedRoc = _combineRocData();
        $auc = _auc(@combinedRoc);

        print "\n[i] Overall metrics:\n" .
                "\n  [*] Coefficient unpaired: " . sprintf("%.3f", $coeffUnpaired) .
                "\n  [*] DSCI:                 " . sprintf("%.3f", $dsci) .
                "\n  [*] AUC:                  " . sprintf("%.3f", $auc) . "\n";

    }

    print "\n[+] Writing metrics to file...";

    open(my $fh, ">", "$output/metrics.txt") or die "\n\n  [!] Error: Unable to write output file ($!)\n\n";
    select((select($fh), $|=1)[0]);
    print $fh join("\t", qw(Transcript coeffUnpaired DSCI AUROC)) . "\n";
    print $fh join("\t", @$_) . "\n" for (sort { $a->[0] cmp $b->[0] } @metrics);
    print $fh join("\t", "Overall", $coeffUnpaired, $dsci, $auc) . "\n" if (!$noOverall);
    close($fh);

    if ($image && !$noOverall) {

        print "\n[+] Generating plots of overall metrics...";

        _plotROC(@overallAuc, "Overall", @combinedRoc);
        _plotDSCI("Overall", $overallDsci[2], $overallDsci[3]);
        _plotMetricSummary(@metrics, ["Overall", $coeffUnpaired, $dsci, $auc]);

    }

    cleanup();

    print "\n[+] All done.\n\n";

}
else { 

    rmtree("$output/plots/") if ($image);
        
    die "\n\n  [!] Error: Evaluation failed for all structures\n\n"; 
    
}

sub metrics {

    while(1) {

        my ($id, $structure, $xml);

        { lock(@ids);
          $id = shift(@ids) if (@ids); }

        last unless($id);

        $structure = $structs{$id}->{structure};

        eval { $xml = RF::Data::IO::XML->new(file => $structs{$id}->{reactFile}); };

        if ($@) {

            lock(%results);
            $results{parseErr}++;

            undef($@);

            next;

        }

        if ($xml->length() != length($structure)) {

            lock(%results);
            $results{diffLen}++;

            next;

        }

        my ($coeffUnpaired, $dsci, $auc);
        $coeffUnpaired = coeffUnpaired($structure, [ $xml->reactivity() ]);
        $dsci = dsci($id, $structure, [ $xml->reactivity() ]);
        $auc = roc($id, $structure, [ $xml->reactivity() ]);

        { lock(@metrics);
          push(@metrics, shared_clone([$id, $coeffUnpaired, $dsci, $auc])); 
          $progressBar->update(1);
          $results{success}++; }

    }

}

sub coeffUnpaired {

    my ($dotbracket, $reactivity) = @_;

    my ($coeff, $ss, $total, @unpaired);
    @unpaired = @{(getBaseIndexes($dotbracket))[0]};
    @unpaired = grep { isnumeric($reactivity->[$_]) } @unpaired;
    $total = scalar(grep { $_ >= $reactCutoff } @{$reactivity});

    return("NaN") if (!@unpaired || !$total);

    $ss = scalar(grep { $reactivity->[$_] >= $reactCutoff } @unpaired);
    $coeff = $ss / $total;

    { lock(@overallCoeff);
      $overallCoeff[0] += $ss;
      $overallCoeff[1] += $total; }

    return($coeff);

}

sub dsci {

    my ($id, $dotbracket, $reactivity) = @_;

    my ($dsci, $paired, $unpaired, $total);
    ($unpaired, $paired) = getBaseIndexes($dotbracket);
    @$paired = grep { isnumeric($reactivity->[$_]) } @$paired;
    @$unpaired = grep { isnumeric($reactivity->[$_]) } @$unpaired;

    return("NaN") if (!@$paired || !@$unpaired);

    foreach my $i (@$paired) {

        foreach my $j (@$unpaired) {

            $dsci++ if ($reactivity->[$i] < $reactivity->[$j]);

        }

    }

    $total = scalar(@$paired) * scalar(@$unpaired);

    { lock(@overallDsci);
      $overallDsci[0] += $dsci; 
      $overallDsci[1] += $total;
      
      if (@overallDsci != 4) {

          $overallDsci[2] = shared_clone([]);
          $overallDsci[3] = shared_clone([]);

      } }

    if ($image) {

        my (@unpairReact, @pairReact);
        @unpairReact = map { $reactivity->[$_] } @$unpaired;
        @pairReact = map { $reactivity->[$_] } @$paired;

        _plotDSCI($id, \@unpairReact, \@pairReact);

        { lock(@overallDsci);
          push(@{$overallDsci[2]}, @unpairReact);
          push(@{$overallDsci[3]}, @pairReact); }

    }

    return($dsci / $total);

}

sub roc {

    my ($id, $dotbracket, $reactivity) = @_;

    my ($auc, $unpaired, $paired, @tp, @fp);
    $auc = 0;
    ($unpaired, $paired) = getBaseIndexes($dotbracket);
    @$unpaired = grep { isnumeric($reactivity->[$_]) } @$unpaired;
    @$paired = grep { isnumeric($reactivity->[$_]) } @$paired;

    return("NaN") if (!@$unpaired || !@$paired);

    for (my $i = 0; $i <= 1000; $i += 1) {

        my ($cutoff, $tp, $fp);
        $cutoff = $i / 1000;
        $tp = scalar(grep { $reactivity->[$_] >= $cutoff } @$unpaired);
        $fp = scalar(grep { $reactivity->[$_] >= $cutoff } @$paired);

        push(@tp, $tp);
        push(@fp, $fp);

    }

    { lock(@overallAuc);
      push(@overallAuc, $id, shared_clone(\@tp), shared_clone(\@fp), scalar(@$unpaired), scalar(@$paired)); }

    return(_auc(\@tp, \@fp, scalar(@$unpaired), scalar(@$paired)));

}

sub _auc {

    my ($tp, $fp, $nUnpaired, $nPaired) = @_;

    my ($auc);

    # Ensure curve starts at 0,0 and ends at 1,1
    unshift(@$tp, $nUnpaired);
    unshift(@$fp, $nPaired);
    push(@$tp, 0);
    push(@$fp, 0);

    for (my $i = 0; $i < $#{$tp}; $i++) {

        my ($b1, $b2, $h);
        $b1 = $tp->[$i] / $nUnpaired;
        $b2 = $tp->[$i+1] / $nUnpaired;
        $h = $fp->[$i] / $nPaired - $fp->[$i+1] / $nPaired;
        $auc += ($b1 + $b2) * $h / 2;

    }

    return($auc);

}

sub _combineRocData {

    my (@tp, @fp, $totUnpaired, $totPaired);

    for (my $i = 1; $i < @overallAuc - 3; $i += 5) {

        my ($tp, $fp, $unpaired, $paired) = @overallAuc[$i .. $i + 3];
        @tp = @tp ? map { $tp[$_] + $tp->[$_] } 0 .. $#tp : @$tp;
        @fp = @fp ? map { $fp[$_] + $fp->[$_] } 0 .. $#fp : @$fp;
        $totUnpaired += $unpaired;
        $totPaired += $paired;

    }

    return(\@tp, \@fp, $totUnpaired, $totPaired);

}

sub _plotROC {

    my @data = @_;

    my ($plot, $roc, @id, @tp, @fp);

    $plot = Graphics::Image->new( file   => "${output}plots/roc.pdf",
                                  width  => 6,
                                  height => 5,
                                  R      => $R,
                                  tmpdir => $tmpDir );

    for (my $i = 0; $i < @data; $i += 5) {

        my ($id, $tp, $fp, $unpaired, $paired) = @data[$i .. $i + 4];
        push(@id, ($id) x (scalar(@$tp) + 2));
        push(@tp, 0, reverse(map { $_ / $unpaired } @$tp), 1);
        push(@fp, 0, reverse(map { $_ / $paired } @$fp), 1);

    }

    $roc = Graphics::Chart::Linedot->new( x               => "fp",
                                          legend          => 1,
                                          data            => \@tp,
                                          dataLabels      => { fp => \@fp,
                                                               id => \@id },
                                          dataLabelType   => { fp => "numeric"},
                                          legendSort      => [ (grep { $_ ne "Overall" } sort(uniq(@id))), "Overall" ],
                                          fill            => "id",
                                          xLabelAngle     => 90,
                                          plotDataPoints  => 0,
                                          axisTitleSize   => 14,
                                          labelTextSize   => 12,
                                          legendTitleSize => 14,
                                          legendTextSize  => 12,
                                          colorPalette    => "Paired",
                                          xTitle          => "False Positive Rate",
                                          yTitle          => "True Positive Rate" );
    $plot->plot([$roc]);

}

sub _plotDSCI {

    my @data = @_;

    for (my $i = 0; $i < @data; $i += 3) {

        my ($id, $unpaired, $paired, $dsci, $plot);
        ($id, $unpaired, $paired) = @data[$i .. $i + 2];
        $plot = Graphics::Image->new( file   => "${output}plots/dsci/$id.pdf",
                                      width  => 6,
                                      height => 5,
                                      R      => $R,
                                      tmpdir => $tmpDir );
        $dsci = Graphics::Chart::Density->new( lineThickness   => 0,
                                               colorPalette    => "Set1",
                                               legend          => 1,
                                               data            => [ @$unpaired, @$paired ],
                                               dataLabels      => { type => [ ("Unpaired") x scalar(@$unpaired), 
                                                                              ("Paired") x scalar(@$paired) ] },
                                               fill            => "type",
                                               xLimit          => [0, 2],
                                               xTitle          => "Reactivity",
                                               yTitle          => "Density",
                                               axisTitleSize   => 14,
                                               labelTextSize   => 12,
                                               legendTitleSize => 14,
                                               legendTextSize  => 12 );
        $plot->plot([$dsci]);

    }


}

sub _plotMetricSummary {

    my @metrics = @_;

    my ($plot, $summary);
    $plot = Graphics::Image->new( file   => "${output}plots/summary.pdf",
                                  width  => 6,
                                  height => 5,
                                  R      => $R,
                                  tmpdir => $tmpDir );
    $summary = Graphics::Chart::Barplot->new( legend          => 1,
                                              x               => "id",
                                              data            => [ map { @$_[1..3] } @metrics ],
                                              dataLabels      => { value => [ (qw(coeffUnpaired DSCI AUROC)) x scalar(@metrics) ],
                                                                   id    => [ map { ($_->[0]) x 3 } @metrics ] },
                                              dataLabelSort   => { id    => [ sort(map { $_->[0] } @metrics[0 .. $#metrics - 1]), "Overall" ],
                                                                   value => [ qw(coeffUnpaired DSCI AUROC) ] },
                                              fill            => "value",
                                              colorPalette    => "Set1",
                                              yTitle          => "Metric value",
                                              axisTitleSize   => 14,
                                              labelTextSize   => 12,
                                              xLabelAngle     => 90,
                                              legendTitleSize => 14,
                                              legendTextSize  => 12 );

    $plot->plot([$summary]);

}

sub getBaseIndexes {

    my $dotbracket = shift;

    my ($n, @paired, @unpaired);
    $n = length($dotbracket) - 1;
    @paired = grep { substr($dotbracket, $_, 1) ne "." } 0 .. $n;
    @unpaired = grep { substr($dotbracket, $_, 1) eq "." } 0 .. $n;

    if ($ignoreTerminal || $termAsUnpaired) {

        my %terminal = map { $_ => 1 } grep { substr($dotbracket, $_, 1) ne "." && 
                                              (($_ > 0 && substr($dotbracket, $_ - 1, 1) eq ".") || 
                                               ($_ < $n && substr($dotbracket, $_ + 1, 1) eq ".")) } 0 .. $n;

        @paired = grep { !exists $terminal{$_} } @paired if ($ignoreTerminal);
        push(@unpaired, map { $_ } keys %terminal) if ($termAsUnpaired);

    }

    return(\@unpaired, \@paired);

}

sub cleanup {

    unlink(glob($tmpDir . "*"));

    rmtree($tmpDir);

}

sub help {

    print "\n  [!] Error: Invalid option. Please check the help\n" if ($_[0]);

    die <<HELP;

 RF Eval (v$Core::Utils::VERSION)
 RNA Framework [http://www.rnaframework.com]

 Author:  Danny Incarnato (dincarnato[at]rnaframework.com)
 Summary: Calculates various metrics of agreement between reactivity data and a structure model

 Usage:   rf-eval [Options] -s /path/to/structure/files -r /path/to/XML/reactivity/files       # Multiple files
          rf-eval [Options] -s /path/to/structure/file.db -r /path/to/XML/reactivity/file.xml  # Single file

 Options                                       Description
 -p  or --processors             <int>         Number of processors to use (>=1, Default: 1)
 -s  or --structures             <string>      Path to a (folder of) structure file(s)
                                               Note: files containing multiple structures are accepted
 -r  or --reactivities           <string>      Path to a (folder of) XML reactivity file(s)
                                               Note: file names must match transcript IDs in structure files
 -o  or --output                 <string>      Output folder (Default: rf_eval/)
 -ow or --overwrite                            Overwrites output folder (if the specified folder already exists)
 -g  or --img                                  Generates plots for the various metrics (requires R)
 -tu or --terminal-as-unpaired                 Treats terminal base-pairs as if they were unpaired
 -it or --ignore-terminal                      Terminal base-pairs are excluded from calculations
 -kl or --keep-lonelypairs                     Lonely base-pairs (helices of 1 bp) are retained
 -kp or --keep-pseudoknots                     Pseudoknotted base-pairs are retained
 -c  or --reactivity-cutoff      <float>       Cutoff for considering a base highly-reactive when computing the
                                               unpaired coefficient (>0, Default: 0.7)
 -no or --no-overall                           Disables overall stats computation
 -R  or --R-path                 <string>      Path to R executable (Default: assumes R is in PATH) 

HELP

}