#!/usr/bin/env perl

##
# RF Compare
# RNA Framework [http://www.rnaframework.com]
#    
# Author:  Danny Incarnato (dincarnato[at]rnaframework.com)
# Summary: Compares inferred secondary structures to a set of reference structures
#
# This program is free software, and can be redistribute  and/or modified
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.
#
# Please see <http://www.gnu.org/licenses/> for more informations.
##

use strict;
use File::Basename;
use FindBin qw($Bin);
use Getopt::Long qw(:config no_ignore_case);
use threads;
use threads::shared;

use lib $Bin . "/lib";

use Core::Mathematics qw(:all);
use Core::Utils;
use Data::IO;
use Data::IO::Sequence;
use Data::Sequence::Utils;
use Graphics::Chart::Arcs;
use Graphics::Image;
use RF::Utils;
use RNA::Utils;
use Term::Table;
use Term::Constants qw(:screen);
use Term::Progress;

$|++;

my ($input, $reference, $help, $relaxed, 
    $kp, $kl, $img, $threads, $output, 
    $overwrite, $ignoreseq, $nRefs, $R, 
    $tmpDir, @pool);
my $progressBar : shared;
my @files : shared;
my @results : shared;
my %results : shared;
my %reference : shared;

%results = ( compared      => 0,
             ioerr         => 0,
             diffseq       => 0,
             missingstruct => 0 );

do {
    
    local $SIG{__WARN__} = sub { };

    GetOptions( "h|help"               => \$help,      
                "r|reference=s"        => \$reference,
                "x|relaxed"            => \$relaxed,
                "kp|keep-pseudoknots"  => \$kp,
                "kl|keep-lonelypairs"  => \$kl,
                "g|img"                => \$img,
                "o|output-dir=s"       => \$output,
                "ow|overwrite"         => \$overwrite,
                "i|ignore-sequence"    => \$ignoreseq,
                "p|processors=s"       => \$threads,
                "R|R-path=s"           => \$R ) or help(1);

    $input = $ARGV[0];
                
};

help() if ($help);

$threads ||= 1;
$output ||= "rf_compare/";
$R = checkRinstall($R) if ($img);

$output =~ s/\/?$/\//;
$tmpDir = "${output}tmp/";
$input =~ s/\/?$/\// if (-d $input);

##
# Input validation
##

die "\n  [!] Error: Number of processors must be a positive INT >= 1\n\n" if (!isint($threads) || $threads < 1);
die "\n  [!] Error: No output directory specified\n\n" if (!defined $output && $img);
die "\n  [!] Error: Provided structure directory\/file does not exist\n\n" if (!-e $input);
die "\n  [!] Error: No reference structure directory\/file provided\n\n" if (!defined $reference);
die "\n  [!] Error: Reference structure directory\/file does not exist\n\n" if (!-e $reference);

$SIG{__DIE__} = \&cleanup;

print "\n[+] Making output directory...";

if (-e $output) {

    if ($overwrite) {
        
        my $error = rmtree($output);
    
        die "\n\n  [!] Error: " . $error . "\n\n" if ($error); 
        
    }
    else { die "\n\n  [!] Error: Output directory already exists." .
                "\n      Please use -ow (or --overwrite) to overwrite output directory\n\n"; }
    
}

if (my $error = mktree($tmpDir, "755")) { die "\n\n  [!] Error: Unable to create output directory ($error)\n\n"; }
mktree($output . "plots/", "755") if ($img);

$progressBar = shared_clone(Term::Progress->new( max     => 1,
                                                 width   => 50,
                                                 colored => 1 ));

print "\n[+] Importing reference structure file(s)...\n\n";

if (-d $reference) { # Directory of structure files

    opendir(my $dh, $reference) or die "\n\n  [!] Error: Unable to read reference structure files from directory\n\n";
    while(my $file = readdir($dh)) {

        next if ($file !~ m/\.(?:db|ct)$/);

        push(@files, $file);

    }
    closedir($dh);

    $nRefs = scalar(@files);
    $progressBar->max($nRefs);
    $progressBar->init();

    @pool = map{ threads->create(\&import) } 1 .. $threads;
    $_->join() for (@pool);

    undef(@pool);

    die "\n\n  [!] Error: Specified reference folder doesn't contain any structure file\n\n" unless(keys %reference);

}
else { # Single structure file

    die "\n\n  [!] Error: Provided reference file lacks .db or .ct extension\n\n" if ($reference !~ m/\.(?:db|ct)$/);

    my $io = Data::IO::Sequence->new( file        => $reference,
                                      lonelypairs => $kl,
                                      pseudoknots => $kp );
    while(my $entry = $io->read()) { 

        die "\n\n  [!] Error: Duplicate reference structure ID \"" . $entry->id() . "\"\n\n" if (exists $reference{$entry->id()});

        $reference{$entry->id()} = shared_clone({ sequence  => dna2rna($entry->sequence()),
                                                  structure => $entry->structure() }) if ($entry->can("structure"));
        print CLRRET . "[+] Importing structure file(s) [" . scalar(keys %reference) . " imported]";

    }
    
}

die "\n\n  [!] Error: No structure file imported\n\n" if (!keys %reference);

print "\n\n[+] Comparing transcript structures...\n\n";

$nRefs = scalar(keys %reference);
$progressBar->reset();
$progressBar->max($nRefs);
$progressBar->init();

@pool = map{ threads->create(\&compare) } 1 .. $threads;
$_->join() for (@pool);

print "\n";

if ($results{compared}) {
    
    my ($table, $top, @median);
    $top = min(10, $results{compared});
    $table = Term::Table->new(indent => 2);
    $table->head("Transcript", "PPV", "Sensitivity", "FMI", "mFMI");

    @results = sort { $b->[3] <=> $a->[3] } @results;
    $table->row(@$_) for (@results[0 .. $top - 1]);

    if ($top > 1) {

        $table->blank();
        @median = map { my $i = $_; my @i = grep { isnumeric($_) } map { $_->[$i] } @results; @i ? median(@i) : "NaN" } 1 .. 4;
        $table->row("Median (all)", @median);

    }

    print "\n[+] Top $top similar structures:\n\n";
    $table->print();
    print "\n";

    open(my $wh, ">", $output . "metrics.txt") or die "\n  [!] Error: Failed to write metrics to file ($!)\n\n";
    print $wh join("\t", "Transcript", "PPV", "Sensitivity", "FMI", "mFMI") . "\n";
    print $wh join("\t", @$_) . "\n" for (sort {$a->[0] cmp $b->[0]} @results);
    close($wh);
    
}

$results{missingstruct} = 1 unless (sum(values %results));

cleanup();

print "\n[+] Comparison statistics:\n" .
      "\n  [*] Compared transcripts: " . $results{compared} .
      "\n  [*] Excluded transcripts: " . ($results{ioerr} + $results{diffseq} + $results{missingstruct}) . " total" .
      "\n                            " . $results{ioerr} . " structure file parsing failed" .
      "\n                            " . $results{diffseq} . " mismatch between reference and query transcript sequences" .
      "\n                            " . $results{missingstruct} . " missing structure file";

print "\n\n[+] All done.\n\n";

sub import {

    while(1) {

        my ($file, $io);

        { lock(@files);
          
          if (@files) { $file = shift(@files); } }

        last unless($file);

        my $io = Data::IO::Sequence->new( file        => "$reference/$file",
                                          lonelypairs => $kl,
                                          pseudoknots => $kp );
        while(my $entry = $io->read()) { 
            
            lock(%reference);

            die "\n\n  [!] Error: Duplicate reference structure ID \"" . $entry->id() . "\"\n\n" if (exists $reference{$entry->id()});

            $reference{$entry->id()} = shared_clone({ sequence  => dna2rna($entry->sequence()),
                                                      structure => $entry->structure() }) if ($entry->can("structure")); 
            
        }

        { lock($progressBar);
          $progressBar->update(1); }

    }

}

sub compare {

    while(1) {

        my ($refId, $refSeq, $refStruct);

        { lock(%reference);
          
          if (keys %reference) {

              $refId = (keys %reference)[0];
              $refSeq = $reference{$refId}->{sequence};
              $refStruct = $reference{$refId}->{structure};
              delete($reference{$refId});

          } }

        last unless($refStruct);

        { lock($progressBar);
          $progressBar->update(1); }

        my ($file, $format, $queryIO, $querySeq,
            $queryStruct, $ppv, $sensitivity, $fmi,
            $mfmi);

        if (-d $input) {
        
            if (-e $input . $refId . ".ct") { $format = "ct"; }
            elsif (-e $input . $refId . ".db") { $format = "db"; }
            else {
            
                lock(%results);
                $results{missingstruct}++;
                
                next;
                
            }
            
            $file = $input . $refId . "." . $format;
            
        }
        else {
            
            my ($fileId);
            ($fileId, undef, $format) = fileparse($input, qr/\.[^.]*/);
            $format =~ s/^\.//;
            $file = $input;
            
            next if ($fileId ne $refId && $nRefs > 1);
            
        }

        $queryIO = Data::IO::Sequence->new( file        => $file,
                                            pseudoknots => $kp,
                                            lonelypairs => $kl );
    
        if (my $query = $queryIO->read()) {
            
            $querySeq = dna2rna($query->sequence());

            if ($query->can("structure")) { $queryStruct = $query->structure(); }
            else {

                lock(%results);
                $results{ioerr}++;

                next;

            }
            
        }
        else {
            
            lock(%results);
            $results{ioerr}++;
            
            next;
        
        }
        
        if ($querySeq ne $refSeq && !$ignoreseq) {
            
            lock(%results);
            $results{diffseq}++;
            
            next;
            
        }

        $ppv = sprintf("%.2f", ppv($refStruct, $queryStruct, $relaxed));
        $sensitivity = sprintf("%.2f", sensitivity($refStruct, $queryStruct, $relaxed)); 
        $fmi = sprintf("%.2f", fmi($refStruct, $queryStruct, $relaxed));
        $mfmi = sprintf("%.2f", mfmi($refStruct, $queryStruct, $relaxed));

        { lock(@results);
          push(@results, shared_clone([$refId, $ppv, $sensitivity, $fmi, $mfmi])); }
        
        if ($img) {

            my ($plot, $length, $common, $missing, $correct, $wrong,
                $refArcs, $queryArcs, $maxBpUp, $maxBpDown, @heights);
            $length = length($refSeq);
            ($common, $missing) = commonpairs($refStruct, $queryStruct);
            $maxBpUp = max(map { abs(diff(@$_)) } (@$common, @$missing));
            ($correct, $wrong) = commonpairs($queryStruct, $refStruct);
            $maxBpDown = max(map { abs(diff(@$_)) } (@$correct, @$wrong));
            @heights = map { $_ / $length * 2 } ($maxBpUp, $maxBpDown);

            $plot = Graphics::Image->new( file   => "${output}plots/$refId.pdf",
                                          width  => 4.5 * 3,
                                          height => sum(@heights) * 3.3,
                                          R      => $R,
                                          tmpdir => $tmpDir );

            $refArcs = Graphics::Chart::Arcs->new( data           => [ map { [ map { $_ + 1 } @$_ ] } @$common, @$missing ],
                                                   dataLabels     => { "bp" => [ ("Predicted") x @$common, ("Missing") x @$missing ] },
                                                   flip           => "up",
                                                   fill           => "bp",
                                                   legendKeyWidth  => 9,
                                                   legendKeyHeight => 2,
                                                   legendColors    => { "Predicted" => "#000000",
                                                                        "Missing"   => "#663399" },
                                                   legendSort      => [ qw(Predicted Missing) ],
                                                   legendTextSize  => 8,
                                                   xLimit          => [0, $length + 1],
                                                   lineThickness   => 0.15,
                                                   legend          => 1,
                                                   background      => 0,
                                                   grid            => 0,
                                                   xTicks          => 0,
                                                   yTicks          => 0,
                                                   xLabels         => 0,
                                                   yLabels         => 0,
                                                   yTitle          => "Reference",
                                                   axisTitleSize   => 10 );

            $queryArcs = Graphics::Chart::Arcs->new( data           => [ map { [ map { $_ + 1 } @$_ ] } @$correct, @$wrong ],
                                                     dataLabels     => { "bp" => [ ("Correct") x @$correct, ("Wrong") x @$wrong ] },
                                                     flip           => "down",
                                                     fill           => "bp",
                                                     legendKeyWidth  => 9,
                                                     legendKeyHeight => 2,
                                                     legendColors    => { "Correct" => "#0c8039",
                                                                          "Wrong"   => "#bb1b1b" },
                                                     legendSort      => [ qw(Correct Wrong) ],
                                                     legendTextSize  => 8,
                                                     xLimit          => [0, $length + 1],
                                                     lineThickness   => 0.15,
                                                     legend          => 1,
                                                     background      => 0,
                                                     grid            => 0,
                                                     xTicks          => 0,
                                                     yTicks          => 0,
                                                     xLabels         => 0,
                                                     yLabels         => 0,
                                                     yTitle          => "Query",
                                                     axisTitleSize   => 10 );

            $plot->plot([$refArcs, $queryArcs], { heights => \@heights });
            
        }
        
        { lock(%results);
          $results{compared}++; }

    }

}

sub commonpairs {
    
    my ($reference, $structure) = @_;
    
    my (@common, @missing, %reference);
    
    for (listpairs($reference)) { $reference{$_->[0] . "-" . $_->[1]} = 0; }
	for (listpairs($structure)) { $reference{$_->[0] . "-" . $_->[1]} = 1 if (exists $reference{$_->[0] . "-" . $_->[1]}); }

    @common = map { [ split("-", $_) ] } (grep { $reference{$_} } keys %reference);
    @missing = map { [ split("-", $_) ] } (grep { !$reference{$_} } keys %reference);
    
    return(\@common, \@missing);
    
}

sub cleanup {

    unlink(glob($tmpDir . "*"));

    rmtree($tmpDir);

}

sub help {
    
    print "\n  [!] Error: Invalid option. Please check the help\n" if ($_[0]);
    
    die <<HELP;
 
 RF Compare (v$Core::Utils::VERSION)
 RNA Framework [http://www.rnaframework.com]
    
 Author:  Danny Incarnato (dincarnato[at]rnaframework.com)
 Summary: Compares inferred secondary structures to a set of reference structures
 
 Usage:   rf-compare [Options] structures_folder/                # Multiple transcripts
          rf-compare [Options] file.(ct|db)                      # Single transcript
 
 Options                                Description
 -p  or --processors        <int>       Number of processors to use (>=1, Default: 1)
 -r  or --reference         <string>    Path to (a folder) of structure file(s)
                                        Note: files containing multiple structures are accepted
 -g  or --img                           Enables generation of comparison secondary structure plots (requires R)
 -o  or --output-dir        <string>    Output directory (Default: rf_compare/)
 -ow or --overwrite                     Overwrites output directory (if the specified path already exists)
 -x  or --relaxed                       Uses relaxed criteria (described in Deigan et al., 2009) to calculate similarity metrics
 -kp or --keep-pseudoknots              Keeps pseudoknotted basepairs in reference structure
 -kl or --keep-lonelypairs              Keeps lonely basepairs (helices of length 1 bp) in reference structure
 -i  or --ignore-sequence               Ignores sequence differences (e.g. SNVs) between the compared structures
 -R  or --R-path            <string>    Path to R executable (Default: assumes R is in PATH)

HELP
    
}
