#!/usr/bin/env perl

##
# RF Combine
# RNA Framework [http://www.rnaframework.com]
#
# Author:  Danny Incarnato (dincarnato[at]rnaframework.com)
# Summary: Combines multiple experiments into a single XML profile
#
# This program is free software, and can be redistribute  and/or modified
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.
#
# Please see <http://www.gnu.org/licenses/> for more informations.
##

use strict;
use Config;
use File::Basename;
use FindBin qw($Bin);
use Getopt::Long qw(:config no_ignore_case);
use threads;
use threads::shared;

use lib $Bin . "/lib";

use Core::Mathematics qw(:all);
use Core::Statistics;
use Core::Utils;
use Data::IO;
use Data::XML;
use RF::Data::IO::XML;
use Term::Constants qw(:screen);
use Term::Progress;

die "\n  [!] Error: This program requires ithreads." .
    "\n             Please recompile Perl with ithreads and try again\n\n" unless(exists $Config{useithreads});

$|++;

my ($output, $overwrite, $stdev, $help,
    $threads, $decimals, $mincorr, $minvalues, 
    $logtransform, $spearman, $ignoreNaNs,
    @pool, @xml, %xml);

my $progressBar : shared;
my @ids : shared;
my %results : shared;
%results = ( diffseq     => 0,
             diffscore   => 0,
             diffnorm    => 0,
             diffoffset  => 0,
             diffwin     => 0,
             difftool    => 0,
             failed      => 0,
             nominvalues => 0,
             lowcorr     => 0,
             combined    => 0 );

do {

    local $SIG{__WARN__} = sub { };

    GetOptions( "h|help"              => \$help,
                "o|output-dir=s"      => \$output,
                "ow|overwrite"        => \$overwrite,
                "s|stdev"             => \$stdev,
                "p|processors=i"      => \$threads,
                "d|decimals=i"        => \$decimals,
                "m|min-values=s"      => \$minvalues,
                "c|min-correlation=s" => \$mincorr,
                "l|log-transform"     => \$logtransform,
                "S|spearman"          => \$spearman,
                "i|ignore-NaNs"       => \$ignoreNaNs ) or help(1);

    @xml = uniq(@ARGV);

};

help() if ($help);

# Default
$output ||= "combined";
$decimals ||= 3;
$threads ||= 1;
$mincorr //= -1;
$minvalues = round($minvalues) if (isnumeric($minvalues) &&
                                   $minvalues > 1);

$output =~ s/\/?$/\// if (defined $output);

##
# Input validation
##

die "\n  [!] Error: No output directory specified\n\n" unless(defined $output);
die "\n  [!] Error: Not enough XML directories/files specified\n\n" if (@xml < 2);
die "\n  [!] Error: Number of processors must be an integer greater than 0\n\n" if ($threads < 1);
die "\n  [!] Error: Decimals value must be an integer comprised between 1 and 10\n\n" if (!inrange($decimals, [1, 10]));
die "\n  [!] Error: Minimum correlation value must be comprised between -1 and 1\n\n" if (!inrange($mincorr, [-1, 1]));
die "\n  [!] Error: Miminum number of values must be greater than 0\n\n" if (defined $minvalues &&
                                                                             (!isnumeric($minvalues) ||
                                                                              $minvalues == 0));

print "\n[+] Importing input XML directories\/files...";

for (@xml) {

    die "\n\n  [!] Error: Provided XML directory\/file \"$_\" does not exist\n\n" if (!-e $_);

    if (-d $_) {

        $_ =~ s/\/?$/\//;

        opendir(my $dh, $_) or die "\n\n  [!] Error: Unable to read from input directory \"" . $_ . "\" (" . $! . ")\n\n";
        while(my $file = readdir($dh)) {

            next if ($file !~ m/\.xml$/);

            $file =~ s/\.xml//;
            $xml{$file}++;

        }
        close($dh);

    }
    else { $xml{"__XML__"}++ if ($_ =~ m/\.xml$/); }

}

# Delete all transcripts that are not common to all XML directories
for (keys %xml) { delete($xml{$_}) if ($xml{$_} != @xml); }

print " " . (keys %xml) . " common transcripts.";

die "\n\n  [!] Error: No common transcript ID found between XML directories\/files\n\n" unless(keys %xml);

print "\n[+] Making output directory...";

if (-e $output) {

    if ($overwrite) {

        my $error = rmtree($output);

        die "\n\n  [!] Error: " . $error . "\n\n" if ($error);

    }
    else { die "\n\n  [!] Error: Output directory already exists." .
               "\n      Please use -ow (or --overwrite) to overwrite output directory\n\n"; }

}

if (my $error = mktree($output, "755")) { die "\n\n  [!] Error: Unable to create output directory ($error)\n\n"; }

print "\n[+] Combining reactivities...\n\n";

@ids = keys %xml;
$progressBar = shared_clone(Term::Progress->new( max     => scalar(@ids),
                                                 width   => 50,
                                                 colored => 1 ));
$progressBar->init();

@pool = map{ threads->create(\&combine) } 1 .. $threads;
$_->join() for(@pool);

print "\n\n[+] Combination statistics:\n" .
      "\n  [*] Combined transcripts:  " . $results{combined} .
      "\n  [*] Discarded transcripts: " . $results{failed} . " total" .
      "\n                             " . ($results{failed} - $results{diffseq} - $results{diffscore} - $results{diffnorm} - $results{difftool} - $results{nominvalues} - $results{lowcorr}) . " XML parsing failed" .
      "\n                             " . $results{nominvalues} . " no common bases, or not enough values for correlation calculation" .
      "\n                             " . $results{lowcorr} . " correlation too low" .
      "\n                             " . $results{difftool} . " mismatch between analysis tools" .
      "\n                             " . $results{diffseq} . " mismatch between transcript sequences" .
      "\n                             " . $results{diffscore} . " mismatch between scoring methods" .
      "\n                             " . $results{diffnorm} . " mismatch between normalization methods" .
      "\n                             " . $results{diffwin} . " mismatch between window sizes" .
      "\n                             " . $results{diffoffset} . " mismatch between window offsets";

print "\n\n[+] All done.\n\n";

sub combine {

    while (1) {

        my ($id, @commonidx1, @commonidx2);

        { lock(@ids);
          $id = shift(@ids) if (@ids); }

        last unless($id);

        my ($xmlref, $sequence, $scoring, $reactive,
            $norm, $xmlio, $mean1, $stdev1,
            $mean2, $stdev2, $attributes, $offset,
            $win, $remap, $tool, $reactive2,
            $algorithm, $xml, $nbases, $length,
            @values1, @values2);

        { lock($progressBar);
          $progressBar->update(1); }

        foreach my $dir (@xml) {

            my $file = -d $dir ? $dir . $id . ".xml" : $dir;

            eval {

                no warnings;

                my ($sequence2, $scoring2, $norm2, $win2,
                    $offset2, $remap2, $tool2, $algorithm2);

                $xmlref = RF::Data::IO::XML->new(file => $file);
                $id = $xmlref->id() if ($id eq "__XML__");
                $tool2 = $xmlref->tool();
                $sequence2 = $xmlref->sequence();
                $norm2 = $xmlref->norm();
                $scoring2 = $xmlref->scoring();
                $win2 = $xmlref->window();
                $offset2 = $xmlref->offset();
                $remap2 = $xmlref->remap();
                $reactive2 = $xmlref->reactive();
                $algorithm2 = $xmlref->algorithm();

                $tool = $tool2 if (!defined $tool);
                $sequence = $sequence2 if (!defined $sequence);
                $scoring = $scoring2 if (!defined $scoring);
                $norm = $norm2 if (!defined $norm);
                $win = $win2 if (!defined $win);
                $offset = $offset2 if (!defined $offset);
                $remap = $remap2 if (!defined $remap);
                $algorithm = $algorithm2 if (!defined $algorithm);
                $reactive .= $reactive2;

                if ($tool ne $tool2) {

                    lock(%results);
                    $results{difftool}++;

                    die;

                }

                if ($tool ne "rf-modcall") {

                    if ($scoring ne $scoring2) {

                        lock(%results);
                        $results{diffscore}++;

                        die;

                    }

                    if ($norm ne $norm2 ||
                        ($norm eq $norm2 &&
                         $remap != $remap2)) {

                        lock(%results);
                        $results{diffnorm}++;

                        die;

                    }

                    if ($offset != $offset2) {

                        lock(%results);
                        $results{diffoffset}++;

                        die;

                    }

                }

                if ($sequence ne $sequence2) {

                    lock(%results);
                    $results{diffseq}++;

                    die;

                }

                if ($win != $win2) {

                    lock(%results);
                    $results{diffwin}++;

                    die;

                }

                $algorithm = "combined" if ($algorithm ne $algorithm2);

            };

            last if ($@);

            if ($tool eq "rf-norm") {  # rf-norm

                my @reactivity = $xmlref->reactivity();
                push(@values1, \@reactivity);
                #push(@{$values1[$-[0]]}, $reactivity[$-[0]]) while($sequence =~ m/[$reactive2]/g);

            }
            else { # rf-modcall

                my (@score, @ratio);
                @score = $xmlref->score();
                @ratio = $xmlref->ratio();
                push(@values1, \@score);
                push(@values2, \@ratio);

                #for (0 .. $#score) {

                #    push(@{$values1[$_]}, $score[$_]);
                #    push(@{$values2[$_]}, $ratio[$_]);

                #}

            }

        }

        if ($@) { # Exception from eval

            lock(%results);
            $results{failed}++;

            undef($@);

            next;

        }

        @commonidx1 = grep { isnumeric($values1[0]->[$_]) } 0 .. $#{$values1[0]};

        foreach my $i (1 .. $#values1) { @commonidx1 = grep { isnumeric($values1[$i]->[$_]) } @commonidx1; }

        if ($tool eq "rf-modcall") {

            $nbases = @{$values1[0]};
            @commonidx2 = grep { isnumeric($values2[0]->[$_]) } 0 .. $#{$values2[0]};

            foreach my $i (1 .. $#values2) { @commonidx2 = grep { isnumeric($values2[$i]->[$_]) } @commonidx2; }

        }
        else {

            $reactive = join("", sort(uniq(split(//, $reactive))));
            ($nbases) = $sequence =~ s/([$reactive])/$1/g;

        }

        if ((defined $minvalues || @commonidx1 < 2) && !$ignoreNaNs) {

            if (($minvalues < 1 && @commonidx1 / $nbases < $minvalues) ||
                ($minvalues >= 1 && @commonidx1 < $minvalues) ||
                @commonidx1 < 2 ||
                ($tool eq "rf-modcall" &&
                 (($minvalues < 1 &&
                    @commonidx2 / $nbases < $minvalues) ||
                    ($minvalues >= 1 &&
                     @commonidx2 < $minvalues)))) {

                lock(%results);
                $results{nominvalues}++;
                $results{failed}++;

                next;

            }

        }

        if ($mincorr > -1) {

            if (!correlateall(\@values1, \@commonidx1) ||
                ($tool eq "rf-modcall" &&
                 !correlateall(\@values2, \@commonidx2))) {

                lock(%results);
                $results{lowcorr}++;
                $results{failed}++;

                next;

            }

        }

        ($mean1, $stdev1) = combineall(@values1);

        if ($tool eq "rf-modcall") { ($mean2, $stdev2) = combineall(@values2); }

        #for (@profile) {
        #
        #    if (ref($_) eq "ARRAY" &&
        #        !isnan(@{$_})) {
        #
        #        push(@mean, sprintf("%." . $decimals . "f", mean(@{$_})));
        #        push(@stdev, sprintf("%." . $decimals . "f", stdev(@{$_})));
        #
        #    }
        #    else {
        #
        #        push(@mean, "NaN");
        #        push(@stdev, "NaN");
        #
        #    }
        #
        #}
        #
        #push(@mean, "NaN") while (@mean < length($sequence));
        #push(@stdev, "NaN") while (@stdev < length($sequence));

        { lock(%results);
          $results{combined}++; }

        $length = length($sequence);

        # For nicer formatting
        $sequence =~ s/(\w{60})/$1\n/g;
        $mean1 = join(",", @{$mean1});
        $stdev1 = join(",", @{$stdev1});
        $mean1 =~ s/((?:[\w\.]+,){60})/$1\n/g;
        $stdev1 =~ s/((?:[\w\.]+,){60})/$1\n/g;

        $xmlio = Data::IO->new( file      => $output . $id . ".xml",
                                mode      => "w",
                                binmode   => ":encoding(utf-8)",
                                verbosity => -1 );
        $xml = Data::XML->new( heading   => 1,
                               indent    => 0,
                               autoclose => 1 );

        if ($tool eq "rf-norm") {

            $attributes = { combined => "TRUE",
                            reactive => $reactive,
                            scoring  => $scoring,
                            norm     => $norm,
                            win      => $win,
                            offset   => $offset,
                            remap    => $remap,
                            tool     => $tool };

        }
        else {

            $mean2 = join(",", @{$mean2});
            $stdev2 = join(",", @{$stdev2});
            $mean2 =~ s/((?:[\w\.]+,){60})/$1\n/g;
            $stdev2 =~ s/((?:[\w\.]+,){60})/$1\n/g;

            $attributes = { combined => "TRUE",
                            win      => $win,
                            tool     => $tool };

        }

        $xml->opentag("data", $attributes);
        $xml->opentag("transcript", { id     => $id,
                                      length => $length });
        $xml->opentag("sequence");
        $xml->addtext($sequence);
        $xml->closelasttag();

        if ($tool eq "rf-norm") {

            $xml->opentag("reactivity");
            $xml->addtext($mean1);
            $xml->closelasttag();

            if ($stdev) { # Report stdev

                $xml->opentag("reactivity-error");
                $xml->addtext($stdev1);
                $xml->closelasttag();

            }

        }
        else {

            $xml->opentag("score");
            $xml->addtext($mean1);
            $xml->closelasttag();

            if ($stdev) { # Report stdev

                $xml->opentag("score-error");
                $xml->addtext($stdev1);
                $xml->closelasttag();

            }

            $xml->opentag("ratio");
            $xml->addtext($mean2);
            $xml->closelasttag();

            if ($stdev) { # Report stdev

                $xml->opentag("ratio-error");
                $xml->addtext($stdev2);

            }

        }

        $xmlio->write($xml->xml());

    }

    threads->exit();

}

sub correlateall {

    my ($values, $commonidx) = @_;

    foreach my $i (0 .. $#{$values}) {

        foreach my $j (0 .. $#{$values}) {

            next if ($i == $j);

            my ($r, @set1, @set2);
            @set1 = @{$values->[$i]}[@{$commonidx}];
            @set2 = @{$values->[$j]}[@{$commonidx}];
            $r = $spearman ? (spearman(\@set1, \@set2))[0] : (pearson(\@set1, \@set2))[0];

            return() if ($r < $mincorr);

        }

    }

    return(1);

}

sub combineall {

    my @values = @_;

    my (@mean, @stdev);

    foreach my $i (0 .. $#{$values[0]}) {

        my @current = map { $values[$_]->[$i] } 0 .. $#values;
        @current = grep { isnumeric($_) } @current if ($ignoreNaNs);

        if (@current && isnumeric(@current)) {

            if ($logtransform) {

                my @currentLog = map { $_ > 0 ? log($_) : log(1e-308) } @current;

                push(@mean, sprintf("%." . $decimals . "f", exp(mean(@currentLog))));
                push(@stdev, sprintf("%." . $decimals . "f", stdev(@currentLog)));

            }
            else {

                push(@mean, sprintf("%." . $decimals . "f", mean(@current)));
                push(@stdev, sprintf("%." . $decimals . "f", stdev(@current)));

            }

        }
        else {

            push(@mean, "NaN");
            push(@stdev, "NaN");

        }

    }

    return(\@mean, \@stdev);

}

sub help {

    print "\n  [!] Error: Invalid option. Please check the help\n" if ($_[0]);

    die <<HELP;

 RF Combine (v$Core::Utils::VERSION)
 RNA Framework [http://www.rnaframework.com]

 Author:  Danny Incarnato (dincarnato[at]rnaframework.com)
 Summary: Combines multiple experiments into a single XML profile

 Usage:   rf-combine [Options] XML_folder_rep1/ XML_folder_rep2/ .. XML_folder_repn/           # Whole transcriptome
          rf-combine [Options] file_rep1.xml file_rep2.xml .. file_repn.xml                    # Single transcript

 Options                                        Description
 -p  or --processors             <int>          Number of processors to use (Default: 1)
 -o  or --output-dir             <string>       Output directory (Default: combined/)
 -ow or --overwrite                             Overwrites output directory (if the specified path already exists)
 -s  or --stdev                                 When combining multiple replicates, an optional "-error" tag will be reported
                                                in the output XML files, containing the per-base standard deviation of the measure
 -d  or --decimals               <int>          Number of decimals for reporting reactivities (1-10, Default: 3)
 -m  or --min-values             <float>        Minimum number of values to calculate correlation (Default: off)
                                                Note: if a value between 0 and 1 is provided, this is interpreted as a fraction of the
                                                      transcript's length
 -c  or --min-correlation        <float>        Minimum correlation to report a combined profile (-1-1, Default: off)
                                                Note: if more than two replicates are being combined, RF Combine requires this threshold
                                                to be satisfied by all pairwise comparisons
 -S  or --spearman                              Uses Spearman instead of Pearson to calculate correlation
 -l  or --log-transform                         Log transforms values before averaging replicates
 -i  or --ignore-NaNs                           NaNs are ignored when calculating mean reactivities
                                                Note: this parameter enables combining XML files from experiments with different sets of
                                                      reactive bases (e.g., A/C and G/U)

HELP

}
