#!/usr/bin/env perl

##
# RF NormFactor
# RNA Framework [http://www.rnaframework.com]
#
# Author:  Danny Incarnato (dincarnato[at]rnaframework.com)
# Summary: Calculate transcriptome-wide (and experiment-wide) normalization
#          factors to be used with rf-norm
#
# This program is free software, and can be redistribute  and/or modified
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.
#
# Please see <http://www.gnu.org/licenses/> for more informations.
##

use strict;
use Config;
use File::Basename;
use FindBin qw($Bin);
use Getopt::Long qw(:config no_ignore_case);
use threads;
use threads::shared;

use lib $Bin . "/lib";

use Core::Mathematics qw(:all);
use Core::Statistics;
use Core::Utils;
use Data::Sequence::Utils;
use RF::Data::IO::RC;

$|++;

die "\n  [!] Error: This program requires ithreads." .
    "\n             Please recompile Perl with ithreads support and try again\n\n" unless(defined $Config{useithreads});

my ($minCov, $output, $help, $overwrite,
    $reactiveBases, $scoreMethod, $normMethod, $pseudoCount,
    $maxScore, $ignoreLower, $maxUntreatedMut, $maxMutRate, 
    $treated, $untreated, $denatured, $threads, $rfnorm,
    $index, $whitelist, $medianCoverage, $runNorm,
    @ctrlFiles, @treatFiles, @denatFiles, @files, @pool, 
    %io, %commonReact, %fileMap);

my $common : shared;
my @ids : shared;
my %tmpReact : shared;
my %commonNorm : shared;

do {

    local $SIG{__WARN__} = sub { };

    GetOptions( "h|help"                         => \$help,
                "o|output=s"                     => \$output,
                "ow|overwrite"                   => \$overwrite,
                "sm|scoring-method=s"            => \$scoreMethod,
                "nm|norm-method=s"               => \$normMethod,
                "rb|reactive-bases=s"            => \$reactiveBases,
                "pc|pseudocount=s"	             => \$pseudoCount,
                "s|max-score=s"                  => \$maxScore,
                "mc|min-coverage=s"              => \$minCov,
                "u|untreated=s"                  => \$untreated,
                "t|treated=s"                    => \$treated,
                "d|denatured=s"                  => \$denatured,
                "p|processors=s"                 => \$threads,
                "i|index=s"                      => \$index,
                "mu|max-untreated-mut=s"         => \$maxUntreatedMut,
                "mm|max-mutation-rate=s"         => \$maxMutRate,
                "il|ignore-lower-than-untreated" => \$ignoreLower,
                "wl|whitelist=s"                 => \$whitelist,
                "ec|median-coverage=s"           => \$medianCoverage,
                "rn|run-norm"                    => \$runNorm,
                "rf|rf-norm=s"                   => \$rfnorm ) or help(1);

};

help() if ($help);

$common = 0;
$threads ||= 1;
$output ||= "norm_factors.txt";
$scoreMethod ||= 1;
$normMethod ||= 1;
$minCov ||= 10;
$maxUntreatedMut //= 0.05;
$medianCoverage //= 0;
$maxMutRate ||= 1;
$pseudoCount ||= 1;
$maxScore ||= 10;
$reactiveBases ||= "ACGT";
$rfnorm ||= which("rf-norm");
@treatFiles = split /,/, $treated;
@ctrlFiles = split /,/, $untreated;
@denatFiles = split /,/, $denatured;

die "\n  [!] Error: Number of processors must be an integer greater than 0\n\n" if (!isint($threads) || $threads < 1);
die "\n  [!] Error: Provided RCI index file does not exist\n\n" if (defined $index && !-e $index);
die "\n  [!] Error: No treated sample RC file provided\n\n" if (!defined $treated);
for (@treatFiles, @ctrlFiles, @denatFiles) { die "\n [!] Error: Provided file \"$_\" does not exist\n\n" if (!-e $_); }
die "\n  [!] Error: Invalid scoring method\n\n" if ($scoreMethod !~ /^[1-4]$/);
die "\n  [!] Error: Invalid normalization method\n\n" if ($normMethod !~ /^[1-3]$/);
die "\n  [!] Error: Normalization methods 1 and 3 cannot be used with scoring method 2\n\n" if ($scoreMethod == 2 && $normMethod =~ /^[13]$/);
die "\n  [!] Error: Scoring method $scoreMethod requires an untreated control\n\n" if (!defined $untreated && $scoreMethod =~ /^[13]$/);
die "\n  [!] Error: Unequal number of treated and control files\n\n" if (@ctrlFiles > 1 && @treatFiles != @ctrlFiles);
die "\n  [!] Error: Unequal number of treated and denatured files\n\n" if (@denatFiles > 1 && @treatFiles != @denatFiles);
die "\n  [!] Error: Invalid reactive bases\n\n" if ($reactiveBases !~ /^all$/i && !isiupac($reactiveBases));
die "\n  [!] Error: Maximum mutation rate for untreated sample must be comprised between 0 and 1\n\n" if (!isnumeric($maxUntreatedMut) || !inrange($maxUntreatedMut, [0, 1]));
die "\n  [!] Error: Maximum mutation rate must be comprised between >0 and 1\n\n" if (!isnumeric($maxMutRate) || !inrange($maxMutRate, [0, 1]) || !$maxMutRate);
die "\n  [!] Error: Maximum mutation rate for untreated sample cannot exceed max mutation rate\n\n" if ($maxUntreatedMut > $maxMutRate);
die "\n  [!] Error: Pseudocount must be > 0\n\n" if (!isnumeric($pseudoCount) || $pseudoCount <= 0);
die "\n  [!] Error: Minimum coverage must be an INT > 0\n\n" if (!isint($minCov) || $minCov <= 0);
die "\n  [!] Error: Median coverage must be positive\n\n" if (!ispositive($medianCoverage));
die "\n  [!] Error: Provided whitelist file does not exist\n\n" if (defined $whitelist && !-e $whitelist);

if ($runNorm) {

    die "\n  [!] Error: No path to rf-norm provided\n\n" if (!$rfnorm);
    die "\n  [!] Error: Provided path to rf-norm does not exist\n\n" if (!-e $rfnorm);
    die "\n  [!] Error: rf-norm is not executable\n\n" if (!-x $rfnorm);

}

$reactiveBases =~ /^all$/i ? "ACGT" : join("", sort(uniq(split("", join("", iupac2nt(rna2dna(uc($reactiveBases))))))));

if ($scoreMethod !~ /^[24]$/) {

    @ctrlFiles = ($ctrlFiles[0]) x @treatFiles if (@ctrlFiles == 1);

    $fileMap{$treatFiles[$_]} = { "ctrl" => $ctrlFiles[$_] } for (0 .. $#treatFiles);

    if ($scoreMethod == 3 && @denatFiles) { 
        
        @denatFiles = ($denatFiles[0]) x @treatFiles if (@denatFiles == 1); 
        $fileMap{$treatFiles[$_]}->{"denat"} = $denatFiles[$_] for (0 .. $#treatFiles);
        
    }
    else { undef(@denatFiles); }

}
else {

    # Just in case they have been passed, but won't be used
    undef(@ctrlFiles);
    undef(@denatFiles);

    warn "\n  [!] Warning: Parameter --ignore-lower-than-untreated will have no effect in the absence of an untreated sample\n" if ($ignoreLower);

}

@files = (@treatFiles, @ctrlFiles, @denatFiles);

die "\n  [!] Error: Provided RC files have unequal sizes\n\n" if (scalar(uniq(map { -s $_ } @files)) != 1);

print "\n[+] Loading RC files...";

$io{$_} = RF::Data::IO::RC->new( file  => $_,
                                 index => $index ) for (@files);

if (-e $output) {

    if ($overwrite) { unlink($output) or die "\n  [!] Error: $!\n\n"; }
    else { die "\n\n  [!] Error: Output file already exists." .
               "\n      Please use -ow (or --overwrite) to overwrite it\n\n"; }

}

%tmpReact = map { $_ => shared_clone([]) } @treatFiles;
@ids = $io{$files[0]}->ids();

if ($whitelist) {

    my (%whitelist);

    print "\n[+] Importing whitelist...";

    open(my $fh, "<", $whitelist) or die "\n\n  [!] Error: Unable to read whitelist file ($!)\n\n";
    while(<$fh>) {

        chomp();
        $whitelist{$_} = 1;

    }
    close($fh);

    print " " . scalar(keys %whitelist) . " transcripts.\n";

    die "\n\n  [!] Error: Whitelist file is empty\n\n" if (!keys %whitelist);

    @ids = grep { exists $whitelist{$_} } @ids;

    die "\n\n  [!] Error: No match between whitelist and transcript IDs in RC files\n\n" if (!@ids);

    print "\n[+] " . scalar(@ids) . " transcript IDs in RC files matching whitelist will be retained...";

}

print "\n[+] Calculating raw reactivities for bases covered across all samples...";

@pool = map{ threads->create(\&getCoveredBases) } 1 .. $threads;
$_->join() for (@pool);

print " " . $common . " bases selected.";

# It looks like, working with a shared hash of arrays from this point on would use exponentially
# more memory. To address this, we make a deep copy of %commonReact and free the shared hash
print "\n[+] Freeing unused memory...";
%commonReact = map { $_ => clonearrayref($tmpReact{$_}) } @treatFiles;
undef(%tmpReact);

die "\n\n  [!] Error: No bases covered across all samples\n\n" if (!$common);

if ($normMethod =~ /^[13]$/) {

    print "\n[+] Selecting bases for normalization...";

    foreach my $file (keys %commonReact) {

        my @norm = $normMethod == 1 ? norm_28(@{$commonReact{$file}}) : boxplot(@{$commonReact{$file}});

        die "\n\n  [!] Error: Normalization failed on sample " . (fileparse($file, ".rc"))[0] . "\n\n" if (!@norm);

        $commonNorm{$_}++ for (@norm);

    }

    $common = scalar(grep { $commonNorm{$_} == @treatFiles } keys %commonNorm);
    print " $common selected.";

    if ($common) {

        print "\n[+] Normalization factors:\n";

        open(my $wh, ">", $output) or die "\n  [!] Error: Unable to write output file ($!)\n\n";
        select((select($wh), $|=1)[0]);

        print $wh "#Experiment\tNorm factor\n";

        foreach my $file (sort keys %commonReact) {

            my ($sample, $factor, @norm);
            $sample = (fileparse($file, ".rc"))[0];
            @norm = map { $_->[1] } grep { exists $commonNorm{$_->[0]} && $commonNorm{$_->[0]} == @treatFiles } @{$commonReact{$file}};
            $factor = mean(@norm);

            print "\n  [*] $sample: $factor";
            print $wh join("\t", $sample, $factor) . "\n";

            runNorm($file, $factor) if ($runNorm);

        }

        close($wh);

    }
    else { die "\n\n  [!] Error: No bases left for normalization\n\n"; }

}
else {

    print "\n[+] Normalization factors:\n";

    open(my $wh, ">", $output) or die "\n  [!] Error: Unable to write output file ($!)\n\n";
    select((select($wh), $|=1)[0]);

    print $wh "#Experiment\tNorm factor #1\tNorm factor #2\n";

    foreach my $file (sort keys %commonReact) {

        my ($sample, @norm);
        $sample = (fileparse($file, ".rc"))[0];
        @norm = winsor_90(@{$commonReact{$file}});
        
        print "\n  [*] $sample: " . join(",", @norm);
        print $wh join("\t", $sample, @norm) . "\n";

        runNorm($file, join(",", @norm)) if ($runNorm);

    }

    close($wh);

}

print "\n\n[+] All done.\n\n";

sub runNorm {

    my ($file, $normFactor) = @_;

    my ($cmd, $ret, $outDir);
    $outDir = (fileparse($file, ".rc"))[0];
    $cmd = "$rfnorm -p $threads -sm $scoreMethod -nm $normMethod -n $minCov -ec $medianCoverage -t $file -rb $reactiveBases -nf $normFactor -ow";

    if (exists $fileMap{$file}->{"ctrl"}) { 

        $outDir .= "_vs_" . (fileparse($fileMap{$file}->{"ctrl"}, ".rc"))[0];
        $cmd .= " -u " . $fileMap{$file}->{"ctrl"}; 
        
    }

    if (exists $fileMap{$file}->{"denat"}) { 

        $outDir .= (exists $fileMap{$file}->{"ctrl"} ? "_" : "_vs_") . (fileparse($fileMap{$file}->{"denat"}, ".rc"))[0];    
        $cmd .= " -d " . $fileMap{$file}->{"denat"}; 
        
    }

    $outDir .= "_normfactor";
    $cmd .= " -o $outDir";
    $cmd .= " -il" if ($scoreMethod =~ /^[13]$/);
    $cmd .= " -mm $maxMutRate" if ($scoreMethod =~ /^[34]$/);
    $cmd .= " -mu $maxUntreatedMut" if ($scoreMethod == 3);
    $cmd .= " -pc $pseudoCount -s $maxScore" if ($scoreMethod == 1);

    $ret = `$cmd 2>&1`;

}

sub getCoveredBases {

    my $reopened = 0;

    TRANSCRIPT:
    while (1) {

        my ($id, @refCov, @i, @alli, %entries);

        { lock(@ids);
          $id = shift(@ids) if (@ids); }

        last unless(defined $id);

        if (!$reopened) {

            # Each thread will have its own independent copy of the fh
            $_->forceReopenFh() for (values %io);
            $reopened = 1;

        }

        $entries{$files[0]} = $io{$files[0]}->read($id);

        next if ($entries{$files[0]}->mediancoverage() < $medianCoverage);

        @refCov = $entries{$files[0]}->coverage();
        @alli = grep { $refCov[$_] >= $minCov } 0 .. $entries{$files[0]}->length() - 1;

        #next if (!@i);

        foreach my $file (@files[1 .. $#files]) {

            if (my $entry = $io{$file}->read($entries{$files[0]}->id())) {

                next TRANSCRIPT if ($entry->mediancoverage() < $medianCoverage);

                my @cov = $entry->coverage();
                @alli = grep { $cov[$_] >= $minCov } @alli;

                next TRANSCRIPT if (!@alli);

                $entries{$file} = $entry;

            }
            else { next TRANSCRIPT; }

        }

        @i = grep { substr($entries{$files[0]}->sequence(), $_, 1) =~ /^[$reactiveBases]$/ } @alli;

        if (@i) { 
        
            lock($common);
            $common += @i; 
            
        }
        else { next; }

        for my $i (0 .. $#treatFiles) {

            my ($treatEntry, $treatReads, @treatCounts, @treatCov, 
                @react);
            $treatEntry = $entries{$treatFiles[$i]};
            $treatReads = $treatEntry->readscount();
            @treatCounts = $treatEntry->counts();
            @treatCov = $treatEntry->coverage();

            if ($scoreMethod == 1) { @react = map { ["$id:$_", log($treatCounts[$_] + $pseudoCount)] } @i; }
            elsif ($scoreMethod == 2) { @react = map { ["$id:$_", $treatCounts[$_]] } @i; }
            elsif ($scoreMethod =~ /^[34]$/) { 
                
                @react = grep { $_->[1] <= $maxMutRate } map { ["$id:$_", $treatCounts[$_] / $treatCov[$_]] } @i;

                next TRANSCRIPT if (!@react); 
                
            }

            if (@ctrlFiles) {

                my ($ctrlEntry, $ctrlReads, @ctrlCounts, @ctrlCov);
                $ctrlEntry = $entries{$ctrlFiles[$i]};
                $ctrlReads = $ctrlEntry->readscount();
                @ctrlCounts = $ctrlEntry->counts();
                @ctrlCov = $ctrlEntry->coverage();

                if ($scoreMethod == 1) { 
                    
                    my ($tMean, $cMean);
                    $ctrlReads -= sum(@ctrlCounts);
                    $treatReads -= sum(@treatCounts);
                    $tMean = mean((map { log($treatCounts[$_] + $pseudoCount) } @alli), log(max(0, $treatReads) + $pseudoCount));
                    $cMean = mean((map { log($ctrlCounts[$_] + $pseudoCount) } @alli), log(max(0, $ctrlReads) + $pseudoCount)); 

                    @ctrlCounts = map { log($ctrlCounts[$_] + $pseudoCount) } @i;
                    @ctrlCounts = map { $_ / $cMean } @ctrlCounts;
                    @react = map { [ $react[$_]->[0], $react[$_]->[1] / $tMean ] } 0 .. $#react;
                    
                    if ($ignoreLower) {

                        my @i = grep { $react[$_]->[1] >= $ctrlCounts[$_] } 0 .. $#react;
                        @ctrlCounts = @ctrlCounts[@i];
                        @react = @react[@i];

                    }

                    next TRANSCRIPT if (!@react);

                    @react = map { [ $react[$_]->[0], min(max(0, $react[$_]->[1] - $ctrlCounts[$_]), $maxScore) ] } 0 .. $#react;

                }
                elsif ($scoreMethod == 3) {

                    my @ctrlRatio = map { $ctrlCounts[$_] / $ctrlCov[$_] } @i;

                    if ($maxUntreatedMut < 1) {

                        my @i = grep { $ctrlRatio[$_] <= $maxUntreatedMut } 0 .. $#react;
                        @ctrlRatio = @ctrlRatio[@i];
                        @react = @react[@i];

                    }

                    if ($ignoreLower) {

                        my @i = grep { $react[$_]->[1] >= $ctrlRatio[$_] } 0 .. $#react;
                        @ctrlRatio = @ctrlRatio[@i];
                        @react = @react[@i];

                    }

                    next TRANSCRIPT if (!@react);
                    
                    if (@denatFiles) {

                        my ($denatEntry, @denatCounts, @denatCov, @denatRatio);
                        $denatEntry = $entries{$denatFiles[$i]};
                        @denatCounts = $denatEntry->counts();
                        @denatCov = $denatEntry->coverage();
                        @denatRatio = map { $denatCounts[$_] / $denatCov[$_] } @i;

                        if ($maxMutRate < 1) {

                            my @i = grep { $denatRatio[$_] <= $maxMutRate } 0 .. $#react;
                            @denatRatio = @denatRatio[@i];
                            @react = @react[@i];

                        }

                        @react = map { [ $react[$_]->[0], max(0, ($react[$_]->[1] - $ctrlRatio[$_]) / $denatRatio[$_]) ] } 0 .. $#react;

                    }
                    else { @react = map { [ $react[$_]->[0], max(0, $react[$_]->[1] - $ctrlRatio[$_]) ] } 0 .. $#react; }
                    
                }

            }

            { lock(%tmpReact); 
              push(@{$tmpReact{$treatFiles[$i]}}, map { shareDataStruct($_) } @react); }

        }

    }

}

sub boxplot {

    my @reactivity = sort {$b->[1] <=> $a->[1]} @_;

    return if (!@reactivity);

    my ($avg, $perc75, $perc25, $iqrange, 
        $max, $p10, $outliers);
    $perc75 = percentile([ map { $_->[1] } @reactivity ], 0.75);
    $perc25 = percentile([ map { $_->[1] } @reactivity ], 0.25);
    $iqrange = $perc75 - $perc25;      # Interquartile range
    $max = $perc75 + ($iqrange * 1.5); # Above this value, data points are considered outliers
    $p10 = @reactivity < 100 && @reactivity >= 50 ? 10 : round(@reactivity * 0.1);

    if (@reactivity < 50) { $outliers = round(@reactivity * 0.02); }
    else { for (@reactivity) { $outliers++ if ($_->[1] > $max); } }

    $outliers = 1 if (!$outliers);
    @reactivity = @reactivity[$outliers .. $#reactivity];

    return($p10 ? map { $reactivity[$_]->[0] } 0 .. $p10 - 1 : ());

}

sub norm_28 {  # 2-8% Normalization

    my @reactivity = sort {$b->[1] <=> $a->[1]} @_;

    return if (!@reactivity);

    my ($avg, $p2, $p10, @norm);
    $p2 = max(1, round(@reactivity * 0.02) - 1);
    $p10 = max(1, round(@reactivity * 0.1) - 1);

    return($p10 ? map { $reactivity[$_]->[0] } $p2 .. $p10 : ());

}

sub winsor_90 {  # 90% Winsorizing

    my @reactivity = sort {$b->[1] <=> $a->[1]} @_;

    return if (!@reactivity);

    my ($percentile5, $percentile95);
    $percentile5 = percentile([ map { $_->[1] } @reactivity ], 0.05) || 0; # 5th percentile
    $percentile95 = percentile([ map { $_->[1] } @reactivity ], 0.95); # 95th percentile

    return($percentile5, $percentile95);

}

sub help {

    print "\n  [!] Error: Invalid option. Please check the help\n" if ($_[0]);

    die <<HELP;

 RF NormFactor (v$Core::Utils::VERSION)
 RNA Framework [http://www.rnaframework.com]

 Author:  Danny Incarnato (dincarnato[at]rnaframework.com)
 Summary: Calculate transcriptome-wide (and experiment-wide) normalization factors to be used with rf-norm

 Usage:   rf-normfactor [Options]

 Options                                            Description
 -u  or --untreated           <string>[,<string>]   A comma-separated list of untreated sample RC files (required by Ding/Siegfried 
                                                    scoring methods)
                                                    Note: if a single file is passed, this will be used for all treated files
 -d  or --denatured           <string>[,<string>]   A comma-separated list of denatured sample RC files (optional for Siegfried 
                                                    scoring method)
                                                    Note: if a single file is passed, this will be used for all treated files
 -t  or --treated             <string>[,<string>]   A comma-separated list of treated sample RC files
 -i  or --index               <string>              RCI index file
 -wl or --whitelist           <string>              A text file containing a list of IDs of transcripts to be used for calculating
                                                    normalization factors (Default: use all transcripts)
 -p  or --processors          <int>                 Number of processors to use (Default: 1)
 -o  or --output              <string>              Output file normalization factors will be reported to (Default: norm_factors.txt)
 -ow or --overwrite                                 Overwrites output file (if it already exists)
 -sm or --scoring-method      <int>                 Specifies the score calculation method (1-2, Default: 1): [1] Ding et al., 2014
                                                                                                              [2] Rouskin et al., 2014
                                                                                                              [3] Siegfried et al., 2014
                                                                                                              [4] Zubradt et al., 2016
 -nm or --norm-method         <int>                 Specifies the normalization method (1-3, Default: 1): [1] 2-8%
                                                                                                          [2] 90% Winsorizing
                                                                                                          [3] Box-plot
                                                    Note: 2-8% and Box-plot normalization methods cannot be used with Rouskin scoring method.
 -rb or --reactive-bases      <string>              Reactive bases to consider for signal normalization (Default: all)
                                                    Note: IUPAC codes are allowed
 -mc or --min-coverage        <int>                 Discards any base with coverage below this threshold (>=1, Default: 10)
 -ec or --median-coverage     <float>               Discards transcripts having median coverage below this threshold (>=0, Default: 0)
 -rn or --run-norm                                  Automatically runs rf-norm with the derived normalization factors on the input files
                                                    Note: the default output folder name of rf-norm will be used, with appended "_normfactor"
                                                          suffix. Any pre-existing folder with the same name wil be overwritten
 -rf or --rf-norm             <string>              Path to rf-norm executable (Default: assumes rf-norm is in PATH)

 Scoring method #1 options (Ding et al., 2014) options
 -pc or --pseudocount         <float>               Pseudocount added to reactivities to avoid division by 0 (>0, Default: 1)
 -s  or --max-score           <float>               Score threshold for capping raw reactivities (>0, Default: 10)

 Scoring method #3 options (Siegfried et al., 2014) options
 -mu or --max-untreated-mut   <float>               Maximum per-base mutation rate in untreated sample (0<=r<=1, Default: 0.05 [5%])

 Scoring methods #1 and #3 (Ding et al., 2014 & Siegfried et al., 2014)
 -il or --ignore-lower-than-untreated               Bases having raw reactivity in the treated sample lower than the untreated control, will
                                                    be ignored (not used during reactivity normalization)

 Scoring methods #3 and #4 (mutational profiling) options
 -mm or --max-mutation-rate   <float>               Maximum per-base mutation rate (0<=r<=1, Default: 1 [100%])

HELP

}

