#!/usr/bin/env perl

##
# RF Norm
# RNA Framework [http://www.rnaframework.com]
#
# Author:  Danny Incarnato (dincarnato[at]rnaframework.com)
# Summary: Performs normalization of RNA structure probing data
#
# This program is free software, and can be redistribute  and/or modified
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.
#
# Please see <http://www.gnu.org/licenses/> for more informations.
##

use strict;
use Config;
use File::Basename;
use FindBin qw($Bin);
use Getopt::Long qw(:config no_ignore_case);
use threads;
use threads::shared;

use lib $Bin . "/lib";

use Core::Mathematics qw(:all);
use Core::Statistics;
use Core::Utils;
use Data::IO;
use Data::XML;
use Graphics::Chart::Step;
use Graphics::Image;
use RF::Config;
use RF::Utils;
use RF::Data::IO::RC;
use Term::Constants qw(:screen);
use Term::Progress;

$|++;

die "\n  [!] Error: This program requires ithreads." .
    "\n             Please recompile Perl with ithreads support and try again\n\n" unless(defined $Config{useithreads});

my ($help, $output, $overwrite, $configfile,
    $scoremethod, $normmethod, $normwin, $winoffset,
    $reactive, $independent, $pseudocount, $maxscore,
    $meancov, $mediancov, $untreated, $treated,
    $index, $config, $threads, $R,
    $rc, $hasctrl, $decimals, $nan, $image,
    $remap, $raw, $suffix, $maxumut, $tmpDir,
    $denatured, $dynwin, $maxmutrate, $normfactor,
    $ignoreLower, @pool, @normfactor, @legendSort,
    %legendColors);

my $progressBar : shared;
my @ids : shared;
my %results : shared;
%results = ( cov      => 0,
             incov    => 0,
             diffuseq => 0,
             nouid    => 0,
             diffdseq => 0,
             nodid    => 0 );

do {

    local $SIG{__WARN__} = sub { };

    GetOptions( "h|help"                         => \$help,
                "o|output-dir=s"                 => \$output,
                "ow|overwrite"                   => \$overwrite,
                "c|config-file=s"                => \$configfile,
                "sm|scoring-method=s"            => \$scoremethod,
                "nm|norm-method=s"               => \$normmethod,
                "nw|norm-window=s"               => \$normwin,
                "wo|window-offset=s"             => \$winoffset,
                "dw|dynamic-window"              => \$dynwin,
                "rb|reactive-bases=s"            => \$reactive,
                "ni|norm-independent"            => \$independent,
                "pc|pseudocount=s"	             => \$pseudocount,
                "s|max-score=s"                  => \$maxscore,
                "mc|mean-coverage=s"             => \$meancov,
                "ec|median-coverage=s"           => \$mediancov,
                "u|untreated=s"                  => \$untreated,
                "t|treated=s"                    => \$treated,
                "d|denatured=s"                  => \$denatured,
                "i|index=s"                      => \$index,
                "D|decimals=i"                   => \$decimals,
                "p|processors=i"                 => \$threads,
                "n|nan=i"                        => \$nan,
                "rm|remap-reactivities"          => \$remap,
                "r|raw"                          => \$raw,
                "mu|max-untreated-mut=s"         => \$maxumut,
                "mm|max-mutation-rate=s"         => \$maxmutrate,
                "nf|norm-factor=s"               => \$normfactor,
                "il|ignore-lower-than-untreated" => \$ignoreLower,
                "g|img"                          => \$image,
                "R|R-path=s"                     => \$R ) or help(1);

};

help() if ($help);

$threads ||= 1;
$decimals ||= 3;
$scoremethod ||= 1;
$normmethod ||= 1;
$nan ||= 10;
$maxumut //= 0.05;
$maxmutrate ||= 1;
$normwin ||= $scoremethod =~ m/^Ding|Siegfried|Zubradt|[134]$/i ? 1e9 : 50;
$winoffset ||= $scoremethod =~ m/^Ding|Siegfried|Zubradt|[134]$/i ? 1e9 : 50;
$suffix = $raw ? "_raw/" : "_norm/";
$R = checkRinstall($R) if ($image);

die "\n  [!] Error: Number of processors must be an integer greater than 0\n\n" if ($threads < 1);
die "\n  [!] Error: Decimals value must be an integer comprised between 1 and 10\n\n" if ($decimals < 1 || $decimals > 10);
die "\n  [!] Error: Maximum allowed window size for dynamic windows is 30000 nt\n\n" if ($dynwin && $normwin > 30000);
die "\n  [!] Error: Provided RCI index file does not exist\n\n" if ($index && !-e $index);

print "\n[+] Parsing configuration...";

print "\n\n  [!] Warning: Provided configuration file doesn't exist. Will be created...\n" if (defined $configfile &&
                                                                                               !-e $configfile);

$config = RF::Config->new( file              => $configfile,
                           scoremethod       => $scoremethod,
                           normmethod        => $normmethod,
                           normwindow        => $normwin,
                           windowoffset      => $winoffset,
                           reactivebases     => $reactive,
                           normindependent   => $independent,
                           pseudocount       => $pseudocount,
                           maxscore          => $maxscore,
                           meancoverage      => $meancov,
                           mediancoverage    => $mediancov,
                           remapreactivities => $remap,
                           maxuntreatedmut   => $maxumut,
                           maxmutationrate   => $maxmutrate,
                           raw               => $raw ? 1 : 0 );

$config->summary();
$hasctrl = $config->scoremethod() =~ /^[24]$/ ? 0 : 1;

die "\n\n  [!] Error: No treated sample RC file provided\n\n" if (!defined $treated);
die "\n\n  [!] Error: Provided treated sample RC file doesn't exist\n\n" if (!-e $treated);

print "\n\n  [!] Warning: Parameter --ignore-lower-than-untreated will have no effect in the absence of an untreated sample\n" if ($ignoreLower && !$hasctrl);

if (defined $normfactor) {

    if ($config->normmethod() == 2) { 
        
        @normfactor = split(/,/, $normfactor);

        die "\n  [!] Error: 90\% Winsorizing requires two values for normalization factor\n\n" if (@normfactor != 2);
        
    }
    else { @normfactor = ($normfactor); }

    die "\n  [!] Error: Normalization factor must be >= 0\n\n" if ($normfactor[0] < 0);
    die "\n  [!] Error: Normalization factor must be > 0\n\n" if ($normfactor[-1] <= 0);

}

if (defined $untreated) { print "\n  [!] Warning: " . $config->scoremethod(1) . " scoring method has been chosen. Ignoring untreated sample file...\n" if (!$hasctrl); }
else {

    if ($hasctrl) {

        die "\n  [!] Error: No untreated sample RC file provided\n\n" if (!defined $untreated);
        die "\n  [!] Error: Provided untreated sample RC file doesn't exist\n\n" if (!-e $untreated);

    }

}

if (defined $denatured) { print "\n  [!] Warning: Denatured sample is considered only by Siegfried normalization method. Ignoring denatured sample file...\n" if ($config->scoremethod() != 3); }
else {

    die "\n  [!] Error: Provided denatured sample RC file doesn't exist\n\n" if ($config->scoremethod() == 3 &&
                                                                                 defined $denatured &&
                                                                                 !-e $denatured);

}

if (!defined $output) {

    my ($uid, $did, $tid);
    $uid = fileparse($untreated, ".rc") if (defined $untreated);
    $did = fileparse($denatured, ".rc") if (defined $denatured);
    $tid = fileparse($treated, ".rc");

    $output = ($config->scoremethod() =~ /^[24]$/ ? $tid : ($config->scoremethod() == 3 &&
                                                            defined $denatured ? $tid . "_vs_" . $uid . "_" . $did : $tid . "_vs_" . $uid)) . $suffix;

}
else { $output =~ s/\/?$/\//; }

$SIG{__DIE__} = \&cleanup;
$tmpDir = "${output}tmp/";

print "\n[+] Making output directory...";

if (-e $output) {

    if ($overwrite) {

        my $error = rmtree($output);

        die "\n\n  [!] Error: " . $error . "\n\n" if ($error);

    }
    else { die "\n\n  [!] Error: Output directory already exists." .
               "\n      Please use -ow (or --overwrite) to overwrite output directory\n\n"; }

}

if (my $error = mktree($tmpDir, "755")) { die "\n  [!] Error: Unable to create output directory ($error)\n\n"; }

mktree("${output}plots/", "755") if ($image);

# In case no index has been provided, we rebuild the index before generating the working threads
# The new index will be generated in the same path of the rc file, with extension .rci
if (!$index) {

    $rc = RF::Data::IO::RC->new( file       => $untreated,
                                 index      => "$untreated.rci",
                                 buildindex => 1 ) if ($hasctrl && !-e "$untreated.rci");

    $rc = RF::Data::IO::RC->new( file       => $denatured,
                                 index      => "$denatured.rci",
                                 buildindex => 1 ) if ($config->scoremethod() == 3 && defined $denatured && !-e "$denatured.rci");

}

$rc = RF::Data::IO::RC->new( file       => $treated,
                             index      => $index ? $index : "$treated.rci",
                             buildindex => $index || -e "$treated.rci" ? 0 : 1 );

if ($image) {

    %legendColors = ("Treated" => "#4daf4a");
    @legendSort = ("Treated");

    if ($config->scoremethod() == 3 && defined $denatured) {

        $legendColors{"Denatured"} = "#e41a1c";
        push(@legendSort, "Denatured");
    
    }

    if ($hasctrl) {

        $legendColors{"Untreated"} = "#377eb8";
        push(@legendSort, "Untreated");

    }

}

print "\n[+] Loading transcript IDs... ";

@ids = $rc->ids();

print scalar(@ids) . " transcripts loaded." .
      "\n[+] Normalizing reactivities. Please wait...\n\n";

$progressBar = shared_clone(Term::Progress->new( max     => scalar(@ids),
                                                 width   => 50,
                                                 colored => 1 ));
$progressBar->init();

@pool = map{ threads->create(\&norm) } 1 .. $threads;
$_->join() for(@pool);

print "\n\n[+] Normalization statistics:\n" .
      "\n  [*] Covered transcripts:   " . $results{cov} .
      "\n  [*] Discarded transcripts: " . ($results{incov} + $results{diffuseq} + $results{diffdseq} + $results{nocov} + $results{nouid} + $results{nodid}) . " total" .
      "\n                             " . $results{incov} . " insufficient coverage";

print "\n                             " . $results{diffuseq} . " mismatch between treated and untreated sample sequence" .
      "\n                             " . $results{nouid} . " absent in untreated sample reference" if ($hasctrl);

print "\n                             " . $results{diffdseq} . " mismatch between treated and denatured sample sequence" .
      "\n                             " . $results{nodid} . " absent in denatured sample reference" if (defined $denatured &&
                                                                                                        $config->scoremethod() == 3);
print "\n\n[+] All done.\n\n";

cleanup();

sub norm {

    my ($urc, $trc, $drc, $reactivity,
        $attributes, $xmlio, $xml, $plot);

    $urc = RF::Data::IO::RC->new( file  => $untreated,
                                  index => $index || "$untreated.rci" ) if ($hasctrl);

    $drc = RF::Data::IO::RC->new( file  => $denatured,
                                  index => $index || "$denatured.rci" ) if (defined $denatured && $config->scoremethod() == 3);

    $trc = RF::Data::IO::RC->new( file  => $treated,
                                  index => $index || "$treated.rci" );
    $plot = Graphics::Image->new( width  => 10,
                                  height => 4,
                                  R      => $R,
                                  tmpdir => $tmpDir ) if ($image);

    while (1) {

        my ($id);

        { lock(@ids);
          $id = shift(@ids) if (@ids); }

        last unless(defined $id);

        { lock($progressBar);
          $progressBar->update(1); }

        my ($tentry, $sequence, $seqmask, $uReads, $tReads,
            @seqmask, @tcounts, @ucounts, @dcounts, @tcov,
            @ucov, @dcov, @last, @raw, @norm);
        $tentry = $trc->read($id);

        if ($tentry->meancoverage() < $config->meancoverage() ||
            $tentry->mediancoverage() < $config->mediancoverage()) {

            lock(%results);
            $results{incov}++;

            next;

        }

        $sequence = $tentry->sequence();
        $tReads = $tentry->readscount();
        @seqmask = split(//, $sequence);
        @tcounts = $tentry->counts();
        @tcov = $tentry->coverage();

        if ($hasctrl) { # Ding/Siegfried methods

            if (my $uentry = $urc->read($id)) {

                if ($uentry->sequence() ne $sequence) {

                    lock(%results);
                    $results{diffuseq}++;

                    next;

                }

                if ($uentry->meancoverage() < $config->meancoverage() ||
                    $uentry->mediancoverage() < $config->mediancoverage()) {

                    lock(%results);
                    $results{incov}++;

                    next;

                }

                $uReads = $uentry->readscount();
                @ucounts = $uentry->counts();
                @ucov = $uentry->coverage();

            }
            else {

                lock(%results);
                $results{nouid}++;

                next;

            }

            if ($config->scoremethod() == 3) { # Siegfried

                if (defined $denatured) {

                    if (my $dentry = $drc->read($id)) {

                        if ($dentry->sequence() ne $sequence) {

                            lock(%results);
                            $results{diffdseq}++;

                            next;

                        }

                        if ($dentry->meancoverage() < $config->meancoverage() ||
                            $dentry->mediancoverage() < $config->mediancoverage()) {

                            lock(%results);
                            $results{incov}++;

                            next;

                        }

                        @dcounts = $dentry->counts();
                        @dcov = $dentry->coverage();

                    }
                    else {

                        lock(%results);
                        $results{nodid}++;

                        next;

                    }

                }

                for (0 .. $#tcounts) {

                    if ($tcov[$_] < $nan ||                   # Coverage per base < $nan in each sample
                        $ucov[$_] < $nan ||
                        (defined $denatured &&
                         $dcov[$_] < $nan) ||
                        $ucounts[$_] / $ucov[$_] > $maxumut ||
                        $ucounts[$_] / $ucov[$_] > $maxmutrate ||
                        $tcounts[$_] / $tcov[$_] > $maxmutrate ||
                        (defined $denatured &&
                         $dcounts[$_] / $dcov[$_] > $maxmutrate)) { # Mutation rate in untreated sample > $maxumut

                        $seqmask[$_] = "X";

                    }
                    else {

                        $ucounts[$_] = $ucounts[$_] / $ucov[$_];
                        $tcounts[$_] = $tcounts[$_] / $tcov[$_];
                        $dcounts[$_] = $dcounts[$_] / $dcov[$_] if (defined $denatured);

                        $seqmask[$_] = "X" if ($ignoreLower && $tcounts[$_] < $ucounts[$_]);

                    }

                }

                if (defined $denatured) { @raw = map { $dcounts[$_] ? max(0, ($tcounts[$_] - $ucounts[$_]) / $dcounts[$_]) : 0 } 0 .. $#tcounts; }
                else { @raw = map { max(0, ($tcounts[$_] - $ucounts[$_])) } 0 .. $#tcounts; }

            }
            else {  # Ding

                my ($umean, $tmean, @uLog, @tLog);
                @uLog = map { log($_ + $config->pseudocount()) } @ucounts;
                @tLog = map { log($_ + $config->pseudocount()) } @tcounts;
                $uReads -= sum(@ucounts);
                $tReads -= sum(@tcounts);

                for (0 .. $#tcounts) { $seqmask[$_] = "X" if ($ucov[$_] < $nan || $tcov[$_] < $nan); }

                $umean = mean(@uLog, log(max(0, $uReads) + $config->pseudocount()));
                $tmean = mean(@tLog, log(max(0, $tReads) + $config->pseudocount()));

                if (!$umean || !$tmean) {

                    lock(%results);
                    $results{incov}++;

                    next;

                }

                @uLog = map { $_ / $umean } @uLog;
                @tLog = map { $_ / $tmean } @tLog;
                @raw = map { min(max(0, $tLog[$_] - $uLog[$_]), $config->maxscore()) } 0 .. $#tcounts;

                for (0 .. $#tcounts) { $seqmask[$_] = "X" if ($ignoreLower && $tLog[$_] < $uLog[$_]); }

            }

        }
        else {

            if ($config->scoremethod() == 4) { # Zubradt method

                for (0 .. $#tcounts) { 

                    if ($tcov[$_] < $nan || $tcounts[$_] / $tcov[$_] > $maxmutrate) { $seqmask[$_] = "X"; }
                    else { $tcounts[$_] /= $tcov[$_]; }

                }

            }
            else { # Rouskin

                for (0 .. $#tcounts) { $seqmask[$_] = "X" if ($tcov[$_] < $nan); }

            }

            @raw = @tcounts;

        }

        $seqmask = join("", @seqmask);

        if ($image) {

            $plot->file("${output}plots/$id.pdf");
            
            my ($step, @maskedPos);
            @maskedPos = grep { $seqmask[$_] eq "X" } 0 .. $#seqmask; 
            @tcounts[@maskedPos] = (0) x @maskedPos;
            @ucounts[@maskedPos] = (0) x @maskedPos if (@ucounts);
            @dcounts[@maskedPos] = (0) x @maskedPos if (@dcounts); 
            $step = Graphics::Chart::Step->new( data            => [ @dcounts, @ucounts, @tcounts ],
                                                dataLabelType   => { "pos" => "numeric" },
                                                dataLabels      => { "sample" => [ ("Denatured") x @dcounts, ("Untreated") x @ucounts, ("Treated") x @tcounts ],
                                                                     "pos"    => [ (1 .. $tentry->length()) x ((@tcounts + @ucounts + @dcounts) / $tentry->length()) ] },
                                                fill            => "sample",
                                                x               => "pos",
                                                legendColors    => \%legendColors,
                                                legendSort      => \@legendSort,
                                                dataLabelSort   => { "sample" => \@legendSort },
                                                xTitle          => "Position (nt)",
                                                yTitle          => $config->scoremethod() =~ /^[34]$/ ? "Mutation rate" : "RT-stops",
                                                axisTextSize    => 12,
                                                axisTitleSize   => 14,
                                                legendTextSize  => 12,
                                                lineThickness   => 0.4 );
            
            $plot->plot([$step]);

        }

        if ($raw) { # Skips normalization

            my $bases = $config->reactivebases();
            @norm = @raw;

            undef($norm[$-[0]]) while($seqmask =~ m/[^$bases]/g);

        }
        else { # Performs normalization

            my ($winlen, $winoffset);

            if ($dynwin) { # Dynamic window (dynamically resized to contain $winlen x reactive bases)

                my ($tmpsequence, $bases, $i);
                $winlen = $config->normwindow() - 1;
                $winoffset = $config->windowoffset() - 1;
                $tmpsequence = $sequence;
                $bases = $config->reactivebases();
                $tmpsequence =~ s/^[^$bases]*//;
                $i = 0;

                while($tmpsequence =~ m/^((?:[$bases][^$bases]*){$winlen}[$bases])/) { # Regex to capture a window containing $bases x $winlen

                    my ($winseq, $winmask, @wincounts, @winnorm);
                    $winseq = $1;
                    $i = index($sequence, $winseq, $i);
                    $winmask = substr($seqmask, $i, length($winseq));
                    @wincounts = @raw[$i .. $i + length($winseq) - 1];

                    foreach my $base ($config->normindependent() ? split(//, $bases) : $bases) {

                        my (@base, @basenorm);

                        push(@base, $wincounts[$-[0]]) while($winmask =~ m/[$base]/g); # Extracts reactivities only on specified (unmasked) bases

                        @basenorm = $config->normmethod() == 1 ? norm_28(@base) : ($config->normmethod() == 2 ? winsor_90(@base) : ($config->normmethod() == 3 ? boxplot(@base) : mitchell(@base)));
                        @basenorm = (undef) x scalar(@base) if (!@basenorm); # In case this window has not been covered

                        $winnorm[$-[0]] = shift(@basenorm) while($winmask =~ m/[$base]/g);
                        push(@winnorm, undef) while (@winnorm < length($winseq)); # If winseq doesn't end with a $base, winnorm will be shorter than winseq

                    }

                    for (0 .. $#winnorm) { push(@{$norm[$_ + $i]}, $winnorm[$_]); }

                    ($winseq) = $winseq =~ m/^((?:[$bases][^$bases]*){$winoffset}[$bases])/;
                    $tmpsequence =~ s/^$winseq[^$bases]*//;
                    $i += length($winseq);

                }

                $i = 0;

                if ($tmpsequence =~ m/[$bases]/) { # There are still $bases in the remaining sequence (or the window was larger than the analyzed sequence)

                    my ($winseq, $winmask, @wincounts, @winnorm);

                    if ($sequence =~ m/((?:[$bases][^$bases]*){$winlen}[$bases])[^$bases]*$/) { # To avoid end biases, $winlen bases are extracted from the end if possible

                        $winseq = $1;
                        $i = index($sequence, $winseq, $i); #changeme with $i = length($sequence) - length($winseq);
                        $winmask = substr($seqmask, $i, length($winseq));

                    }
                    else {

                        $winseq = $sequence;
                        $winmask = substr($seqmask, -length($winseq));

                    }

                    @wincounts = @raw[$i .. $i + length($winseq) - 1];

                    foreach my $base ($config->normindependent() ? split(//, $bases) : $bases) {

                        my (@base, @basenorm);

                        push(@base, $wincounts[$-[0]]) while($winmask =~ m/[$base]/g); # Extracts reactivities only on specified bases

                        @basenorm = $config->normmethod() == 1 ? norm_28(@base) : ($config->normmethod() == 2 ? winsor_90(@base) : ($config->normmethod() == 3 ? boxplot(@base) : mitchell(@base)));
                        @basenorm = (undef) x scalar(@base) if (!@basenorm); # In case this window has not been covered

                        $winnorm[$-[0]] = shift(@basenorm) while($winmask =~ m/[$base]/g);
                        push(@winnorm, undef) while (@winnorm < length($winseq)); # If winseq doesn't end with a $base, winnorm will be shorter than winseq

                    }

                    for (0 .. $#winnorm) { push(@{$norm[$_ + $i]}, $winnorm[$_]); }

                }

            }
            else {

                $winlen = $config->normwindow() > @raw ? @raw : $config->normwindow();
                $winoffset = $winlen == @raw ? $winlen : $config->windowoffset();

                for(my $i = 0; $i + $winlen - 1 <= $#raw; $i += $winoffset) {

                    my ($winmask, @wincounts, @winnorm);
                    $winmask = substr($seqmask, $i, $winlen);
                    @wincounts = @raw[$i .. $i + $winlen - 1];

                    foreach my $base ($config->normindependent() ? split(//, $config->reactivebases()) : $config->reactivebases()) {

                        my (@base, @basenorm);

                        push(@base, $wincounts[$-[0]]) while($winmask =~ m/[$base]/g); # Extracts reactivities only on specified bases

                        @basenorm = $config->normmethod() == 1 ? norm_28(@base) : ($config->normmethod() == 2 ? winsor_90(@base) : ($config->normmethod() == 3 ? boxplot(@base) : mitchell(@base)));
                        @basenorm = (undef) x scalar(@base) if (!@basenorm); # In case this window has not been covered

                        $winnorm[$-[0]] = shift(@basenorm) while($winmask =~ m/[$base]/g);
                        push(@winnorm, undef) while (@winnorm < length($winmask)); # If winseq doesn't end with a $base, winnorm will be shorter than winseq

                    }

                    for (0 .. $#winnorm) { push(@{$norm[$_ + $i]}, $winnorm[$_]); }

                }

                if (@norm < @raw) { # Missing the last window

                    my ($winmask, $i, @wincounts, @winnorm);
                    $i = @raw - $winlen;
                    $winmask = substr($seqmask, $i);
                    @wincounts = @raw[$i .. $#raw];

                    foreach my $base ($config->normindependent() ? split(//, $config->reactivebases()) : $config->reactivebases()) {

                        my (@base, @basenorm);

                        push(@base, $wincounts[$-[0]]) while($winmask =~ m/[$base]/g); # Extracts reactivities only on specified bases

                        @basenorm = $config->normmethod() == 1 ? norm_28(@base) : ($config->normmethod() == 2 ? winsor_90(@base) : ($config->normmethod() == 3 ? boxplot(@base) : mitchell(@base)));
                        @basenorm = (undef) x scalar(@base) if (!@basenorm); # In case this window has not been covered

                        $winnorm[$-[0]] = shift(@basenorm) while($winmask =~ m/[$base]/g);
                        push(@winnorm, undef) while (@winnorm < length($winmask)); # If winseq doesn't end with a $base, winnorm will be shorter than winseq

                    }

                    for (0 .. $#winnorm) { push(@{$norm[$_ + $i]}, $winnorm[$_]); }

                }

            }

            @norm = map { ref($_) eq "ARRAY" ? (isnumeric(@{$_}) ? mean(@{$_}) : undef) : undef } @norm;
            @norm = zarringhalam(@norm) if ($config->remapreactivities());

        }

        # Set masked positions to NaN
        @norm = map { $seqmask[$_] ne "X" ? $norm[$_] : undef } 0 .. $#tcov;

        # This control has been added because in certain cases the coverage is above threshold, but the signal
        # is limited to non-probed bases, thus the final reactivity profile is NaN for all bases.
        # In such case, the transcript should be considered as non-covered, and excluded from output.
        if (!grep {defined $_} @norm) {

            lock(%results);
            $results{incov}++;

            next;

        }

        $reactivity = join(",", map { defined $_ ? sprintf("%." . $decimals . "f", $_) : "NaN" } @norm);

        # For nicer formatting
        $sequence =~ s/(\w{60})/$1\n/g;
        $reactivity =~ s/((?:[\w\.]+,){60})/$1\n/g;

        $attributes = { combined => "FALSE",
                        reactive => $config->reactivebases(),
                        scoring  => $config->scoremethod(1),
                        norm     => $raw ? "raw" : $config->normmethod(1),
                        offset   => $config->windowoffset(),
                        win      => $config->normwindow(),
                        remap    => $config->remapreactivities(),
                        tool     => "rf-norm" };

        if ($config->scoremethod() == 1) { # Ding

            $attributes->{max} = $config->maxscore();
            $attributes->{pseudo} = $config->pseudocount();

        }
        elsif ($config->scoremethod() == 3) { # Siegfried

            $attributes->{maxumut} = $config->maxuntreatedmut();

        }

        if ($config->scoremethod() =~ m/^[34]$/) { # Siegfried/Zubradt

            $attributes->{maxmutrate} = $config->maxmutationrate();

        }

        $xmlio = Data::IO->new( file      => $output . $id . ".xml",
                                mode      => "w",
                                binmode   => ":encoding(utf-8)",
                                verbosity => -1 );
        $xml = Data::XML->new( heading   => 1,
                               indent    => 0,
                               autoclose => 1 );

        $xml->opentag("data", $attributes);
        $xml->opentag("transcript", { id     => $id,
                                      length => $tentry->length() });
        $xml->opentag("sequence");
        $xml->addtext($sequence);
        $xml->closelasttag();
        $xml->opentag("reactivity");
        $xml->addtext($reactivity);
        $xmlio->write($xml->xml());

        { lock(%results);
          $results{cov}++; }

    }

    threads->exit();

}

sub cleanup {

    unlink(glob($tmpDir . "*"));

    rmtree($tmpDir);

}

sub winsor_90 {  # 90% Winsorizing

    my @reactivity = @_;

    my ($percentile5, $percentile95);

    if (!defined $normfactor) {

        return if (@reactivity < 10);

        $percentile5 = percentile(\@reactivity, 0.05) || 0; # 5th percentile
        $percentile95 = percentile(\@reactivity, 0.95); # 95th percentile

        return() unless($percentile95);

    }
    else { ($percentile5, $percentile95) = @normfactor; }

    @reactivity = map { min(($_ < $percentile5 ? $percentile5 : $_) / $percentile95, 1) } @reactivity;

    return(@reactivity);

}

sub norm_28 {  # 2-8% Normalization

    my @reactivity = @_;

    my ($avg, @norm);

    if (!defined $normfactor) {

        return if (@reactivity < 10);
        
        my ($p2, $p10);
        $p2 = max(1, round(@reactivity * 0.02) - 1);
        $p10 = max(1, round(@reactivity * 0.1) - 1);

        return() unless($p10);

        @norm = sort {$b <=> $a} @reactivity;
        $avg = mean(@norm[$p2 .. $p10]);

        return() if ($avg == 0);

    }
    else { $avg = $normfactor[0]; }

    @reactivity = map { max(0, $_ / $avg) } @reactivity;

    return(@reactivity);

}

sub boxplot {  # Box-plot Normalization

    my @reactivity = @_;

    my ($avg, @norm);

    if (!defined $normfactor) {

        return if (@reactivity < 10);

        my ($iqrange, $max, $p10, $outliers);
        $iqrange = percentile(\@reactivity, 0.75) - percentile(\@reactivity, 0.25); # Interquartile range
        $max = percentile(\@reactivity, 0.75) + ($iqrange * 1.5);                   # Above this value, data points are considered outliers
        @norm = sort {$b <=> $a} @reactivity;
        $p10 = @norm < 100 && @norm >= 50 ? 10 : round(@norm * 0.1);

        if (@norm < 50) { $outliers = round(@norm * 0.02); }
        else { for (@norm) { $outliers++ if ($_ > $max); } }

        $outliers = 1 if (!$outliers);
        @norm = @norm[$outliers .. $#norm];

        return unless($p10);

        $avg = mean(@norm[0 .. $p10 - 1]);

        return() if ($avg == 0);

    }
    else { $avg = $normfactor[0]; }

    # Values will range between 0 and ~1.5 according to
    # "SHAPE-Directed RNA Secondary Structure Prediction" (Low et al., 2010)
    @reactivity = map { max(0, $_ / $avg) } @reactivity;

    return(@reactivity);

}

sub mitchell { # Mitchell et al., 2023 (Mustoe lab)

    my @reactivity = @_;

    return if (@reactivity < 10);

    my ($avg);

    if (!defined $normfactor) {

        my ($p90, $p95, $p75, @sorted, @sortedHigh);
        @sorted = sort {$a <=> $b} @reactivity;
        @sortedHigh = grep { $_ > 0.001 } @sorted;
        $p90 = max(1, round(@sorted * 0.9) - 1);
        $p95 = max(1, round(@sorted * 0.95) - 1);
        $avg = mean(@sorted[$p90 .. $p95]);
        $p75 = max(1, round(@sortedHigh * 0.75) - 1);
        $avg = max($avg, $sortedHigh[$p75]);

    }
    else { $avg = $normfactor[0]; }

    @reactivity = map { max(0, $_ / $avg) } @reactivity;

    return(@reactivity);

}

sub zarringhalam {

    my @reactivity = @_;

    return if (!@reactivity);

    my ($oldmin, $newmin, $oldmax, $newmax,
        $max);
    $max = max(grep { defined $_ } @reactivity);


    for (@reactivity) {

        next if (!isnumeric($_));

        if ($_ < 0.25) { ($oldmin, $oldmax, $newmin, $newmax) = (0, 0.25, 0, 0.35); }
        elsif ($_ >= 0.25 && $_ < 0.3) { ($oldmin, $oldmax, $newmin, $newmax) = (0.25, 0.3, 0.35, 0.55); }
        elsif ($_ >= 0.3 && $_ < 0.7) { ($oldmin, $oldmax, $newmin, $newmax) = (0.3, 0.7, 0.55, 0.85); }
        else { ($oldmin, $oldmax, $newmin, $newmax) = (0.7, $max, 0.85, 1); }

        $_ = maprange($oldmin, $oldmax, $newmin, $newmax, $_);

    }

    return(@reactivity);

}

sub help {

    print "\n  [!] Error: Invalid option. Please check the help\n" if ($_[0]);

    die <<HELP;

 RF Norm (v$Core::Utils::VERSION)
 RNA Framework [http://www.rnaframework.com]

 Author:  Danny Incarnato (dincarnato[at]rnaframework.com)
 Summary: Performs normalization of RNA structure probing data

 Usage:   rf-norm [Options]

 Options                                            Description
 -u  or --untreated           <string>              Untreated sample RC file (required by Ding/Siegfried scoring methods)
 -d  or --denatured           <string>              Denatured sample RC file (optional for Siegfried scoring method)
 -t  or --treated             <string>              Treated sample RC file
 -i  or --index               <string>              An RCI index file to be used for all input RC files
                                                    Note: If no RCI index is provided, RF Norm will look for files with .rci
                                                          extension in the same input folder as the RC files, named after the RC
                                                          files (e.g., Sample.rc will look for Sample.rc.rci).
                                                          If no RCI file is found, it will be created at runtime, and stored in
                                                          the same folder of the input RC files.
 -p  or --processors          <int>                 Number of processors to use (Default: 1)
 -o  or --output-dir          <string>              Output directory (Default: <treated>_vs_<untreated>/ for Ding/Siegfried methods,
                                                                               <treated>/ for Rouskin/Zubradt methods, or
                                                                               <treated>_vs_<untreated>_<denatured>/ for Siegfried method)
 -ow or --overwrite                                 Overwrites output directory (if the specified path already exists)
 -c  or --config-file         <string>              A configuration file with normalization parameters
                                                    Note: If the provided file exists, this will override any command-line
                                                          specified parameter. If the provided file doesn't exist, it will
                                                          be generated using command-line specified (or default) parameters
 -sm or --scoring-method      <int>                 Specifies the score calculation method (1-2, Default: 1): [1] Ding et al., 2014
                                                                                                              [2] Rouskin et al., 2014
                                                                                                              [3] Siegfried et al., 2014
                                                                                                              [4] Zubradt et al., 2016
 -nm or --norm-method         <int>                 Specifies the normalization method (1-3, Default: 1): [1] 2-8%
                                                                                                          [2] 90% Winsorizing
                                                                                                          [3] Box-plot
                                                                                                          [4] Mitchell et al., 2023
                                                    Note: 2-8% and Box-plot normalization methods cannot be used with Rouskin scoring method.
 -r  or --raw                                       Reports raw reactivities (skips normalization)
 -rm or --remap-reactivities                        Remaps reactivities to values ranging from 0 to 1 according to Zarringhalam et al., 2012
 -rb or --reactive-bases      <string>              Reactive bases to consider for signal normalization (Default: all)
                                                    Note: IUPAC codes are allowed
 -ni or --norm-independent                          Each reactive base will be normalized independently
 -nw or --norm-window         <int>                 Window for signal normalization (>=3, Default: whole transcript (1e9) [Ding|Siegfried|Zubradt], 50 [Rouskin])
 -wo or --window-offset       <int>                 Offset for sliding window during normalization (Default: none [Ding|Siegfried|Zubradt], 50 [Rouskin])
 -dw or --dynamic-window                            When enabled, the normalization window is dynamically resized to include at least that number of reactive bases
                                                    (e.g. "-rb AC -nw 50 -dw" instructs RF Norm to normalize reactivities in windows containing at least 50 A/C residues)
 -nf or --norm-factor         <float>[,<float>]     When provided, this will be used as the normalization factor for all transcripts (default behavior is to
                                                    calculate the normalization factor independently for each transcript)
                                                    Note: 90% Winsorizing requires 2 normalization factors, provided as a comma-separated list, respectively corresponding
                                                          to the 5th and 95th percentiles of the distribution of raw reactivities
 -mc or --mean-coverage       <float>               Discards any transcript with mean coverage below this threshold (>=0, Default: 0)
 -ec or --median-coverage     <float>               Discards any transcript with median coverage below this threshold (>=0, Default: 0)
 -D  or --decimals            <int>                 Number of decimals for reporting reactivities (1-10, Default: 3)
 -n  or --nan                 <int>                 Transcript positions with read coverage below this threshold will be reported as NaN in
                                                    the reactivity profile (>0, Default: 10)
 -g  or --img                                       Enables the generation of plots of raw reactivity data (RT-stops or mutation rates) 
                                                    for each transcript across all samples (requires R)
 -R  or --R-path              <string>              Path to R executable (Default: assumes R is in PATH)

 Scoring method #1 options (Ding et al., 2014) options
 -pc or --pseudocount         <float>               Pseudocount added to reactivities to avoid division by 0 (>0, Default: 1)
 -s  or --max-score           <float>               Score threshold for capping raw reactivities (>0, Default: 10)

 Scoring method #3 options (Siegfried et al., 2014) options
 -mu or --max-untreated-mut   <float>               Maximum per-base mutation rate in untreated sample (0<=r<=1, Default: 0.05 [5%])

 Scoring methods #1 and #3 (Ding et al., 2014 & Siegfried et al., 2014)
 -il or --ignore-lower-than-untreated               Bases having raw reactivity in the treated sample lower than the untreated control, will
                                                    be ignored (not used during reactivity normalization) and will be reported as NaNs

 Scoring methods #3 and #4 (mutational profiling) options
 -mm or --max-mutation-rate   <float>               Maximum per-base mutation rate (0<=r<=1, Default: 1 [100%])

HELP

}
