#!/usr/bin/env perl

##
# RF MMTools
# RNA Framework [http://www.rnaframework.com]
#
# Author:  Danny Incarnato (dincarnato[at]rnaframework.com)
# Summary: Easy manipulation of MM files
#
# This program is free software, and can be redistribute  and/or modified
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.
#
# Please see <http://www.gnu.org/licenses/> for more informations.
##

use strict;
use File::Basename;
use File::Copy;
use FindBin qw($Bin);
use Getopt::Long qw(:config no_ignore_case);

use lib $Bin . "/lib";

use Core::Mathematics qw(:all);
use Core::Statistics;
use Core::Utils;
use Data::Sequence::Utils;
use RF::Data::IO::MM;
use RF::Data::IO::RC;
use RF::Data::RC;
use Term::Table;

$|++;

my ($output, $overwrite, $command, $help,
    $tab, $keepBases, $minMutPerRead,
    $minReadLen, $randSubsample, $bedAnno, $whitelist, 
    $discardPos, $minReads, $minRate, $maxRate, 
    $spearman, @mm, %commands);

%commands = map { $_ => 1 } qw(index view merge extract alignids
                               stats torc correlate split);

do {

    local $SIG{__WARN__} = sub { };

    GetOptions( "h|help"                => \$help,
                "o|output=s"            => \$output,
                "ow|overwrite"          => \$overwrite,
                "kb|keepBases=s"        => \$keepBases,
                "mpr|minMutPerRead=s"   => \$minMutPerRead,
                "mrl|minReadLen=s"      => \$minReadLen,
                "rs|randSubsample=s"    => \$randSubsample,
                "a|annotation=s"        => \$bedAnno,
                "wl|whitelist=s"        => \$whitelist,
                "dp|discardPositions=s" => \$discardPos,
                "mr|minReads=s"         => \$minReads,
                "S|spearman"            => \$spearman,
                "nr|minRate=s"          => \$minRate,
                "xr|maxRate=s"          => \$maxRate ) or help(1);

    $command = shift(@ARGV);
    $command = lc($command);
    @mm = uniq(@ARGV);

};

help() if ($help);

# Default
$randSubsample ||= 1;
$minMutPerRead //= 1;
$keepBases ||= "ACGT";
$minReadLen ||= 0;
$minReads ||= 1000;
$minRate ||= 0;
$maxRate ||= 1;

##
# Input validation
##

die "\n  [!] Error: No command specified\n\n" if (!$command);
die "\n  [!] Error: Invalid command \"$command\"\n\n" if (!exists $commands{$command});
die "\n  [!] Error: No MM file specified\n\n" if (!@mm);

for (0 .. ($command !~ /^(?:merge|alignids)$/ ? 0 : ($command eq "correlate" ? 1 : $#mm))) { 
    
    die "\n  [!] Error: Provided MM file \"" . $mm[$_] . "\" does not exist\n\n" if (!-e $mm[$_]); 

    next if ($command eq "index" || ($command eq "view" && @mm == 1));

    die "\n  [!] Error: Missing MMI index for MM file \"" . $mm[$_] . "\"." .
        "\n             Run 'rf-mmtools index " . $mm[$_] . "' and try again\n\n" if (!-e $mm[$_] . ".mmi");

}

if ($command eq "index") {

    for (@mm) {

        my $mmIO = RF::Data::IO::MM->new( file  => $_,
                                          index => "$_.mmi",
                                          mode  => "r" );
        $mmIO->buildIndex();
        $mmIO->close();

    }

}
elsif ($command eq "view") {

    my ($file, $mmIO);
    $file = shift(@mm);
    $mmIO = RF::Data::IO::MM->new( file   => $file,
                                   index  => "$file.mmi" );
    @mm = $mmIO->ids() if (!@mm);

    foreach my $id (@mm) {

        my $sequence = $mmIO->sequence($id);

        die "\n  [!] Error: Invalid transcript ID \"$id\"\n\n" if (!defined $sequence);

        print "$id\n$sequence\n";

        while(my ($start, $end, $muts) = $mmIO->readStream($id)) { print join("\t", $start, $end, join(",", @$muts)) . "\n"; }

        print "\n";

    }

}
elsif ($command eq "merge") {

    $output ||= "merge.mm";
    $output =~ s/(?:\.mm)?$/.mm/;

    die "\n  [!] Error: Output MM file already exists." .
        "\n             Please use -ow (or --overwrite) to overwrite output file\n\n" if (-e $output && !$overwrite);
    die "\n  [!] Error: Only one MM file provided\n\n" if (@mm < 2);

    my ($mmOut, $mappedreads, @mmIn, @ids);
    $mmOut = RF::Data::IO::MM->new( file       => $output,
                                    index      => "$output.mmi",
                                    mode       => "w",
                                    overwrite  => $overwrite );
    @mmIn = map { RF::Data::IO::MM->new( file  => $mm[$_],
                                         index => $mm[$_] . ".mmi",
                                         mode  => "r" ) } 0 .. $#mm;
    @ids = uniq(sort map { $_->ids() } @mmIn);

    foreach my $id (@ids) {

        die "\n  [!] Error: Malformed MM file(s)\n\n" if (!defined $id);

        my ($sequence, $minPos, %withSeq, %buffer);

        for my $i (0 .. $#mmIn) {

            if (my $seq = $mmIn[$i]->sequence($id)) {

                die "\n  [!] Error: Sequence for transcript \"$id\" differs across MM files\n\n" if (defined $sequence && $sequence ne $seq);

                $sequence = $seq;
                $withSeq{$i} = 1;

            }

        }

        $mmOut->append_transcript($id, $sequence);

        $minPos = 0;

        while (keys %withSeq) {

            for my $i (keys %withSeq) {

                if (!exists $buffer{$i}) {

                    while (my @read = $mmIn[$i]->readStream($id)) {

                        if ($read[0] == $minPos) { $mmOut->append_read(@read[0, 1], scalar(@{$read[2]}), $read[2]); }
                        else { 
                            
                            $buffer{$i} = \@read; 

                            last;

                        }

                    }

                    delete($withSeq{$i}) if (!exists $buffer{$i});

                }

            }

            if (keys %buffer) {

                $minPos = min(map { $buffer{$_}->[0] } keys %buffer);

                for my $i (grep { $buffer{$_}->[0] == $minPos } keys %buffer) {

                    $mmOut->append_read($buffer{$i}->[0], $buffer{$i}->[1], scalar(@{$buffer{$i}->[2]}), $buffer{$i}->[2]);
                    delete($buffer{$i});

                }

            }

        }

    }

    $mmOut->close();

}
elsif ($command eq "extract") {

    if (!defined $output) {

        $output = fileparse($mm[0], ".mm");
        $output =~ s/\.mm$//;
        $output .= ".extracted.mm";

    }

    die "\n  [!] Error: Output MM file already exists." .
        "\n             Please use -ow (or --overwrite) to overwrite output file\n\n" if (-e $output && !$overwrite);
    die "\n  [!] Error: Invalid IUPAC code\n\n" if ($keepBases !~ /^all$/i && !isiupac($keepBases));
    die "\n  [!] Error: Minimum number of mutations per read must be a positive INT >= 1\n\n" if (!isint($minMutPerRead) || !ispositive($minMutPerRead));
    die "\n  [!] Error: Minimum read length must be a positive INT\n\n" if (!isint($minReadLen) || !ispositive($minReadLen));
    die "\n  [!] Error: Random subsample factor must be a positive INT >= 1\n\n" if (!isint($randSubsample) || !ispositive($randSubsample));
    die "\n  [!] Error: Minimum mutation rate must be comprised between 0 and 1\n\n" if (!inrange($minRate, [0, 1]));
    die "\n  [!] Error: Maximum mutation rate must be comprised between 0 and 1\n\n" if (!inrange($maxRate, [0, 1]));

    $keepBases = $keepBases =~ m/^all$/i ? "ACGT" : join("", sort(uniq(split("", join("", iupac2nt(rna2dna(uc($keepBases))))))));

    my ($mmIn, $mmOut, %whitelist, %discardPos, %bedAnno);

    if (defined $whitelist) {

        die "\n  [!] Error: Whitelist file does not exist\n\n" if (!-e $whitelist);

        open(my $wh, "<", $whitelist) or die "\n  [!] Error: Unable to read from whitelist file ($!)\n\n";
        while(<$wh>) {

            chomp();
            $whitelist{$_} = 1;

        }
        close($wh);

        warn "\n  [!] Warning: Empty whitelist file\n" if (!keys %whitelist);

    }

    if (defined $discardPos) {

        die "\n  [!] Error: Blacklisted positions' file does not exist\n\n" if (!-e $discardPos);

        open(my $wh, "<", $discardPos) or die "\n  [!] Error: Unable to read from blacklisted positions' file ($!)\n\n";
        while(<$wh>) {

            chomp();

            next if ($_ eq "");

            my @row = split /[;,]/;

            die "\n  [!] Error: Malformed line in blacklisted positions' file (\"$_\")\n\n" if (@row < 2);
            
            for my $pos (@row[1 .. $#row]) {

                if ($pos =~ /^(\d+)-(\d+)$/) {

                    my ($start, $end) = ($1, $2);

                    die "\n  [!] Error: Malformed range \"$pos\" for transcript \"" . $row[0] . "\" in blacklisted positions' file\n\n" if ($start > $end);

                    $discardPos{$row[0]}->{$_} = 1 for ($start .. $end);

                }
                elsif (ispositive($pos)) { $discardPos{$row[0]}->{$pos} = 1; }
                else { die "\n  [!] Error: Malformed range \"$pos\" for transcript \"" . $row[0] . "\" in blacklisted positions' file\n\n"; }

            }

        }
        close($wh);

        warn "\n  [!] Warning: Empty blacklisted positions' file\n" if (!keys %discardPos);

    }

    if ($bedAnno) {

        open(my $fh, "<", $bedAnno) or die "\n  [!] Error: Unable to read BED annotation ($!)\n\n";
        while(<$fh>) {

            chomp();
            my ($id, @row);
            @row = split /\t/;

            next if (@row < 3);
            next if (!isint($row[1]) || !ispositive($row[1]));
            next if (!isint($row[2]) || !ispositive($row[2]));

            die "\n  [!] Error: Duplicate entry ID \"" . $row[0] . "\" in BED annotation\n\n" if (exists $bedAnno{$row[0]});

            $bedAnno{$row[0]} = { start => $row[1],
                                  end   => $row[2] };

        }
        close($fh);

        die "\n  [!] Error: Empty BED annotation file\n\n" if (!keys %bedAnno);

    }

    $mmIn = RF::Data::IO::MM->new( file  => $mm[0],
                                   index => $mm[0] . ".mmi",
                                   mode  => "r" );
    $mmOut = RF::Data::IO::MM->new( file      => $output,
                                    index     => "$output.mmi",
                                    mode      => "w",
                                    overwrite => $overwrite );

    foreach my $id ($mmIn->ids()) {

        my ($realSeq, $sequence, $length, $start, 
            $end);
        $realSeq = $mmIn->sequence($id);
        $length = length($realSeq);

        next if ($whitelist && !exists $whitelist{$id});
        last if ($whitelist && !keys %whitelist);

        delete($whitelist{$id});

        if ($bedAnno) {

            next if (!exists $bedAnno{$id});

            $start = $bedAnno{$id}->{start};
            $end = $bedAnno{$id}->{end};

            if ($end > $length) {

                warn "\n  [!] Warning: End coordinate exceeds length for transcript \"$id\" (end: $end, length: $length)\n";

                next;

            }

            $length = $end - $start;

            next if ($length < $minReadLen);

            $sequence = substr($realSeq, $start, $length);

        }
        else {

            $start = 0;
            $end = $length;
            $sequence = $realSeq;

        }

        # Blacklist all positions with mutation rates > maxRate or < minRate
        if ($minRate > 0 || $maxRate < 1) {

            my ($nReads, $counts, $coverage, @mutRate);
            ($nReads, $counts, $coverage) = _countsAndCovFromReads($mmIn, $id);
            @mutRate = map { $coverage->[$_] ? $counts->[$_] / $coverage->[$_] : 0 } 0 .. $#{$coverage};
            $discardPos{$id}->{$_} = 1 for (grep { $_ < $minRate || $_ > $maxRate } @mutRate);

        }

        $mmOut->append_transcript($id, $sequence);

        while (my @read = $mmIn->readStream($id)) {

            my ($readStart, $readEnd, @muts);
            @muts = grep { substr($realSeq, $_, 1) =~ /^[$keepBases]$/ } @{$read[2]};
            @muts = grep { !exists $discardPos{$id}->{$_} } @muts;
            @muts = grep { inrange($_, [$start, $end - 1]) } @muts;

            next if (@muts < $minMutPerRead);
            next if ($read[1] < $start);
            last if ($read[0] >= $end);

            $readStart = max($read[0], $start) - $start;
            $readEnd = min($read[1], $end - 1) - $start;

            next if ($minReadLen && $readEnd - $readStart + 1 < $minReadLen);

            @muts = map { $_ - $start } @muts;
           
            next if ($randSubsample > 1 && int(rand($randSubsample)));

            $mmOut->append_read($readStart, $readEnd, scalar(@muts), \@muts);

        }        

    }

    $mmOut->close();
    $mmIn->close();

}
elsif ($command eq "torc") {

    $output ||= $mm[0];
    $output =~ s/(?:\.mm|\.rc)?$/.rc/;

    die "\n  [!] Error: Output RC file already exists." .
        "\n             Please use -ow (or --overwrite) to overwrite output file\n\n" if (-e $output && !$overwrite);

    my ($mmIO, $rcIO, $totReads);
    $totReads = 0;
    $mmIO = RF::Data::IO::MM->new( file  => $mm[0],
                                   index => $mm[0] . ".mmi",
                                   mode  => "r" );
    $rcIO = RF::Data::IO::RC->new( file       => $output,
                                   index      => "$output.rci",
                                   buildindex => 1,
                                   mode       => "w",
                                   overwrite  => $overwrite );

    foreach my $id (sort($mmIO->ids())) {

        my ($sequence, $nReads, $rc, $counts, 
            $coverage);
        $sequence = $mmIO->sequence($id);
        ($nReads, $counts, $coverage) = _countsAndCovFromReads($mmIO, $id);

        $rc = RF::Data::RC->new( id         => $id,
                                 sequence   => $sequence,
                                 counts     => $counts,
                                 coverage   => $coverage,
                                 readscount => $nReads );
        $rcIO->write($rc);
        $totReads += $nReads;

    }

    $rcIO->mappedreads($totReads);
    $rcIO->close();

}
elsif ($command eq "stats") {

    my ($mmIO, $total, %muts, %lengths, 
    %tsReads, %bases);
    %bases = map { $_ => 0 } qw(A C G T N);
    $mmIO = RF::Data::IO::MM->new( file  => $mm[0],
                                   index => $mm[0] . ".mmi",
                                   mode  => "r" );

    foreach my $id ($mmIO->ids()) {

        my $sequence = $mmIO->sequence($id);
        $tsReads{$id} = 0;

        while (my @read = $mmIO->readStream($id)) {

            $total++;
            $lengths{($read[1] - $read[0] + 1)}++;
            $muts{scalar(@{$read[2]})}++;
            $tsReads{$id}++;
            $bases{$_}++ for (map { substr($sequence, $_, 1) } @{$read[2]});

        }

    }

    if ($total) {

        print "\n";

        my ($min, $max, $n, $table);
        $table = Term::Table->new();
        $table->head("Transcript", "# of reads");
        $table->row($_, $tsReads{$_}) for (sort keys %tsReads);
        $table->print();

        print "\n\n";

        $min = min(keys %lengths);
        $max = max(keys %lengths);
        $table = Term::Table->new();
        $table->head("Read length", "# of reads", "\%");
        $table->row($_, $lengths{$_} || 0, sprintf("%.2f", ($lengths{$_} || 0) / $total * 100)) for ($min .. $max);
        $table->print();

        print "\n\n";

        $min = min(keys %muts);
        $max = max(keys %muts);
        $table = Term::Table->new();
        $table->head("Mutations per read", "# of reads", "\%");
        $table->row($_, $muts{$_} || 0, sprintf("%.2f", ($muts{$_} || 0) / $total * 100)) for ($min .. $max);
        $table->print();

        print "\n\n";

        $n = sum(map { $bases{$_} } keys %bases);
        $table = Term::Table->new();
        $table->head("Base", "%");
        $table->row($_, sprintf("%.2f", $bases{$_} / $n * 100)) for (qw(A C G T N));
        $table->print();

        print "\n\n";

    }

}
elsif ($command eq "correlate") {

    die "\n  [!] Error: Correlate requires two MM files\n\n" if (@mm < 2);

    $output ||= fileparse($mm[0], ".mm") . "_vs_" . fileparse($mm[1], ".mm");
    $output =~ s/(?:\.tsv)?$/.tsv/;
    
    die "\n  [!] Error: Output MM file already exists." .
        "\n             Please use -ow (or --overwrite) to overwrite output file\n\n" if (-e $output && !$overwrite);

    my ($r, $p, $corrFunc, @files, 
        @data, @ids, @io, %excluded);
    $corrFunc = $spearman ? \&spearman : \&pearson;
    @files = (shift(@mm), shift(@mm));
    @io = map { RF::Data::IO::MM->new( file  => $_,
                                       index => "$_.mmi" ) } @files;
    @ids = $io[0]->ids();

    open(my $wh, ">", $output) or die "\n  [!] Error: Unable to write output file ($!)\n\n";
    select((select($wh), $|=1)[0]);
    
    foreach my $id (@ids) {

        my ($r, $p, @keys, @data);
        @data = ({}, {});

        foreach my $n (0, 1) {

            next if (exists $excluded{$id});

            my ($mmIO, $readCount);
            $mmIO = $io[$n];
            $readCount = $mmIO->readCount($id) || 0;

            if ($readCount < $minReads) {

                $excluded{$id} = 1;
                next;

            }
        
            while(my ($start, $end, $muts) = $mmIO->readStream($id)) { 
                
                for my $i (0 .. $#{$muts}) {

                    for my $j ($i + 1 .. $#{$muts}) {

                        $data[$n]->{join("-", @$muts[$i, $j])}++;

                    }

                }
                
            }

        }

        @keys = uniq(map { keys %{$data[$_]} } 0, 1);

        next if (@keys < 3);

        ($r, $p) = $corrFunc->([map { exists $data[0]->{$_} ? $data[0]->{$_} : 0 } @keys],
                               [map { exists $data[1]->{$_} ? $data[1]->{$_} : 0 } @keys]);

        print $wh join("\t", $id, $r, $p) . "\n";

    }

    close($wh);

    unlink($output) if (!-s $output);

}
elsif ($command eq "alignids") {

    $output ||= "alignedIds/";
    $output =~ s/\/?$/\//;

    if (-e $output) {

        if (!$overwrite) {

            die "\n  [!] Error: Output directory \"$output\" already exists." .
                "\n             Please use -ow (or --overwrite) to overwrite it\n\n";

        }
        else {

            if (-d $output) { 

                my $error = rmtree($output);

                die "\n  [!] Error: Unable to overwrite output ($error)\n\n" if ($error);

            }
            else { unlink($output) or die "\n  [!] Error: Unable to overwrite output ($!)\n\n"; }

        }

    }

    die "\n  [!] Error: Only one MM file provided\n\n" if (@mm < 2);

    my (@mmOut, @mmIn, @ids, %ids);
    
    if (my $error = mktree($output, "755")) { die "\n  [!] Error: Unable to create output directory ($error)\n\n"; }

    @mmOut = map { my $filename = fileparse($mm[$_], ".mm");
                   RF::Data::IO::MM->new( file       => $output . $filename . ".mm",
                                          index      => $output . $filename . ".mm.mmi",
                                          mode       => "w",
                                          overwrite  => $overwrite ) } 0 .. $#mm;
    @mmIn = map { RF::Data::IO::MM->new( file  => $mm[$_],
                                         index => $mm[$_] . ".mmi",
                                         mode  => "r" ) } 0 .. $#mm;

    foreach my $mmIn (@mmIn) { $ids{$_}++ for ($mmIn->ids()); }
    
    @ids = grep { $ids{$_} == @mm } sort keys %ids;

    die "\n  [!] Error: No transcript IDs common to all MM files\n\n" if (!@ids);

    foreach my $id (@ids) {

        foreach my $i (0 .. $#mmOut) {

            my $mmOut = $mmOut[$i];
            $mmOut->append_transcript($id, $mmIn[$i]->sequence($id));
            while(my ($start, $end, $muts) = $mmIn[$i]->readStream($id)) { $mmOut->append_read($start, $end, scalar(@$muts), $muts); }

        }

    }

    $_->close() for (@mmOut, @mmIn);

}
elsif ($command eq "split") {

    if (!defined $output) {

        $output = fileparse($mm[0], ".mm");
        $output =~ s/(?:\.mm)?$/_split/;

    }

    if (-e $output) {

        if (!$overwrite) {

            die "\n  [!] Error: Output directory \"$output\" already exists." .
                "\n             Please use -ow (or --overwrite) to overwrite it\n\n";

        }
        else {

            if (-d $output) { 

                my $error = rmtree($output);

                die "\n  [!] Error: Unable to overwrite output ($error)\n\n" if ($error);

            }
            else { unlink($output) or die "\n  [!] Error: Unable to overwrite output ($!)\n\n"; }

        }

    }

    if (my $error = mktree($output, "755")) { die "\n  [!] Error: Unable to create output directory ($error)\n\n"; }

    my $mmIn = RF::Data::IO::MM->new( file  => $mm[0],
                                      index => $mm[0] . ".mmi",
                                      mode  => "r" );

    foreach my $id ($mmIn->ids()) {

        my $mmOut = RF::Data::IO::MM->new( file  => "$output/$id.mm",
                                           index => "$output/$id.mm.mmi",
                                           mode  => "w" );

        $mmOut->append_transcript($id, $mmIn->sequence($id));

        while(my ($start, $end, $muts) = $mmIn->readStream($id)) { $mmOut->append_read($start, $end, scalar(@$muts), $muts); }

        $mmOut->close();

    }

}

sub _countsAndCovFromReads {

    my ($mmIO, $id) = @_;

    my ($nReads, @counts, @coverage);
    $nReads = 0;
    @counts = (0) x $mmIO->length($id);
    @coverage = @counts;

    while (my @read = $mmIO->readStream($id)) {

        $nReads++;
        
        $coverage[$_]++ for ($read[0] .. $read[1]);
        $counts[$_]++ for (@{$read[2]});

    }

    return($nReads, \@counts, \@coverage);

}

sub help {

    print "\n  [!] Error: Invalid option. Please check the help\n" if ($_[0]);
    print "\n  [!] Error: Invalid command \"$command\"\n" if ($command && !exists $commands{$command});

    print <<HELP;

 RF MMTools (v$Core::Utils::VERSION)
 RNA Framework [http://www.rnaframework.com]

 Author:  Danny Incarnato (dincarnato[at]rnaframework.com)
 Summary: Easy manipulation of MM files
HELP

    if ($command eq "index") {

        die <<HELP;
 Description: Generates MMI index files for a set of input MM files
 Usage:       rf-mmtools index Sample1.mm Sample2.mm ... Samplen.mm

HELP

    }
    elsif ($command eq "stats") {

        die <<HELP;
 Description: Provides stats on length and mutation distribution per read
 Usage:       rf-mmtools stats Sample.mm

HELP

    }
    elsif ($command eq "view") {

        die <<HELP;

 Description: Displays the content of MM files in human-readable form
 Usage:       rf-mmtools view [options] Sample.mm [transcript#1,transcript#2,..,transcript#n]

HELP

    }
    elsif ($command eq "merge") {

        die <<HELP;

 Description: Combines multiple MM files into a single file
 Usage:       rf-mmtools merge [options] Sample1.mm Sample2.mm ... Samplen.mm

 Options                                     Description
 -o  or --output       <string>              Output MM filename (Default: merge.mm)
 -ow or --overwrite                          Overwrites output file if already exists

HELP

    }
    elsif ($command eq "torc") {

        die <<HELP;

 Description: Converts an MM file to RC format
 Usage:       rf-mmtools toRC [options] Sample.mm

 Options                                     Description
 -o  or --output       <string>              Output RC filename (Default: <input>.rc)
 -ow or --overwrite                          Overwrites output file if already exists

HELP

    }
    elsif ($command eq "split") {

        die <<HELP;

 Description: Splits an MM file into multiple files, each containing a single transcript
 Usage:       rf-mmtools split [options] Sample.mm

 Options                                     Description
 -o  or --output       <string>              Output folder (Default: <input>_split/)
 -ow or --overwrite                          Overwrites output folder if already exists

HELP

    }
    elsif ($command eq "extract") {

        die <<HELP;

 Description: Generates an RC file by extracting a user-defined set of regions from an input RC file
 Usage:       rf-rctools extract [Options] Sample.mm

 Options                                   Description
 -o   or --output             <string>     Output MM file (Default: <input>.extracted.mm)
 -ow  or --overwrite                       Overwrites output MM file (if the specified file already exists)
 -kb  or --keepBases          <string>     Only retains mutations on specified bases (Default: all)
                                           Note: IUPAC codes are allowed
 -mpr or --minMutPerRead      <int>        Reads with less than this number of mutations are discarded (>=0, Default: 1)
 -mrl or --minReadLen         <int>        Reads shorter than this length are discarded (>0, Default: 1)
 -rs  or --randSubsample      <int>        Randomly subsamples this fraction of reads (Default: keep all reads)
                                           Note: the provided value represents the denominator of the fraction (for example, 
                                                 if -rs 2, 1/2 of the reads will be subsampled)
 -a   or --annotation         <string>     Path to a file containing a list of regions (in BED format) to extract from 
                                           the MM file
                                           Note: only the portion of the read falling within the boundaries of the provided
                                                 BED intervals will be retained and subjected to the other filtering steps
 -wl  or --whitelist          <string>     Path to a file containing a list (one per line) of transcripts to be extracted 
                                           from the MM file
 -nr  or --minRate            <string>     Positions with mutation rate < this value are discarded (0-1, Default: 0 [no cutoff])
 -xr  or --maxRate            <string>     Positions with mutation rate > this value are discarded (0-1, Default: 1 [no cutoff])
 -dp  or --discardPositions   <string>     Path to a blacklist file containing a list of transcript positions to be filtered
                                           out of the output file (mutations occurring on these positions will be discarded)

HELP

    }
    elsif ($command eq "alignids") {

        die <<HELP;

 Description: Given multiple MM files, it rewrites them by only retaining the transcripts
              common to all of them, so that they appear in the same order in all the
              files (necessary for DRACO replicate analysis)
 Usage:       rf-mmtools alignids [options] Sample.mm

 Options                                     Description
 -o  or --output       <string>              Output folder (Default: alignedIds/)
 -ow or --overwrite                          Overwrites output folder if already exists

HELP

    }
    elsif ($command eq "correlate") {

        die <<HELP;

 Description: Calculates the correlation of co-mutation patterns between two MM files
 Usage:       rf-rctools correlate [Options] Sample.mm

 Options                           Description
 -o  or --output       <string>    Output TSV file (Default: <MM#1>_vs_<MM#2>.tsv)
 -ow or --overwrite                Overwrites output TSV file (if the specified file already exists)
 -mr or --minReads     <int>       Transcripts having less than these number of reads are excluded (>0, Default: 1000)
 -S  or --spearman                 Uses Spearman to calculate correlation (Default: Pearson)

HELP

    }
    else {

        die <<HELP;

 Usage:   rf-mmtools [command]

 Commands         Description
 view             Dumps to screen the content of the provided MM file
 merge            Combines multiple MM files
 extract          Filters reads in MM file
 split            Splits an MM file into multiple files, each containing a single transcript
 alignIds         Rewrites a set of input MM files, keeping only the common transcripts
 toRC             Converts an MM file to RC format
 correlate        Calculates the per-transcript correlation of co-mutation patterns between 2 MM files
 stats            Provides stats on length and mutation distribution per read
 index            Indexes an MM file

HELP

    }

}
