#!/usr/bin/env perl

##
# RF RCTools
# RNA Framework [http://www.rnaframework.com]
#
# Author:  Danny Incarnato (dincarnato[at]rnaframework.com)
# Summary: Easy manipulation of RC files
#
# This program is free software, and can be redistribute  and/or modified
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.
#
# Please see <http://www.gnu.org/licenses/> for more informations.
##

use strict;
use File::Basename;
use File::Copy;
use FindBin qw($Bin);
use Getopt::Long qw(:config no_ignore_case);

use lib $Bin . "/lib";

use Core::Mathematics qw(:all);
use Core::Utils;
use RF::Data::IO::RC;
use RF::Data::RC;

$|++;

my ($output, $overwrite, $command, $help,
    $index, $tab, $annoFile, $GTFfeature, 
    $GTFattribute, $blockSize, $whitelist, 
    @rc, %commands);

%commands = map { $_ => 1 } qw(index view merge stats extract);

do {

    local $SIG{__WARN__} = sub { };

    GetOptions( "h|help"           => \$help,
                "o|output=s"       => \$output,
                "ow|overwrite"     => \$overwrite,
                "i|index=s"        => \$index,
                "t|tab"            => \$tab,
                "a|annotation=s"   => \$annoFile,
                "f|GTFfeature=s"   => \$GTFfeature,
                "b|GTFattribute=s" => \$GTFattribute,
                "s|blockSize=s"    => \$blockSize,
                "wl|whitelist=s"   => \$whitelist ) or help(1);

    $command = shift(@ARGV);
    @rc = uniq(@ARGV);

};

help() if ($help);

# Default
$command = lc($command);
$GTFfeature ||= "exon";
$GTFattribute ||= "transcript_id";
undef($index) if ($command eq "index");

$GTFattribute = quotemeta($GTFattribute);

##
# Input validation
##

die "\n  [!] Error: No command specified\n\n" if (!$command);
die "\n  [!] Error: Invalid command \"" . $command . "\"\n\n" if (!exists $commands{$command});
die "\n  [!] Error: No RC file specified\n\n" if (!@rc);
die "\n  [!] Error: Provided RCI index file does not exist\n\n" if (defined $index && !-e $index);

if ($command eq "extract") {

    if ($rc[0] =~ m/\.rc$/) { die "\n  [!] Error: Provided RC file \"" . $rc[0] . "\" does not exist\n\n" if (!-e $rc[0]); }
    else { die "\n  [!] Error: RC file \"" . $rc[0] . ".plus.rc\" does not exist\n\n" if (!-e $rc[0] . ".plus.rc"); }

}
else { for (0 .. ($command =~ /^view|stats$/ ? 0 : $#rc)) { die "\n  [!] Error: Provided RC file \"" . $rc[$_] . "\" does not exist\n\n" if (!-e $rc[$_]); } }

if ($command eq "index") {

    for (@rc) {

        my $rc = RF::Data::IO::RC->new( file       => $rc[$_] =~ m/\.rc$/ ? $rc[$_] : $rc[$_] . ".plus.rc",
                                        buildindex => 1,
                                        index      => $rc[$_] . ".rci" );

    }

}
elsif ($command =~ m/^view|stats$/) {

    my ($file, $rcin);
    $file = shift(@rc);
    $index ||= "$file.rci" if (-e "$file.rci");
    $rcin = RF::Data::IO::RC->new( file           => $file,
                                   noPreloadIndex => @rc ? 0 : 1,
                                   index          => $index );

    if (@rc) {

        foreach my $region (@rc) {

            my ($length, @region, @coords);
            @region = split(/:/, $region);
            @coords = $command eq "stats" ? (0, 1) : split(/-/, $region[1]);
            $length = $rcin->length($region[0]);

            die "\n  [!] Error: Invalid sequence ID or target region \"$region\"\n\n" if (@region > 2 || @coords > 2 || !defined $length ||
                                                                                          (@coords && (!isint(@coords) || !ispositive(@coords) || $coords[1] < $coords[0] || $coords[1] >= $length)));

            my $entry = @coords ? $rcin->readBytewise($region[0], \@coords) : $rcin->read($region);

            if ($command eq "view") { formatentry($entry); }
            else { print $entry->id() . "\t" . $entry->readscount() . "\t" . $entry->mediancoverage() . "\n"; }

        }

    }
    else {

        while(my $entry = $rcin->read()) {

            if ($command eq "view") { formatentry($entry); }
            else { print $entry->id() . "\t" . $entry->readscount() . "\t" . $entry->mediancoverage() . "\n"; }

        }

    }

    print "\nTotal\t" . $rcin->mappedreads() . "\n\n" if ($command eq "stats");

}
elsif ($command eq "merge") {

    $output ||= "merge.rc";
    $output =~ s/(?:\.rc)?$/.rc/;
    $index ||= $rc[0] . ".rci" if (-e $rc[0] . ".rci");
    $blockSize ||= 1000000;

    die "\n  [!] Error: Block size must be a positive INT >= 1" if (!isint($blockSize) || $blockSize < 1);
    die "\n  [!] Error: Output RC file already exists." .
        "\n      Please use -ow (or --overwrite) to overwrite output filey\n\n" if (-e $output &&
                                                                                    !$overwrite);
    die "\n  [!] Error: Only one RC file provided\n\n" if (@rc < 2);

    # Easy check: RC files should have the same size, so if one of them differs, then it cannot be merged
    die "\n [!] Error: Provided RC files have different sizes\n\n" if (sum(map { (-s $rc[$_]) - (-s $rc[$_ - 1]) } 1 .. $#rc));

    my ($rcout, $mappedReads, @rcin);
    $mappedReads = 0;
    
    copy($rc[0], $output) or die "\n  [!] Error: Failed to copy temporary RC file ($!)\n\n";
    if (defined $index) { copy($index, "$output.rci") or die "\n  [!] Error: Failed to copy temporary RCI file ($!)\n\n"; }
    
    $rcout = RF::Data::IO::RC->new( file       => $output,
                                    index      => $output . ".rci",
                                    buildindex => defined $index ? 0 : 1,
                                    mode       => "w+" );
    @rcin = map { RF::Data::IO::RC->new( file           => $rc[$_],
                                         noPreloadIndex => 1 ) } 1 .. $#rc; # Speeds up, files to be merged should have the same base structure
    $_->copyIndexFromObject($rcout) for (@rcin); # Faster than actually loading indexes

    foreach my $id ($rcin[0]->ids()) {

        my $length = $rcin[0]->length($id);

        for (my $i = 0; $i < $length; $i += $blockSize) {

            my ($j, $end, $entry, $readCount, 
                $sequence, @counts, @coverage);
            $end = $i + $blockSize - 1;
            $end = $length - 1 if ($end >= $length);
            $entry = $rcout->readBytewise($id, [$i, $end]);
            $readCount = $entry->readscount() if ($end == $length - 1);
            @counts = $entry->counts();
            @coverage = $entry->coverage();

            foreach my $rcIn (@rcin) {

                my ($tmpEntry, @tmpCounts, @tmpCoverage);
                $tmpEntry = $rcIn->readBytewise($id, [$i, $end]);
                @tmpCounts = $tmpEntry->counts();
                @tmpCoverage = $tmpEntry->coverage();
                $readCount += $tmpEntry->readscount() if ($end == $length - 1);
                @counts = map { $counts[$_] + $tmpCounts[$_] } 0 .. $#counts;
                @coverage = map { $coverage[$_] + $tmpCoverage[$_] } 0 .. $#coverage;

            }

            $entry = $rcout->writeBytewise($id, $i, \@counts, \@coverage, $readCount);

        }

    }

    $mappedReads += $_->mappedreads() for ($rcout, @rcin);

    $rcout->mappedreads($mappedReads);
    $rcout->close();

}
elsif ($command eq "extract") {

    die "\n  [!] Error: No annotation or whitelist file provided\n\n" if (!defined $annoFile && !defined $whitelist);
    die "\n  [!] Error: Annotation and whitelist files are mutually exclusive\n\n" if (defined $annoFile && defined $whitelist);
    die "\n  [!] Error: Provided annotation file does not exist\n\n" if (defined $annoFile && !-e $annoFile);
    die "\n  [!] Error: Provided whitelist file does not exist\n\n" if (defined $whitelist && !-e $whitelist);

    if (!defined $index) {

        if ($rc[0] =~ /\.rc$/ && -e $rc[0] . ".rci") { $index = $rc[0] . ".rci"; }
        elsif ($rc[0] !~ /\.rc$/) { $index = (grep {-e $_ } ($rc[0] . ".rci", $rc[0] . ".plus.rc.rci", $rc[0] . ".minus.rc.rci"))[0]; }

        die "\n  [!] Error: No RCI index file provided\n\n" if (!defined $index);

    }

    $output = (fileparse($annoFile || $whitelist, qr/\.(?:bed|gtf|txt)/i))[0] . ".rc" if (!$output);
    $output =~ s/(?:\.rc)?$/.rc/;

    die "\n  [!] Error: Output RC file already exists." .
        "\n      Please use -ow (or --overwrite) to overwrite output filey\n\n" if (-e $output && !$overwrite);

    my ($rcout, $mappedreads, $annoType, $lastChr,
        %chr, %rcin, %ids);

    if (defined $annoFile) {

        ($annoType) = $annoFile =~ m/\.(gtf|bed)$/i;

        die "\n  [!] Error: Annotation must have .gtf or .bed extension\n\n" if (!defined $annoType);

        $annoType = lc($annoType);
    
    }

    $mappedreads = 0;
    $rcin{"+"} = RF::Data::IO::RC->new( file  => $rc[0] =~ m/\.rc$/ ? $rc[0] : $rc[0] . ".plus.rc",
                                        index => $index );

    if ($rc[0] !~ m/\.rc$/ && -e $rc[0] . ".minus.rc") {

        $rcin{"-"} = RF::Data::IO::RC->new( file  => $rc[0] . ".minus.rc",
                                            index => $index );

    }
    else { $rcin{"-"} = $rcin{"+"}; }

    %ids = map { $_ => $rcin{"+"}->length($_) } $rcin{"+"}->ids();

    $rcout = RF::Data::IO::RC->new( file       => $output,
                                    index      => $output . ".rci",
                                    buildindex => 1,
                                    mode       => "w",
                                    overwrite  => $overwrite );

    if (defined $annoFile) {

        open(my $fh, "<", $annoFile) or die "\n  [!] Error: Unable to read from annotation file (" . $! . ")\n\n";
        while(<$fh>) {

        chomp();

            next if ($_ =~ m/^\#/);

            my ($id, @row);
            @row = split /\t/;

            next if (!exists $ids{$row[0]});

            if ($annoType eq "gtf") {

                next if ($row[2] ne $GTFfeature);
                next if ($row[4] > $ids{$row[0]}); # GTF is 1-based

                $row[3]--;
                $row[4]--;

                if (defined $GTFattribute) { ($id) = $row[8] =~ m/$GTFattribute "(.+?)"/; }
                
                $id = $row[0] . "_" . $row[3] . "-" . $row[4] if (!defined $id);

            }
            else {

                next if ($row[2] > $ids{$row[0]});

                $row[2]--;
                $id = @row < 4 ? $row[0] . "_" . $row[1] . "-" . $row[2] : $row[3];

            }

            if ($lastChr ne $row[0]) {

                if (defined $lastChr) {

                    foreach my $strand (sort keys %chr) {

                        foreach my $transcript (sort keys %{$chr{$strand}}) {

                            my $rc = $rcin{$strand}->readBytewise($lastChr, @{$chr{$strand}->{$transcript}});
                            $rc->revcomp() if ($strand eq "-");
                            $rc->id($transcript);
                            $rcout->write($rc);

                        }

                    }

                }

                $lastChr = $row[0];
                undef(%chr);

            }

            if ($annoType eq "gtf") { push(@{$chr{$row[6] eq "-" ? "-" : "+"}->{$id}}, [$row[3], $row[4]]); }
            else {

                next if ($row[1] == $row[2]);

                my $strand = @row >= 6 && $row[5] eq "-" ? "-" : "+";

                if (@row < 12) { push(@{$chr{$strand}->{$id}}, [$row[1], $row[2]]); }
                else {

                    my (@exonSizes, @exonStarts);
                    @exonSizes = split ",", $row[10];
                    @exonStarts = split ",", $row[11];

                    push(@{$chr{$strand}->{$id}}, [$row[1] + $exonStarts[$_], $row[1] + $exonStarts[$_] + $exonSizes[$_] - 1]) for (0 .. $row[9] - 1);

                }

            }

        }
        close($fh);

        if (defined $lastChr) {

            foreach my $strand (sort keys %chr) {

                foreach my $transcript (sort keys %{$chr{$strand}}) {

                    my $rc = $rcin{$strand}->readBytewise($lastChr, @{$chr{$strand}->{$transcript}});
                    $rc->revcomp() if ($strand eq "-");
                    $rc->id($transcript);
                    $rcout->write($rc);

                }

            }

        }

        $rcout->mappedreads($rcin{"+"}->mappedreads());

    }
    else {

        my $nReads = 0;

        open(my $fh, "<", $whitelist) or die "\n  [!] Error: Unable to read from whitelist file (" . $! . ")\n\n";
        while(my $id = <$fh>) {

            chomp($id);

            if (my $entry = $rcin{"+"}->read($id)) { 
                
                $rcout->write($entry); 
                $nReads += $entry->readscount();
                
            }
            else { warn "\n  [!] Warning: Transcript \"$id\" does not exist\n"; }

        }
        close($fh);

        $rcout->mappedreads($nReads);

    }

    $rcout->close();

}

sub formatentry {

    my $entry = shift;

    return unless($entry);

    if ($tab) {

        my (@sequence, @counts, @coverage);
        @sequence = split(//, $entry->sequence());
        @counts = $entry->counts();
        @coverage = $entry->coverage();

        print $entry->id();
        print "\n" . join("\t", $sequence[$_], $counts[$_], $coverage[$_]) for (0 .. $#sequence);
        print "\n\n";

    }
    else { print join("\n", $entry->id(), $entry->sequence, join(",", $entry->counts()), join(",", $entry->coverage())) . "\n\n"; }

}

sub help {

    print "\n  [!] Error: Invalid option. Please check the help\n" if ($_[0]);
    print "\n  [!] Error: Invalid command \"" . $command . "\"\n" if ($command && !exists $commands{$command});

    print <<HELP;

 RF RCTools (v$Core::Utils::VERSION)
 RNA Framework [http://www.rnaframework.com]

 Author:  Danny Incarnato (dincarnato[at]rnaframework.com)
 Summary: Easy manipulation of RC files
HELP

    if ($command eq "index") {

        die <<HELP;
 Description: Generates RCI index files for a set of input RC files
 Usage:       rf-rctools index Sample1.rc Sample2.rc ... Samplen.rc

HELP

    }
    elsif ($command eq "view") {

        die <<HELP;

 Description: Displays the content of RC files in human-readable form
 Usage:       rf-rctools view [options] Sample.rc 
              rf-rctools view [options] Sample.rc transcript#1 transcript#2
              rf-rctools view [options] Sample.rc transcript#1:1000-2000

 Options                         Description
 -t  or --tab                    Switch to tabular output format
 -i  or --index     <string>     RCI index file
                                 Note: if an RCI index is not specified, the program will look in the same 
                                       directory of the input RC file for a file named after the RC file with 
                                       one of the following extensions: .rci, .plus.rc.rci, .minus.rc.rci

HELP

    }
    elsif ($command eq "merge") {

        die <<HELP;

 Description: Combines multiple RC files into a single file
 Usage:       rf-rctools merge [options] Sample1.rc Sample2.rc ... Samplen.rc

 Options                                     Description
 -o  or --output         <string>            Output RC filename (Default: merge.rc)
 -ow or --overwrite                          Overwrites output file if already exists
 -i  or --index          <string>            RCI index file
                                             Note: if an RCI index is not specified, the program will look in the same 
                                                   directory of the input RC file for a file named after the RC file with 
                                                   one of the following extensions: .rci, .plus.rc.rci, .minus.rc.rci
 -s  or --blockSize      <int>               Maximum size of the chromosome/transcript block to read from
                                             each RC file (>=1000, Default: 1000000)
                                             Note: this is particularly useful when merging genome-level RC
                                                   files, to prevent entire chromosomes from be kept in memory

HELP

    }
    elsif ($command eq "extract") {

        die <<HELP;

 Description: Generates an RC file by extracting a user-defined set of regions from an input RC file
 Usage:       rf-rctools extract -a annotation.gtf Sample.rc     # Will only use the specified file
              rf-rctools extract -a annotation.gtf Sample        # Will look for .plus.rc and .minus.rc files

 Options                            Description
 -o  or --output       <string>     Output RC filename (Default: <annotation>.rc)
 -ow or --overwrite                 Overwrites output file if already exists
 -i  or --index        <string>     RCI index file
                                    Note: if an RCI index is not specified, the program will look in the same 
                                          directory of the input RC file for a file named after the RC file with 
                                          one of the following extensions: .rci, .plus.rc.rci, .minus.rc.rci
 -a  or --annotation   <string>     BED/GTF file containing a list of regions to be extracted
 -wl or --whitelist    <string>     Whitelist containing a list of IDs of transcript to be extracted
                                    Note: either an annotation or a whitelist must be provided. The two are
                                          mutually exclusive.
 -f  or --GTFfeature   <string>     If a GTF file is provided, only entries corresponding to this feature type
                                    will be extracted (Default: exon)
 -b  or --GTFattribute <string>     If a GTF file is provided, this attribute will be used as the entry ID in
                                    the output RC file (Default: transcript_id)

HELP

    }
    else {

        die <<HELP;

 Usage:   rf-rctools [command]

 Commands         Description
 view             Dumps to screen the content of the provided RC file
 merge            Combines multiple RC files
 extract          Extracts a user-defined set of regions
 index            Generates RCI index
 stats            Prints per-transcript and global reads mapping statistics

HELP

    }

}
