#!/usr/bin/env perl

use strict;
use File::Basename;
use FindBin qw($Bin);
use Getopt::Long qw(:config no_ignore_case);
use List::Util qw(shuffle);
use POSIX qw(ceil SEEK_END SEEK_SET);
use Storable;
use threads;
use threads::shared;

use lib $Bin . "/lib";

use Align::SmithWaterman;
use Cluster::Kmeans;
use Core::Mathematics qw(:all);
use Core::Utils;
use Data::IO;
use Data::IO::Sequence;
use Data::Sequence::Utils;
use Graphics::Chart::Area;
use Graphics::Chart::Arcs;
use Graphics::Chart::Heatmap;
use Graphics::Image;
use RF::Utils;
use RNA::Utils;
use Term::Constants qw(:screen);
use Term::Table;

$|++;

my ($output, $overwrite, $minOverlapFrac, $minGap,
    $minReads, $minProb, $noUseVienna, $makeImage, $greyHeatmap,
    $makeClustersSam, $reference, $minPairedFrac, $makeConstraint,
    $minHalfLen, $minSampleSize, $maxReads, $requireBpSupport,
    $refIO, $help, $samtools, $table, $evalOnly, $heatmapOnly,
    $threads, $tmpDir, $logDir, $R, $heatmap, $logHeatmap,
    $whitelist, $suffix, $singleTranscript, $constraintThresh,
    $colorScale, $minChimeraFrac, $maxIter, $evalStructures,
    $dumpChimaeras, $dumpChimaerasOnly, $makeDotPlot, @colorScale, 
    @input, @colorLegend, %samples, %transcripts);

my @transcripts : shared;
my %sharedData : shared;
my %initials : shared;
my %stats : shared;

do {

    local $SIG{__WARN__} = sub { };

    GetOptions( "h|help"                    => \$help,
                "o|output=s"                => \$output,
                "ow|overwrite"              => \$overwrite,
                "p|processors=i"            => \$threads,
                "c|constraint"              => \$makeConstraint,
                "ct|constraint-threshold=s" => \$constraintThresh,
                "es|eval-structures=s"      => \$evalStructures,
                "eo|eval-only"              => \$evalOnly,
                "g|img"                     => \$makeImage,
                "cb|make-clusters-bam"      => \$makeClustersSam,
                "mh|min-half=i"             => \$minHalfLen,
                "mpf|min-paired-frac=s"     => \$minPairedFrac,
                "mo|min-overlap-frac=s"     => \$minOverlapFrac,
                "mg|min-gap=i"              => \$minGap,
                "mr|min-reads=s"            => \$minReads,
                "xr|max-reads=s"            => \$maxReads,
                "mp|min-prob=s"             => \$minProb,
                "mss|min-sample-size=i"     => \$minSampleSize,
                "f|fasta=s"                 => \$reference,
                "p|processors=i"            => \$threads,
                "wl|whitelist=s"            => \$whitelist,
                "nv|no-use-vienna"          => \$noUseVienna,
                "st|single-transcript"      => \$singleTranscript,
                "cs|color-scale=s"          => \$colorScale,
                "rmc|require-min-chimera"   => \$requireBpSupport,
                "mcf|min-chimera-frac=s"    => \$minChimeraFrac,
                "mi|max-iterations=i"       => \$maxIter,
                "dc|dump-chimaeras"         => \$dumpChimaeras,
                "do|dump-chimaeras-only"    => \$dumpChimaerasOnly,
                "R|R-path=s"                => \$R,
                "hm|heatmap"                => \$heatmap,
                "lhm|log-heatmap"           => \$logHeatmap,
                "ghm|grey-heatmap"          => \$greyHeatmap,
                "ho|heatmap-only"           => \$heatmapOnly,
                "dp|dot-plot"               => \$makeDotPlot ) or help(1);

    @input = @ARGV;

};

help() if ($help);

$suffix = 0;
$threads ||= 1;
$maxIter ||= 5;
$minHalfLen ||= 20;
$minGap ||= 1;
$minReads ||= 2;
$maxReads ||= 100000;
$minProb //= 0.01;
$minOverlapFrac ||= 0.01;
$minSampleSize ||= 1000;
$minPairedFrac //= 0.25;
$constraintThresh ||= 0.5;
$minChimeraFrac ||= 0.5;
$samtools ||= which("samtools");
$output ||= "rf_duplex/";
$R ||= checkRinstall($R) if ($makeImage);
@colorScale = $colorScale ? uniq(sort {$a <=> $b} split(/[:,;]/, $colorScale)) : (0.05, 0.1, 0.4, 0.7);
$output =~ s/\/?$/\//;
$evalStructures =~ s/\/?$/\// if (defined $evalStructures);
$tmpDir = $output . "tmp/";
$logDir = $output . "logs/";

if (!$noUseVienna) {

    die "\n  [!] Error: ViennaRNA package's Perl module RNA.pm is required." .
        "\n             Please ensure that ViennaRNA package v2.2.0 (or greater) is installed and try again\n\n" unless (eval { require RNA; 1; });

    $RNA::fold_constrained = 1;
    $RNA::noLonelyPairs = 1;

}

die "\n  [!] Error: No input file provided\n\n" if (!@input);
die "\n  [!] Error: No FASTA reference provided\n\n" if (!defined $reference);
die "\n  [!] Error: Provided FASTA reference does not exist\n\n" if (!-e $reference);
die "\n  [!] Error: Provided whitelist does not exist\n\n" if (defined $whitelist && !-e $whitelist);
die "\n  [!] Error: Provided structure folder does not exist\n\n" if (defined $evalStructures && !-d $evalStructures);
die "\n  [!] Error: Minimum overlap must be comprised between 0 and 1\n\n" if (!inrange($minOverlapFrac, [0, 1]));
die "\n  [!] Error: Minimum number of reads must be an INT > 0\n\n" if (!isint($minReads) || $minReads <= 0);
die "\n  [!] Error: Maximum number of reads must be an INT > 0\n\n" if (!isint($maxReads) || $maxReads <= 0);
die "\n  [!] Error: Minimum base-pair probability must be comprised between 0 and 1\n\n" if (!inrange($minProb, [0, 1]));
die "\n  [!] Error: Minimum base-paired fraction must be comprised between 0 and 1\n\n" if (!inrange($minPairedFrac, [0, 1]));
die "\n  [!] Error: Minimum chimera support must be comprised between 0.5 and 1\n\n" if ($requireBpSupport && !inrange($minChimeraFrac, [0.5, 1]));
die "\n  [!] Error: Constraint threshold must be comprised between 0.5 and 1\n\n" if ($makeConstraint && !inrange($constraintThresh, [0.5, 1]));
die "\n  [!] Error: Color scale must contain exactly 4 cut-points\n\n" if (@colorScale != 4);
die "\n  [!] Error: Parameters -do and -eo are mutually exclusive\n\n" if ($dumpChimaerasOnly && $evalOnly);
die "\n  [!] Error: Parameter -eo requires -es\n\n" if ($evalOnly && !$evalStructures);
die "\n  [!] Error: Parameter -do requires -dc\n\n" if ($dumpChimaerasOnly && !$dumpChimaeras);
die "\n  [!] Error: Parameter -ho requires -hm\n\n" if ($heatmapOnly && !$heatmap);
die "\n  [!] Error: Parameter -lhm requires -hm\n\n" if ($logHeatmap && !$heatmap);
die "\n  [!] Error: Parameter -ghm requires -hm\n\n" if ($greyHeatmap && !$heatmap);

if ($makeImage) {

    die "\n  [!] Error: Parameter -g requires -dp and/or -c\n\n" if (!$makeDotPlot && !$makeConstraint);

    for (1 .. $#colorScale) { 
        
        die "\n  [!] Error: Color scale's cut-point values must be comprised between 0 and 1\n\n" if (!isnumeric($colorScale[$_]) || !inrange($colorScale[$_], [0, 1])); 
        
        push(@colorLegend, sprintf("%.1f", $colorScale[$_-1] * 100) . "-" . sprintf("%.1f", $colorScale[$_] * 100) . "\%");

    }

    push(@colorLegend, sprintf("%.1f", $colorScale[-1] * 100) . "-100.0\%");

}

if (!defined $samtools) { die "\n  [!] Error: samtools is not in PATH\n\n"; }
elsif (!-e $samtools) { die "\n  [!] Error: samtools doesn't exist\n\n"; }
elsif (!-x $samtools) { die "\n  [!] Error: samtools is not executable\n\n"; }
else {

    my $ret = `$samtools 2>&1`;

    if ($ret =~ m/Version: ([\d\.]+)/) {

        my $version = $1;

        die "\n  [!] Error: Samtools v1 or greater is required (Detected: v" . $version . ")\n\n" if (substr($version, 0, 1) < 1);

    }
    else { warn "\n  [!] Warning: Unable to detect Samtools version\n"; }

}

print "\n[+] Checking input file type and sorting...\n\n";

$table = Term::Table->new(indent => 2);
$table->head("Sample", "File", "Type", "Sorted");

foreach my $sample (@input) {

    my ($basename, @sample, @files);
    @sample = split(":", $sample);
    @files = sort split(",", $sample[-1]);
    $basename = @sample == 1 ? (fileparse($files[0], qr/((\.[^.\s]+)+)$/))[0] : $sample[0];

    if (exists $samples{$basename}) {

        $suffix++;
        $basename .= "_" . $suffix;

    }

    foreach my $file (@files) {

        die "  [!] Error: File \"" . $file . "\" does not exist\n\n" if (!-e $file);

        my ($fileType, $sorted) = fileTypeAndSorting($file);

        die "  [!] Error: Too many input files for sample \"" . $sample . "\"." .
            "\n             Each sample must consist at most of a SAM/BAM file and a chimeric junction file\n\n" if (exists $samples{$fileType});

        $samples{$basename}->{$fileType} = { file   => $file,
                                             sorted => $sorted };
        $table->row($basename, $file, $fileType, $sorted ? "Yes" : "No");

    }

}

$table->print();

print "\n\n[+] Making output directory...";

if (-e $output) {

    if ($overwrite) {

        my $error = rmtree($output);

        die "\n\n  [!] Error: " . $error . "\n\n" if ($error);

    }
    else { die "\n\n  [!] Error: Output directory already exists." .
               "\n      Please use -ow (or --overwrite) to overwrite output directory\n\n"; }

}

if (my $error = mktree($tmpDir, "755")) { die "\n\n  [!] Error: Unable to create output directory ($error)\n\n"; }
mktree($logDir, "755");

print "\n[+] Importing transcripts from FASTA reference... ";

$refIO = Data::IO::Sequence->new(file => $reference);
while(my $entry = $refIO->read()) { $transcripts{$entry->id()} = $entry->length(); }

if ($whitelist) {

    my ($total, @whitelist);
    $total = scalar(keys %transcripts);

    open(my $wh, "<", $whitelist) or die "\n\n  [!] Error: Unable to open whitelist file (" . $! . ")\n\n";
    while(<$wh>) {

        chomp();
        push(@whitelist, $_);

    }
    close($wh);

    die "\n\n  [!] Error: Whitelist file is empty\n\n" if (!@whitelist);

    %transcripts = map { $_ => $transcripts{$_} } grep { exists $transcripts{$_} } @whitelist;

    die "\n\n  [!] Error: None of the whitelisted transcript IDs found in reference\n\n" if (!keys %transcripts);

    print $total . " imported, " . scalar(keys %transcripts) . " in whitelist.";

}
else { print scalar(keys %transcripts) . " imported."; }

foreach my $sample (sort keys %samples) {

    @transcripts = map { $_ } sort keys %transcripts;

    my ($chimericFile, $bamFile, @pool, %chimericIndex);
    $bamFile = exists $samples{$sample}->{BAM} ? $samples{$sample}->{BAM}->{file} : (exists $samples{$sample}->{SAM} ? $samples{$sample}->{SAM}->{file} : undef);
    $chimericFile = exists $samples{$sample}->{chimeric} ? $samples{$sample}->{chimeric}->{file} : undef;
    %stats = ( totTranscripts => 0,
               totChimeras    => 0 );

    print "\n[+] Processing sample \"" . $sample . "\"\n";

    mktree($output . $sample . "/dotplot/", "755") if ($makeDotPlot);
    mktree($output . $sample . "/clustered/", "755") if ($makeClustersSam);
    mktree($output . $sample . "/plots/", "755") if ($makeImage);
    mktree($output . $sample . "/constraints/", "755") if ($makeConstraint);
    mktree($output . $sample . "/bpcounts/", "755") if ($evalStructures);
    mktree($output . $sample . "/chimaeras/", "755") if ($dumpChimaeras);
    mktree($output . $sample . "/heatmaps/", "755") if ($heatmap);

    if (exists $samples{$sample}->{SAM} ||
        (exists $samples{$sample}->{BAM} && !$samples{$sample}->{BAM}->{sorted})) {

        print "\n  [*] " . (exists $samples{$sample}->{SAM} ? "Converting SAM file to BAM format, and sorting..." :
                                                              "Sorting BAM file...");

        my ($sortedFile, $logFile);
        $sortedFile = $tmpDir . $sample . "_sorted.bam";
        $logFile = $logDir . $sample . "_BAMsort.log";

        system($samtools . " sort --threads " . $threads . " -O BAM -T \"" . $tmpDir . $sample . "_tmp\" -o \"" . $sortedFile . "\" \"" . $bamFile . "\" >> \"". $logFile . "\" 2>\&1");

        if ($? == -1 || $? & 127) {

            print "\n  [!] Error: Sort failed, please check the logs. Skipping sample...\n";

            next;

        }

        $bamFile = $sortedFile;

    }

    if (defined $bamFile && !-e $bamFile . ".bai") {

        print "\n  [*] Indexing BAM file...";
        system($samtools . " index -@ " . $threads . " \"" . $bamFile . "\"");

    }

    if (defined $chimericFile) {

        if (!$samples{$sample}->{chimeric}->{sorted}) {

            print "\n  [*] Sorting chimeric junction file...";

            my ($sortedFile, $logFile);
            $sortedFile = $tmpDir . $sample . "_sorted.out";
            $logFile = $logDir . $sample . "_Chimericsort.log";
            system("LC_ALL=C sort -k 1,1 --parallel=" . $threads . " \"" . $samples{$sample}->{chimeric}->{file} . "\" 2>\"". $logFile . "\" > \"" . $sortedFile . "\"");

            if ($? == -1 || $? & 127) {

                print "\n  [!] Error: Sort failed, please check the logs. Skipping sample...\n";

                next;

            }

            $chimericFile = $sortedFile;

        }

        print "\n  [*] Indexing chimeric junction file...\n";

        if (open(my $ch, "<" , $chimericFile)) {

            my ($lastId, $lastOffset);
            $lastOffset = tell($ch);

            while(<$ch>) {

                my @row = split("\t", $_);

                if ($row[0] ne $lastId) {

                    $chimericIndex{$row[0]} = $lastOffset if (exists $transcripts{$row[0]});
                    $lastId = $row[0];

                }

                $lastOffset = tell($ch);

            }

            close($ch);

        }
        else {

            print "  [!] Error: Failed to open chimeric junction file (" . $! . "). Skipping sample...\n";

            next;

        }

    }

    @pool = map { threads->create(\&processTranscript, $sample, $bamFile, $chimericFile, \%chimericIndex) } 1 .. ($singleTranscript ? 1 : $threads);
    $_->join() for (@pool);

    print CLRRET . "  [*] Processing completed [" . $stats{totTranscripts} . " transcripts, " . $stats{totChimeras} . " unique chimeras analyzed]\n";

}

print "\n[+] Cleaning up temporary data...";

for ($logDir, $tmpDir) {

    unlink(glob($_ . "*"));
    rmtree($_);

}

print "\n[+] All done.\n\n";

sub processTranscript {

    my $sample = shift;
    my $bamFile = shift;
    my $chimericFile = shift;
    
    my ($plot, %chimericIndex);
    %chimericIndex = %{$_[0]};

    $plot = Graphics::Image->new( width  => 18,
                                  R      => $R,
                                  tmpdir => $tmpDir ) if ($makeImage);

    while (1) {

        my ($id);

        { lock(@transcripts);
          $id = shift(@transcripts) if (@transcripts);

          if (defined $id && !exists $initials{"${sample}_" . uc(substr($id, 0, 1))}) {

              my $initial = substr($id, 0, 1);
              $initials{"${sample}_$initial"} = 1;

              mktree($output . $sample . "/dotplot/$initial", "755") if ($makeDotPlot);
              mktree($output . $sample . "/clustered/$initial", "755") if ($makeClustersSam);
              mktree($output . $sample . "/plots/$initial", "755") if ($makeImage);
              mktree($output . $sample . "/constraints/$initial", "755") if ($makeConstraint);
              mktree($output . $sample . "/bpcounts/$initial", "755") if ($evalStructures);
              mktree($output . $sample . "/chimaeras/$initial", "755") if ($dumpChimaeras);
              mktree($output . $sample . "/heatmaps/$initial", "755") if ($heatmap);

          } }

        last unless(defined $id);

        my ($refinedCentroids, $refinedBasePairs, @ranges, @basePairs,
            @centroids, %probTable, %basePairs, %baseProbs);

        if (defined $bamFile) {

            print CLRRET . "  [*] Transcript " . $id . ": importing chimeric reads from BAM file (this might take a while)";
            @ranges = parseSam($bamFile, $id);

        }

        if (defined $chimericFile && exists $chimericIndex{$id}) {

            print CLRRET . "  [*] Transcript " . $id . ": importing chimeric reads from chimeric junction file (this might take a while)";
            push(@ranges, parseChimeric($chimericFile, $chimericIndex{$id}, $id));

        }

        @ranges = sort { $a->[0] <=> $b->[0] } (shuffle(@ranges))[0 .. min($#ranges, $maxReads - 1)] if ($maxReads);

        $refIO->forceReopenFh();
        $sharedData{$id} = shared_clone({ sequence   => $refIO->read($id)->sequence(),
                                          done       => 0,
                                          total      => scalar(@ranges) });

        print CLRRET . "  [*] Transcript " . $id . ": " . $sharedData{$id}->{total} . " chimeras imported";

        if (@ranges) {

            { lock(%stats);
              $stats{totChimeras} += $sharedData{$id}->{total};
              $stats{totTranscripts}++; }

            if ($dumpChimaerasOnly) {

                print CLRRET . "  [*] Transcript " . $id . ": dumping chimeras to file";

                open(my $th, ">", $output . $sample . "/chimaeras/" . uc(substr($id, 0, 1)) . "/" . $id . ".txt");
                select((select($th), $|=1)[0]);
                print $th "#" . join("\t", qw(start1 end1 start2 end2)) . "\n";
                print $th join("\t", @$_) . "\n" for (@ranges);
                close($th);

                next;

            }

            if ($heatmap) {

                print CLRRET . "  [*] Transcript " . $id . ": generating heatmap";

                my ($plot, $heatmap, $hLength, @matrix, 
                    @x, @y, @counts);
                $hLength = min(500, $transcripts{$id});

                foreach my $range (@ranges) {

                    my ($start1, $end1, $start2, $end2, 
                        $scale, $mStart, $mEnd, $nStart, $nEnd); 
                    ($start1, $end1, $start2, $end2) = @$range;
                    $scale = $hLength / $transcripts{$id};

                    # Compute index ranges directly
                    $mStart = int($start1 * $scale);
                    $mEnd   = int($end1   * $scale);
                    $nStart = int($start2 * $scale);
                    $nEnd   = int($end2   * $scale);

                    for my $m ($mStart .. $mEnd) {

                        for my $n ($nStart .. $nEnd) {
                            
                            $matrix[$m]->[$n]++;
                        
                        }
                    
                    }
                
                }

                for my $i (0 .. $hLength - 1) {

                    for my $j (0 .. $hLength - 1) {

                        next if ($i >= $j);
                        next if (!$matrix[$i]->[$j]);

                        push(@x, $i, $j);
                        push(@y, $j, $i);
                        push(@counts, ($matrix[$i]->[$j]) x 2);

                    }

                }

                @counts = map { log($_) } @counts if ($logHeatmap);

                $plot = Graphics::Image->new( file   => $output . $sample . "/heatmaps/" . uc(substr($id, 0, 1)) . "/$id.pdf",
                                              width  => 10,
                                              height => 9.2,
                                              R      => $R,
                                              tmpdir => $tmpDir );
                $heatmap = Graphics::Chart::Heatmap->new( dataLabels    => { x => \@x,
                                                                             y => \@y },
                                                          dataLabelSort => { x => [ uniq(sort { $a <=> $b } @x) ],
                                                                             y => [ uniq(sort { $a <=> $b } @y) ] },
                                                          data          => \@counts,
                                                          background    => 0,
                                                          grid          => 0,
                                                          legendTitle   => $logHeatmap ? "log(# reads)" : "# reads",
                                                          labelTextSize => 12,
                                                          axisTextSize  => 12,
                                                          axisTitleSize => 12,
                                                          valueTextSize => 3,
                                                          x             => "x",
                                                          y             => "y",
                                                          colorPalette  => $greyHeatmap ? "Greys" : "YlOrRd",
                                                          plotValues    => 0,
                                                          yTitle        => "Position (1-" . $transcripts{$id} . ")",
                                                          xTitle        => "Position (1-" . $transcripts{$id} . ")",
                                                          xTicks        => 0,
                                                          yTicks        => 0,
                                                          xLabels       => 0,
                                                          yLabels       => 0 );
                $plot->plot([$heatmap]);

                next if ($heatmapOnly);

            }

            # Splits ranges in chunks, that will be processed by each thread
            my @rangeChunks = splitRanges($id, $sample, @ranges);
            undef(@ranges);

            if ($evalStructures) {

                my $structureFile = -e $evalStructures . $id . ".db" ? $evalStructures . $id . ".db" :
                                                                       (-e $evalStructures . $id . ".ct" ? $evalStructures . $id . ".ct" : undef);

                if (defined $structureFile) {

                    print CLRRET . "  [*] Transcript " . $id . ": calculating chimeric read support for reference structure";

                    my ($eval, @allPairs);
                    $eval = do { local $@;
                                 eval {

                                     my ($structIO, $entry, @bp, @pk);
                                     $structIO = Data::IO::Sequence->new( file         => $structureFile,
                                                                          pseudoknots  => 1,
                                                                          lonelypairs  => 1,
                                                                          noncanonical => 1 );
                                     $entry = $structIO->read();
                                     @bp = $entry->basepairs();
                                     @pk = $entry->pkpairs();
                                     @allPairs = sort { $a->[0] <=> $b->[0] } (@bp, @pk);

                                 };
                                 $@; };

                    if (!$eval) {

                        my (@pool, %pairSupport);

                        @pool = map { threads->create(\&countChimeraSupport, \@allPairs, $_ ) } @rangeChunks;
                        @allPairs = map { $_->join() } @pool;
                        %pairSupport = %{mergeBpHashes(\%pairSupport, $_)} for (@allPairs);

                        my $bpcountIO = Data::IO->new( file => $output . $sample . "/bpcounts/" . uc(substr($id, 0, 1)) . "/" . $id . ".bpcount",
                                                       mode => "w" );
                        $bpcountIO->write(join("\t", $_, (keys %{$pairSupport{$_}})[0], $pairSupport{$_}->{(keys %{$pairSupport{$_}})[0]}) . "\n") for (sort { $a <=> $b } keys %pairSupport);
                        $bpcountIO->write(join("\t", "Total", undef, $sharedData{$id}->{total}) . "\n");
                        $bpcountIO->close();

                    }

                }

                next if ($evalOnly);

            }

            my ($structure, @pool, @clusters, @clusteredRanges);

            print CLRRET . "  [*] Transcript " . $id . ": clustering reads (this might take a while)";

            @clusters = clusterReads($id, @rangeChunks);

            print CLRRET . "  [*] Transcript " . $id . ": " . scalar(@clusters) . " clusters detected\n";

            unlink(glob($tmpDir . $sample . "_" . $id . "_ranges_*"));

            # Updates group number (for SAM file generation) in clustered reads
            # and create collapsed table
            foreach my $c (0 .. $#clusters) {

                push(@clusteredRanges, map { [ @{$clusters[$c]->[$_]}, 1, $c + 1 ] } 0 .. $#{$clusters[$c]});
                push(@centroids, [ collapse(\@{$clusters[$c]}, $transcripts{$id}), $c + 1 ]);

            }

            if ($dumpChimaeras) {

                print CLRRET . "  [*] Transcript " . $id . ": dumping chimeras to file";

                open(my $th, ">", $output . $sample . "/chimaeras/" . uc(substr($id, 0, 1)) . "/" . $id . ".txt");
                select((select($th), $|=1)[0]);
                print $th "#" . join("\t", qw(start1 end1 start2 end2 cluster)) . "\n";
                print $th join("\t", @$_[0..3,5]) . "\n" for (@clusteredRanges);
                close($th);

            }

            if ($makeClustersSam) {

                print CLRRET . "  [*] Transcript " . $id . ": generating clustered BAM file";
                clustersSam($sample, $id, \@clusteredRanges);

            }

            if ($makeDotPlot || $makeConstraint) {

                print CLRRET . "  [*] Transcript " . $id . ": identifying base-pairs";

                # Builds most likely base-pairs
                @rangeChunks = splitRanges($id, $sample, $requireBpSupport ? @clusteredRanges : @centroids);
                @pool = map { threads->create(\&foldChimeras, $id, $_) } @rangeChunks;
                @basePairs = map { $_->join() } @pool;
                %basePairs = %{mergeBpHashes(\%basePairs, $_)} for (@basePairs);

                # Merges basePairs from all threads
                print CLRRET . "  [*] Transcript " . $id . ": refining base-pairs and centroids";
                @centroids = refineBpAndCentroids(\@centroids, \%basePairs);

                print CLRRET . "  [*] Transcript " . $id . ": calculating probability table";
                %probTable = probTable(@centroids);

                print CLRRET . "  [*] Transcript " . $id . ": calculating base-pair probabilities";
                %baseProbs = calcBaseProbs(\%probTable, \%basePairs);

                dotPlot($sample, $id, \%baseProbs) if ($makeDotPlot);
                $structure = makeConstraint($sample, $id, \%baseProbs) if ($makeConstraint);
                plotImage($plot, $output . $sample . "/plots/" . uc(substr($id, 0, 1)), $id, $structure, \%baseProbs) if ($makeImage);

            }

        }

        delete($sharedData{$id});

    }

}

sub splitRanges {

    my $id = shift;
    my $sample = shift;
    my @ranges = @_;

    my ($nRanges, $baseName, @split);
    $nRanges = @ranges;
    $baseName = $tmpDir . $sample . "_" . $id . "_ranges_";
    @split = map { [ splice(@ranges, 0, $singleTranscript ? min(max(ceil($nRanges / $threads), $minSampleSize), scalar(@ranges)) : scalar(@ranges)) ] } 1 .. ($singleTranscript ? min(max(1, ceil($nRanges / $minSampleSize)), $threads) : 1);

    unlink(glob($baseName . "*"));
    store $split[$_], $baseName . $_ . ".tmp" for (0 .. $#split);

    return(map { $baseName . $_ . ".tmp" } 0 .. $#split);

}

sub countChimeraSupport {

    my @allPairs = @{$_[0]};
    my $ranges = retrieve($_[1]);

    my %pairs;

    foreach my $range (sort { $a->[3] <=> $b->[3] } @{$ranges}) {

        foreach my $pair (@allPairs) {

            if ($range->[3] < $pair->[0]) { last; }
            elsif ($range->[0] > $pair->[1]) { next; }
            elsif ($pair->[0] >= $range->[0] && $pair->[0] <= $range->[1] &&
                   $pair->[1] >= $range->[2] && $pair->[1] <= $range->[3]) { $pairs{$pair->[0]}->{$pair->[1]}++; }

        }

    }

    return(\%pairs);

}

sub mergeBpHashes {

    my ($hash1, $hash2) = @_;

    my $merge = {};

    foreach my $key (keys %{$hash1}) {

        if (ref($hash1->{$key}) eq "HASH") {

            my $h2 = exists $hash2->{$key} ? $hash2->{$key} : {};
            $merge->{$key} = mergeBpHashes($hash1->{$key}, $h2);

        }
        else { $merge->{$key} = $hash1->{$key} + $hash2->{$key}; }

        delete($hash2->{$key});

    }

    foreach my $key (keys %{$hash2}) { $merge->{$key} = $hash2->{$key}; }

    return($merge);

}

# Returns a table containing the probability of each cluster
# The probability is calculated as the n. of reads belonging to the cluster
# divided by the sum of the n. of reads of all the clsuters that are incompatible with
# the cluster in analysis
sub probTable {

    my @ranges = @_;

    my ($n, @weights, @compat, @denom,
        %probTable);
    @weights = map { $_->[4] } @ranges;
    $n = @ranges;

    # Precompute compatibility
    for my $i (0 .. $n - 1) {

        for my $j ($i .. $n - 1) {

            $compat[$i][$j] = $compat[$j][$i] = isChimeraCompatible($ranges[$i], $ranges[$j]);
        
        }
    
    }

    # Precompute denominators
    for my $i (0 .. $n - 1) {

        $denom[$i] = 0;

        for my $j (0 .. $n - 1) {

            $denom[$i] += $weights[$j] unless $compat[$i][$j];

        }

    }

    # Compute probabilities
    for my $i (0 .. $n-1) {

        my $prob = $weights[$i] / $denom[$i];

        next if ($weights[$i] < $minReads || $prob < $minProb);

        $probTable{$ranges[$i][-1]} = $prob;

    }

    return %probTable;

}


sub calcBaseProbs {

    my %probTable = %{$_[0]};
    my %basePairs = %{$_[1]};

    my (%baseProbs);

    foreach my $base1 (keys %basePairs) {

        foreach my $base2 (keys %{$basePairs{$base1}}) {

            for (keys %{$basePairs{$base1}->{$base2}}) { $baseProbs{$base1}->{$base2} += $probTable{$_} if (exists $probTable{$_}); }

        }

    }

    return(%baseProbs);

}

# Centroid start and end position are refined base on the first and last base-pair detected
sub refineBpAndCentroids {

    my @centroids = @{$_[0]};
    my $basePairs = $_[1];

    my (@newCentroids, %newBasePairs, %refined, %totReads);
    %totReads = map { $_->[5] => $_->[4] } @centroids;

    foreach my $base1 (keys %{$basePairs}) {

        foreach my $base2 (keys %{$basePairs->{$base1}}) {

            for (keys %{$basePairs->{$base1}->{$base2}}) {

                my $fracInCluster = $basePairs->{$base1}->{$base2}->{$_} / $totReads{$_};

                if ($fracInCluster <= $minChimeraFrac) { delete($basePairs->{$base1}->{$base2}->{$_}); }
                else {

                    $refined{$_}->{start1} = $base1 if (!exists $refined{$_}->{start1} || (exists $refined{$_}->{start1} && $refined{$_}->{start1} > $base1));
                    $refined{$_}->{end1} = $base1 if (!exists $refined{$_}->{end1} || (exists $refined{$_}->{end1} && $refined{$_}->{end1} < $base1));
                    $refined{$_}->{start2} = $base2 if (!exists $refined{$_}->{start2} || (exists $refined{$_}->{start2} && $refined{$_}->{start2} > $base2));
                    $refined{$_}->{end2} = $base2 if (!exists $refined{$_}->{end2} || (exists $refined{$_}->{end2} && $refined{$_}->{end2} < $base2));

                }

            }

            delete($basePairs->{$base1}->{$base2}) if (!keys %{$basePairs->{$base1}->{$base2}});

        }

        delete($basePairs->{$base1}) if (!keys %{$basePairs->{$base1}});

    }

    @newCentroids = map { [$refined{$_}->{start1}, $refined{$_}->{end1}, $refined{$_}->{start2}, $refined{$_}->{end2}, $totReads{$_}, $_] } sort {$a <=> $b} keys %refined;

    return(@newCentroids);

}

sub makeConstraint {

    my $sample = shift;
    my $id = shift;
    my %baseProbs = %{$_[0]};

    my ($path, @dotbracket);
    $path = $output . $sample . "/constraints/" . uc(substr($id, 0, 1)) . "/" . $id . ".db";
    @dotbracket = (".") x $transcripts{$id};

    foreach my $base1 (keys %baseProbs) {

        foreach my $base2 (keys %{$baseProbs{$base1}}) {

            if ($baseProbs{$base1}->{$base2} > $constraintThresh) {

                $dotbracket[$base1] = "(";
                $dotbracket[$base2] = ")";

            }

        }

    }

    return if (uniq(@dotbracket) == 1);

    open(my $ch, ">", $path);
    select((select($ch), $|=1)[0]);
    print $ch ">" . $id . "\n" .
              $sharedData{$id}->{sequence} . "\n" .
              join("", @dotbracket) . "\n";
    close($ch);

    return(join("", @dotbracket));

}

sub initCentroids {

    my @sortedRanges = sort { $a->[0] <=> $b->[0] ||
                              $a->[1] <=> $b->[1] } @_;

    my ($last, @centroids);
    $last = $sortedRanges[0];

    for (0 .. $#sortedRanges) {

        if ($sortedRanges[$_]->[0] > $last->[1]) {

            push(@centroids, $last);
            $last = $sortedRanges[$_];

        }


    }

    @centroids = ($last) unless (@centroids);

    return(uniq(@centroids));

}

sub clusterReads {

    my $id = shift;
    my @rangeChunks = @_;

    my (@pool, @allSamples, @i, @sampleClusters,
        @sampleCentroids);

    @pool = map { threads->create(\&clusterSample, $id, $_) } @rangeChunks;
    @allSamples = map { $_->join() } @pool;
    @sampleCentroids = map { @{$_->[0]} } @allSamples;
    @sampleClusters = map { @{$_->[1]} } @allSamples;

    if ($threads > 1 && $singleTranscript) {

        @i = sort { $sampleCentroids[$a]->[0] <=> $sampleCentroids[$b]->[0] } (0 .. $#sampleClusters);
        @sampleClusters = @sampleClusters[@i];
        @sampleCentroids = @sampleCentroids[@i];

        print CLRRET . "  [*] Transcript " . $id . ": merging from " . scalar(@sampleClusters) . " clusters [main thread]";

        for (my $i = 0; $i < $#sampleCentroids; $i++) {

            my (@overlap, @dists);

            for (my $j = $i + 1; $j < @sampleCentroids; $j++) {

                if ($sampleCentroids[$i]->[1] < $sampleCentroids[$j]->[0]) { last; }
                else { push(@overlap, $j); }

            }

            next if (!@overlap);

            @dists = map { rangeDist($sampleCentroids[$i], $sampleCentroids[$_]) } @overlap;

            if (min(@dists) < 1e308) {

                # to improve by iterating through all the argmin at the same moment, and pushing them in $i
                my $closeTo = $overlap[(argmin(@dists))[0]];
                push(@{$sampleClusters[$i]}, @{$sampleClusters[$closeTo]});
                $sampleCentroids[$i] = mergeRanges(@{$sampleClusters[$i]});
                splice(@sampleClusters, $closeTo, 1);
                splice(@sampleCentroids, $closeTo, 1);
                $i--;

            }

        }

    }

    return(sort { $a->[0]->[0] <=> $b->[0]->[0] } @sampleClusters);

}

sub clusterSample {

    my $id = $_[0];
    my $ranges = retrieve $_[1];

    return unless(@{$ranges});

    my (@centroids, @clusters, @i);

    while (@{$ranges}) {

        my ($kmin, $clusterer, @sample, @startCentroids);
        @sample = splice(@{$ranges}, 0, min(scalar(@{$ranges}), $minSampleSize));
        @startCentroids = initCentroids(@sample);
        $kmin = @startCentroids;
        $clusterer = Cluster::Kmeans->new( data          => \@sample,
                                           distfunc      => \&rangeDist,
                                           centroidfunc  => \&mergeRanges,
                                           maxiterations => $maxIter,
                                           refine        => 1 );
        $clusterer->cluster($kmin, \@startCentroids);
        push(@clusters, $clusterer->clusters());
        push(@centroids, $clusterer->centroids());

        { lock(%sharedData);
          $sharedData{$id}->{done} += @sample;

          print CLRRET . "  [*] Transcript " . $id . ": clustering, " . sprintf("%.2f", $sharedData{$id}->{done} / $sharedData{$id}->{total} * 100) . "\% done"; }

    }

    @i = sort { $centroids[$a]->[0] <=> $centroids[$b]->[0] } (0 .. $#clusters);
    @clusters = @clusters[@i];
    @centroids = @centroids[@i];

    print CLRRET . "  [*] Transcript " . $id . ": merging from " . scalar(@clusters) . " clusters [thread #" . threads->tid() . "]";

    for (my $i = 0; $i < $#centroids; $i++) {

        my (@overlap, @dists);

        for (my $j = $i + 1; $j < @centroids; $j++) {

            if ($centroids[$i]->[1] < $centroids[$j]->[0]) { last; }
            else { push(@overlap, $j); }

        }

        next if (!@overlap);

        @dists = map { rangeDist($centroids[$i], $centroids[$_]) } @overlap;

        if (min(@dists) < 1e308) {

            my $closeTo = $overlap[(argmin(@dists))[0]];
            push(@{$clusters[$i]}, @{$clusters[$closeTo]});
            $centroids[$i] = mergeRanges(@{$clusters[$i]});
            splice(@clusters, $closeTo, 1);
            splice(@centroids, $closeTo, 1);
            $i--;

        }

    }

    return([\@centroids, \@clusters]);

}

sub rangeDist {

    my ($x, $y) = @_;

    my ($r1x, $r2x, $r1y, $r2y,
        $i1, $i2);
    $r1x = abs(diff(@{$x}[0..1])) + 1;
    $r1y = abs(diff(@{$y}[0..1])) + 1;
    $r2x = abs(diff(@{$x}[2..3])) + 1;
    $r2y = abs(diff(@{$y}[2..3])) + 1;
    $i1 = intersect([@{$x}[0..1]], [@{$y}[0..1]]);
    $i2 = intersect([@{$x}[2..3]], [@{$y}[2..3]]);

    if ($i1 && $i2) {

        my ($ext1, $ext2, $over1, $over2);
        $ext1 = abs(diff(@{$i1})) + 1;
        $ext2 = abs(diff(@{$i2})) + 1;
        $over1 = $ext1 / min($r1x, $r1y);
        $over2 = $ext2 / min($r2x, $r2y);

        if ($over1 >= $minOverlapFrac && $over2 >= $minOverlapFrac) { return(1 - ($over1 * $over2)); }
        else { return(1e308); }

    }

    return(1e308);

}

sub isChimeraCompatible {

    my ($range1, $range2) = @_;

    return if (intersect([$range1->[0], $range1->[1]], [$range2->[0], $range2->[1]]) ||
               intersect([$range1->[2], $range1->[3]], [$range2->[2], $range2->[3]]) ||
               intersect([$range1->[2], $range1->[3]], [$range2->[0], $range2->[1]]) ||
               intersect([$range1->[0], $range1->[1]], [$range2->[2], $range2->[3]]) ||
               (intersect([$range1->[0], $range1->[3]], [$range2->[0], $range2->[1]]) && !intersect([$range1->[0], $range1->[3]], [$range2->[2], $range2->[3]])) ||
               (intersect([$range1->[0], $range1->[3]], [$range2->[2], $range2->[3]]) && !intersect([$range1->[0], $range1->[3]], [$range2->[0], $range2->[1]])));

    return(1);

}

sub clustersSam {

    my $sample = shift;
    my $id = shift;
    my @reads = @{$_[0]};

    my ($path, $tmpPath);
    $tmpPath = $tmpDir . $sample . "_" . $id . "_clustered";
    $path = $output . $sample . "/clustered/" . uc(substr($id, 0, 1)) . "/" . $id . ".bam";

    open(my $bh, ">", $tmpPath . ".sam") or die "\n  [!] Error: Unable to write temporary clustered SAM file (" . $! . ")\n\n";
    select((select($bh), $|=1)[0]);

    print $bh "\@HD\tVN:1.4\tSO:coordinate\n" .
              "\@SQ\tSN:" . $id . "\tLN:" . $transcripts{$id} . "\n";

    for my $i (0 .. $#reads) {

        my ($readgroup, $lenFirstHalf, $lenSecondHalf, $cigar, 
            @range1, @range2);
        @range1 = ($reads[$i]->[0] + 1, $reads[$i]->[1] + 1);
        @range2 = ($reads[$i]->[2] + 1, $reads[$i]->[3] + 1);
        $readgroup = $reads[$i]->[-1];
        $lenFirstHalf = $range1[1] - $range1[0] + 1;
        $lenSecondHalf = $range2[1] - $range2[0] + 1;
        $cigar = "${lenFirstHalf}M" . ($range2[0] - $range1[1] - 1) . "N${lenSecondHalf}M";

        print $bh "read#" . $i . "\t0\t" . $id . "\t" . $range1[0] . "\t255\t" . $cigar .
                  "\t*\t0\t0\t" . substr($sharedData{$id}->{sequence}, $range1[0] - 1, $lenFirstHalf) . substr($sharedData{$id}->{sequence}, $range2[0] - 1, $lenSecondHalf) . "\t" .
                  ("F" x (($range1[1] - $range1[0] + 1) + ($range2[1] - $range2[0] + 1))) . "\tXG:i:" . $readgroup . "\n";

    }

    close($bh);

    my ($cmd, $ret);
    $cmd = "$samtools sort --threads $threads -O BAM -o \"$path\" -T \"$tmpPath\" \"$tmpPath.sam\" 2>&1";
    $ret = `$cmd`;
    print "\n  [!] Error: Sort of clustered BAM file failed. Skipping transcript...\n" if ($? == -1 || $? & 127 || $ret =~ /(?:\[E::sam_parse|Aborting|Error)/i);

    $cmd = "$samtools index -@ $threads \"$path\" 2>&1";
    $ret = `$cmd`;
    print "\n  [!] Error: Indexing of clustered BAM file failed. Skipping transcript...\n" if ($? == -1 || $? & 127 || $ret =~ /(?:\[E::sam_parse|Aborting|Error)/i);

    unlink($tmpPath . ".sam");

}

sub dotPlot {

    my $sample = shift;
    my $id = shift;
    my %baseProbs = %{$_[0]};

    return if (!keys %baseProbs);

    my $path = $output . $sample . "/dotplot/" . uc(substr($id, 0, 1)) . "/" . $id . ".dp";

    open(my $dh, ">", $path);
    select((select($dh), $|=1)[0]);

    print $dh $transcripts{$id} . "\n" .
              "i\tj\t-log10(Probability)";

    foreach my $base1 (sort {$a <=> $b} keys %baseProbs) {

        foreach my $base2 (sort {$a <=> $b} keys %{$baseProbs{$base1}}) {

            print $dh "\n" . ($base1 + 1) . "\t" . ($base2 + 1) . "\t" . -logarithm($baseProbs{$base1}->{$base2}, 10);

        }

    }

    close($dh);

}

sub plotImage {

    my $plot = shift;
    my $dir = shift;
    my $id = shift;
    my $structure = shift;
    my %baseProbs = %{$_[0]};

    return if (!keys %baseProbs);

    my ($probsPlot, $maxProbBp, @pairs, @plots, 
        @heights);
    $plot->file("$dir/$id.pdf");

    foreach my $base1 (keys %baseProbs) {

        foreach my $base2 (keys %{$baseProbs{$base1}}) {

            my ($prob, $shannon, @rgb);
            $prob = $baseProbs{$base1}->{$base2};

            next if ($prob < $colorScale[0]);

            push(@pairs, [$base1, $base2, $prob]);

        }

    }

    @pairs = sort { $a->[2] <=> $b->[2] } @pairs;
    $maxProbBp = max(map { abs(diff(@$_)) } @pairs);
    $probsPlot = Graphics::Chart::Arcs->new( data            => [ map { [ @$_[0, 1] ] } @pairs ],
                                             dataLabels      => { "prob" => [ map { $_->[2] < $colorScale[1] ? $colorLegend[0] : ($_->[2] < $colorScale[2] ? $colorLegend[1] : ($_->[2] < $colorScale[3] ? $colorLegend[2] : $colorLegend[3])) } @pairs ] },
                                             flip            => "up",
                                             fill            => "prob",
                                             legendKeyWidth  => 14,
                                             legendKeyHeight => 4,
                                             legendColors    => { $colorLegend[0] => "#979797",
                                                                  $colorLegend[1] => "#ffcc07",
                                                                  $colorLegend[2] => "#478ecc",
                                                                  $colorLegend[3] => "#58b14a" },
                                             legendSort      => \@colorLegend,
                                             xLimit          => [0, $transcripts{$id} + 1],
                                             lineThickness   => 0.2,
                                             legendTitle     => "Probability",
                                             legend          => 1,
                                             background      => 0,
                                             grid            => 0,
                                             xTicks          => 1,
                                             yTicks          => 0,
                                             xLabels         => 1,
                                             yLabels         => 0,
                                             legendTextSize  => 12,
                                             legendTitleSize => 14,
                                             axisTitleSize   => 14,
                                             labelTextSize   => 12,
                                             xTitle          => "Position (nt)" );

    push(@plots, $probsPlot);
    push(@heights, $maxProbBp / $transcripts{$id});

    if (defined $structure && $structure !~ /^\.+$/) {

        my ($structPlot, $maxBp, @structPairs);
        @structPairs = listpairs($structure);
        $maxBp = max(map { abs(diff(@$_)) } @structPairs);
        $structPlot = Graphics::Chart::Arcs->new( data            => \@structPairs,
                                                  dataLabels      => { "fill" => [ ("black") x @structPairs ] },
                                                  flip            => "down",
                                                  fill            => "fill",
                                                  legendColors    => { "black" => "#000000" },
                                                  xLimit          => [0, $transcripts{$id} + 1],
                                                  lineThickness   => 0.2,
                                                  legend          => 0,
                                                  background      => 0,
                                                  grid            => 0,
                                                  xTicks          => 0,
                                                  yTicks          => 0,
                                                  xLabels         => 0,
                                                  yLabels         => 0,
                                                  legendTextSize  => 12,
                                                  legendTitleSize => 14,
                                                  axisTitleSize   => 14,
                                                  labelTextSize   => 12,
                                                  xTitle          => "Constraint" );

        push(@plots, $structPlot);
        push(@heights, $maxBp / $transcripts{$id});

    }

    $plot->height(max(2, sum(map { $_ * 8 } @heights)));
    $plot->plot([ map { [$_] } @plots ], { heights => \@heights});

}

sub foldChimeras {

    my $id = $_[0];
    my $ranges = retrieve($_[1]);

    my (%basePairs);

    while (my $range = shift(@{$ranges})) {

        my ($seq1, $seq2, $end1, $start2,
            @pairs);
        $end1 = $range->[2] <= $range->[1] ? $range->[2] : $range->[1];
        $start2 = $range->[2] <= $range->[1] ? $range->[1] : $range->[2];
        $seq1 = substr($sharedData{$id}->{sequence}, $range->[0], $end1 - $range->[0] + 1);
        $seq2 = substr($sharedData{$id}->{sequence}, $start2, $range->[3] - $start2 + 1);

        if (!$noUseVienna) {

            my @fold = RNA::fold(join("GAAA", $seq1, $seq2), join("xxxx", ("<" x length($seq1)), (">" x length($seq2))));
            RNA::free_arrays();
            @pairs = listpairs($fold[0]);

        }
        else { @pairs = basePairsFromAln($seq1, $seq2); }

        next if ((@pairs * 2) / length($seq1 . $seq2) < $minPairedFrac);

        foreach my $pair (@pairs) {

            $pair->[0] += $range->[0];
            $pair->[1] = $pair->[1] - 4 - length($seq1) + $start2; # 4 is the length of the fake GAAA tetraloop

            $basePairs{$pair->[0]}->{$pair->[1]}->{$range->[5]} += $range->[4]; # Saves the cluster that base-pair belongs to

        }

    }

    return(\%basePairs);

}

# Quick-and-dirty alternative to the ViennaRNA fold()
sub basePairsFromAln {

    my ($seq1, $seq2) = @_;

    my ($aligner, $aln, $qEnd, $rStart,
        $alnLen, @pairs);

    my $aligner = Align::SmithWaterman->new( reference => $seq1,
                                             query     => rev($seq2),
                                             scoring   => { AT => 2,
                                                            CG => 3,
                                                            GT => 1 },
                                             mismatch  => -1,
                                             gOpen     => 0,
                                             gExt      => -1 );
    $aligner->align();
    $aln = $aligner->read();
    $qEnd = length($seq2) - $aln->queryStart() - 1 + 4 + length($seq1); # the fake GAAA tetraloop + the first half of the chimera
    $rStart = $aln->refStart();
    $alnLen = length($aln->alignment());

    for my $i (0 .. $alnLen - 1) {

        if (substr($aln->alignment(), $i, 1) ne " ") {

         push(@pairs, [$rStart, $qEnd]) if (substr($aln->alignment(), $i, 1) eq "|");
         $qEnd--;
         $rStart++;

        }
        elsif (substr($aln->reference(), $i, 1) eq "-") { $qEnd--; }
        elsif (substr($aln->query(), $i, 1) eq "-") { $rStart++; }

    }

    return(@pairs);

}

sub parseSam {

    my $file = shift;
    my $id = shift;

    my (@ranges);

    if (open(my $fh, "-|", "samtools view " . $file . " " . $id . " 2>&1")) {

        while(<$fh>) {

            my ($range1, $range2, $range, @line,
                @cigar);

            chomp();
            @line = split /\t/;

            next if (($line[5] =~ tr/N/N/) != 1); # Skip reads with less or more than 1 gap
            next if ($line[1] & 16); # Skip reads aligning to reverse strand

            @cigar = parseCigar($line[5]);
            $range = [ $line[3] - 1, $line[3] - 1 + $cigar[0] - 1,
                       $line[3] - 1 + $cigar[0] + $cigar[1], $line[3] - 1 + $cigar[0] + $cigar[1] + $cigar[2] - 1 ];

            push(@ranges, $range) if (abs(diff($range->[1] + 1, $range->[2] - 1)) + 1 >= $minGap &&
                                      abs(diff(@{$range}[0..1])) + 1 >= $minHalfLen &&
                                      abs(diff(@{$range}[2..3])) + 1 >= $minHalfLen);

        }
        close($fh);

        @ranges = uniq(@ranges);

    }
    else { print "\n  [!] Error: Failed to read from BAM file (" . $! . ")"; }

    return(@ranges);

}

sub parseChimeric {

    my $file = shift;
    my $offset = shift;
    my $id = shift;

    my (@ranges);

    if (open(my $fh, "<", $file)) {

        seek($fh, $offset, SEEK_SET);

        while(<$fh>) {

            my ($range1, $range2, $range, @line);

            chomp();
            @line = split /\t/;

            last if ($line[0] ne $id);
            next if ($line[0] ne $line[3]); # Only keeps intramolecular base-pairs
            next if ($line[2] ne "+" ||
                     $line[5] ne "+");      # Only keeps reads mapping same strand of the transcript
            next if ($line[13] =~ m/N/);    # Skips chimeric reads with gaps

            # This should never happen, and its a bug in STAR (already reported to A. Dobin)
            next if (!$line[12]); 

            $range1 = [$line[10] - 1, $line[1] - 2];
            $range2 = [$line[12] - 1, $line[12] - 1 + parseCigar($line[13]) - 1];

            # Should never happen
            # next if (intersect($range1, $range2));

            ($range1, $range2) = ($range2, $range1) if ($range1->[0] > $range2->[1]); # Swap intervals if second interval comes first

            $range = [ @{$range1}, @{$range2} ];

            push(@ranges, $range) if (diff($range->[2] - 1, $range->[1] + 1) + 1 >= $minGap &&
                                      abs(diff(@{$range}[0..1])) + 1 >= $minHalfLen &&
                                      abs(diff(@{$range}[2..3])) + 1 >= $minHalfLen);

        }

        close($fh);

        @ranges = uniq(@ranges);

    }
    else { print "\n  [!] Error: Failed to read from chimeric junction file (" . $! . ")"; }

    return(@ranges);

}

sub parseCigar {

    my $cigar = shift;

    my ($gap, @cov, @cigar);
    $gap = 0;
    @cov = (0, 0);

    # Remove clipped bases
    $cigar =~ s/\d+[SH]//g;

    if ($cigar =~ m/(\d+)N/) {

        $gap = $1;
        @cigar = split($gap . "N", $cigar);

    }
    else { @cigar = ($cigar); }

    for (0 .. $#cigar) {

        while ($cigar[$_] =~ m/^(\d+)([MIDP=X])/) {

            my ($n, $op) = ($1, $2);
            $cov[$_] += $n if ($op =~ m/^[DNM=X]$/);

            $cigar[$_] =~ s/^$n$op//;

        }

    }

    return($gap ? ($cov[0], $gap, $cov[1]) : $cov[0]);

}

sub collapse {

    my ($ranges, $length) = @_;

    my $collapsed = mergeRanges(@{$ranges});
    $length--;

    # Fixes end if exceedes transcript length
    $collapsed->[1] = min($collapsed->[1], $length);
    $collapsed->[3] = min($collapsed->[3], $length);

    return(@{$collapsed}, scalar(@{$ranges}));

}

# Given a set of reads, this function finds the midpoint of maximum enrichment on both sides,
# then enlarges it up/down by 1/2 the median size of each read half
# Used for both collapsing of reads into read groups and centroid calculation in Kmeans
sub mergeRanges {

    my @ranges = @_;

    return($ranges[0]) if (@ranges == 1);

    my ($maxr1, $maxr2, $start1, $start2,
        $halfmedianr1, $halfmedianr2, @r1, @r2);
    $halfmedianr1 = round(median(map { abs(diff(@{$_}[0..1])) + 1 } @ranges) / 2);
    $halfmedianr2 = round(median(map { abs(diff(@{$_}[2..3])) + 1 } @ranges) / 2);
    $start1 = min(map { $_->[0] } @ranges);
    $start2 = min(map { $_->[2] } @ranges);

    foreach my $range (@ranges) {

        $r1[$_]++ for ($range->[0] - $start1 .. $range->[1] - $start1);
        $r2[$_]++ for ($range->[2] - $start2 .. $range->[3] - $start2);

    }

    @r1 = map { defined $_ ? $_ : 0 } @r1;
    @r2 = map { defined $_ ? $_ : 0 } @r2;

    $maxr1 = round(mean(argmax(@r1))) + $start1;
    $maxr2 = round(mean(argmax(@r2))) + $start2;

    return([ max(0, $maxr1 - $halfmedianr1), $maxr1 + $halfmedianr1,
             max(0, $maxr2 - $halfmedianr2), $maxr2 + $halfmedianr2 ]);

}

sub fileTypeAndSorting {

    my $file = shift;

    my ($type, $sorted, $fail);
    $sorted = 1;

    if ($file =~ m/\.([bs]am)$/i) { $type = uc($1); }
    else {

        my ($header, $eof, @data);
        $header = "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00";
        $eof = "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00";

        open(my $fh, "-|", $samtools . " view " . $file . " 2>&1") or die "  [!] Error: Unable to open sample \"" . $file . "\" (" . $! . ")\n\n";
        for (1 .. 10) {

            my ($row, @row);

            $row = <$fh>;
            @row = split("\t", $row);

            if (@row < 12 || !isint($row[3]) || !isdna($row[9])) {

                $fail = 1;

                last;

            }

        }
        close($fh);

        if (!$fail) {

            open($fh , "<:raw", $file);
            read($fh, $data[0], 16);
            seek($fh, -28, SEEK_END);
            read($fh, $data[1], 28);
            close($fh);

            if ($data[0] eq $header && $data[1] eq $eof) { $type = "BAM"; }
            else {

                $type = "SAM";
                $sorted = 0;

            }

        }

    }

    if ($type eq "BAM") {

        open(my $fh, "-|", $samtools . " view -H " . $file . " | grep '^\@HD'");

        if (my $row = <$fh>) { $sorted = 0 if ($row !~ m/SO:coordinate/); }
        else { $sorted = 0; }

        close($fh);

    }
    elsif ($fail) { # Not a SAM/BAM, let's try chimeric reads file

        my ($lastId);
        $fail = 0;

        open(my $fh, "<", $file) or die "  [!] Error: Unable to open sample \"" . $file . "\" (" . $! . ")\n\n";
        for (1 .. 100) {

            my ($row, @row);

            last if (eof($fh));

            $row = <$fh>;
            chomp($row);

            next if (!defined $row);

            @row = split("\t", $row);

            next if ($row[0] eq "chr_donorA");

            if (@row < 13 || !isint($row[1], $row[4]) || $row[2] !~ /^[\+-]$/) {

                $fail = 1;

                last;

            }

            if ($row[0] ne $lastId) {

                $lastId = $row[0] if (!defined $lastId || $lastId lt $row[0]);

                if ($lastId gt $row[0]) {

                    $sorted = 0;

                    last;

                }

            }

        }
        close($fh);

        die "  [!] Error: Sample file \"" . $file . "\" is not a SAM/BAM file, nor a chimeric junction file\n\n" if ($fail);

        $type = "chimeric";

    }

    return($type, $sorted);

}

sub help {

    print "\n  [!] Error: Invalid option. Please check the help\n" if ($_[0]);

    die <<HELP;

 RF Duplex (v$Core::Utils::VERSION)
 RNA Framework [http://www.rnaframework.com]

 Author:  Danny Incarnato (dincarnato[at]rnaframework.com)
 Summary: Analyzes direct RNA-RNA interaction capture experiments

 Usage:   rf-duplex [Options] Sample1.bam,Chimeric1.out.junction ... SampleN.bam,ChimericN.out.junction
          rf-duplex [Options] sample1:Sample1.bam,Chimeric1.out.junction ... sampleN:SampleN.bam,ChimericN.out.junction   # With sample labels

 Options                                        Description
 -p  or --processors             <int>          Number of processors to use (Default: 1)
 -st or --single-transcript                     Enables multithreading optimization for single trancript analysis
                                                Note: when active, a single transcript at a time will be analyzed, and multiple processors
                                                      will be used to speed-up time-comsuming operations such as read clustering and
                                                      base-pair analysis. This is useful, for example, for the analysis of COMRADES
                                                      experiments, in which a single target transcript is analyzed at a high sequencing depth
 -o  or --output-dir             <string>       Output directory (Default: rf_duplex/)
 -ow or --overwrite                             Overwrites output directory (if the specified path already exists)
 -f  or --fasta                  <string>       Path to a FASTA file containing the reference transcripts
                                                Note: Transcripts in this file must match transcripts in SAM/BAM file headers
 -wl or --whitelist              <string>       A whitelist containing transcript IDs (one per each row) to restrict the analysis to (optional)
 -mg or --min-gap                <int>          Minimum gap length (in nt) to consider a chimeric read (>0, Default: 1)
 -mh or --min-half               <int>          Minimum length (in nt) of each half of a chimeric read to consider it (>0, Default: 20)
 -xr or --max-reads              <int>          Maximum number of chimeric reads to analyze for each transcript (>0, Default: 100000)
                                                Note: all chimeric reads are first imported, then a random sample is extracted
 -c  or --constraint                            Generates a dot-bracket constraint file containing the inferred base-pairs whose probability
                                                exceeds a given threshold (controlled by -ct or --constraint-theshold)
 -ct or --constraint-theshold    <float>        Sets the probability threshold to include a base-pair in the constraint (0.5-1, Default: 0.5)
 -dp or --dot-plot                              Calculates 
 -es or --eval-structures        <string>       Path to a folder of structure files (in .db or .ct format), for which the number of chimeric reads supporting
                                                each base-pair will be calculated
                                                Note: structure files must be named after their respective reference transcript
 -eo or --eval-only                             Only calculates the chimeric read support for a reference structure (requires -es), and exits
 -dc or --dump-chimeras                         Dumps extracted chimeric reads to file
 -do or --dump-chimeras-only                    Only dumps chimeric reads to file (requires -dc), and exits

 |
 +- Base-pairs inference options (requires -dp or -c)
 -nv  or --no-use-vienna                        A modified Smith-Waterman local alignment algorithm is used instead of the ViennaRNA package to
                                                infer base-pairs from chimeric reads
 -mr  or --min-reads             <int>          Minimum number of chimeric reads to retain a cluster (>0, Default: 2)
 -mpf or --min-paired-frac       <float>        Minimum fraction of paired bases in a chimeric read to retain it (>0-1, Default: 0.25)
 -mp  or --min-prob              <float>        Minimum probability to retain an inferred base-pair (>0-1, Default: 0.01)
 -rmc or --require-min-chimera                  Instead of being inferred from the cluster's centroid, base-pairs are inferred from every single
                                                chimeric read belonging to the cluster, and only base-pairs common to a certain fraction of the
                                                reads in the cluster (controlled by -mcf or --min-chimera-frac), are retained
                                                Note: if no base-pair is supported by at least -mcf chimeric reads, then the cluster is discarded
 -mcf or --min-chimera-frac      <float>        Sets the minimum fraction of chimeric reads supporting a base-pair (requires -rmc; 0.5-1, Default: 0.5)

 |
 +- Plotting options
 -hm  or --heatmap                              Generates contact heatmaps from chimeras (requires R)
 -ho  or --heatmap-only                         Only generates heatmaps (requires -hm), and exits
 -lhm or --log-heatmap                          Read counts are log-transformed for heatmap generation (requires -hm)
 -ghm or --grey-heatmap                         Heatmap will be colored in grey scale
 -g   or --img                                  Generates plots of the inferred base-pairing probabilities (if -dp is passed) and of the inferred 
                                                structure/constraint (if -c is passed) (requires R)
 -cs  or --color-scale            <string>      Allows specifying 4 cut-points for the color scale used to report base-pairing probabilities
                                                (Default: 0.05,0.1,0.4,0.7 for 0.05-0.1:grey, 0.1-0.4:yellow, 0.4-0.7:blue, 0.7-1:green)
 -R   or --R-path                 <string>      Path to R executable (Default: assumes R is in PATH)

 |
 +- Clustering options
 -cb  or --make-clusters-bam                    Generates a BAM file for each transcript being analyzed, with chimeric reads belonging to the same
                                                cluster grouped together via the XG tag
 -mss or --min-sample-size       <int>          Minimum number of reads to be considered for each iteration of mini-batch clustering (>0, Default: 1000)
 -mo  or --min-overlap-frac      <float>        Minimum fractional overlap between the two halves of two chimeric reads to trigger clustering (>0-1,
                                                Default: 0.05)
 -mi  or --max-iterations        <int>          Maximum number of iterations for K-means (>0, Default: 5)

HELP

}
