#!/usr/bin/env perl
use 5.32.0;
use strict;
use warnings;
use Path::Tiny;
use File::Which qw"which";
use List::Util qw"min max";
use List::MoreUtils qw"uniq minmax";
use File::Spec; # still need for devnull()
use Data::Dumper;
use Storable qw"dclone";
use FindBin;

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# global variables

my $VERSION = "1.10.5";
my $EXE = $FindBin::RealScript;
my $DESC = "Rapid microbial RNA annotationn";
my $AUTHOR = 'Torsten Seemann';
my $URL = 'https://github.com/tseemann/barrnap';
my $DEFAULT_DBDIR = path("$FindBin::RealBin/../db")->realpath;
my $BUILD_DIR = path("$FindBin::RealBin/../build")->realpath;
my $DEVNULL = File::Spec->devnull;
my $DOT = '.';
my @KINGDOM = qw"bac arc fun";
my %KINGDOM = (map { $_ => 1 } @KINGDOM);
my %GCODE = ('bac'=>11, 'arc'=>25, 'fun'=>1);
my %SNAME = (
  'RF00001' => "5S",  'RF00002' => "5.8S",
  'RF00177' => "16S", 'RF01959' => "16S",
  'RF01960' => "18S", 'RF02540' => "23S",
  'RF02541' => "23S", 'RF02543' => "28S",
);
# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# constants for GFF3
use constant {
  SEQ_ID=>0, SOURCE=>1, FTYPE=>2,
  START=>3, FINISH=>4, SCORE=>5, STRAND=>6,
  PHASE=>7, ATTR=>8,
};
# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# command line options

my(@Options, 
   $quiet, $kingdom, $threads, 
   $legacy, $all,
   $fast, $debug, $gcode,
   $rrna, $trna, $ncrna, $mrna,
   $operon, , $opdist, $addids, 
   $listdb, $updatedb, $dbdir,
   $evalue, $lencutoff, $reject,
   $incseq, $incseqreg, $outseq, 
);
setOptions();

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# check all is well

#-d $dbdir or err("--dbdir '$dbdir' does not exist");

$listdb and list_dbs($dbdir);
$updatedb and update_dbs($dbdir);

@ARGV or err("No input sequence files provided");

$rrna || $trna || $ncrna || $mrna or err("Must choose at least one of --rrna --trna --ncrna --mrna");
$threads > 0 or err("Invalid --threads $threads");
$evalue > 0 or err("Invalid --evalue $evalue");
#$lencutoff > 0 or err("Invalid --lencutoff $lencutoff");
#$reject > 0 or err("Invalid --reject cutoff $reject");

exists $KINGDOM{$kingdom} or
  err("Imvalid --kingdom '$kingdom'. CHoose from: @KINGDOM");

if ($gcode eq 'auto' or not $gcode) {
  $gcode = $GCODE{$kingdom};
}
msg("Using genetic code table $gcode");

msg("This is $EXE $VERSION by $AUTHOR");
msg("Obtained from $URL");

if ($legacy) {
  msg("Switching to --legacy barrnap < 1.0 mode");
  $rrna=1;
  $trna = $ncrna = $operon = $mrna = 0;
}
if ($all) {
  msg("Switching on --all features!");
  $rrna = $trna = $ncrna = $operon = $mrna = 1;
}
msg("Checking dependencies:");
require_exe(
  'cmsearch', 'cmscan', 'bedtools', 'aragorn', 
  'any2fasta', 'seqkit', 'make', 'transterm',
  'ziggypep', 'pyrodigal', 'diamond', 'taxonkit'
);

# Check all input files and make
my @infile;
for my $infile (uniq @ARGV) {
  $infile = '/dev/stdin' if $infile eq '-';
  -r $infile or err("Can't read file: $infile");
  push @infile, quotemeta File::Spec->rel2abs($infile);
}
@infile or err("No input files provided.");
#dbg( Dumper(\@infile) );

# Make a temp folder for all intermediate files
my $tempdir = $debug ? $DOT : Path::Tiny->tempdir(CLEANUP=>1);
my $origdir = Path::Tiny->cwd;
chdir($tempdir);

# Write a single combined FASTA file
my $FASTA = File::Spec->rel2abs("$EXE.fasta");
my($a2f_opt,$sk_opt) = $quiet ? ('-q','--quiet') : ('','');
run_cmd("any2fasta -k $a2f_opt @infile > $FASTA.tmp");
run_cmd("seqkit seq $sk_opt -g --min-len 1 -u -w 60 $FASTA.tmp > $FASTA");
-s $FASTA or err("No sequences found in input");
run_cmd("rm -f $FASTA.fai"); # for --debug case of tmp=.
run_cmd("seqkit faidx $sk_opt $FASTA");

# Loop over the RNA types chosen to scan
my @gff;
if ($trna) {
  my $toolver = aragorn_version();
  my $ara_file = run_aragorn($FASTA, $gcode);
  push @gff, parse_aragorn($ara_file, $toolver);
}
if ($rrna) {
  my $tool = 'cmsearch';
  my $tbl_file = run_infernal($tool, $FASTA, $kingdom, 'rRNA');
  push @gff, parse_infernal($tool, $tbl_file, 'rRNA');;
}
#if ($operon) {
#  @gff = sort_gff(\@gff); # for bedtools
#  push @gff, find_operons(\@gff);
#}
if ($mrna) {
  push @gff, find_genes($FASTA, $kingdom, $gcode);
}
if ($ncrna) {
  my $exe = 'cmscan';
  my $tbl_file = run_infernal($exe, $FASTA, $kingdom, 'ncRNA');
  push @gff,parse_infernal($exe, $tbl_file, 'ncRNA');
}

@gff = sort_gff(\@gff);
gff_add_ids(\@gff) if $addids;

chdir($origdir);
print_gff(\*STDOUT, \@gff, $incseq, $incseqreg);

my %fcount;
map { $fcount{$_->[FTYPE]}++ } @gff;
msg("Found", 0+@gff, "features:");
for my $f (sort keys %fcount) {
  msg( $fcount{$f}, $f );
}

msg( motd() ); # final traditional witticism

msg("Done.");

#----------------------------------------------------------------------
sub rbs_spacer_length {
  my($s) = @_; # pyrodgial rbs_spacer
  return $s =~ m/(\d+)bp/ ? $1 : 0;
}
#----------------------------------------------------------------------
sub rbs_length {
  my($s) = @_; # pyrodigal rbs_motif
  return 0 if $s =~ /None/i;
  return $1 if $s =~ /^(\d+)/;
  $s =~ s|^.*/||;
  return length($s);
}
#----------------------------------------------------------------------
# this is naive code, ignores %XX encoding
# should use URI::Escape::unescape ?
sub attr_to_hash {
  my($attrs) = @_;
  my @kv = split m';', $attrs;
  return { map { split m/=/,$_,2 } @kv }
  ;
}
#----------------------------------------------------------------------
sub cds_to_mrna {
  my($cds) = @_;

  my $mrna = dclone($cds);
  $mrna->[SOURCE] = "$EXE:$VERSION";
  $mrna->[FTYPE] = 'mRNA';
  $mrna->[SCORE] = $DOT;
  $mrna->[PHASE] = $DOT; # no phase for mRNA

  my $attr = attr_to_hash( $mrna->[ATTR] );
  my $rbs_len = rbs_length( $$attr{'rbs_motif'} );
  my $rlen = $rbs_len + rbs_spacer_length( $$attr{'rbs_spacer'} );
  
  if ($mrna->[STRAND] eq '+') {
    $mrna->[START] -= $rlen;
  } else {
    $mrna->[FINISH] += $rlen;
  }
  $mrna->[ATTR] = 'product=messenger RNA';
  
  if ($rlen > 0) {
    $mrna->[ATTR] .= " with RBS";
  
    # add the RBS
    my $rbs = dclone($cds);
    $rbs->[FTYPE] = 'RBS';
    $rbs->[PHASE] = $DOT;
    if ($rbs->[STRAND] eq '+') {
      $rbs->[START] = $mrna->[START];
      $rbs->[FINISH] = $mrna->[START] + $rbs_len - 1;
    } else {
      $rbs->[FINISH] = $mrna->[FINISH] + $rbs_len - 1;
      $rbs->[START] = $mrna->[FINISH];
    }
    $rbs->[ATTR] = "product=ribosome binding site "
                 . $attr->{'rbs_motif'} ;
    return ($mrna, $rbs);
  }
  return ($mrna);
}
#----------------------------------------------------------------------
sub find_terminators {
  my($fasta, $cds) = @_;

  my $ttexe = 'transterm';
  my $termi = "$EXE.$ttexe.out";
  my $coords = "$EXE.$ttexe.coords";
  my$ttver = transterm_version();

  my $geneid = 0;
  my @coord = map { tsv(
    ++$geneid,
    $_->[STRAND] eq '+' ? $_->[START] : $_->[FINISH],
    $_->[STRAND] eq '+' ? $_->[FINISH] : $_->[START],
    $_->[SEQ_ID],
  ); } @$cds;
  path($coords)->spew(\@coord);
  
  my $datfile = path(which($ttexe))
                ->parent
                ->child("../data/expterm.dat")
                ->realpath;
  $datfile->is_file
    or err("Can't locate 'expterm.dat' for $ttexe");

  run_cmd("$ttexe -S -p $datfile $FASTA $coords 1> $termi 2> $termi.log");

  #       0     1     2 3      4      5     6
  #  TERM 19    15310 - 15327  -      F     99     - 12.7 -4.0 |bidir  
  #       (name)(start - end)  (sense)(loc) (conf) (h p) (tail) (notes)
  my @term;
  my %term;
  my $seqid;
  $geneid = undef;
  foreach ( path($termi)->lines({chomp=>1}) ) {
    if (m/^SEQUENCE\s+(\S+)/) {
      $seqid = $1;
    }
    elsif ($seqid and m/^(\d+)/) {
      $geneid = $1;
    }
    elsif (m/^\s*TERM\s+/) {
      # FIXME - we need to collate all the hits
      # and choose the best one in each section
      # they ar ein coord order not in score order
      my(undef,@x) = split ' ';
      my $term = [
        $seqid, $ttver, 'terminator',
        min($x[1], $x[3]),
        max($x[1], $x[3]),
        $x[6], # score
        $x[4], # straind
        $DOT,  # phase
        'product=Rho-independent terminator',      
      ];
      push $term{ $geneid }->@*, $term;
    }
  }
  # find best terminator in each block
  for my $id (keys %term) {
    my @t = sort { $b->[SCORE] <=> $a->[SCORE] } 
      $term{$id}->@*;
    push @term, $t[0];
  }
  msg("Found",0+@term,"Rho-independent terminator");
  return @term;
}
#----------------------------------------------------------------------
sub find_genes {
  my($fasta, $kdom, $ttable) = @_;
  
  # check if we have enough sequence
  if (-s $fasta < 19_000) {
    wrn("Skipping mRNA search - sequence too short");
    return;
  }
  
  # first find mRNA 
  my $gff = "$EXE.pyrodigal.gff";
  my $faa = "$EXE.pyrodigal.faa";
  run_cmd(
    "pyrodigal -g $ttable -c -p single -j $threads" 
   ." -i $fasta -f gff -o $gff -a $faa"
  );

  my @cds;
  my @mrna;
  my %idx_of;
  foreach ( path($gff)->lines({chomp=>1}) ) {
    next if m/^#/;
    my @orf = split m/\t/;
    $orf[SOURCE] =~ s/_v/:/;
    push @mrna, cds_to_mrna(\@orf);
    # keep track of the original
    if ( $orf[ATTR] =~ m/ID=([^;]+)/ ) {
      $idx_of{$1} = $#cds + 1;
    }
    $orf[ATTR] = "productr=hypothetical protein";
    push @cds, [ @orf ];
  }

  my @term = find_terminators($fasta, \@cds);
  
  # nake a CDS for each mRNA
  for my $i (0 .. $#cds) { 
    # vvv NFI why i did this originally
    #$mrna[$i][ATTR] = ''; # blank out mRNA attrs
    my $c = $cds[$i];
    ## $$c[SOURCE] = $cds_tool; # fix tool
    $c->[FTYPE] = 'CDS';     # ftype
    $c->[PHASE] = 0;         # phase=0
    $c->[ATTR]  = "product=hypothetical protein";
  }
  #dmp(\%idx_of);
  # Now blast the predicted proteins
  my $cds_tool = diamond_version();
  my $bls = "$EXE.diamond.tsv";
  run_cmd(
    "diamond blastp -e $evalue -k 1"
   ." --quiet -p $threads"
   ." -f 6 qseqid evalue stitle"
   ." -q $faa -o $bls -d $dbdir/$kdom/$kdom.dmnd"
  );
  # K12_1 (# 337 # 2799 # 1 # ID=1_1;partial=00;start_type=ATG;rbs_motif=GGAG/GAGG;rbs_spacer=5-10bp;gc_cont=0.531)
  # 2.62e-119
  # sP:P0ABQ4 Dihydrofolate reductasee
  foreach ( path($bls)->lines({chomp=>1}) ) {
    #dbg("DIAMOND: $_");
    my($qid,$score,$stitle) = split m/\t/;
    my $c = $cds[ $idx_of{$qid} ];
    my($acc,$prod) = split ' ', $stitle, 2;
    $prod =~ s/\s+$//; # why is there space at end?
    my $name = '';
    if ($prod =~ m/([A-Z][a-z][a-z][A-Z])$/) {
      $name = "Name=$1;";
    }
    $c->[ATTR] = "${name}product=$prod;Dbxref=$acc";
    $c->[SCORE] = $score;
    $c->[SOURCE] = $cds_tool;
    msg("Annotated CDS: $prod");
  }

  # FInd sig_pep in the predicted proteins
  my @sigpep;
  my $zig = "$EXE.ziggypep.tsv";
  my $zp_ver = ziggypep_version();
  my $zp_opt = $quiet ? '-q' : '';
  run_cmd("ziggypep $zp_opt -s $faa > $zig");
  # https://github.com/tseemann/ziggypep?tab=readme-ov-file#quick-start
  # SAMD00000344.contig00001_10     12      MILVSLFVGVIC
  # SAMD00000344.contig00001_11     23      MDNNGKSYNKVLLMLTLLLGTFS  
  #dmp(\%idx_of);
  for my $zp ( path($zig)->lines({chomp=>1}) ) {
    #dbd("ZIGGYPEP=$zp");
    my($qid,$len,$pep) = split m/\t/, $zp;
    my $sp = dclone( $cds[ $idx_of{$qid} ] );
    #dmp($sp);
    $sp->[SOURCE] = "ziggypep:$zp_ver";
    $sp->[FTYPE] = 'sig_pep';
    $sp->[FINISH] = $sp->[START] + 3*$len;
    $sp->[SCORE] = $DOT;
    $sp->[ATTR] = "product=signal peptide $pep";
    push @sigpep, $sp;
  }

  return (@mrna, @cds, @sigpep, @term);
}

#----------------------------------------------------------------------
sub gff_add_ids {
  my($feats) = @_;
  my %count;
  for my $f (@$feats) {
    my $nv = $f->[ATTR]; # tagsin last column
    my $t = $nv =~ m/Name=([^;]+)/ ? $1 : $f->[2];
    $t =~ s/\s/_/g;
    $count{$t}++; 
    $f->[ATTR] = join ';', "ID=$t-$count{$t}", $nv;
  }
}
#----------------------------------------------------------------------
sub find_operons {
  my($feats) = @_;
  if (@$feats < 1) {
    wrn("Skipping operon detection - no RNA features");
    return;
  }
  my @operons;
  my $outf = "$EXE.find_operon.gff";
  open my $fh, '>', $outf;
  print_gff($fh, $feats, 0, 0);
  my $inf = "$EXE.operon.bed";
  run_cmd("bedtools merge -s -d $opdist -i $outf" 
         ." -c 3 -o count,collapse -delim '-'"
         ." > $inf");
  open my $BED, '<', $inf;
  while (<$BED>) {
    chomp;
    my @bed = split m/\t/;
    next unless $bed[3] > 1; # not singletons
    # this makes tRNA-only operons too.
    my $name = $bed[4] =~ /rRNA/ ? 'rRNA' : 'tRNA';
    my($b,$e,$s) = bed2gff_coords($bed[1],$bed[2]);
    push @operons, [
      $bed[0],
      "$EXE:$VERSION",
      'operon',
      $b, $e, $DOT, $s, $DOT,
      "Name=$name operon;product=$name operon: $bed[4]"
    ];
  }
  return @operons;
}
#----------------------------------------------------------------------
sub write_bed {
  my($fh, $gff) = @_;
  # $gff =  arrayref of gff arrays
  foreach (@$gff) {
  # https://en.wikipedia.org/wiki/General_feature_format#GFF_general_structure
  # https://en.wikipedia.org/wiki/BED_(file_format)#Format
    print {$fh} tsv(
      $_->[SEQ_ID],   # chr
      $_->[START]-1, # begin 0-based
      $_->[FINISH],   # end (exclusive)
      $DOT,      # name
      1000,      # socre 1-1000
      $_->[STRAND],   # strand
    );
  }
}
#----------------------------------------------------------------------
sub bed2gff_coords {
  my($begin,$end) = @_;
  return (
    min($begin,$end) + 1,
    max($begin,$end),
    $begin > $end ? '-' : '+',
  )
}
#----------------------------------------------------------------------
sub gff2bed_coords {
  my($start,$finish,$strand) = @_;
  return $strand eq '+' 
         ? ($start-1,$finish)
         : ($finish-1,$start);
}
#----------------------------------------------------------------------
sub tsv {
  return join("\t",@_)."\n";
}
#----------------------------------------------------------------------
sub gff_cmp {
  return ($a->[SEQ_ID] cmp $b->[SEQ_ID])  
      || ($a->[START] <=> $b->[START])   
      || ($b->[FINISH] <=> $a->[FINISH]) ;
}
#----------------------------------------------------------------------
sub sort_gff {
  my($gff) = @_;
  return sort { &gff_cmp } @$gff; 
}
#----------------------------------------------------------------------
sub print_gff {
  my($fh, $feats, $add_seq, $add_seq_reg) = @_;
  select $fh;
  $fh->autoflush(1);
  print "##gff-version 3\n";
  if ($add_seq_reg) {
    run_cmd(
       "seqkit fx2tab --only-id --name --length $FASTA"
     ." | sed 's/\\t/ 1 /'"
     ." | sed 's/^/##sequence-region /'"
    );
  }
  # print each feature line (assymeed sorted)
  for my $row (@$feats) {
     print tsv(@$row);
  }
  if ($add_seq) {
    print "##FASTA\n";
    run_cmd("cat $FASTA");
  }
  $fh->close();

  if ($outseq) {
#   msg("Writing hit sequences to: $outseq");
    my $BED = "$EXE.bed";
    open my $bed, '>', $BED;
    write_bed($bed, \@gff); # write ALL features
    close $bed;
    my $bq = $quiet ? "2>$DEVNULL" : "";
    run_cmd("bedtools getfasta -s -fo \Q$outseq\E -fi $FASTA -bed $BED $bq");
  } 
}
#----------------------------------------------------------------------
sub run_aragorn {
  my($fasta, $code) = @_;
  my $outfile = "$fasta.aragorn.out";
  my $cmd = "aragorn -w -gc$code -o '$outfile' '$fasta'";
  run_cmd($cmd);
  return $outfile;
}
#----------------------------------------------------------------------
sub transterm_version {
  # TransTermHP v2.09 (built on Oct  8 2025)
  my($ver) = qx"transterm -h 2>&1";
  $ver =~ m/^(\S+)\s+v(\S+)/ or err("Can't parse: $ver");
  return "$1:$2";
}
#----------------------------------------------------------------------
sub ziggypep_version {
  # % ziggypep -v
  # ziggypep 0.3.1
  my($ver) = qx"ziggypep -v";
  chomp $ver;
  my @ver = split ' ', $ver;
  return $ver[1];
}
#----------------------------------------------------------------------
sub diamond_version {
  # diamond version 2.1.18
  my($ver) = qx"diamond --version";
  chomp $ver;
  $ver =~ s/\h+version\h+/:/;
  return $ver;
}
#----------------------------------------------------------------------
sub aragorn_version {
  #----------------------------
  #ARAGORN v1.2.41 Dean Laslett
  #----------[B------------------
  my $help = join '', qx"aragorn -h";
  $help =~ m/(ARAGORN) v([\d\.]+)/s or err("Can not parse aragorn version");;
  return lc($1).":$2";
}
#----------------------------------------------------------------------
sub parse_aragorn {
  my($fname, $toolver) = @_;
  $toolver ||= "$EXE:$VERSION";
  my %PROD = (
    'tRNA'   => 'trasnfer RNA',
    'tmRAN'  => 'transfer-meesenger RNA',
#    'tmRNA*' => 'circularly
  );
  my @gff;
  #>AL123456
  #54 genes found
  #1   tRNA-Ile    [10887,10961]   35     (gat)
  #2   tRNA-Ala    [11112,11185]   34     (tgc)
  #3   tRNA-Leu    [25644,25728]   34     (cag
  #4   tRNA-Ser   c[243100,243188] 35     (gga)
  #46  tmRNA       [753615,753977] 90,125 ANDENYALAA**
  open my $fh, '<', $fname;
  my $seqid;
  while (<$fh>) {
    chomp;
    if (m/\hgene/) {
      # 1 gene found
      # 8 genes found
      # >end    74 sequences 55 tRNA genes 1 tmRNA genes, nothing found in 60 sequences, (18.92% sensitivity)
      next;
    }
    elsif (m/^>(\S+)/) {
      # this rule in second to avoid '>end' line
      $seqid = $1;
    }
    else {
      my @x = split ' ', $_;
      $x[2] =~ m/(c?)\[(\d+),(\d+)\]/ 
        or err("Can not parse $toolver '$_'");
      my($begin,$end) = ($2,$3);
      my $strand = $1 ? '-' : '+';

      my $odd=0;
      if ($x[1] =~ m/\*$/) {
        $odd=1;
        chop $x[1];
      }
      my $name = $x[1];
      $name =~ m/^(\w+)/;
      my $type = $1;
      my $prod = $type eq 'tRNA' 
               ? 'transfer RNA' 
               : 'transfer-messenger RNA';
      $prod .= " (non-canonical)" if $odd;
      my $score = $DOT;

      push @gff, [
        $seqid, $toolver,
        $type, $begin, $end,
        $score, $strand,  $DOT,
        "Name=$name;product=$prod" . (defined $x[4] ? " $x[4]" : "")
      ];
      msg("Found $name - $prod");
    }
  }
  return @gff;
}
#----------------------------------------------------------------------
sub run_infernal {
  my($exe, $fasta, $king, $feat) = @_;
  my $outfile = "$fasta.$king.$feat.tbl";
  my $speed = $fast ? "--hmmonly" : "--rfam -g";
  my $cmd = "$exe $speed";
  $cmd .= " --cpu $threads -E $evalue";
  $cmd .= " --tblout $outfile";
  $cmd .= " --noali -o $DEVNULL";
  # the 2 positional parametrs
  $cmd .= " $dbdir/$king/$king.$feat.cm";
  $cmd .= " $fasta";
  run_cmd($cmd);
  return $outfile;
}
#----------------------------------------------------------------------
sub parse_infernal {
  my($exe, $fname, $ftype) = @_;
  my $toolver = "$EXE:$VERSION"; # will overide later
  my @gff;
  foreach ( path($fname)->lines({chomp=>1}) ) {
    # # Version:         1.1.5 (Sep 2023)
    if (m/Version:\s+([\d\.]+)/) {
      $toolver = "infernal:$1";
    }
    next if m/^#/;    # comment line
    next if m/^\s*$/; # empty line
    
    my @x = split ' ', $_;
    dbg( map { "[$_]='$x[$_]' " } 0 .. $#x );

    my $strand = $x[9];
    $strand =~ m/^[+-]$/ or err("Bad hit line: @x");
    my($begin,$end) = minmax($x[7], $x[8]);
    my $prod = join( ' ', @x[17..$#x] );
    my $score = $x[15] // $DOT;

    # label = $seqid  $seqaccd $model   $mode;_acc
    # rRMA  = E.coli  -        5S_rRNA  RF00001
    # mcRMA = AsrC    RF02746  E.coli   -
    if ($exe eq 'cmscan') {
      my @pair = splice @x, 2, 2;
      unshift @x, @pair;
    }
    else { # exe eq 'cmsearch'
      $x[2] =~ m/^([^_]+)/;
      $prod = "$1 ribosomal RNA";
    }
    my($seq_id, $model_id, $model_acc) = ($x[0], $x[2], $x[3]);

    my $names = "Name=$model_id";
    if ($ftype eq 'rRNA') {
      my $sname = $SNAME{$model_acc} or err("No 'S' name for $model_acc");
      $names = "Name=${sname}_rRNA;Alias=$model_id";
      $prod = "$sname ribosomal RNA";
    }

    msg("Found $ftype - $prod"); 

    push @gff, [
      $seq_id, 
      $DOT, # will patch in true version at end
      $ftype,
      $begin, $end, $score, $strand,$DOT, 
        $names
       .";Dbxref=Rfam:$model_acc"
       .";product=$prod"
    ];
  }
  # fill in the tool version (came after featreus)
  map { $_->[1] = $toolver } @gff;
  return @gff;
}
#----------------------------------------------------------------------
sub run_cmd {
  my($cmd) = @_;
  msg("Running: $cmd");
  system($cmd)==0 or err("COuld not run: $cmd");
}
#----------------------------------------------------------------------
sub enumerate_dbs {
  my($dir) = @_;
  $dir // err("Can't enumerate undefined folder");
  my $db = {};
  for my $cm (glob("$dir/*/*.cm")) {
    $cm =~ m=(\w+)\.(\w+)\.cm$= or err("Bad CM model: $cm");
    $db->{$1}{$2} = $cm;
  }
  for my $dmnd (glob("$dir/*/*.dmnd")) {
    $dmnd =~ m=(\w+)\.dmnd$= or err("Bad DMND model: $dmnd");
    $db->{$1}{'CDS'} = $dmnd;
  }
  return $db;
}
#----------------------------------------------------------------------
sub list_dbs {
  my $d = shift // $dbdir;
  my $dbs = enumerate_dbs($d);
  msg("Database home: $d");
  my $count=0;
  for my $k (sort keys %$dbs) {
    msg("--kingdom '$k' has: tRNA tmRNA",sort keys %{$dbs->{$k}});;
    $count++;
  }
  $count or err("You have no databases installed!"
   ."\nRun '$EXE --updatedb' to update/repair thos");
  exit(0);
}
#----------------------------------------------------------------------
sub update_dbs {
  my $d = shift // $dbdir;
  if (! -d $d) {
    msg("Making folder: $d");
    path($d)->mkpath; # mkdir in later versions
  }
  -d $d or err("Database folder: $d does not exist");
  -w $d or err("You don't have permissions to modify the database folder: $dbdir");
  my $make = "make -j $threads -C \Q$BUILD_DIR\E DBDIR=\Q$d\E";
  run_cmd("$make bigclean");
  run_cmd("$make install");  
  list_dbs($d);
  #msg("Finished updating databases.");
  exit(0);
}
#----------------------------------------------------------------------
sub require_exe {
  for my $exe (@_) {
    my $which = which($exe);
    $which or err("Can not find required '$exe' in PATH");
    msg("Found $exe - $which");
  }
}
#----------------------------------------------------------------------
sub msg {
  return if $quiet;
  my $line = "[$EXE] @_\n";
  print STDERR $line;
}
#----------------------------------------------------------------------
sub wrn {
  msg("WARNING:", @_);
}
#----------------------------------------------------------------------
sub dmp {
  dbg(Dumper(@_));
}
#----------------------------------------------------------------------
sub dbg {
  print STDERR "[$EXE] DEBUG:\n", @_,"\n" if $debug;
}
#----------------------------------------------------------------------
sub err {
  $quiet=0;
  chdir($origdir) if $origdir;
  msg("ERROR:", @_);
  exit(2);
}
#----------------------------------------------------------------------
sub version {
  print "$EXE $VERSION\n";
  exit;
}
#----------------------------------------------------------------------
sub show_citation {
  print STDERR << "EOCITE";
  
If you use Barrnap in your work, please cite:

    Seemann T
    $EXE $VERSION : $DESC
    $URL
    
Thank you.

EOCITE

  exit;
}
#----------------------------------------------------------------------
# Option setting routines

sub setOptions {
  use Getopt::Long;

  @Options = (
    'GENERAL',
    {OPT=>"help",    VAR=>\&usage,             DESC=>"This help"},
    {OPT=>"version", VAR=>\&version,           DESC=>"Print version and exit"},
    {OPT=>"citation",VAR=>\&show_citation,     DESC=>"Print citation for referencing $EXE"},
    {OPT=>"quiet!",  VAR=>\$quiet, DESC=>"No screen output"},
    {OPT=>"debug!",  VAR=>\$debug, DESC=>"Debug mode"},
    'DATABASE',
    {OPT=>"listdb", VAR=>\$listdb, DESC=>"Show installed database"},
    {OPT=>"updatedb", VAR=>\$updatedb, DESC=>"Update databases from internet"},
    {OPT=>"dbdir=s", VAR=>\$dbdir, DEFAULT=>$DEFAULT_DBDIR, DESC=>"Location of databases"},
    "MODE",
    {OPT=>"kingdom=s", VAR=>\$kingdom, DEFAULT=>'bac', DESC=>"Kingdom: @KINGDOM" },
    {OPT=>"gcode=s", VAR=>\$gcode, DEFAULT=>'auto', DESC=>"Genetic code table" },
#    {OPT=>"legacy!", VAR=>\$legacy,  DEFAULT=>0, DESC=>"Simulate barrnap<1.0 - only --rrna)" },
    {OPT=>"all!", VAR=>\$all,  DEFAULT=>0, DESC=>"FInd ALL RNA features" },
    {OPT=>"rrna!",  VAR=>\$rrna, DEFAULT=>1, DESC=>"Find rRNA, disable with --no-rrna" },
    {OPT=>"trna!",  VAR=>\$trna, DEFAULT=>0, DESC=>"Find tRNA/tmRNA, disable with --no-trna" },
    {OPT=>"ncrna!", VAR=>\$ncrna, DEFAULT=>0, DESC=>"Find ncRNA, disable with --no-ncrna" },
#    {OPT=>"operon!", VAR=>\$operon, DEFAULT=>1, DESC=>"Idenitfy RNA opersons, disanle wioth --no-operson" },
    {OPT=>"mrna!", VAR=>\$mrna, DEFAULT=>0, DESC=>"Idenitfy mRNA coding regions, disable with --no-mrna" },    
#    {OPT=>"opdist=i", VAR=>\$opdist, DEFAULT=>200,  DESC=>"Operon clustering threshold" },
    "SPEED",
    {OPT=>"threads=i", VAR=>\$threads, DEFAULT=>1,  DESC=>"CPU cores to use"},
    {OPT=>"fast!", VAR=>\$fast, DEFAULT=>undef,  DESC=>"Trade speed for accuracy"},
#    "FILTERING",
#    {OPT=>"lencutoff=f",VAR=>\$lencutoff, DEFAULT=>0.8, DESC=>"Proportional length threshold to label as partial"},
#    {OPT=>"reject=f",VAR=>\$reject, DEFAULT=>0.25, DESC=>"Proportional length threshold to reject prediction"},
    {OPT=>"evalue=f",VAR=>\$evalue, DEFAULT=>0.001, DESC=>"Similarity e-value cut-off"},
    "GFF OUTPUT",
    {OPT=>"addids!",  VAR=>\$addids,  DESC=>"Add unique ID tagsn" },
    {OPT=>"incseq!",  VAR=>\$incseq,  DESC=>"Include original input sequences"},
    {OPT=>"incseqreg!",  VAR=>\$incseqreg,  DESC=>"Include #sequence-region headers"},
    {OPT=>"outseq=s",  VAR=>\$outseq,  DESC=>"Save RNA hits to a FASTA file"},
  );

  my @opt = ( map { $_->{OPT}, $_->{VAR} } 
               grep { ref }  @Options );
  GetOptions(@opt) or exit(1);

  # Now setup default values.
  foreach (@Options) {
    if (ref $_ && defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) {
      ${$_->{VAR}} = $_->{DEFAULT};
    }
  }
}

#----------------------------------------------------------------------
sub usage {
  my($exitcode) = @_;
  $exitcode = 0 if $exitcode eq 'help'; # what gets passed by getopt func ref
  $exitcode ||= 0;
  select STDERR if $exitcode; # write to STDERR if exitcode is error

  print "NAME\n";
  print "  $EXE $VERSION\n";
  print "SYNOPSIS\n";
  print "  $DESC\n";
  print "USAGE\n";
  print "  $EXE [opts] seq.{fa,gbk,fq}[.gz] ... > rna.gff\n";
  foreach (@Options) {
    if (ref $_) {
      my $def = defined $_->{DEFAULT} ? "[$_->{DEFAULT}]" : "";
      my $opt = $_->{OPT};
      if ($opt =~ s/!$//) {
        $def = $_->{DEFAULT} ? "[ON]" : "[OFF]";
      }
      $opt =~ s/=s$/ STR/;
      $opt =~ s/=i$/ INT/;
      $opt =~ s/=f$/ NUM/;
      printf "  --%-13s %s %s\n", $opt, $_->{DESC}, $def;
    }
    else {
      print "$_\n";
    }
  }
  print "HOMEPAGE\n";
  print "  $URL";
  exit($exitcode);
}

#---------------------------------------------------------------------
sub motd {
  my @motd = <DATA>;
  chomp @motd;
  if ($_[0]) { say "* $_" for @motd; exit(0); }
  return $motd[ int(rand(@motd)) ];
}
#---------------------------------------------------------------------

__DATA__
Two nuts walki into a bar. One is assaulted.
Barrnap - what you have when you fall asleep at the pub.
Found a bug? Tell me at b.com/tseemann/barrnap/issues
Did you know Barrnap can accepy Genbank files?
Fun fact - Barrnap will work with .gz compressed files
Barrnap does more than just rRNA; try the --all mode!
Running a bit slow? Use --threads 8 to speed it up.
Trade accuracy for speed using the --fast option.
Make a full GFF with sequences using --incseq --incseqreg
Barrnap also supports Fungi and Arachaea with the --kingdom option
Barrnap's first code was written in Berlin in 2013
You can use --fast but it is not recommended due to lower accurarcy
