bb.orffinder

#!/usr/bin/perl
#----------------------------------------------------------#
#        Author: Douglas Senalik dsenalik@wisc.edu         #
#----------------------------------------------------------#
# "Black Box" program series
=bb
Find open reading frames in a DNA or RNA sequence
=cut bb
use strict;
use warnings;
use Getopt::Long;      # for getting command line parameters
use Time::HiRes qw ( time );  # for --timer option
# 1.0.0 - Mar 9, 2011
# 1.1.0 - June 8, 2011 = Add --trimheader, fix major bug in sorting alpha instead of numeric
# 1.2.0 - Aug 28, 2012 = Bug fix in returned nucleotide sequence in table (length too long), added --origorder=s
# 1.3.0 - Apr 1, 2013 = Add gff output format, increase efficiency
# 1.4.0 - Dec 8, 2013 = New algorithm to pair start and stop codons that is much faster for long sequences


############################################################
# configuration variables
############################################################
my @strictstartcodons = ( "ATG" );
my @fullstartcodons = ( "ATG", "GTG", "CTG", "TTG" );
my @stopcodons = ( "[TU]AA", "[TU]AG", "[TU]GA" );
# thresholds for guessing orientation
my $guessthreshold = 0.10;  # "ambiguous" if both directions
my $guessminbp = 100;  # "tooshort" if below this
my %guessstats = ();
my %proteins = qw/
    UUU F UUC F UUA L UUG L UCU S UCC S UCA S UCG S UAU Y UAC Y UAA * UAG * UGU C UGC C UGA * UGG W
    CUU L CUC L CUA L CUG L CCU P CCC P CCA P CCG P CAU H CAC H CAA Q CAG Q CGU R CGC R CGA R CGG R
    AUU I AUC I AUA I AUG M ACU T ACC T ACA T ACG T AAU N AAC N AAA K AAG K AGU S AGC S AGA R AGG R
    GUU V GUC V GUA V GUG V GCU A GCC A GCA A GCG A GAU D GAC D GAA E GAG E GGU G GGC G GGA G GGG G
    /;
my $defaultfeatureid = "CDS";


############################################################
# global variables
############################################################
my $ansiup    = "\033[1A";  # terminal control
(my $prognopath = $0) =~ s/^.*[\/\\]//;
my $currhdr = "";
my $currseq = "";
my $guessdata = "";
my $numseq = 0;
my %orfperid = ();
my $outfilename2 = "";  # --nonorffasta file name
my @startcodons;
my $past = time();


############################################################
# command line parameters
############################################################
my $infilename  = "";   # input file name
my $outfilename = "";   # output file name
my $anystart    = 0;    # start also at beginning of sequence
my $anyend      = 0;    # end also at end of sequence
my $fullstart   = 0;    # use all 4 start codons
my $minlen      = 100;  # minimum orf length in b.p.
my $guessorient = 0;    # different output format: guess orientation
my $fasta       = 0;    # different output format: FASTA
my $nonorffasta = 0;    # second file with all sequence not in the first one
my $fastacollapse = 0;  # no duplication in --fasta output
my $fastalargest = 0;   # only larger of any overlapping orfs is kept
my $trimheader  = 0;    # keep header only up to first white space
my $featureid   = $defaultfeatureid;
my $non;
my $origorder;          # return in sequence order instead of sorted by size
my $grep;
my $nsequence   = 0;    # include column with nucleotide sequence
my $psequence   = 0;    # include column with protein sequence
my $gffformat   = 0;    # generate output in gff3 format
my $timer       = 0;    # print time stamps
my $help        = 0;    # print help and exit
my $quiet       = 0;    # only show errors
my $debug       = 0;    # print extra debugging information
GetOptions (
            "infile=s"         => \$infilename,        # string
            "outfile=s"        => \$outfilename,       # string
            "fullstart"        => \$fullstart,         # flag
            "anystart"         => \$anystart,          # flag
            "anyend"           => \$anyend,            # flag
            "minlen=i"         => \$minlen,            # integer
            "guessorientation" => \$guessorient,       # flag
            "fasta"            => \$fasta,             # flag
            "nonorffasta"      => \$nonorffasta,       # flag
            "fastacollapse"    => \$fastacollapse,     # flag
            "fastalargest"     => \$fastalargest,      # flag
            "trimheader"       => \$trimheader,        # flag
            "non:i"            => \$non,               # flag/integer
            "origorder:s"      => \$origorder,         # string
            "grep=s"           => \$grep,              # string
            "nsequence"        => \$nsequence,         # flag
            "psequence"        => \$psequence,         # flag
            "gffformat"        => \$gffformat,         # flag
            "featureid=s"      => \$featureid,         # string
            "timer"            => \$timer,             # flag
            "help"             => \$help,              # flag
            "quiet"            => \$quiet,             # flag
            "debug"            => \$debug);            # flag
# required parameters
unless ( $infilename ) { $help = 1; }
unless ( $outfilename ) { $help = 1; }
if ( ( defined $non ) and ( ! $non ) ) { $non = 1; }
if ( $gffformat ) { $trimheader = 1; }
# debug implies not quiet
if ( $debug ) { $quiet = 0; }


############################################################
# print help screen
############################################################
if ( $help )
  {
    print "$prognopath

This program will detect open reading frames in FASTA
DNA or RNA sequences. This is similar to the NCBI program at 
http://www.ncbi.nlm.nih.gov/gorf/orfig.cgi

Required parameters:
  --infile=xxx       input file name
  --outfile=xxx      output file name, use \"-\" for stdout
Optional parameters:
  --fullstart        use full set of start codons: ATG GTG CTG TTG
                     the default is to only use ATG
  --anystart         start of sequence is also a valid orf start
  --anyend           end of sequence is also a valid orf end
  --minlen=xxx       minimum orf length in b.p., default=$minlen
  --guessorientation guess orientation based on strand with the
                     most total orfs, this data will be be saved
                     instead of the list of orfs
  --fasta            output file is in FASTA format, each orf is
                     a separate sequence, information is in header
  --nonorffasta      second FASTA file with all sequence not in
                     the first one. File name is --outfile name
                     with \"nonorf\" inserted
  --fastacollapse    combine overlapping sequence in the FASTA file
  --fastalargest     if two orfs overlap, keep only the larger one
  --trimheader       remove any text in the FASTA header after 
                     the first occurrence of white space
  --origorder        return list in sequence order instead of
                     the default, which is sorted by size
  --origorder=s      same, but do + and - strands separately
  --grep=xxx         only do sequences with headers matching pattern
  --nsequence        include a column with nucleotide sequence
  --psequence        include a column with protein sequence
  --gffformat        generate output in gff3 format. This also
                     enables --trimheader
  --featureid=xxx    column 3 of gff file, default is \"$defaultfeatureid\"
  --non              do not allow any \"N\"s in an orf
  --help             print this screen
  --quiet            only print error messages
  --timer            print timestamps to evaluate each stage for debugging
  --debug            print extra debugging information
";
    exit 1;
  } # if ( $help )


############################################################
# perform orf detection
############################################################
if ( $fullstart )
  { @startcodons = @fullstartcodons }
else
  { @startcodons = @strictstartcodons }
my $OUTF = stdopen ( ">", $outfilename );
my $OUTF2;
if ( $nonorffasta )
  {
    ( $outfilename2 = $outfilename ) =~ s/(\.[^\.]*)$/.nonorf$1/;
    $OUTF2 = stdopen ( ">", $outfilename2 );
  }
unless ( ( $guessorient ) or ( $fasta ) or ( $gffformat ) )
  {
    print $OUTF join ( "\t", "#ID", "orf", "Frame", "Start", "Codon", "Stop", "Codon", "Length" );
    if ( $nsequence ) { print $OUTF "\tDNA Sequence"; }
    if ( $psequence ) { print $OUTF "\tProtein Sequence"; }
    print $OUTF "\n";
  }
if ( $gffformat )
  { print $OUTF "##gff-version 3\n##Index-subfeatures 1\n\n"; }

my $INF = stdopen ( "<", $infilename );
while ( my $aline = <$INF> )
  {
    $aline =~ s/[\r\n]//g;
    if ( $aline =~ m/^>(.*)$/ )
      {
        my $seqhdr = $1;
        if ( $trimheader ) { $seqhdr =~ s/\s.*$//; }  # Trim anything after first white space
        unless ( $seqhdr ) { die "Error, FASTA header is missing, or starts with white space: \"$aline\"\n"; }
        if ( length($currhdr) > 200 ) { die "Error, header is over 200 characters long \"$currhdr\"\n"; }
        if ( ( ! defined $grep ) or ( $currhdr =~ m/$grep/ ) )
          { process(); }
        $currhdr = $seqhdr;
        $currseq = "";
      }
    else
      {
        $currseq .= $aline;
      }
  } # while <$INF>
stdclose ( $INF );
if ( ( ! defined $grep ) or ( $currhdr =~ m/$grep/ ) )
  { process(); }

if ( $guessorient )
  {
    print $OUTF "#ID\tSeq Len\tbp Fwd\tbp Rev\tOrientation\n", $guessdata;
    print "Guess Orientation Summary:\n";
    while ( my ( $key, $value ) = each ( %guessstats ) )
      { print commify($value), "\t", $key, "\n"; }
  }

unless ( $quiet )
  { print commify($numseq), " sequences processed\n"; }

stdclose ( $OUTF );
if ( $nonorffasta )
  { stdclose ( $OUTF2 ); }


# Notes for correcting for reverse sequences, when not a multiple
# of 3, frame would be different, e.g.
#               length 9         length 10          length 11
# frame +3 [2]   1231231    [2]   12312312    [2]   123123123
# frame +2 [1]  12312312    [1]  123123123    [1]  1231231231
# frame +1 [0] 123123123    [0] 1231231231    [0] 12312312312
#              NNNNNNNNN        NNNNNNNNNN        NNNNNNNNNNN
# frame -1 [0] 321321321    [1] 3213213213    [2] 32132132132
# frame -2 [1]   3213213    [2]   32132132    [0]   321321321
# frame -3 [2]  32132132    [0]  321321321    [1]  3213213213
# correction needed prior to mod function
#          +0               +2                +1
# mod      -9%3=0           -10%3=2           -11%3=1
# this becomes $rf variable's value
############################################################
sub process {
############################################################
  if ( $currseq )  # first call will have no sequence
    {
      if ( $timer ) { print "TIMER: Start ".et()." \"$currhdr\" \n"; }
      $numseq++;
      # display progress for large input files
      if ( ( ( $numseq % 10 ) == 0 ) and ( ! $quiet ) )
        { print commify($numseq), "\n", $ansiup; }

      my @outdata = ();
      my $currseqrc = revcomp($currseq);
      my $seqlen = length($currseq);
      my @start = ();
      my @startrc = ();
      my @stop = ();
      my @stoprc = ();
      my @gap = ();
      my @gaprc = ();
      # reference frame fix for reverse sequence
      my $rf = (-$seqlen) % 3;
      # variables used for guessing the orientation
      my $bpplus = 0;
      my $bpminus = 0;

      # only use stop codon once, later hits will be for shorter orf
      my %stopused = ();
      my %stopusedrc = ();

      # to avoid column alignment problems, make sure header has no tabs in it
      $currhdr =~ s/\t/ /g;
      debugmsg ( "Processing \"$currhdr\", " . commify($seqlen) . " b.p." );

      # if --anystart, add start of sequence positions in all reading frames
      if ( $anystart )
        {
          for my $frame ( 1 .. 3 )
            {
              push ( @{$start[$frame % 3]}, $frame );
              push ( @{$startrc[$frame % 3]}, $frame );
            }
        }
      # if --anyend, add end of sequence positions in all reading frames
      if ( $anyend )
        {
          for my $frame ( $seqlen-2 .. $seqlen )
            {
              push ( @{$start[$frame % 3]}, $frame );
              push ( @{$startrc[$frame % 3]}, $frame );
            }
        }
      foreach my $acodon ( @startcodons )
        {
          while ( $currseq =~ m/($acodon)/ig )
            {
              # pos() returns the 0-based position of the first base after the codon
              my $p = pos($currseq)-3;  # 0-based position of the first base of the codon
              if ( $debug ) { debugmsg ( "Fwd: Start codon $1 at $p" ); }
              push ( @{$start[$p % 3]}, $p );
            }
          while ( $currseqrc =~ m/($acodon)/ig )
            {
              my $p = pos($currseqrc)-3;
              if ( $debug ) { debugmsg ( "Rev: Start codon $1 at $p" ); }
              push ( @{$startrc[($p+$rf) % 3]}, $p );
            }
        }
      if ( $timer ) { print "TIMER: Start codons found ".et()." \"$currhdr\" \n"; }

      foreach my $acodon ( @stopcodons )
        {
          while ( $currseq =~ m/($acodon)/ig )
            {
              my $p = pos($currseq)-3;
              if ( $debug ) { debugmsg ( "Fwd: Stop codon $1 at $p" ); }
              push ( @{$stop[$p % 3]}, $p );
            }
          while ( $currseqrc =~ m/($acodon)/ig )
            {
              my $p = pos($currseqrc)-3;
              if ( $debug ) { debugmsg ( "Rev: Stop codon $1 at $p" ); }
              push ( @{$stoprc[($p+$rf) % 3]}, $p );
            }
        }
      if ( $timer ) { print "TIMER: Stop codons found ".et()." \"$currhdr\" \n"; }

      if ( $non )
        {
          while ( $currseq =~ m/[Nn]{$non,}/g )
            {
              my $p = pos($currseq)-1; # end of the gap
              if ( $debug ) { debugmsg ( "Fwd: Gap ending at $p" ); }
              push ( @gap, $p );
            }
          while ( $currseqrc =~ m/[Nn]{$non,}/g )
            {
              my $p = pos($currseqrc)-1;
              if ( $debug ) { debugmsg ( "Rev: Gap ending at $p" ); }
              push ( @gaprc, $p );
            }
        }

      # sort arrays to put positions in increasing order
      for my $frame ( 0 .. 2 )
        {
          if ( ( @start ) and ( $start[$frame] ) )
            { @{$start[$frame]} = sort ( { $a <=> $b } @{$start[$frame]} ); }
          if ( ( @stop ) and ( $stop[$frame] ) )
            { @{$stop[$frame]} = sort ( { $a <=> $b } @{$stop[$frame]} ); }
          if ( ( @startrc ) and ( $startrc[$frame] ) )
            { @{$startrc[$frame]} = sort ( { $a <=> $b } @{$startrc[$frame]} ); }
          if ( ( @stoprc ) and ( $stoprc[$frame] ) )
            { @{$stoprc[$frame]} = sort ( { $a <=> $b } @{$stoprc[$frame]} ); }
        }
      if ( $timer ) { print "TIMER: Sorted ".et()." \"$currhdr\" \n"; }

      # Forward start-stop pairs
      for my $frame ( 0 .. 2 )
        {
          next unless ( ( @start ) and ( @stop ) and ( $start[$frame] ) and ( $stop[$frame] ) );
          my $s = 0;  # current start index
          my $e = 0;  # current stop (end) index
          while ( defined $start[$frame]->[$s] )
            {
              while ( ( defined $stop[$frame]->[$e] ) and ( $stop[$frame]->[$e] < $start[$frame]->[$s] ) )
                { $e++; }

              # ORF detected, unless end position not defined
              if ( defined $stop[$frame]->[$e] )
                {
                  # 0-based coordinates
                  my $astart = $start[$frame]->[$s];
                  my $astop = $stop[$frame]->[$e];
                  my $orflen = ( $astop - $astart + 3 );

                  # prohibit N's in orf if requested
                  my $gapok = 1;
                  if ( @gap )
                    {
                      foreach my $gapend ( @gap )
                        {
                          if ( $gapend >= $astart )
                            {
                              if ( $gapend <= $astop )
                                { $gapok = 0; }
                              last;
                            }
                        }
                    }

                  # length must be >= --minlen parameter
                  if ( $gapok )
                    {
                      if ( $orflen >= $minlen )
                        {
                          # [0]ID  [1]orf  [2]Frame  [3]Start  [4]Codon  [5]Stop  [6]Codon  [7]Length
                          $orfperid{$currhdr}++;
                          my @outline = ( $currhdr,
                                          $orfperid{$currhdr},
                                          "+".($frame+1),
                                          $astart+1,
                                          substr($currseq,$astart,3),
                                          ($astop+3),
                                          substr($currseq,$astop,3),
                                          $orflen );
                          if ( $nsequence ) { push ( @outline, substr ($currseq,$astart,$orflen) ); }
                          if ( $psequence ) { push ( @outline, dnatoprotein(substr ($currseq,$astart,$orflen)) ); }
                          push ( @outdata, \@outline );
                          $bpplus += $orflen;
                          if ( $debug ) { debugmsg ( "Fwd frame $frame, Valid orf at $astart to $astop, length=$orflen" ); }
                        }
                      else
                        { if ( $debug ) { debugmsg ( "Fwd frame $frame, Discarding too-short orf ( $orflen b.p. ) at $astart to $astop" ); } }
                    } # if $gapok
                  else
                    { if ( $debug ) { debugmsg ( "Fwd frame $frame, Discarding orf spanning gap ( $orflen b.p. ) at $astart to $astop" ); } }

                  # increment start codon position to at least current stop codon position
                  # so that we skip over other start codons in current ORF
                  while ( ( defined $start[$frame]->[$s] ) and ( $start[$frame]->[$s] < $stop[$frame]->[$e] ) )
                    { $s++; }
                } # if stop defined, i.e., orf detected
              else
                { last; }
            }
          if ( $timer ) { print "TIMER: Forward frame $frame done ".et()." \"$currhdr\" \n"; }
        }

      # Reverse start-stop pairs
      for my $frame ( 0 .. 2 )
        {
          next unless ( ( $startrc[$frame] ) and ( $stoprc[$frame] ) );
          my $s = 0;  # current start index
          my $e = 0;  # current stop (end) index
          while ( defined $startrc[$frame]->[$s] )
            {
              while ( ( defined $stoprc[$frame]->[$e] ) and ( $stoprc[$frame]->[$e] < $startrc[$frame]->[$s] ) )
                { $e++; }

              # ORF detected, unless end position not defined
              if ( defined $stoprc[$frame]->[$e] )
                {
                  # 0-based coordinates
                  my $astart = $startrc[$frame]->[$s];
                  my $astop = $stoprc[$frame]->[$e];
                  my $orflen = ( $astop - $astart + 3 );

                  # prohibit N's in orf if requested
                  my $gapok = 1;
                  if ( @gaprc )
                    {
                      foreach my $gapend ( @gaprc )
                        {
                          if ( $gapend >= $astart )
                            {
                              if ( $gapend <= $astop )
                                { $gapok = 0; }
                              last;
                            }
                        }
                    }

                  # length must be >= --minlen parameter
                  if ( $gapok )
                    {
                      if ( $orflen >= $minlen )
                        {
                          # [0]ID  [1]orf  [2]Frame  [3]Start  [4]Codon  [5]Stop  [6]Codon  [7]Length
                          $orfperid{$currhdr}++;
                          my @outline = ( $currhdr,
                                          $orfperid{$currhdr},
                                          "-".($frame+1),
                                          ($seqlen-$astart),
                                          substr($currseqrc,$astart,3),
                                          ($seqlen-($astop+2)),
                                          substr($currseqrc,$astop,3),
                                          $orflen );
                          if ( $nsequence ) { push ( @outline, substr ($currseqrc,$astart,$orflen) ); }
                          if ( $psequence ) { push ( @outline, dnatoprotein(substr ($currseqrc,$astart,$orflen)) ); }
                          push ( @outdata, \@outline );
                          $bpplus += $orflen;
                          if ( $debug ) { debugmsg ( "Rev frame $frame, Valid orf at $astart to $astop, length=$orflen" ); }
                        }
                      else
                        { if ( $debug ) { debugmsg ( "Rev frame $frame, Discarding too-short orf ( $orflen b.p. ) at $astart to $astop" ); } }
                    } # if $gapok
                  else
                    { if ( $debug ) { debugmsg ( "Rev frame $frame, Discarding orf spanning gap ( $orflen b.p. ) at $astart to $astop" ); } }

                  # increment start codon position to at least current stop codon position
                  # so that we skip over other start codons in current ORF
                  while ( ( defined $startrc[$frame]->[$s] ) and ( $startrc[$frame]->[$s] < $stoprc[$frame]->[$e] ) )
                    { $s++; }
                } # if stop defined, i.e., orf detected
              else
                { last; }
            }
          if ( $timer ) { print "TIMER: Reverse frame $frame done ".et()." \"$currhdr\" \n"; }
        }

      # save orf data for later output in appropriate format
      if ( defined $origorder )  # sort output data by original sequence order
        {
          @outdata = sort { $a->[3] <=> $b->[3] } @outdata;
          if ( $origorder =~ m/s/i )
            { @outdata = sort { substr($a->[2],0,1) cmp substr($b->[2],0,1) } @outdata; }
        }
      else  # sort output data by orf length, largest first
        { @outdata = sort { $b->[7] <=> $a->[7] } @outdata; }

      # save guess orientation data
      if ( $guessorient )
        {
          my $txt;
          if ( ( $bpplus + $bpminus ) < 1 )
            { $txt = "no orfs"; }
          elsif ( ( $bpplus + $bpminus ) < $guessminbp )
            { $txt = "orfs too short"; }
          elsif ( ( abs( $bpplus - $bpminus ) / ( $bpplus + $bpminus ) ) < $guessthreshold )
            { $txt = "ambiguous"; }
          elsif ( $bpplus > $bpminus )
            { $txt = "forward"; }
          else
            { $txt = "reverse"; }

          $guessdata .= join ( "\t", $currhdr, $seqlen, $bpplus, $bpminus, $txt ) . "\n";
          $guessstats{$txt}++;
        } # if ( $guessorient )

      elsif ( $fasta )
        {
          my %used = ();  # for --fastacollapse or --fastalargest only, key is start, value is end
          foreach my $rowref ( @outdata )
            {
              # [0]ID  [1]orf  [2]Frame  [3]Start  [4]Codon  [5]Stop  [6]Codon  [7]Length
              my $header = ">" . $rowref->[0].".".$rowref->[1] . " " .  join ( "; ", @{$rowref}[2..7] );
              my $len = $rowref->[7];
              my $seq = "error";
              if ( ( $fastacollapse ) or ( $fastalargest ) )
                {
                  # get range, start always the lower value
                  my $start = $rowref->[3];
                  my $end = $rowref->[5];
                  if ( $end < $start )
                    {
                      $start = $rowref->[5];
                      $end = $rowref->[3];
                    }

                  # see if this range overlaps an existing one
                  my $inserted = 0;
                  foreach my $key ( keys %used )
                    {
                      if ( ( $start >= $key ) and ( $end <= $used{$key} ) )
                        {
                          # full subset, can ignore it completely for both --fastacollapse and --fastalargest
                          debugmsg ( "Sequence \"$currhdr\": orf $start..$end is subset of orf $key..$used{$key}" );
                          $inserted = 1;
                          last;
                        } # if ( ( $start >= $key ) and ( $end <= $used{$key} ) )
                      elsif ( ( $end > $used{$key} ) and ( $start <= $used{$key} ) )
                        {
                          if ( $fastacollapse )
                            {
                              debugmsg ( "Sequence \"$currhdr\": orf $start..$end extends end of orf $key..$used{$key}" );
                              # extend end side
                              $used{$key} = $end;
                              # modify start to detect join events
                              if ( $start > $key ) { $start = $key }
                            } # if ( $fastacollapse )
                          if ( $fastalargest )
                            {
                              if ( ( $end - $start ) > ( $used{$key} - $key ) )  # if new orf larger
                                {
                                  debugmsg ( "Sequence \"$currhdr\": orf $start..$end larger than orf $key..$used{$key}" );
                                  $used{$start} = $end;
                                  delete ( $used{$key} );
                                }
                            } # if ( $fastalargest )
                          $inserted = 1;
                        } # elsif ( ( $end > $used{$key} ) and ( $start <= $used{$key} ) )
                      elsif ( ( $start < $key ) and ( $end >= $key ) )
                        {
                          if ( $fastacollapse )
                            {
                              debugmsg ( "Sequence \"$currhdr\": orf $start..$end extends start of orf $key..$used{$key}" );
                              # extend start side
                              $used{$start} = $used{$key};
                              delete ( $used{$key} );
                              # modify end to detect join events
                              if ( $end < $used{$start} ) { $end = $used{$start} }
                            } # if ( $fastacollapse )
                          if ( $fastalargest )
                            {
                              if ( ( $end - $start ) > ( $used{$key} - $key ) )  # if new orf larger
                                {
                                  debugmsg ( "Sequence \"$currhdr\": orf $start..$end larger than orf $key..$used{$key}" );
                                  $used{$start} = $end;
                                  delete ( $used{$key} );
                                }
                            } # if ( $fastalargest )
                          $inserted = 1;
                        } # elsif ( ( $start < $key ) and ( $end >= $key ) )
                    } # foreach

                  # no action taken above, so new non-overlapping range found
                  unless ( $inserted )
                    {
                      $used{$start} = $end; 
                      debugmsg ( "Sequence \"$currhdr\": orf $start..$end is new" );
                    } # unless ( $inserted )

                } # if ( ( $fastacollapse ) or ( $fastalargest ) )
              else  # do not collapse overlaps
                {
                  if ( $rowref->[2] =~ m/\-/ ) # if reverse complement
                    {
                      $seq = substr( $currseq, $rowref->[5]-1, $len );
                      $seq = revcomp ( $seq );
                      $header .= " [RC]";
                    }
                  else  # forward orientation
                    {
                      $seq = substr( $currseq, $rowref->[3]-1, $len );
                    }
                  print $OUTF $header, "\n";
                  print $OUTF $seq, "\n";
                } # else do not collapse overlaps
            } # foreach my $rowref ( @outdata )

          if ( ( $fastacollapse ) or ( $fastalargest ) )
            {
              debugmsg ( "Sequence \"$currhdr\" has ".scalar ( keys %used )." orf sequences to save" );
              my $numseq = 0;
              my $prevend = 0;
              foreach my $key ( sort { $a <=> $b } keys %used )
                {
                  $numseq++;
                  my $len = ( $used{$key} - $key + 1 );
                  my $header = ">$currhdr.$numseq $key..$used{$key} = $len b.p.";
                  my $seq = substr( $currseq, $key-1, $len );
                  debugmsg ( "Save FASTA of orf $key..$used{$key} length $len" );
                  print $OUTF $header, "\n";
                  print $OUTF $seq, "\n";
                  if ( $nonorffasta )
                    {
                      my $nolen = ( $key - $prevend - 1 );
                      my $noseq = substr( $currseq, $prevend, $nolen );
                      my $noheader = ">$currhdr.$numseq ".($prevend+1)."..".($key-1)." = $nolen b.p.";
                      debugmsg ( "Save FASTA of non-orf ".($prevend+1)."..".($key-1)." length $nolen" );
                      $prevend = $used{$key};
                      print $OUTF2 $noheader, "\n";
                      print $OUTF2 $noseq, "\n";
                    } # if ( $nonorffasta )
                } # foreach
              if ( $nonorffasta )
                {
                  $numseq++;
                  my $nolen = ( length($currseq) - $prevend );
                  my $noseq = substr( $currseq, $prevend, $nolen );
                  my $noheader = ">$currhdr.$numseq ".($prevend+1)."..".length($currseq)." = $nolen b.p.";
                  debugmsg ( "Save FASTA of last non-orf ".($prevend+1)."..".length($currseq)." length $nolen" );
                  print $OUTF2 $noheader, "\n";
                  print $OUTF2 $noseq, "\n";
                } # if ( $nonorffasta )
            } # if ( ( $fastacollapse ) or ( $fastalargest ) )

        } # if ( $fasta )
      elsif ( $gffformat )
        {
          # rowref columns: [0]ID [1]orf [2]Frame [3]Start [4]Codon [5]Stop [6]Codon [7]Length
          my $orfindex = 0;
          foreach my $rowref ( @outdata )
            {
              my $strand = substr ( $rowref->[2], 0, 1 );  # + or -
              my $start = ( $strand eq '+' )?$rowref->[3]:$rowref->[5];
              my $end   = ( $strand eq '+' )?$rowref->[5]:$rowref->[3];
              $orfindex++;
              print $OUTF join ( "\t", $currhdr, "bb.orffinder", $featureid,
                                       $start, $end, '.', $strand, 0,
                                       "ID=$currhdr.$orfindex"
                                       . ";Name=" . $rowref->[7] . $rowref->[2]
                               ), "\n";
            }
        } # if ( $gffformat )

      else  # normal output
        {
          foreach my $rowref ( @outdata )
            { print $OUTF join ( "\t", @{$rowref} ), "\n"; }
        }

      @outdata = ();
    }
  if ( $timer ) { print "TIMER: Saved ".et()." \"$currhdr\" \n"; }
} # sub process


############################################################
sub debugmsg { my ( $text, $noreturn, $nolinenum ) = @_;
############################################################
  if ( $debug )
    {
      my ($package, $filename, $line, $sub) = caller(0);
      unless ( $nolinenum ) { $text = "Line $line: " . $text; }
      if ( ! ( $noreturn ) ) { $text .= "\n"; }
      print $text;
    } # if ( $debug )
} # sub debugmsg


###############################################################
sub timestr {
###############################################################
  @_ = localtime(shift || time);
  return(sprintf("%04d/%02d/%02d %02d:%02d", $_[5]+1900, $_[4]+1, $_[3], @_[2,1]));
} # sub timestr


###############################################################
sub commify {
###############################################################
# http://perldoc.perl.org/perlfaq5.html#How-can-I-output-my-numbers-with-commas
  local $_ = shift;
  1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
  return $_;
} # commify


############################################################
sub revcomp { my ( $dna ) = @_;
############################################################
# standard DNA reverse complement, including degenerate bases
  my $revcomp = reverse ( $dna );
  $revcomp =~ tr/AaCcTtGgMmRrYyKkVvHhDdBb/TtGgAaCcKkYyRrMmBbDdHhVv/;
  return $revcomp;
} # sub revcomp


############################################################
sub dnatoprotein { my ( $dna ) = @_;
############################################################
# translate DNA sequence into protein sequence
  $dna =~ tr/acgtT/ACGUU/;
  if ( ( length($dna) % 3 ) != 0 )
    { die "Error, DNA sequence length not a multiple of 3: \"$dna\"\n"; }
  my $protseq = "";
  foreach my $triplet ( $dna =~ m/(...)/g )
    {
      if ( defined $proteins{$triplet} )
        { $protseq .= $proteins{$triplet}; }
      else
        { $protseq .= "?"; }  # should only happen if any letter is not [ACTGU]
    }
  return $protseq;
} # sub dnatoprotein


###############################################################
sub stdopen { my ( $mode, $filename, $extratext ) = @_;
###############################################################
# a replacement for the three-parameter open which also allows
# the use of "-" as the file name to mean STDIN or STDOUT
  my $fh;  # the file handle
  if ( $filename eq "-" )  # only exact match to "-" has special meaning
    {
      if ( $mode =~ m/>/ )
        { $fh = *STDOUT }
      else
	{ $fh = *STDIN }
    }
  else
    {
      # supplemental passed text for error messages, need one more space
      if ( defined $extratext )
        { $extratext .= " " }
      else
	{ $extratext = "" }

      my $text;  # this is only used for error message
      if ( $mode =~ m/^\+?>>/ )  # ">>" or "+>>"
        { $text = "append" }
      elsif ( $mode =~ m/^\+?>/ )  # ">" or "+>"
        { $text = "output" }
      elsif ( $mode =~ m/^\+?</ )  # "<" or "+<"
        { $text = "input" }
      else
	{ die "Error, unsupported file mode \"$mode\" specified to stdopen( $mode, $filename, $extratext )\n"; }
      open ( $fh, $mode, $filename ) or die ( "Error opening ${extratext}file \"$filename\" for $text: $!\n" );
    }
  # return the opened file handle to the caller
  return $fh;
} # stdopen


###############################################################
sub stdclose { my ( $fh ) = @_;
###############################################################
# same as built-in close, except in case of STDIN or STDOUT,
# and in this case the file handle is not closed

  unless ( fileno($fh) <= 2 )  # if file number is this low, is stdin or stdout or stderr
    { close ( $fh ) or die ( "Error closing file handle: $!\n" ); }

} # sub stdclose


###############################################################
sub et {
###############################################################
  # elapsed time for --timer option
  my $now = time();
  my $et = $now - $past;
  $past = $now;
  return sprintf ( "%0.6f seconds", $et );
} # sub et


=test_data
>A1Contig1
TCACACTTTTAGCTAACAGATTTACATATCTCCGTTGGAGATGCTCTAACATGACTAGAATGAGATGGCCCTCGTTAAAA
TTTTAAAATCTCGGCAGAGAAGTTGTTAAAAAGAGCACAATTCATGAGTTCGATAACTGATGAGAAACTGAGTGAACATA
AAGCAACACAGAGGGCAGTACAACTCTTATTTAAGAACGAGATTAGAAATGTCCAACATCTTCACAAATGAAGGACCGAA
GTTTGATATGGAATTCTGACTTGATAGAGACATTAGAGTTGGAAAACTTGTTGATCAATGCAGCTATTACCATGCATTCA
GCTGAAGCAAGAAAAGAGAGCAGAGGAGCTCATGCTCGTGAAGATTTTACGAAAAGAGATGATGAGAATTGGATGAAGCA
TTCATTGGGATACTGGGAGAATGAGAAGGTACGGCTGGACTACAGGCCTGTTCACATGAACACATTAGACGATGAGATTG
AAACGTTCCCACCTAAAGCACGTGTGTACTGAGGTATGGTATTGAAGGATACATGTGGTTGGGGAAAAATATAATATTTT
CTTCTAAGAAGTCCGCAATAAATTTACTGAGACCTAGAAAATTTCTAAATAATGACGTCATTTGTCAAACTGCAGTCGGG
GTGAGTAAGCTTGTCTCTGTCATGAGCAAGGGTGTGAGAATACCTCACTTGTATTGATCATGTTCCTGGCAACAAATATT
TTATACTACTCATAAAAATGACTCCAAACTTTCAATTACTAGTAACAACTGGATTGGAAGTTAAATCAAAGTGTTATCTT
ATTATTGGTAGCTCAGTTGAACTTTGTTTTAGAACAAAATGCTTACTATAATTTCGTGAGCATTATGACTTTGGATGACA
TTTTGGTCATTGTATTTGTGTATCAAATGTGAGATTAGTTGCTGTCTACT

NCBI results from http://www.ncbi.nlm.nih.gov/gorf/orfig.cgi
Frame		from		to	Length
 +3		228	..	512	285
 +2		533	..	697	165
 -1		493	..	621	129
 +3		738	..	851	114
 -3		203	..	313	111
 +1		124	..	231	108

Results from this program
#ID	orf	Frame	Start	Codon	Stop	Codon	Length
A1Contig1	3	+3	228	ATG	512	TGA	285
A1Contig1	2	+2	533	ATG	697	TGA	165
A1Contig1	5	-1	621	ATG	493	TAG	129
A1Contig1	4	+3	738	ATG	851	TAA	114
A1Contig1	6	-3	313	ATG	203	TAA	111
A1Contig1	1	+1	124	ATG	231	TGA	108

=cut test_data
# eof