Program for finding the longest open reading frame out of the 6 possible frames in any contigs. The start can be either the first codon or a methionine codon (M) after a stop codon.
Input should be one line, a fsa file should be converted by the following script perl -ape 'chomp;if(/>/){s/^/\n/;s/$/ xx/;}' so there is two xx rigth in front of the sequence
#!/usr/bin/perl -w #findstop6frames.pl or find longest start stop strech #input one line per seq. which shall contain two xx in front of the seq @lager = @ARGV; undef @ARGV;$max= $lager[0]; # arg set to 3 only forward $rr=6; if($max eq 3){ $rr=3;} # only forward while (<>){ $r=0;$ll2=0;$cor=0;$flx=0;$flx1=0; # reading record while (<>){ $r=0;$ll2=0;$cor=0;$flx=0;$flx1=0; # reading record while ( $r <$rr ) {$r=$r+1; # over $rr rounds (frames) if ($r == 1 && /xx/) { s/xx(.*)/$&/;$b=$1;$c=length($b); } if ($r == 4) {$cor=3; $seq=reverse(uc($b)); $seq =~ s/\n//;$seq =~ s/$/\n/; $seq =~ s/A/t/g; # reverse seq $seq =~ s/T/a/g; $seq =~ s/G/c/g; $seq =~ s/C/g/g; $b=uc($seq);} $d=-1+$r-$cor; # set frame while ( $d < $c-2 ) { $fl=1; while ( $fl == 1 && $d < $c-2 ) { # finding startpoint $f=substr($b,$d,3); # if($f =~ /TAA|TGA|TAG/i && $d <$c-9 ){;} else { $ff=$f; $dd=$d; $fl=0; $flx=1; } # start if non stop codon if($f =~ /ATG/i || $d >$c-9 || $d < 3+$r-$cor){ $ff=$f; $dd=$d; $fl=0; $flx=1; } # start with start codon or M $d=$d+3; } # next codon $fl=1; while ($fl == 1 && $d < $c-2 ) { # finding stop codon or end $f=substr($b,$d,3); if($f =~ /TAA|TGA|TAG/i || $d >$c-4 ){ $ll=$d- $dd; if( $ll > $ll2) { $r1=$r ; $ff1=$ff; $dd1=$dd+1 ; $f2=$f; $ll2=$ll; $ds=$d+1;} $fl=0; $flx1=1;} # update the longest frame $d=$d+3; } }} if($flx*$flx1 > 0) {$ratio=$ll2/$c; s/xx/ fr $r1 START Query: $dd1 $ff1 STOP $ds $f2 $c l\= $ll2 $ratio xx/; # putting results to the record if($r1 > 3){chomp;$r1x=$r1-3;s/fr \d+/fr $r1x/;s/xx.*/reversed xx$b/;} } # if longest is frame >3 put the reversed to record print "$_" ; }
>Contig1072_32458_IMA8 GAAGGTTGGGTGCCCAGGGCTGGAGGAGGGGCTTGTGGAATTAGTGTCTGATGGGGACAGACTTTCAGTTTGGGAAGTGGAAAAGGTT CTGGAGACAGATGGTGGCGACAGCTGCCCAGCAGCGTGAAGGTACTTCATGCACTGAACTGTATACTCCGAAGCGCTTAAAACGGTGAATTACTGCTTCCCGTCAATATGCCGA CTTTAGACGCTCCAGAAGGGAGGCTGAGAAAATTCAAGTACCGGGGCAAAGATGCGTCTATCCGGCGGCACCAGCGCATGGCAGTCAGCCTGGAACTCCGCAAGGCCAAGAAAG ATGAGCAGGCCTTAAAGAGAAGAAATATCACCATTTTCTCCCCTGAACCAGCTTCTGGAGAGCTGACCAAAGGGGTCAGCCTCACCCTGCAAGAAATCATCAGTGGCGTGAATG CCTCAGATCCAGACCTGTGTTTCCAGGCCACCCAGGCAGCCAGGAAAATGCTGTCCCAAGAAAAGAACCCTCCTCTAAAATTGATTGTTGAAGCAGGCCTCATCCCCAGGCTGG TGGAGTTCCTGAAGTTGTCACCTCACCCCTGCTTGCAGTTTGAGGCAGCCTGGGCTCTGACCAACATCGCTTCTGGGACTTCAGAGCAGACTCAAGCTGTTGTGGAAGGTGGGG CCATCCCACCTTTGGTTGAGCTCCTGTCTTCCCCCCACATGACTGTGTGCGAACAGGCGGTGTGGGCTCTTGGTAATATCGCAGGTGATGGCCCAGAATTCAGAGATCTCGTTA TCTCGAGCAATGCTATTCCATATCTGCTGGCCCTCGTTTCATCAACCATACCAATCACGTTTCTACGGAACATCACGTGGACCTTGTCCAACTTGTGCCGAAACAAGAACCCTT ACCCTTCCGTGAAAGCCGTGAAGCAGATGTTGCCTGTCCTGTCCCACCTCCTGCAGCACCAAGACAGCGAAATTCTCTCGGACACCTGCTGG
>Contig1072_32458_IMA8 fr 1 START Query: 196 ATG STOP 975 TGG 978 l= 780 0.797546012269939 xxGAAGGTTGGGTGCCCAGGGCTGGA GGAGGGGCTTGTGGAATTAGTGTCTGATGGGGACAGACTTTCAGTTTGGGAAGTGGAAAAGGTTCTGGAGACAGATGGTGGCGACAGCTGCCCAGCAGCGTGAAGGTACTTCAT GCACTGAACTGTATACTCCGAAGCGCTTAAAACGGTGAATTACTGCTTCCCGTCAATATGCCGACTTTAGACGCTCCAGAAGGGAGGCTGAGAAAATTCAAGTACCGGGGCAAA GATGCGTCTATCCGGCGGCACCAGCGCATGGCAGTCAGCCTGGAACTCCGCAAGGCCAAGAAAGATGAGCAGGCCTTAAAGAGAAGAAATATCACCATTTTCTCCCCTGAACCA GCTTCTGGAGAGCTGACCAAAGGGGTCAGCCTCACCCTGCAAGAAATCATCAGTGGCGTGAATGCCTCAGATCCAGACCTGTGTTTCCAGGCCACCCAGGCAGCCAGGAAAATG CTGTCCCAAGAAAAGAACCCTCCTCTAAAATTGATTGTTGAAGCAGGCCTCATCCCCAGGCTGGTGGAGTTCCTGAAGTTGTCACCTCACCCCTGCTTGCAGTTTGAGGCAGCC TGGGCTCTGACCAACATCGCTTCTGGGACTTCAGAGCAGACTCAAGCTGTTGTGGAAGGTGGGGCCATCCCACCTTTGGTTGAGCTCCTGTCTTCCCCCCACATGACTGTGTGC GAACAGGCGGTGTGGGCTCTTGGTAATATCGCAGGTGATGGCCCAGAATTCAGAGATCTCGTTATCTCGAGCAATGCTATTCCATATCTGCTGGCCCTCGTTTCATCAACCATA CCAATCACGTTTCTACGGAACATCACGTGGACCTTGTCCAACTTGTGCCGAAACAAGAACCCTTACCCTTCCGTGAAAGCCGTGAAGCAGATGTTGCCTGTCCTGTCCCACCTC CTGCAGCACCAAGACAGCGAAATTCTCTCGGACACCTGCTGG