Linux sequence handling software - RNA translator

by Knud Christensen

return

Program for producing the amino acid sequence using the query position from blast as start point. If the query position is not 1 and the start codon is not M, it searches backward to find a start point, a stop codon or only first codon. It continues from there to the end or a stop codon. If a stop codon is found within the first 100 bp it steps over and start from there
If 'Frame' has a negative value the sequence is converted and the Query start point is reset to begin from the other end.

Input should be one line, a fsa file can be converted by the following perl script: perl -ape 'chomp;if(/>/){s/^/\n/;s/$/ xx/;}'
so there is two xx rigth in front of the sequence

See example below



#!/usr/bin/perl -w
#input one line per seq. shall contain from blast Query: start and two xx in front of the seq; can also take input from findstop6frames.pl

%codon =(											 				# RNA translation
"GCA" => "A","GCC" => "A","GCG" => "A","GCT" => "A",					 				# Alanine
"TGC" => "C","TGT" => "C",									 				# Cysteine
"GAC" => "D","GAT" => "D",									 				# Aspartic Acid
"GAG" => "E","GAA" => "E",									 				# Glutamine Acid
"TTC" => "F","TTT" => "F",									 				# Phenylalanine
"GGA" => "G","GGT" => "G","GGC" => "G","GGG" => "G",					 				# Glycine
"CAC" => "H","CAT" => "H",									 				# Histine
"ATA" => "I","ATT" => "I","ATC" => "I",							 				# Isoleucine
"AAA" => "K","AAG" => "K",									 				# Lysine
"TTA" => "L","TTG" => "L","CTA" => "L","CTT" => "L","CTG" => "L","CTC" => "L", 	 				# Leucine
"ATG" => "M",											 				# Methionine (start codon)
"AAC" => "N","AAT" => "N",									 				# Asparagine
"CCA" => "P","CCT" => "P","CCG" => "P","CCC" => "P",					 				# Proline
"CAA" => "Q","CAG" => "Q",									 				# Glutamine
"CGA" => "R","CGT" => "R","CGG" => "R","CGC" => "R","AGA" => "R","AGG" => "R",	 				# Arginine
"TCA" => "S","TCT" => "S","TCG" => "S","TCC" => "S","AGC" => "S","AGT" => "S",	 				# Serine
"ACA" => "T","ACT" => "T","ACG" => "T","ACC" => "T",					 				# Threonine
"GTA" => "V","GTT" => "V","GTG" => "V","GTC" => "V",					 				# Valine
"TGG" => "W",											 				# Tryptophan
"TAC" => "Y","TAT" => "Y",									 				# Tyrosine
"TAA" => "*","TGA" => "*","TAG" => "*" );							 				# Stop codons
while (<> ){												  		
      s/ $//;s/xx(.*)/$&/;$b=$1;$c=length($b);										# reading rna and rna length
     if(/Query/){				
     s/Query\: (\d+)/$&/;$a=$1;$d=$a;$st=$a; 										# looking for blast initial point
     if (/Frame \= \-/) {$seq=reverse(uc($b)); $seq =~ s/\n//;$seq =~ s/$/\n/; $seq =~ s/A/t/g;      		# reverse seq
			$seq =~ s/T/a/g; $seq =~ s/G/c/g; $seq =~ s/C/g/g; $b=uc($seq);$b =~ s/\n//;
			s/xx.*/reversed xx$b/;  $a=$c-$a+1; s/Query\: (\d+)/Query\: $a/; $d=$a;$st=$a;}                    #

while ( $d < $a+120 && $d < $c-4) {$f=substr($b,$d-1,3);							       # jumping an early stopcodon
     if($f =~ /TAA|TGA|TAG/i){$d=$d+3; $st=$d; $d=$c; s/xx/www start $st xx/ } 
     $d=$d+3;    } 

#     $a=$st;
     if($a eq $st) { $d=$a;
while ( $d > 0) {$f=substr($b,$d-1,3);											 # looking backward for start point
    if($f =~ /ATG/i &&  $d > 4 )       { s/xx/ww1 start $d $f xx/;$st=$d; $d=-3; }
    if($f =~ /TAA|TGA|TAG/i && $d > -3){$d=$d+3; $f=substr($b,$d-1,3);s/xx/ww2 start $d $f xx/;$st=$d+3; $d=-3; }
    if($d <4 &&  $d > -3 )             { s/xx/ww3 start $d $f xx/;$st=$d; }
    $d=$d-3;  
  } 
 		    }
    $slx=0;$d=$st;
while ( $d < $c-2 ) {$f=substr($b,$d-1,3);								  		 # looking for stop codon
    if($f =~ /TAA|TGA|TAG/i){ s/xx/stop $d $f $c xx/; $s=$d;$slx=$d+1; $d=$c; }     $d=$d+3;  }  
    if( $slx == 0)          { s/xx/stop $d $f $c xx/; $s=$d;$slx=$c-2; $d=$c; } 
    $d=$st;
		}

     $siz =($s-$st)/3;s/$/ $siz yy/;
while (  $d < $slx ) {											   		# Producing the prot seq.
	$f=substr($b,$d-1,3); $r=$codon{$f}; s/$/$r/;							   		# translating and adding codon to record
       $d=$d+3;    }  
		     														
       print "$_"; }



Input, should be converted to one line:

>Contig1072_32458_IMA8 >sp|A9QM74|IMA8_HUMAN Importin subunit alpha-8 OS=Homo sapiens GN=KPNA7 PE=1 SV=1 > 424 bits (1091), Expect = e-119 ident of 263 (81%) Frame = +1 Query: 196 Sbjct: 1 3e-77 42  
GAAGGTTGGGTGCCCAGGGCTGGAGGAGGGGCTTGTGGAATTAGTGTCTGATGGGGACAGACTTTCAGTTTGGGAAGTGGAAAAGGTTCTGGAG
ACAGATGGTGGCGACAGCTGCCCAGCAGCGTGAAGGTACTTCATGCACTGAACTGTATACTCCGAAGCGCTTAAAACGGTGAATTACTGCTTCCCGTCAATATGCCGACTTTAGACGCTCCAGAAGGGAGGCTGAGAAAATTCAAGT
ACCGGGGCAAAGATGCGTCTATCCGGCGGCACCAGCGCATGGCAGTCAGCCTGGAACTCCGCAAGGCCAAGAAAGATGAGCAGGCCTTAAAGAGAAGAAATATCACCATTTTCTCCCCTGAACCAGCTTCTGGAGAGCTGACCAAAG
GGGTCAGCCTCACCCTGCAAGAAATCATCAGTGGCGTGAATGCCTCAGATCCAGACCTGTGTTTCCAGGCCACCCAGGCAGCCAGGAAAATGCTGTCCCAAGAAAAGAACCCTCCTCTAAAATTGATTGTTGAAGCAGGCCTCATCC
CCAGGCTGGTGGAGTTCCTGAAGTTGTCACCTCACCCCTGCTTGCAGTTTGAGGCAGCCTGGGCTCTGACCAACATCGCTTCTGGGACTTCAGAGCAGACTCAAGCTGTTGTGGAAGGTGGGGCCATCCCACCTTTGGTTGAGCTCC
TGTCTTCCCCCCACATGACTGTGTGCGAACAGGCGGTGTGGGCTCTTGGTAATATCGCAGGTGATGGCCCAGAATTCAGAGATCTCGTTATCTCGAGCAATGCTATTCCATATCTGCTGGCCCTCGTTTCATCAACCATACCAATCA
CGTTTCTACGGAACATCACGTGGACCTTGTCCAACTTGTGCCGAAACAAGAACCCTTACCCTTCCGTGAAAGCCGTGAAGCAGATGTTGCCTGTCCTGTCCCACCTCCTGCAGCACCAAGACAGCGAAATTCTCTCGGACACCTGCTGG

Output using program

The program has added to the input line the data given in bold face, the last number in front of the two xx'es is the RNA sequence length; and the number in front of the two yy'es is the protein sequence length
>Contig1072_32458_IMA8 >sp|A9QM74|IMA8_HUMAN Importin subunit alpha-8 OS=Homo sapiens GN=KPNA7 PE=1 SV=1 > 424 bits (1091), Expect = e-119 ident of
 263 (81%) Frame = +1 Query: 196 Sbjct: 1 3e-77 42 ww1 start 196 ATG stop 976 TGC 978 xxGAAGGTTGGGTGCCCAGGGCTGGAGGAGGGGCTTGTGGAATTAGTGTCTGATGGGGACA
GACTTTCAGTTTGGGAAGTGGAAAAGGTTCTGGAGACAGATGGTGGCGACAGCTGCCCAGCAGCGTGAAGGTACTTCATGCACTGAACTGTATACTCCGAAGCGCTTAAAACGGTGAATTACTGCTTCCCGTCAATATGCCGACTTT
AGACGCTCCAGAAGGGAGGCTGAGAAAATTCAAGTACCGGGGCAAAGATGCGTCTATCCGGCGGCACCAGCGCATGGCAGTCAGCCTGGAACTCCGCAAGGCCAAGAAAGATGAGCAGGCCTTAAAGAGAAGAAATATCACCATTTT
CTCCCCTGAACCAGCTTCTGGAGAGCTGACCAAAGGGGTCAGCCTCACCCTGCAAGAAATCATCAGTGGCGTGAATGCCTCAGATCCAGACCTGTGTTTCCAGGCCACCCAGGCAGCCAGGAAAATGCTGTCCCAAGAAAAGAACCC
TCCTCTAAAATTGATTGTTGAAGCAGGCCTCATCCCCAGGCTGGTGGAGTTCCTGAAGTTGTCACCTCACCCCTGCTTGCAGTTTGAGGCAGCCTGGGCTCTGACCAACATCGCTTCTGGGACTTCAGAGCAGACTCAAGCTGTTGT
GGAAGGTGGGGCCATCCCACCTTTGGTTGAGCTCCTGTCTTCCCCCCACATGACTGTGTGCGAACAGGCGGTGTGGGCTCTTGGTAATATCGCAGGTGATGGCCCAGAATTCAGAGATCTCGTTATCTCGAGCAATGCTATTCCATA
TCTGCTGGCCCTCGTTTCATCAACCATACCAATCACGTTTCTACGGAACATCACGTGGACCTTGTCCAACTTGTGCCGAAACAAGAACCCTTACCCTTCCGTGAAAGCCGTGAAGCAGATGTTGCCTGTCCTGTCCCACCTCCTGCA
GCACCAAGACAGCGAAATTCTCTCGGACACCTGCTGG 260 yyMPTLDAPEGRLRKFKYRGKDASIRRHQRMAVSLELRKAKKDEQALKRRNITIFSPEPASGELTKGVSLTLQEIISGVNASDPDLCFQATQAARKMLSQEKNPP
LKLIVEAGLIPRLVEFLKLSPHPCLQFEAAWALTNIASGTSEQTQAVVEGGAIPPLVELLSSPHMTVCEQAVWALGNIAGDGPEFRDLVISSNAIPYLLALVSSTIPITFLRNITWTLSNLCRNKNPYPSVKAVKQMLPVLSHLLQH
QDSEILSDTC