-- find longest ORF in a Fasta file, inspired by a question on BioStar -- (http://biostar.stackexchange.com/questions/5902/code-golf-finding-orf-and-corresponding-strand-in-a-dna-sequence/) import Bio.Sequence import System.IO import Data.List (maximumBy, tails) main :: IO () main = hReadFasta stdin >>= mapM_ (putStrLn . doit . castToNuc) doit s = format s . maximumBy comp_orflength . all_orfs $ s format :: Sequence Nuc -> (Frame,Length,Offset) -> String format s (frm, len, off) | frm > 0 = sq++"+"++show frm++" "++show (frm+off)++" "++show (frm+off+len'-1) | otherwise = sq++show frm++" "++show (sl+frm-off-len'+2)++" "++show (sl-off+frm+1) where len' = fromIntegral len sl = fromIntegral (seqlength s) sq = toStr (seqheader s) ++ ":\t" comp_orflength (_,l1,_) (_,l2,_) = compare l1 l2 type Frame = Offset -- -3..+3 type Length = Int -- | Put it all togehter, and generate all ORFs from a sequence. all_orfs :: Sequence Nuc -> [(Frame,Length,Offset)] all_orfs = map (findlength . truncateStp) . concat . map transtails . translations -- | Convert the translated sequence into the equivalent nucleotide length findlength :: (Frame,Offset,[Amino]) -> (Frame,Length,Offset) findlength (f,o,as) = (f,3*length as,o) -- | Truncate the translation if and when a stop codon is reached. truncateStp :: (Frame,Offset,[Amino]) -> (Frame,Offset,[Amino]) truncateStp (f,o,as) = (f,o,takeUntil (==STP) as) -- | Just like takeWhile, but keeps the final element (ie. STP) takeUntil p (x:xs) | p x = [x] | otherwise = x : takeUntil p xs takeUntil _ [] = [] -- | Analogous to the 'tails' function, this generates all suffixes -- of a translation. transtails :: (Frame,[Amino]) -> [(Frame,Offset,[Amino])] transtails (f,as) = [(f,i,ts) | (i,ts) <- zip [0,3..] (tails as)] -- | Generate all six-frame translations of the sequence, recording -- the frame as well. The order decides which one gets selected later -- on in the case of equally long ORFs. translations :: Sequence Nuc -> [(Frame,[Amino])] translations s = [(negate i,translate (revcompl s) (i-1)) | i <- [3,2,1]] ++ [(i,translate s (i-1)) | i <- [3,2,1]]