% $Id: Fasta.lhs 547 20040126 12:33:16Z ketil $
\documentclass[a4paper]{article}
\pagestyle{myheadings}
\usepackage{haskell}
\begin{document}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Reading EST Data}
Parses ESTs on FASTA format, and creates corresponding EST data
structures.
\begin{code}
module Fasta (parse, bmfparser, ugparser, FHParser,readDataLower) where
import EST (EST(..), End(..), mkEST)
import Gene
import Data.Char(isSpace)
readUfile :: FilePath -> IO [EST]
readUfile fn = do
inp <- readFile fn
return (parse ugparser inp)
\end{code}
\subsection {Parsing}
The FHParser (FASTA header parser) contains functions for extracting
the accession number (or whatever unique identifier you wish, really)
and clone end from the header.
\begin{code}
data FHParser = FHP {getSeqNo :: String -> String
,getCloneEnd :: String -> End
,getData :: String -> [Gene]
}
ugparser, bmfparser :: FHParser
ugparser = FHP (takeWhile (/=' ') . drop 1)
(\s -> case dropIncl "clone_end=" s of
('3':_) -> Three'
('5':_) -> Five'
_ -> Unknown)
readData
bmfparser = FHP (takeWhile (/=' ') . drop 1)
(\s -> case dropIncl "cDNA " s of
('3':_) -> Three'
('5':_) -> Five'
_ -> Unknown)
readDataLower
parse :: FHParser -> String -> [EST]
parse p inp = zipWith (readEST p) [1..]
((ests . filter (\x->head x/='#') . filter (/="") . lines) inp)
\end{code}
\subsection{Processing}
The functions that actually do the work.
\begin{code}
ests :: [String] -> [[String]]
ests [] = []
ests inp = let (xs,rest) = splitWhen '>' inp
in xs : ests rest
splitWhen :: Char -> [String] -> ([String],[String])
splitWhen _ [] = error "splitWhen can't split an empty list"
splitWhen x (l:ls)
| head l /= x =
error ("Incorrect data format (expecting "++show x++"):"++ show l)
| otherwise = (l:first, rest)
where
(first, rest) = mysplit [] ls
mysplit acc [] = (reverse acc, [])
mysplit acc ("":ms) = mysplit acc ms
mysplit acc (m:ms) = if head m == x then (reverse acc, m:ms)
else mysplit (m:acc) ms
readEST :: FHParser -> Int -> [String] -> EST
readEST _ _ [] = error "readEST cannot read empty data"
readEST p i (hd:seqs) =
if seqno==seqno && end==end
then mkEST (i,seqno, end, (getData p) $ concat seqs)
else error "Can't happen (it's just for strictness, dummy!)"
where seqno = (getSeqNo p) hd
end = (getCloneEnd p) hd
readDataLower, readData :: String -> [Gene]
readDataLower = map read1 . filter (not.isSpace)
where read1 x = case x of
'a' -> A
'c' -> C
'g' -> G
't' -> T
'A' -> A
'C' -> C
'G' -> G
'T' -> T
_ -> N
readData = map read1 . filter (not.isSpace)
where read1 x = case x of
'A' -> A
'C' -> C
'G' -> G
'T' -> T
_ -> N
dropUntil :: ([a] -> Bool) -> [a] -> [a]
dropUntil _ [] = []
dropUntil p xs = if p xs then xs else dropUntil p (tail xs)
dropIncl :: Eq a => [a] -> [a] -> [a]
dropIncl s = drop (length s) . dropUntil (==s)
\end{code}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{document}