module Biobase.SElab.HMM.Import where
import Control.Monad.IO.Class (liftIO, MonadIO)
import Data.ByteString.Char8 as BS
import Data.ByteString.Lex.Double as BS
import Data.Conduit as C
import Data.Conduit.Binary as CB
import Data.Conduit.List as CL
import Control.Monad (unless)
import Prelude as P
import Control.Arrow
import qualified Data.Map as M
import Data.Char (toLower)
import Biobase.SElab.HMM
import Biobase.SElab.Types
parseHMM3 = go where
go = do
hdr' <- CL.head
unless (legalHMM hdr') . error $ "no legal HMM at header: " ++ show hdr'
let Just hdr = hdr'
hs <- headerMap `fmap` headerLines
(sas,ths) <- sathLines
let asize = P.length sas
c <- compoLine
n0 <- parseBegin asize
ns <- parseNodes asize
Just "//" <- CL.head
return $ HMM3
{ _version = second (BS.dropWhile (==' ')) . BS.span (/=' ') $ hdr
, _idd = IDD $ hs M.! "NAME"
, _acc = fmap (ACC . readBS) $ "AC" `M.lookup` hs
, _description = "DESC" `M.lookup` hs
, _leng = readBS $ hs M.! "LENG"
, _alph = readAlph $ hs M.! "ALPH"
, _rf = readBoolean $ M.findWithDefault "no" "RF" hs
, _cs = readBoolean $ M.findWithDefault "no" "CS" hs
, _alignMap = readBoolean $ M.findWithDefault "no" "MAP" hs
, _date = M.findWithDefault "" "DATE" hs
, _symAlph = sas
, _transHeaders = ths
, _compo = c
, _nodes = n0:ns
}
legalHMM :: Maybe ByteString -> Bool
legalHMM (Just s)
| w == "HMMER3/f" = True
| w == "HMMER3/b" = True
where (w:_) = BS.words s
legalHMM _ = False
readBoolean = f . BS.map toLower where
f "no" = False
f "yes" = True
f x = error $ "unknown boolean: " ++ show x
readAlph = f . BS.map toLower where
f "dna" = DNA
f "rna" = RNA
f "coins" = Coins
f "dice" = Dice
f "amino" = Amino
f "custom" = Custom
f a = error $ "unknown alph: " ++ show a
readBS = read . BS.unpack
headerMap xs = M.fromList . P.map f $ xs where
f = second (BS.dropWhile (==' ')) . BS.span (/=' ')
parseBegin asize = do
Just i' <- CL.head
Just t' <- CL.head
return $ Node
0
[]
(P.map (readNLP . BS.unpack) $ BS.words i')
(P.map (readNLP . BS.unpack) $ BS.words t')
parseNodes asize = go [] where
go xs = do
p <- CL.peek
case p of
(Just "//") -> return $ P.reverse xs
_ -> do Just m' <- CL.head
Just i' <- CL.head
Just t' <- CL.head
let (nid:m) = BS.words m'
let n = Node
(read . BS.unpack $ nid)
(P.map (readNLP . BS.unpack) $ P.take asize m)
(P.map (readNLP . BS.unpack) $ BS.words i')
(P.map (readNLP . BS.unpack) $ BS.words t')
go (n:xs)
readNLP :: String -> NegLogProb
readNLP = go where
go "*" = NLP $ 1/0
go xs = NLP . read $ xs
compoLine = do
Just p <- CL.peek
case (BS.words p) of
("COMPO":xs) -> CL.head >>= \_ -> return $ P.map (NLP . read . BS.unpack) xs
_ -> return []
sathLines = do
Just sa' <- CL.head
Just th' <- CL.head
let (sa:sas) = BS.words sa'
let ths = BS.words th'
if sa == "HMM"
then return (sas,ths)
else error $ "NOT THE HMM symalph lines: " ++ show (sa:sas,ths)
headerLines = go [] where
go xs = do
p <- CL.peek
case p of
(Just x) | "HMM" `BS.isPrefixOf` x -> return $ P.reverse xs
| otherwise -> CL.drop 1 >> go (x:xs)
Nothing -> error $ "no more lines after: " ++ show (P.reverse xs)
test :: IO ()
test = do
xs <- runResourceT $ sourceFile "test.hmm" =$= CB.lines $= CL.sequence parseHMM3 $$ consume
print xs