{-# LANGUAGE DoAndIfThenElse #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE NoMonomorphismRestriction #-} -- | Import HMMER3 HMM models. module Biobase.SElab.HMM.Import where import Control.Monad.IO.Class (liftIO, MonadIO) import Data.ByteString.Char8 as BS import Data.ByteString.Lex.Double as BS import Data.Conduit as C import Data.Conduit.Binary as CB import Data.Conduit.List as CL import Control.Monad (unless) import Prelude as P import Control.Arrow import qualified Data.Map as M import Data.Char (toLower) import Biobase.SElab.HMM import Biobase.SElab.Types -- * Different HMMer parsers -- ** HMMER3 / b -- | -- -- TODO not everything is currently being parsed. Notably the rf,cs,alignmap -- annotations. -- parseHMM3 :: (Monad m, MonadIO m) => Conduit ByteString m HMM3 parseHMM3 = go where go = do hdr' <- CL.head unless (legalHMM hdr') . error $ "no legal HMM at header: " ++ show hdr' let Just hdr = hdr' hs <- headerMap `fmap` headerLines (sas,ths) <- sathLines let asize = P.length sas c <- compoLine n0 <- parseBegin asize ns <- parseNodes asize Just "//" <- CL.head return $ HMM3 { _version = second (BS.dropWhile (==' ')) . BS.span (/=' ') $ hdr , _idd = IDD $ hs M.! "NAME" , _acc = fmap (ACC . readBS) $ "AC" `M.lookup` hs , _description = "DESC" `M.lookup` hs , _leng = readBS $ hs M.! "LENG" , _alph = readAlph $ hs M.! "ALPH" , _rf = readBoolean $ M.findWithDefault "no" "RF" hs , _cs = readBoolean $ M.findWithDefault "no" "CS" hs , _alignMap = readBoolean $ M.findWithDefault "no" "MAP" hs , _date = M.findWithDefault "" "DATE" hs , _symAlph = sas , _transHeaders = ths , _compo = c , _nodes = n0:ns } -- | Check, if we have a legal HMMER3 model. legalHMM :: Maybe ByteString -> Bool legalHMM (Just s) | w == "HMMER3/f" = True | w == "HMMER3/b" = True where (w:_) = BS.words s legalHMM _ = False -- * Helper functions -- | Read boolean flags. readBoolean = f . BS.map toLower where f "no" = False f "yes" = True f x = error $ "unknown boolean: " ++ show x -- | Determine which alphabet is in use by the HMM. readAlph = f . BS.map toLower where f "dna" = DNA f "rna" = RNA f "coins" = Coins f "dice" = Dice f "amino" = Amino f "custom" = Custom f a = error $ "unknown alph: " ++ show a -- | Read from a bytestring into a structure. readBS = read . BS.unpack -- | create associative map of the key/value data. headerMap xs = M.fromList . P.map f $ xs where f = second (BS.dropWhile (==' ')) . BS.span (/=' ') -- | Parse the two beginning lines. parseBegin asize = do Just i' <- CL.head Just t' <- CL.head return $ Node 0 [] (P.map (readNLP . BS.unpack) $ BS.words i') (P.map (readNLP . BS.unpack) $ BS.words t') -- | Parse all individual nodes, except the first one, which uses 'parseBegin'. parseNodes asize = go [] where go xs = do p <- CL.peek case p of (Just "//") -> return $ P.reverse xs _ -> do Just m' <- CL.head Just i' <- CL.head Just t' <- CL.head let (nid:m) = BS.words m' let n = Node (read . BS.unpack $ nid) (P.map (readNLP . BS.unpack) $ P.take asize m) (P.map (readNLP . BS.unpack) $ BS.words i') (P.map (readNLP . BS.unpack) $ BS.words t') go (n:xs) -- | Read a HMMER negated log-probability. readNLP :: String -> NegLogProb readNLP = go where go "*" = NLP $ 1/0 go xs = NLP . read $ xs -- | Read the optional COMPO line. compoLine = do Just p <- CL.peek case (BS.words p) of ("COMPO":xs) -> CL.head >>= \_ -> return $ P.map (NLP . read . BS.unpack) xs _ -> return [] -- | Read the alphabet and transition lines. sathLines = do Just sa' <- CL.head Just th' <- CL.head let (sa:sas) = BS.words sa' let ths = BS.words th' if sa == "HMM" then return (sas,ths) else error $ "NOT THE HMM symalph lines: " ++ show (sa:sas,ths) -- | All the header lines until we see "HMM". headerLines = go [] where go xs = do p <- CL.peek case p of (Just x) | "HMM" `BS.isPrefixOf` x -> return $ P.reverse xs | otherwise -> CL.drop 1 >> go (x:xs) Nothing -> error $ "no more lines after: " ++ show (P.reverse xs) -- | Simple test for the HMMer parser. test :: IO () test = do xs <- runResourceT $ sourceFile "test.hmm" =$= CB.lines $= CL.sequence parseHMM3 $$ consume -- sinkHandle stdout print xs