module Biobase.SElab.CM.Import where
import Control.Applicative
import Control.Arrow
import Control.Lens
import Control.Monad.IO.Class
import Control.Monad.IO.Class (liftIO, MonadIO)
import Control.Monad (unless)
import Data.Attoparsec.ByteString as AB
import Data.ByteString.Char8 as BS
import Data.ByteString.Lex.Double as BS
import Data.Char (isSpace,isAlpha,isDigit)
import Data.Conduit as C
import Data.Conduit.Attoparsec
import Data.Conduit.Binary as CB
import Data.Conduit.List as CL
import Data.Map as M
import Data.Maybe as M
import Data.Tuple.Select
import Data.Vector.Unboxed as VU (fromList)
import Prelude as P
import System.IO (stdout)
import Data.PrimitiveArray
import Data.PrimitiveArray.Zero
import Biobase.SElab.CM
import Biobase.SElab.Types
import qualified Biobase.SElab.HMM as HMM
import qualified Biobase.SElab.HMM.Import as HMM
parseHeader = ($) <$ AB.string "INFERNAL" *> (Infernal10 <$ AB.string "-1" <|> Infernal11 <$ AB.string "1/a") <*> AB.takeByteString <?> "INFERNAL line"
lineParser p = CL.head >>= \x -> return . maybe (error "no more input") (either (\e -> error $ show (e,x)) id . AB.parseOnly p) $ x
parseCM1x :: (Monad m, MonadIO m) => Conduit ByteString m CM
parseCM1x = CB.lines =$= CL.sequence go where
go = do
hdr <- lineParser parseHeader
hs <- parseHeaders []
ns <- parseNodes hdr []
let nsMap = M.fromList . P.map (\n -> (sel2 n, (sel1 n, P.map (^. stateID) $ sel3 n))) $ ns
let ssMap = M.fromList . P.map ((^. stateID) &&& id) . P.concatMap (sel3) $ ns
lineParser $ (AB.string "//" <?> "model end")
pk <- CL.peek
hmm <- case HMM.legalHMM pk of
True -> Just `fmap` HMM.parseHMM3
False -> return Nothing
return CM
{ _name = IDD $ hs M.! "NAME"
, _accession = ACC . readAccession . P.head . M.catMaybes $ P.map (`M.lookup` hs) ["ACC", "ACCESSION"]
, _version = hdr
, _trustedCutoff = BitScore . readBS $ hs M.! "TC"
, _gathering = BitScore . readBS $ hs M.! "GA"
, _noiseCutoff = (BitScore . readBS) `fmap` (M.lookup "NC" hs)
, _nullModel = VU.fromList . P.map readBitScore . BS.words $ hs M.! "NULL"
, _nodes = nsMap
, _states = ssMap
, _localBegin = flip M.singleton (BitScore 0) . (^.stateID) . P.head . P.filter (\s -> s^.stateType == S && s^.nodeID == NodeID 0 ) . M.elems $ ssMap
, _localEnd = M.empty
, _unsorted = M.filter (not . flip P.elem ["NAME","ACCESSION","TC","GA","NC","NULL"]) hs
, _hmm = hmm
}
readBS = read . BS.unpack
readBitScore "*" = BitScore $ 1/0
readBitScore x = BitScore . readBS $ x
readAccession xs
| BS.length xs /= 7 = error $ "can't read accession: " ++ BS.unpack xs
| "RF" == hdr && P.all isDigit tl = read tl
| otherwise = error $ "readAccession: " ++ BS.unpack xs
where (hdr,tl) = second BS.unpack . BS.splitAt 2 $ xs
parseHeaders hs = do
p <- CL.head
case p of
(finishedHeader -> True) -> return . M.fromList
. P.map (second (BS.dropWhile isSpace)
. BS.break isSpace)
. P.reverse
$ hs
Nothing -> error $ "unexpected end of header, until here:" ++ (show $ P.reverse hs)
Just "" -> error "empty line"
Just l -> do ls <- if ("FT-" `isPrefixOf` l) then CL.take 2 else return []
let lls = BS.concat $ l:ls
parseHeaders (lls:hs)
finishedHeader :: Maybe ByteString -> Bool
finishedHeader (Just x) = go x where
go "MODEL:" = True
go "CM" = True
go _ = False
finishedHeader _ = False
parseNodes hdr ns = do
p <- CL.peek
case (BS.dropWhile isAlpha `fmap` p) of
Nothing -> error "unexpected empty line"
Just "//" -> return . P.reverse $ ns
(isNode -> Just (ntype,nid)) -> do _ <- CL.head
ss <- parseStates hdr ntype nid []
parseNodes hdr $ (ntype,nid,ss):ns
parseStates hdr ntype nid xs = do
p <- CL.peek
case (BS.dropWhile isSpace `fmap` p) of
Nothing -> error "unexpected empty state"
Just "//" -> return . P.reverse $ xs
(isNode -> Just _) -> return . P.reverse $ xs
_ -> do Just x <- CL.head
let psx = parseState hdr ntype nid x
parseStates hdr ntype nid (psx:xs)
parseState hdr ntype nid s
| P.null ws = error "parseState: no words"
| B == t = State { _stateID = StateID . readBS $ pn!!0
, _stateType = t
, _nodeID = nid
, _nodeType = ntype
, _transitions = [ ( StateID . readBS $ pn!!3, 0)
, ( StateID . readBS $ pn!!4, 0)
]
, _emits = EmitNothing
}
| otherwise = State { _stateID = StateID . readBS $ pn!!0
, _stateType = t
, _nodeID = nid
, _nodeType = ntype
, _transitions = [ (StateID (i+k), readBitScore $ ts!!k) | k <- [0..n1]]
, _emits = e
}
where
ws = BS.words s
numPN = case hdr of
Infernal10 _ -> 5
Infernal11 _ -> 9
numTS = readBS $ pn!!4
numES = case w of
"MP" -> 16
(flip P.elem ["ML","MR","IL","IR"] -> True) -> 4
_ -> 0
~([w],~(pn,~(ts,es))) = second (second (second (P.map readBitScore) . P.splitAt numTS) . P.splitAt numPN) . P.splitAt 1 $ ws
t = readBS w :: StateType
i = readBS $ pn!!3
n = readBS $ pn!!4
e = case t of
MP -> EmitsPair $ P.zipWith (\(c1,c2) k -> (c1,c2,k)) [ (c1,c2) | c1 <- "ACGU", c2 <- "ACGU" ] es
((flip P.elem [ML,MR,IL,IR]) -> True) -> EmitsSingle $ P.zip "ACGU" es
_ -> EmitNothing
isNode :: Maybe ByteString -> Maybe (NodeType, NodeID)
isNode (Just xs)
| BS.null xs = Nothing
| ("[":ntype:nid:"]":cm11) <- BS.words xs = Just (readBS ntype, NodeID . readBS $ nid)
isNode _ = Nothing
fromFile :: FilePath -> IO [CM]
fromFile fp = do
runResourceT $ sourceFile fp $= parseCM1x $$ consume
test :: IO ()
test = do
xs10 <- runResourceT $ sourceFile "test10.cm" $= parseCM1x $$ consume
xs11 <- runResourceT $ sourceFile "test11.cm" $= parseCM1x $$ consume
print xs10
print xs11
return ()