--
-- A parser for Infernal's Covariance Models (CM). Should work with version 1.0
-- of Infernal.
--

-- TODO use generics?
--
-- TODO importing the cmcalibrate calibration data is currently broken! (data is parsed as separate lines)

module Biobase.Infernal.CM.Import (fromFile,fromString) where

import Data.Maybe (fromJust)
import qualified Data.Array.IArray as A
import qualified Data.List as L
import Text.ParserCombinators.Parsec hiding (State)

import Biobase.Infernal.CM
import Biobase.RNA hiding (nucE)



-- {{{ Parsec

myChar = alphaNum <|> char '-' <|> char '_' <|> char '.'
headerChar = myChar <|> char '[' <|> char ']'
floating = many1 $ digit <|> char '.' <|> char 'e' <|> char '-'

headerLine = do
  key <- many1 headerChar
  --spaces
  many1 $ char ' '
  val <- many1 (headerChar <|> char ' ' <|> char '/' <|> char ':')
  newline
  return (key,val)

unknownNumbers = do
  many1 $ char ' '
  ns <- floating `sepEndBy` (many1 $ char ' ')
  newline
  return ("",concat ns)

theHeader = do
  hs <- many $ (try headerLine <|> try unknownNumbers)
  string "MODEL:"
  newline
  return hs

-- }}}

-- {{{ Node

node = do
  spaces
  string "[ "
  name <- many letter
  spaces
  num <- many digit >>= (return . read)
  spaces
  string "]"
  newline
  states <- many $ try (state num)
  return $ (Node
             { ntype = read name
             , nid = num
             , nstates = map sid states
             , nparents = []
             , nchildren = []
             , ntag = ()
             }
            , states)

-- }}}

-- {{{ stuff

-- TODO put some of this into HsTools/Parsec stuff

aNum = many1 $ char '-' <|> digit
addneginf :: String -> Double
addneginf "*" = (-1)/0 --"-1000000000.0"
addneginf x = read x

-- }}}

-- {{{ State

state nodeID = do
  spaces
  name <- many1 myChar
  spaces
  sid <- aNum >>= (return . read)
  spaces
  plast <- aNum >>= (return . read)
  spaces
  pnum <- aNum >>= (return . read)
  spaces
  cfirst <- aNum >>= (return . read)
  spaces
  cnum <- aNum >>= (return . read)
  probs <- ( (many $ myChar <|> char '*') `sepBy` (many1 $ char ' ') >>=
             return . map addneginf . filter ((/=) "") )
  newline
  let s = case name of
            "B" -> State
                    { stype = read name
                    , sid = sid
                    , snode = nodeID
                    , sparents = [plast-pnum+1 .. plast]
                    , schildren = [Branch cfirst, Branch cnum]
                    , semission = []
                    , stag = ()
                    }
            _   -> State
                    { stype = read name
                    , sid = sid
                    , snode = nodeID
                    , sparents = [plast-pnum+1 .. plast]
                    , schildren = zipWith Transition [cfirst .. cfirst+cnum-1] probs
                    , semission = let keep = drop cnum probs in
                        case length keep of
                          0  -> []
                          4  -> zipWith EmitS acgu keep
                          16 -> zipWith (\(k1,k2) v -> EmitP k1 k2 v) acguPairs keep
                          _  -> error $ "strange number of probabilities" ++ show (keep)
                    , stag = ()
                    }
  return s

-- }}}

-- {{{ Model

models = do
  ms <- many model
  eof
  return ms

model = do
  h <- theHeader
  ns <- many node
  let states = concatMap snd ns
  let nodes = map (addPCinfo states . fst) ns
  -- just add all the node parent / child info
  string "//"
  newline
  -- eof -- removed, we want to be able to read concatenated models!
  return $ CM
            { nodes      = A.array (0, length nodes -1)  $ zip (map nid nodes)  nodes
            , states     = A.array (0, length states -1) $ zip (map sid states) states
            , header     = h
            , localBegin = A.array (0, length states -1) $ zip [0 .. length states -1] (0.0 : repeat (-1/0))
            , localEnd   = A.array (0, length states -1) $ zip [0 .. length states -1] (repeat (-1/0))
            , cmType     = CMScore
            , nullModel  = A.array (nucA,nucU) $ zip acgu (map read . words . fromJust $ "NULL" `lookup` h) -- TODO circumvents the whole parsing stuff!
            }

-- }}}

-- {{{ Stuff

addPCinfo states n =
  let
    s = nstates n
    sp = L.nub $ L.sort $ concatMap (sparents . (states !!)) s
    sc = L.nub $ L.sort $ concatMap (transitionTargets . schildren . (states !!)) s
    np = (L.nub $ L.sort $ map (snode . (states !!)) sp) L.\\ [nid n]
    nc = (L.nub $ L.sort $ map (snode . (states !!)) sc) L.\\ [nid n]
  in
    n {nparents = np, nchildren = nc}



-- | Two types of parsing, once using a file and once by parsing a string.

fromFile f = parseFromFile models f
fromString s = parse models "(stdin)" s

-- }}}

-- | Helper function to remove impossible state transitions (those that have
-- -infty score).

-- TODO move to InfernalCM.hs and have it for Prob and Score both

{-
canonize cm = cm {states = A.amap f $ states cm} where
  f s = s {schildren = filter g $ schildren s}
  g (Branch _)    = True
  g (Transition _ v) = 
  h (_,s)
    | s == (-1)/0 = False
    | otherwise   = True
-}