-- | Parse a CT file.

module Biobase.DataSource.CT.Import where

import Control.Applicative
import Text.Parsec.Char
import Text.Parsec.Combinator hiding (optional)
import Text.Parsec.Error
import Text.Parsec hiding (many)
import Text.Parsec hiding ((<|>), many, optional)
import Text.Parsec.String
import qualified Data.Array.IArray as A

import Biobase.RNA
import Biobase.RNA.Pairs
import Biobase.DataSource.CT


-- | Parse a CT file.

fromFile :: FilePath -> IO (Either String CT)
fromFile fname = do
  x <- parseFromFile ctP fname
  return $ case x of
    Left err -> Left $ show err
    Right res -> res

-- * Parser

commentP :: GenParser Char st String
commentP = char '#' *> anyChar `manyTill` newline

commentsP = many commentP

lenlineP :: GenParser Char st (Int,String)
lenlineP = (,) <$> snumP <*> anyChar `manyTill` newline

-- | We parse six fields: position, nucleotide, next nuc, previous nuc, pairing
-- partner and (position again). CT8 adds two additional fields (unknown) and
-- (unknown). Fields in brackets () are not returned, just parsed.

nucLineP :: GenParser Char st (Int,Nuc)
nucLineP = f <$> snumP <*> scharP <*> snumP <*> snumP <*> snumP <* snumP <* choice [newline, snumP *> snumP *> newline] where
  f pos chr prv nxt partner = (pos, Nuc chr prv nxt partner)

nucLinesP :: GenParser Char st [(Int,Nuc)]
nucLinesP = many (try nucLineP)

-- | Parse a CT file with some error handling.

ctP :: GenParser Char st (Either String CT)
ctP = f <$> commentsP <*> lenlineP <*> nucLinesP <* spaces <* eof where
  f cs (l,ll) ns
    | l /= length ns = Left $ "CT inscribed length of: " ++ show l ++ " doesn't match read entries in: " ++ unlines cs
    | otherwise = Right . CT cs ll $ A.array (1,length ns) ns

scharP = spaces *> anyChar

snumP :: GenParser Char st Int
snumP = read <$ try spaces <*> numberP

numberP = many1 digit