{-# LANGUAGE NoMonomorphismRestriction #-}

-- | Turner file parser. Returns a Turner2004 data structure. We store data in
-- the same way it is stored in the ViennaRNA package. Pairs are tuples
-- however.
--
-- How is 'stack' data stored:
--
-- AX
-- UY   ->  ((A,U),(Y,X))
--
-- How 'iloop1x1' is stored:
--
--  X
-- A G
-- U C  -> ((A,U),(C,G),X,Y)
--  Y
--
--  Now 'iloop1x2' is stored:
--
--  X
-- A  G
-- U  C  -> ((A,U),(C,G),X,C,Y), single (X) first, then 5' to 3'
--  YC
--
--  'iloop2x2' is stored:
--
--  XY
-- A  G
-- U  C  -> ((A,U),(C,G),X,Y,y,x), X-->Y then x<--y
--  xy
--
--TODO not sure if dangle3/dangle5 are correctly split or if they should switch

module Biobase.Turner.Import
  ( parseTurner
  ) where

import Control.Applicative
import Control.Arrow (first)
import Control.Monad
import Data.Either
import Data.List.Split (splitEvery)
import Data.List (transpose)
import qualified Data.Map as M
import System.FilePath.Find
import Text.Parsec hiding ((<|>), many)
import Text.Parsec.String

import Biobase.RNA
import Data.Ix.Tuple
import Data.PrimitiveArray
import Data.PrimitiveArray.Ix
import Text.Parsec.Numbers

import Biobase.Turner.Tables



-- * Associate every file with a parse.

minp = (nucA,nucA)
maxp = (nucU,nucU)
minpb = (minp,nucA)
maxpb = (maxp,nucU)
minpp = (minp,minp)
maxpp = (maxp,maxp)
minppbb = (minp,minp,nucA,nucA)
maxppbb = (maxp,maxp,nucU,nucU)
minppbbb = (minp,minp,nucA,nucA,nucA)
maxppbbb = (maxp,maxp,nucU,nucU,nucU)
minppbbbb = (minp,minp,nucA,nucA,nucA,nucA)
maxppbbbb = (maxp,maxp,nucU,nucU,nucU,nucU)
minpbb = (minp,nucA,nucA)
maxpbb = (maxp,nucU,nucU)
-- (4,3) switched for vienna rna compatibility
keyspp = [((k1,k2),(k4,k3)) | k1 <- acgu, k3 <- acgu, k2 <- acgu, k4 <- acgu]
keyspb = [((k1,k2),k3) | k1 <- acgu, k2 <- acgu, k3 <- acgu]
keyspbb = [((k1,k2),k3,k4) | k1 <- acgu, k3 <- acgu, k2 <- acgu, k4 <- acgu]
plist11 = [(nucA,nucU),(nucC,nucG),(nucG,nucC),(nucU,nucA),(nucG,nucU),(nucU,nucG)]
plist22 = [(nucA,nucU),(nucC,nucG),(nucG,nucC),(nucG,nucU),(nucU,nucA),(nucU,nucG)]
-- (4,3) switched for vienna rna compatibility
keysppbb = [((k1,k2),(k4,k3),k5,k6) | (k1,k2) <- plist11, k5 <- acgu, (k3,k4) <- plist11, k6 <- acgu]
-- (4,3) switched for vienna rna compatibility
keysppbbb = [((k1,k2),(k4,k3),k5,k6,k7) | (k1,k2) <- plist11, k6 <- acgu, k5 <- acgu, (k3,k4) <- plist11, k7 <- acgu]
-- (4,3) switched for vienna rna compatibility, 5786 is 5'3' order top, bottom!
keysppbbbb = [((k1,k2),(k4,k3),k5,k6,k7,k8) | (k1,k2) <- plist22, (k3,k4) <- plist22, k5 <- acgu, k8 <- acgu, k6 <- acgu, k7 <- acgu]
z = 999999



-- | Given the base dir and a suffix (.dat/.dh most likely), parse the relevant
-- files. The prefix can be used for, eg., dna file

parseTurner prefix basedir fsuffix = do
  vstack <- grabB basedir fsuffix $ prefix ++ "stack"
  vdangles <- grabB basedir fsuffix $ prefix ++ "dangle"
  vlengths <- grabB basedir fsuffix $ prefix ++ "loop"
  vhairmm <- grabB basedir fsuffix $ prefix ++ "tstackh"
  viloopmm <- grabB basedir fsuffix $ prefix ++ "tstacki"
  viloop23mm <- grabB basedir fsuffix $ prefix ++ "tstacki23"
  viloop1nmm <- grabB basedir fsuffix $ prefix ++ "tstacki1n"
  vmultimm <- grabB basedir fsuffix $ prefix ++ "tstackm"
  vextmm <- grabB basedir fsuffix $ prefix ++ "tstack"
  viloop11 <- grabB basedir fsuffix $ prefix ++ "int11"
  viloop12 <- grabB basedir fsuffix $ prefix ++ "int21"
  viloop22 <- grabB basedir fsuffix $ prefix ++ "int22"
  vlookups <- fmap (map (first (map charToNucleotide)) . concat) $ mapM (\f -> grabA basedir fsuffix $ prefix ++ f) ["triloop","tloop","hexaloop"]
  vmisc <- grabB basedir fsuffix $ prefix ++ "miscloop"
  let (vdangle3,vdangle5) = splitAt (length vdangles `div` 2) vdangles
  let (_:viloopl:vbulgel:vhairpinl:[]) = transpose $ splitEvery 4 vlengths
  return $ Turner2004
    { stack = fromAssocs minpp maxpp z $ zip keyspp vstack
    , dangle3 = fromAssocs minpb maxpb z $ zip keyspb vdangle3
    , dangle5 = fromAssocs minpb maxpb z $ zip keyspb vdangle5
    , hairpinL = fromAssocs 0 30 z $ zip [1..] vhairpinl
    , bulgeL = fromAssocs 0 30 z $ zip [1..] vbulgel
    , iloopL = fromAssocs 0 30 z $ zip [1..] viloopl
    , hairpinMM = fromAssocs minpbb maxpbb z $ zip keyspbb vhairmm
    , iloopMM = fromAssocs minpbb maxpbb z $ zip keyspbb viloopmm
    , iloop2x3MM = fromAssocs minpbb maxpbb z $ zip keyspbb viloop23mm
    , iloop1xnMM = fromAssocs minpbb maxpbb z $ zip keyspbb viloop1nmm
    , multiMM = fromAssocs minpbb maxpbb z $ zip keyspbb vmultimm
    , extMM = fromAssocs minpbb maxpbb z $ zip keyspbb vextmm
    , hairpinLookup = M.fromList vlookups
    , iloop1x1 = fromAssocs minppbb   maxppbb   z $ zip keysppbb   viloop11
    , iloop1x2 = fromAssocs minppbbb  maxppbbb  z $ zip keysppbbb  viloop12
    , iloop2x2 = fromAssocs minppbbbb maxppbbbb z $ zip keysppbbbb viloop22
    , ninio = vmisc !! 2
    , maxNinio = vmisc !! 1
    , multiOffset = vmisc !! 6
    , multiNuc = vmisc !! 7
    , multiHelix = vmisc !! 8
    , largeLoop = vmisc !! 0
    , termAU = vmisc !! 14
    , intermolecularInit = vmisc !! 19
    }

grabB basedir fsuffix fname = do
  p <- parseFromFile pBlockFile (basedir ++ "/" ++ fname ++ fsuffix)
  case p of
    (Right ans) -> return ans
    (Left err)  -> error $ show err

grabA basedir fsuffix fname = do
  p <- parseFromFile pAssocFile (basedir ++ "/" ++ fname ++ fsuffix)
  case p of
    (Right ans) -> return ans
    (Left err)  -> error $ show err



-- * File parser: We do not want to understand the format, just extract all
-- data.

-- | Blocks of data.

pBlockFile = concat <$> many (try numberRow <|> infoRow) <* eof where
  -- a line with a least one number
  numberRow = many (char ' ') *> (number <* notFollowedBy (char '\'')) `sepEndBy1` (many $ char ' ') <* newline
  -- a number is a dot or some extended floating point number
  number = (try dotFloat) <|> (999999 <$ char '.') <|> parseExtFloat <?> "floating point number"

-- | A File with associations.

pAssocFile = concat <$> many1 (try assocRow <|> infoRow) where
  -- one assocs of string, value
  assocRow = (\a b -> [(a,b)]) <$ many (char ' ') <*> many1 (oneOf nuc) <* many (char ' ') <*> parseExtFloat <* many (char ' ')

-- a line with crap
infoRow = [] <$ anyChar `manyTill` newline

-- a rather stupid float
dotFloat = (\s a b -> read $ s:'0':a:b) <$> (char '-' <|> pure ' ') <*> char '.' <*> many1 digit

nuc = "ACGUT"