{-# LANGUAGE PatternGuards #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE NoMonomorphismRestriction #-}
{-# LANGUAGE OverloadedStrings #-}

-- | Turner file parser. Returns a Turner2004 data structure. Requires an
-- annoying amount of boilerplate.
--
-- How is 'stack' data stored:
--
-- AX
-- UY   ->  ((A,U),(Y,X))
--
-- How 'iloop1x1' is stored:
--
--  X
-- A G
-- U C  -> ((A,U),(C,G),X,Y)
--  Y
--
--  Now 'iloop1x2' is stored:
--
--  X
-- A  G
-- U  C  -> ((A,U),(C,G),X,C,Y), single (X) first, then 5' to 3'
--  YC
--
--  'iloop2x2' is stored:
--
--  XY
-- A  G
-- U  C  -> ((A,U),(C,G),X,Y,y,x), X-->Y then x<--y
--  xy
--
--TODO not sure if dangle3/dangle5 are correctly split or if they should switch

module Biobase.Turner.Import where

import Control.Arrow
import Data.Array.Repa.Index
import Data.ByteString.Char8 as BS
import Data.ByteString.Lex.Double
import Data.Char
import Data.Iteratee as I
import Data.Iteratee.Char as I
import Data.Iteratee.IO as I
import Data.List.Split
import Data.Map as M
import Data.Maybe (fromJust)
import qualified Data.List as L
import System.FilePath.Posix

import Biobase.Primary
import Biobase.Secondary
import Data.PrimitiveArray

import Biobase.Turner



-- *

-- | Given a directory, fill in the 'Turner2004' data structure

fromDir :: FilePath -> Prefix -> Suffix -> IO Turner2004
fromDir fp prefix suffix = do
  stack'     <- blockFromFile $ fp </> prefix ++ "stack"    <.> suffix
  dangle'    <- blockFromFile $ fp </> prefix ++ "dangle"   <.> suffix
  loop'      <- blockFromFile $ fp </> prefix ++ "loop"     <.> suffix
  hairpinMM' <- blockFromFile $ fp </> prefix ++ "tstackh"  <.> suffix
  hairpinLk3 <- tabFromFile   $ fp </> prefix ++ "triloop"  <.> suffix
  hairpinLk4 <- tabFromFile   $ fp </> prefix ++ "tloop"    <.> suffix
  hairpinLk6 <- tabFromFile   $ fp </> prefix ++ "hexaloop" <.> suffix
  let (dangle3',dangle5') = L.splitAt (L.length dangle' `div` 2) dangle'
  let (_:iloopL':bulgeL':hairpinL':[]) = L.transpose $ splitEvery 4 loop'
  iloop1x1'   <- blockFromFile $ fp </> prefix ++ "int11" <.> suffix
  iloop2x1'   <- blockFromFile $ fp </> prefix ++ "int21" <.> suffix
  iloop2x2'   <- blockFromFile $ fp </> prefix ++ "int22" <.> suffix
  iloopMM'    <- blockFromFile $ fp </> prefix ++ "tstacki" <.> suffix
  iloop2x3MM' <- blockFromFile $ fp </> prefix ++ "tstacki23" <.> suffix
  iloop1xnMM' <- blockFromFile $ fp </> prefix ++ "tstacki1n" <.> suffix
  multiMM'    <- blockFromFile $ fp </> prefix ++ "tstacki" <.> suffix
  imisc'      <- miscFromFile  $ fp </> prefix ++ "miscloop" <.> suffix
  extMM'      <- blockFromFile $ fp </> prefix ++ "tstack" <.> suffix
  coaxial'    <- blockFromFile $ fp </> prefix ++ "coaxial" <.> suffix
  cstack'     <- blockFromFile $ fp </> prefix ++ "coaxstack" <.> suffix
  tstack'     <- blockFromFile $ fp </> prefix ++ "tstackcoax" <.> suffix
  return Turner2004
    { stack         = fromAssocs minPP  maxPP   infE $ L.zip keysPP  stack'
    , dangle3       = fromAssocs minPB  maxPB   infE $ L.zip keysPB  dangle3'
    , dangle5       = fromAssocs minPB  maxPB   infE $ L.zip keysPB  dangle5'
    , hairpinL      = fromAssocs (Z:.0) (Z:.30) infE $ L.zip d1_30 hairpinL'
    , hairpinMM     = fromAssocs minPBB maxPBB infE $ L.zip keysPBB hairpinMM'
    , hairpinLookup = M.fromList $ hairpinLk3 ++ hairpinLk4 ++ hairpinLk6
    , hairpinGGG    = L.head $ imisc' !! 8
    , hairpinCslope = L.head $ imisc' !! 9
    , hairpinCintercept = L.head $ imisc' !! 10
    , hairpinC3     = L.head $ imisc' !! 11
    , bulgeL        = fromAssocs (Z:.0)      (Z:.30)     infE $ L.zip d1_30 bulgeL'
    , bulgeSingleC  = L.head $ imisc' !! 13
    , iloop1x1      = fromAssocs minPPBB   maxPPBB   infE $ L.zip keysPPBB   iloop1x1'
    , iloop2x1      = fromAssocs minPPBBB  maxPPBBB  infE $ L.zip keysPPBBB  iloop2x1'
    , iloop2x2      = fromAssocs minPPBBBB maxPPBBBB infE $ L.zip (if (prefix == "" || suffix == "dh") then keysPPBBBBrna else keysPPBBBBdna) iloop2x2'
    , iloopMM       = fromAssocs minPBB    maxPBB    infE $ L.zip keysPBB    iloopMM'
    , iloop2x3MM    = fromAssocs minPBB    maxPBB    infE $ L.zip keysPBB    iloop2x3MM'
    , iloop1xnMM    = fromAssocs minPBB    maxPBB    infE $ L.zip keysPBB    iloop1xnMM'
    , iloopL        = fromAssocs (Z:.0)    (Z:.30)   infE $ L.zip d1_30      iloopL'
    , multiMM       = fromAssocs minPBB    maxPBB    infE $ L.zip keysPBB    multiMM'
    , ninio = L.head $ imisc' !! 2
    , maxNinio = L.head $ imisc' !! 1
    , multiOffset = (imisc' !! 3) !! 0
    , multiNuc = (imisc' !! 3) !! 1
    , multiHelix = (imisc' !! 3) !! 2
    , multiAsym = L.head $ imisc' !! 5
    , multiStrain = L.head $ imisc' !! 6
    , extMM      = fromAssocs minPBB maxPBB infE $ L.zip keysPBB extMM'
    , coaxial    = fromAssocs minPP  maxPP  infE $ L.zip keysPP  coaxial'
    , coaxStack  = fromAssocs minPBB maxPBB infE $ L.zip keysPBB cstack'
    , tStackCoax = fromAssocs minPBB maxPBB infE $ L.zip keysPBB tstack'
    , largeLoop = L.head $ imisc' !! 0
    , termAU = L.head $ imisc' !! 7
    , intermolecularInit = L.head $ imisc' !! 12
    }

minPP     = Z:.nN:.nN:.nN:.nN -- (minP,minP)
maxPP     = Z:.nU:.nU:.nU:.nU -- (maxP,maxP)
minP      = Z:.nN:.nN -- (nN,nN)
maxP      = Z:.nU:.nU -- (nU,nU)
minPB     = minP:.nN -- (minP,nN)
maxPB     = maxP:.nU -- (maxP,nU)
minPBB    = minPB:.nN -- (minP,nN,nN)
maxPBB    = maxPB:.nU -- (maxP,nU,nU)
minPPBB   = minPP:.nN:.nN -- (minP,minP,(nN,nN))
maxPPBB   = maxPP:.nU:.nU -- (maxP,maxP,(nU,nU))
minPPBBB  = minPPBB:.nN -- (minP,minP,(nN,nN,nN))
maxPPBBB  = maxPPBB:.nU -- (maxP,maxP,(nU,nU,nU))
minPPBBBB = minPPBBB:.nN -- (minP,minP,(nN,nN,nN,nN))
maxPPBBBB = maxPPBBB:.nU -- (maxP,maxP,(nU,nU,nU,nU))

d1_30 = L.map (Z:.) [1..30]

keysPP     = [{- ((k1,k2),(k4,k3)) -} Z:.k1:.k2:.k4:.k3 | k1 <- acgu, k3 <- acgu, k2 <- acgu, k4 <- acgu]
keysPB     = [{- ((k1,k2),k3) -} Z:.k1:.k2:.k3 | k1 <- acgu, k2 <- acgu, k3 <- acgu]
keysPBB    = [ {- ((k1,k2),k3,k4) -} Z:.k1:.k2:.k3:.k4
             | k1 <- acgu, k3 <- acgu, k2 <- acgu, k4 <- acgu]
keysPPBB   = [ {- ((k1,k2),(k4,k3),(k5,k6)) -} Z:.k1:.k2:.k4:.k3:.k5:.k6
             | (k1,k2) <- plist11, k5 <- acgu, (k3,k4) <- plist11, k6 <- acgu]
keysPPBBB  = [ {- ((k1,k2),(k4,k3),(k5,k6,k7)) -} Z:.k1:.k2:.k4:.k3:.k5:.k6:.k7
             | (k1,k2) <- plist11, k6 <- acgu, k5 <- acgu, (k3,k4) <- plist11, k7 <- acgu]
keysPPBBBBrna = [ {- ((k1,k2),(k4,k3),(k5,k6,k7,k8)) -} Z:.k1:.k2:.k4:.k3:.k5:.k6:.k7:.k8
                | (k1,k2) <- plist22rna, (k3,k4) <- plist22rna, k5 <- acgu, k8 <- acgu, k6 <- acgu, k7 <- acgu]
keysPPBBBBdna = [ {- ((k1,k2),(k4,k3),(k5,k6,k7,k8)) -} Z:.k1:.k2:.k4:.k3:.k5:.k6:.k7:.k8
                | (k1,k2) <- plist22dna, (k3,k4) <- plist22dna, k5 <- acgu, k8 <- acgu, k6 <- acgu, k7 <- acgu]

plist11 = [(nA,nU),(nC,nG),(nG,nC),(nU,nA),(nG,nU),(nU,nG)]
plist22rna = [(nA,nU),(nC,nG),(nG,nC),(nG,nU),(nU,nA),(nU,nG)]
plist22dna = [(nA,nT),(nC,nG),(nG,nC),(nT,nA),(nG,nT),(nT,nG)]

infE = 999999 :: Double

-- * Iteratee stuff

-- | Transform input stream into list of list of doubles

eneeBlocks :: (Functor m, Monad m) => Enumeratee ByteString [[Double]] m a
eneeBlocks = enumLinesBS ><> mapStream f ><> I.filter (not . L.null) where
  f x
    | "5'" `isPrefixOf` y = []
    | "3'" `isPrefixOf` y = []
    | "." `isPrefixOf`  y = values y
    | Just (d,xs) <- readDouble y = values y
    | otherwise = [] -- error $ BS.unpack x
    where y = BS.dropWhile isSpace x

-- | extract values. "." - values are extracted as > 100k

values :: ByteString -> [Double]
values xs
  | BS.null ys
    = []
  | "." `isPrefixOf` ys
    = infE : values (BS.drop 1 ys)
  | Just (d,zs) <- readDouble ys
    = d : values zs
  where ys = BS.dropWhile isSpace xs

-- | Iteratee to parse tabulated loops (hairpins).

iTabulated :: (Functor m, Monad m) => Iteratee ByteString m [(ByteString,Double)]
iTabulated = joinI $ enumLinesBS ><> I.filter (BS.all isSpace) $ g where
  g = do
    I.drop 2
    joinI $ mapStream f stream2stream
  f x
    | Just (d,_) <- readDouble v = (k,d)
    | otherwise = error $ "tabulated: <" ++ BS.unpack x ++ ">"
    where (k,v) = second (BS.dropWhile isSpace) . BS.span (not . isSpace) . BS.dropWhile isSpace $ x

-- | Convenience function

blockFromFile :: FilePath -> IO [Double]
blockFromFile fp = do
  i <- enumFile 8192 fp . joinI $ eneeBlocks stream2list
  xs <- run i
  if (allEq $ L.map L.length xs)
    then return $ L.concat xs
    else error $ "in file: " ++ fp ++ " we have unequal line lengths"

-- | Parses the miscloop table
--
-- NOTO extra brownie points for miscloop.dat for providing data in a form that
-- does not conform to normal number encoding.

iMiscLoop :: (Functor m, Monad m) => Iteratee ByteString m [[Double]]
iMiscLoop = joinI $ enumLinesBS ><> I.groupBy (\x y -> not $ BS.null y) $ f where
  f = do
    I.drop 1
    xs <- fmap (L.map (L.map (readD) . BS.words . L.last)) $ stream2list
    return xs

-- | Parses stupidly encoded values like ".6" and "-.0".

readD :: ByteString -> Double
readD xs
  | BS.null xs              = error "readD: null"
  | BS.head xs == '.'       = readD $ BS.cons '0' xs
  | "-." `BS.isPrefixOf` xs = readD $ "-0." `BS.append` BS.drop 2 xs
  | Just (d,_) <- readDouble xs = d
  | otherwise = error $ BS.unpack xs

-- |

miscFromFile :: FilePath -> IO [[Double]]
miscFromFile fp = run =<< enumFile 8192 fp iMiscLoop

-- |

tabFromFile :: FilePath -> IO [(ByteString,Double)]
tabFromFile fp = run =<< enumFile 8192 fp iTabulated

allEq [] = True
allEq (x:xs) = L.all (==x) xs

type Prefix = FilePath
type Suffix = FilePath