{-# LANGUAGE ScopedTypeVariables #-}

-- | Importer for the NCM database.
--
-- NOTE This version reads only known single/double-stranded NCMs.
--
-- NOTE "infinity" is a large constant, given by RNA.Constants.eInf

module Biobase.DataSource.MCFold.Import
  ( parseDir
  ) where

import Control.Applicative
import Control.Arrow (first, second)
import Control.Monad (liftM)
import Data.Either.Unwrap (fromRight)
import Data.Maybe (fromJust)
import qualified Data.ByteString as B
import qualified Data.Map as M
import qualified Data.Vector.Unboxed as VU
import System.FilePath ((</>))
import Text.Parsec.ByteString
import Text.Parsec hiding (many, (<|>))
import Text.Parsec.Prim (parse)
import Text.Printf

import Text.Parsec.Numbers
import Data.PrimitiveArray

import Biobase.Constants
import Biobase.DataSource.MCFold
import Biobase.RNA
import Biobase.RNA.Hashes
import Biobase.RNA.NucBounds


parseDir :: FilePath -> IO MotifDB
parseDir dir = do
  let sList = map fst $ VU.toList knownSingleNCM
  let dList = map fst $ VU.toList knownDoubleNCM
  -- basic filenames
  let (spFiles :: [FilePath]) = map ((dir </>) . printf "1_strand_%d.db") sList
  let (dsFiles :: [FilePath]) = map ((dir </>) . uncurry (printf "2_strand_%d_%d.db")) dList
  -- parse cycle DB
  let parseWith parser suffix lst = liftM (zip lst) . mapM (liftM fromRight . parseFromFile parser . (++suffix))
  sCycles <- parseWith parseCycle ".cycle" sList spFiles
  dCycles <- parseWith parseCycle ".cycle" dList dsFiles
  -- parse hinge DB: parses the original counted data
  sHinges <- parseWith parseHinge ".hinge" sList spFiles
  dHinges <- parseWith parseHinge ".hinge" dList dsFiles
  -- parse the DB with individual pair contributions, dropping the 5 integer-based values first
  pairScores <- liftM (fromAssocs minExtended maxExtended eInf . zip [(n1,n2)|n1<-eacgu,n2<-eacgu] . drop 5 . fromRight) . parseFromFile parsePairScores $ (dir </> "hinge.db") -- strangely, the data is in hinge.db
  -- parse contributions by individual junctions
  junctionScores <- liftM (fromList (0,0) (21,21) . fromRight) . parseFromFile parseJunction $ (dir </> "junction.db")
  return $ MotifDB
    { sCycles = map (second (toNCMArray pairScores)) sCycles
    , dCycles = map (second (toNCMArray pairScores)) dCycles
    , dsConnect = [ mkDSConnect junctionScores pairScores d s | d <- dHinges, s <- sHinges ]
    , ddConnect = [ mkDDConnect junctionScores pairScores d d | d <- dHinges, d <- dHinges ]
    , rawPairScores = pairScores
    , rawJunctionScores = junctionScores
    , rawSHingeCounts = sHinges
    , rawDHingeCounts = dHinges
    }

-- | Create the connections array for a given doubleNCM known by (l,r) length
-- and singleNCM. Such a connection is then only defined by the connecting
-- pair.
--
-- TODO check if connect calculates correctly!

mkDSConnect :: JunctionArray -> PairScoreArray -> ((Int,Int),[RawHinge]) -> (Int,[RawHinge]) -> (((Int,Int),Int),ConnectArray)
mkDSConnect junction pair ((lenLeft,lenRight),ds) (len,ss) = (((lenLeft,lenRight),len), fromAssocs l u z xs) where
  l = minExtended
  u = maxExtended
  z = eInf
  xs = [ ((n1,n2), connect n1 n2) | n1<-acgu, n2<-acgu ]
  dsMap = M.map normalize . M.fromListWith (++) $ map (\((pnn,others),v) -> (pnn,[(others,v)])) $ ds
  ssMap = M.map normalize . M.fromListWith (++) $ map (\((pnn,others),v) -> (pnn,[(others,v)])) $ ss
  connect n1 n2 =
    let
      common = [ va*vb | (ka,va) <- M.findWithDefault [] (p3,n1,n2) dsMap, (kb,vb) <- M.findWithDefault [] (p5,n1,n2) ssMap, ka==kb ]
      c = if null common
            then eInf
            else junction ! (((lenLeft,lenRight) `lookup` knownDoubleNCM),(len `lookup` knownSingleNCM)) -- junction score between two NCMs
                 + (logScore $ sum common)
    in c
  lookup k = snd . fromJust . VU.find ((==k).fst)
  normalize xs
    | null xs   = xs
    | otherwise = map (second (/total)) xs
    where total = sum $ map snd xs

-- | Same as above. Some annoying code duplication.
--
-- TODO check if connect calculates correctly

mkDDConnect :: JunctionArray -> PairScoreArray -> ((Int,Int),[RawHinge]) -> ((Int,Int),[RawHinge]) -> (((Int,Int),(Int,Int)),ConnectArray)
mkDDConnect junction pair ((lenLA,lenRA),dsA) ((lenLB,lenRB),dsB) = (((lenLA,lenRA),(lenLB,lenRB)), fromAssocs l u z xs) where
  l = minExtended
  u = maxExtended
  z = eInf
  xs = [ ((n1,n2), connect n1 n2) | n1<-acgu, n2<-acgu ]
  dsAMap = M.map normalize . M.fromListWith (++) $ map (\((pnn,others),v) -> (pnn,[(others,v)])) $ dsA
  dsBMap = M.map normalize . M.fromListWith (++) $ map (\((pnn,others),v) -> (pnn,[(others,v)])) $ dsB
  connect n1 n2 =
    let
      common = [ va*vb | (ka,va) <- M.findWithDefault [] (p3,n1,n2) dsAMap, (kb,vb) <- M.findWithDefault [] (p5,n1,n2) dsBMap, ka==kb ]
      c = if null common
            then eInf
            else junction ! (((lenLA,lenRA) `lookup` knownDoubleNCM),((lenLB,lenRB) `lookup` knownDoubleNCM)) -- junction score between two NCMs
                 + (logScore $ sum common)
    in c
  lookup k = snd . fromJust . VU.find ((==k).fst)
  normalize xs
    | null xs   = xs
    | otherwise = map (second (/total)) xs
    where total = sum $ map snd xs

-- | Generates the array of all NCMs. We add the score for the closing pair
-- (hinge.db) directly here.

toNCMArray :: PairScoreArray -> [Cycle] -> PrimArray HashedPrimary Double
toNCMArray pairScores xs
  | null xs   = error "toNCMArray expects non-empty list of Cycles"
  | otherwise = fromAssocs l u z $ map f xs
  where
    lp = replicate (length . fst $ head xs) (minExtended :: Nucleotide)
    up = replicate (length . fst $ head xs) (maxExtended :: Nucleotide)
    l  = mkci $ mkPrimary lp
    u  = mkci $ mkPrimary up
    z  = eInf
    mkci s = mkHashedPrimary (minExtended,maxExtended) s
    f (s,v) =
      let
        p = mkPrimary s
        pS = pairScores ! (VU.head p, VU.last p)
      in (mkci p,v + pS)


-- * Parsers for different files

-- | parser for junction.db

parseJunction :: GenParser B.ByteString st [Double]
parseJunction = many1 (doubles <* spaces) <* eof where
  doubles = try (const eInf <$> string "+inf") <|> floatX

-- | parser for hinge.db Returns scores for individual pairs.

parsePairScores :: GenParser B.ByteString st [Double]
parsePairScores = (\a b -> a ++ b) <$> fl <*> ls where
  fl = many1 (read <$> many1 digit <* spaces) -- the 5 integers in the top line
  ls = many1 (doubles <* spaces) <* eof -- the 5x5 matrix of doubles
  doubles = try (const eInf <$> string "+inf") <|> floatX

-- | parser for NCM database (*.cycle)

parseCycle :: GenParser B.ByteString st [Cycle]
parseCycle = many1 (cycle <* newline) <* eof where
  cycle :: GenParser B.ByteString st Cycle
  cycle = (\a b -> (a,b)) <$> many1 letter <* ws <*> floatX

-- many ws characters

ws = many $ char ' '

-- | helper function for the strange double values
--
-- TODO move floatX into HsTools!

floatX :: GenParser B.ByteString st Double
floatX = toDouble <$> prefix <*> many digit <*> char '.' <*> many digit where
  prefix = char '+' <|> char '-'
  toDouble '+' ds1 dot ds2 = read $        ds1 ++ [dot] ++ ds2
  toDouble '-' ds1 dot ds2 = read $ "-" ++ ds1 ++ [dot] ++ ds2

-- | new hinge parser

parseHinge :: GenParser B.ByteString st [RawHinge]
parseHinge = concat <$> many hinge <* eof where
  mkap "anti" = anti
  mkap "para" = para
  mktc "trans" = trans
  mktc "cis" = cis
  mksp x = case x of
            'S' -> ssp
            'H' -> hsp
            'W' -> wsp
            'B' -> bsp
            'C' -> csp
            _   -> error $ "parseHinge: " ++ [x]
  hinge :: GenParser B.ByteString st [RawHinge]
  -- use monad format to check opening / closing equality
  hinge = do
    string "[ "
    n53 <- digit
    let primeType = if n53 == '5' then p5 else p3
    string "' "
    b1 <- oneOf "ACGU"
    string " "
    b2 <- oneOf "ACGU"
    string " ]"
    newline
    hts <- many (try hingetype)
    return $ map (\((s1,s2,ap,tr),count) -> (((primeType,mkNuc b1,mkNuc b2),(s1,s2,ap,tr)),count)) hts
  hingetype :: GenParser B.ByteString st ((Slash,Slash,AntiPara,TransCis),Double)
  hingetype =   mkht
            <$> hw
            <*  space
            <*> (try (string "anti") <|> string "para")
            <*  space
            <*> (try (string "trans") <|> string "cis")
            <*  space
            <*> many1 digit
            <*  newline
  mkht (a:_:b:[]) ap tc cnt = ((mksp a, mksp b, mkap ap, mktc tc), read cnt)
  hw = many1 (char '/' <|> letter)


-- * Types

type Cycle = (String,Double)