{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}

-- | The module implements /directed acyclic word graphs/ (DAWGs) internaly
-- represented as /minimal acyclic deterministic finite-state automata/.
--
-- In comparison to "Data.DAWG" module the automaton implemented here:
--
--   * Keeps all nodes in one array and therefore uses much less memory,
--
--   * When 'weigh'ed, it can be used to perform static hashing with
--     'hash' and 'unHash' functions,
--
--   * Doesn't provide insert/delete family of operations.

module Data.DAWG.Static
(
-- * DAWG type
  DAWG (..)
-- * Query
, lookup
, numStates
-- * Index
, index
, byIndex
-- * Hash
, hash
, unHash
-- * Construction
, empty
, fromList
, fromListWith
, fromLang
, freeze
-- * Weight
, Weight
, weigh
-- * Conversion
, assocs
, keys
, elems
-- , thaw
) where

import Prelude hiding (lookup)
import Control.Applicative ((<$), (<$>), (<|>))
import Control.Arrow (first)
import Data.Binary (Binary)
import Data.Vector.Binary ()
import Data.Vector.Unboxed (Unbox)
import qualified Data.IntMap as M
import qualified Data.Vector as V

import Data.DAWG.Node hiding (Node)
import qualified Data.DAWG.Node as N
import qualified Data.DAWG.Node.Specialized as NS
import qualified Data.DAWG.VMap as VM
import qualified Data.DAWG.Internal as I
import qualified Data.DAWG as D

type Node a b = N.Node (Maybe a) (Edge b)

-- | @DAWG a b c@ constitutes an automaton with alphabet symbols of type /a/,
-- node values of type /Maybe b/ and additional transition labels of type /c/.
-- Root is stored on the first position of the array.
newtype DAWG a b c = DAWG { unDAWG :: V.Vector (Node b c) }
    deriving (Show, Eq, Ord, Binary)

-- | Empty DAWG.
empty :: Unbox c => DAWG a b c
empty = DAWG $ V.fromList
    [ Branch 1 VM.empty
    , Leaf Nothing ]

-- | Number of states in the automaton.
numStates :: DAWG a b c -> Int
numStates = V.length . unDAWG

-- | Node with the given identifier.
nodeBy :: ID -> DAWG a b c -> Node b c
nodeBy i d = unDAWG d V.! i

-- | Value in leaf node with a given ID.
leafValue :: Node b c -> DAWG a b c -> Maybe b
leafValue n = value . nodeBy (eps n)

-- | Find value associated with the key.
lookup :: (Unbox c, Enum a) => [a] -> DAWG a b c -> Maybe b
lookup xs' =
    let xs = map fromEnum xs'
    in  lookup'I xs 0
{-# SPECIALIZE lookup :: Unbox c => String -> DAWG Char b c -> Maybe b #-}

lookup'I :: Unbox c => [Sym] -> ID -> DAWG a b c -> Maybe b
lookup'I []     i d = leafValue (nodeBy i d) d
lookup'I (x:xs) i d = case onSym x (nodeBy i d) of
    Just e  -> lookup'I xs (to e) d
    Nothing -> Nothing

-- | Return all key/value pairs in the DAWG in ascending key order.
assocs :: (Enum a, Unbox c) => DAWG a b c -> [([a], b)]
assocs d = map (first (map toEnum)) (assocs'I 0 d)
{-# SPECIALIZE assocs :: Unbox c => DAWG Char b c -> [(String, b)] #-}

assocs'I :: Unbox c => ID -> DAWG a b c -> [([Sym], b)]
assocs'I i d =
    here ++ concatMap there (trans n)
  where
    n = nodeBy i d
    here = case leafValue n d of
        Just x  -> [([], x)]
        Nothing -> []
    there (x, e) = map (first (x:)) (assocs'I (to e) d)

-- | Return all keys of the DAWG in ascending order.
keys :: (Unbox c, Enum a) => DAWG a b c -> [[a]]
keys = map fst . assocs
{-# SPECIALIZE keys :: Unbox c => DAWG Char b c -> [String] #-}

-- | Return all elements of the DAWG in the ascending order of their keys.
elems :: Unbox c => DAWG a b c -> [b]
elems = map snd . assocs'I 0

-- | Construct 'DAWG' from the list of (word, value) pairs.
-- First a 'D.DAWG' is created and then it is frozen using
-- the 'freeze' function.
fromList :: (Enum a, Ord b) => [([a], b)] -> DAWG a b ()
fromList = freeze . D.fromList
{-# SPECIALIZE fromList :: Ord b => [(String, b)] -> DAWG Char b () #-}

-- | Construct DAWG from the list of (word, value) pairs
-- with a combining function.  The combining function is
-- applied strictly. First a 'D.DAWG' is created and then
-- it is frozen using the 'freeze' function.
fromListWith :: (Enum a, Ord b) => (b -> b -> b) -> [([a], b)] -> DAWG a b ()
fromListWith f = freeze . D.fromListWith f
{-# SPECIALIZE fromListWith :: Ord b => (b -> b -> b)
        -> [(String, b)] -> DAWG Char b () #-}

-- | Make DAWG from the list of words.  Annotate each word with
-- the @()@ value.  First a 'D.DAWG' is created and then it is frozen
-- using the 'freeze' function.
fromLang :: Enum a => [[a]] -> DAWG a () ()
fromLang = freeze . D.fromLang
{-# SPECIALIZE fromLang :: [String] -> DAWG Char () () #-}

-- | Weight of a node corresponds to the number of final states
-- reachable from the node.  Weight of an edge is a sum of weights
-- of preceding nodes outgoing from the same parent node.
type Weight = Int

-- | Compute node weights and store corresponding values in transition labels.
weigh :: Unbox c => DAWG a b c -> DAWG a b Weight
weigh d = (DAWG . V.fromList)
    [ branch n (apply ws (trans n))
    | i <- [0 .. numStates d - 1]
    , let n  = nodeBy i d
    , let ws = accum (children n) ]
  where
    -- Branch with new edges.
    branch Branch{..} es    = Branch eps es
    branch Leaf{..}   _     = Leaf value
    -- In nodeWeight node weights are memoized.
    nodeWeight = ((V.!) . V.fromList) (map detWeight [0 .. numStates d - 1])
    -- Determine weight of the node.
    detWeight i = case nodeBy i d of
        Leaf w  -> maybe 0 (const 1) w
        n       -> sum . map nodeWeight $ allChildren n
    -- Weight for subsequent edges.
    accum = init . scanl (+) 0 . map nodeWeight
    -- Apply weight to edges. 
    apply ws ts = VM.fromList
        [ (x, annotate w e)
        | (w, (x, e)) <- zip ws ts ]
    -- Plain children and epsilon child. 
    allChildren n = eps n : children n
    -- IDs of plain children.
    children = map to . edges

-- | Construct immutable version of the automaton.
freeze :: D.DAWG a b -> DAWG a b ()
freeze d = DAWG . V.fromList $
    map (N.toGeneric . NS.reIdent newID . oldBy)
        (M.elems (inverse old2new))
  where
    -- Map from old to new identifiers.
    old2new = M.fromList $ (D.root d, 0) : zip (nodeIDs d) [1..]
    newID   = (M.!) old2new
    -- List of node IDs without the root ID.
    nodeIDs = filter (/= D.root d) . map fst . M.assocs . I.nodeMap . D.graph
    -- Non-frozen node by given identifier.
    oldBy i = I.nodeBy i (D.graph d)
        
-- | Inverse of the map.
inverse :: M.IntMap Int -> M.IntMap Int
inverse =
    let swap (x, y) = (y, x)
    in  M.fromList . map swap . M.toList

-- -- | Yield mutable version of the automaton.
-- thaw :: (Unbox c, Ord a) => DAWG a b c -> D.DAWG a b
-- thaw d =
--     D.fromNodes nodes 0
--   where
--     -- List of resulting nodes.
--     nodes = branchNodes ++ leafNodes
--     -- Branching nodes.
--     branchNodes =
--         [ 
--     -- Number of states used to shift new value IDs.
--     n = numStates d
--     -- New identifiers for value nodes.
--     valIDs = foldl' updID GM.empty (values d)
--     -- Values in the automaton.
--     values = map value . V.toList . unDAWG
--     -- Update ID map.
--     updID m v = case GM.lookup v m of
--         Just i  -> m
--         Nothing -> 
--             let j = GM.size m + n
--             in  j `seq` GM.insert v j

-- | Position in a set of all dictionary entries with respect
-- to the lexicographic order.
index :: Enum a => [a] -> DAWG a b Weight -> Maybe Int
index xs = index'I (map fromEnum xs) 0
{-# SPECIALIZE index :: String -> DAWG Char b Weight -> Maybe Int #-}

index'I :: [Sym] -> ID -> DAWG a b Weight -> Maybe Int
index'I []     i d = 0 <$ leafValue (nodeBy i d) d
index'I (x:xs) i d = do
    let n = nodeBy i d
        v = maybe 0 (const 1) (leafValue n d)
    e <- onSym x n
    w <- index'I xs (to e) d
    return (v + w + label e)

-- | Perfect hashing function for dictionary entries.
-- A synonym for the 'index' function.
hash :: Enum a => [a] -> DAWG a b Weight -> Maybe Int
hash = index
{-# INLINE hash #-}

-- | Find dictionary entry given its index with respect to the
-- lexicographic order.
byIndex :: Enum a => Int -> DAWG a b Weight -> Maybe [a]
byIndex ix d = map toEnum <$> byIndex'I ix 0 d
{-# SPECIALIZE byIndex :: Int -> DAWG Char b Weight -> Maybe String #-}

byIndex'I :: Int -> ID -> DAWG a b Weight -> Maybe [Sym]
byIndex'I ix i d
    | ix < 0    = Nothing
    | otherwise = here <|> there
  where
    n = nodeBy i d
    v = maybe 0 (const 1) (leafValue n d)
    here
        | ix == 0   = [] <$ leafValue (nodeBy i d) d
        | otherwise = Nothing
    there = do
        (x, e) <- VM.findLastLE cmp (edgeMap n)
        xs <- byIndex'I (ix - v - label e) (to e) d
        return (x:xs)
    cmp e = compare (label e) (ix - v)

-- | Inverse of the 'hash' function and a synonym for the 'byIndex' function.
unHash :: Enum a => Int -> DAWG a b Weight -> Maybe [a]
unHash = byIndex
{-# INLINE unHash #-}