{-# LANGUAGE BangPatterns          #-}
{-# LANGUAGE DeriveDataTypeable    #-}
{-# LANGUAGE FlexibleContexts      #-}
{-# LANGUAGE InstanceSigs          #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE OverloadedLists       #-}
{-# LANGUAGE RankNTypes            #-}
{-# LANGUAGE ScopedTypeVariables   #-}
{-# LANGUAGE TemplateHaskell       #-}
{-# LANGUAGE TypeFamilies          #-}
{-# LANGUAGE UndecidableInstances  #-}
module Data.RadixTree
  ( RadixTree (..)
  , RadixNode (..)
  , CompressedRadixTree
    -- * Construction
  , fromFoldable_
  , fromFoldable
  , compressBy
    -- * Parsing with radix trees
  , RadixParsing (..)
  , parse_
  , lookup_
  , search
  ) where
import           Control.Applicative
import           Control.DeepSeq
import           Data.Data               (Data, Typeable)
import           Data.Foldable           (asum, foldr', toList)
import           Data.Map.Strict         (Map)
import qualified Data.Map.Strict         as M
import           Data.Sequence           (Seq)
import qualified Data.Sequence           as Seq
import           Data.Text               (Text)
import qualified Data.Text               as T
import qualified Data.Text.Array         as TI (Array)
import qualified Data.Text.Internal      as TI (Text (..), text)
import           Data.Vector             (Vector)
import qualified Data.Vector             as V
import           Lens.Micro
import           Text.Parser.Char        (CharParsing (anyChar, text))
import           Text.Parser.Combinators (Parsing (try))

--------------------------------------------------------------------------------
-- Stuff to help construct RadixTrees
--
-- I'm not clever enough to write a function to go directly from a 'Foldable' to
-- a fully-optimised RadixTree. Instead, I generate a prefix-tree using a 'Map'
-- directly ('Trie'), and then gradually compress that ('CompressedTrie') before
-- packing the final result into an efficient structure using 'Text' nodes.
--
-- TODO:
-- - generate RadixTree directly, instead of going through 'Trie'/'CompressedTrie'
-- - use compact regions?

data PrefixNode a tree = Accept !Text a !tree | Skip !tree
  deriving (Show, Eq)

newtype Trie a = Trie (PrefixNode a (Map Char (Trie a)))
  deriving (Show, Eq)

newtype CompressedTrie a = CompressedTrie (PrefixNode a (Map (Seq Char) (CompressedTrie a)))
  deriving (Show, Eq)

{-# INLINE node #-}
node :: Lens (PrefixNode p a) (PrefixNode p b) a b
node = lens
  (\x -> case x of { Accept _ _ t -> t; Skip t -> t })
  (\x a -> case x of { Accept l p _ -> Accept l p a; Skip _ -> Skip a })

leaf :: Text -> Text -> a -> Trie a
leaf ft t v = go (T.unpack t)
  where
    go (x:xs) = Trie (Skip (M.singleton x (go xs)))
    go []     = Trie (Accept ft v M.empty)

insert :: Text -> Text -> a -> Trie a -> Trie a
insert ft text' a (Trie n) = case T.uncons text' of
  Just (c, cs) -> Trie ((node %~
    M.insertWith
    (\_ orig -> Data.RadixTree.insert ft cs a orig)
    c
    (leaf ft cs a)) n)
  Nothing ->
    Trie (n^.node.to (Accept ft a))

makeCompressable :: Trie a -> CompressedTrie a
makeCompressable (Trie n) = CompressedTrie (
  over node (M.map makeCompressable . M.mapKeysMonotonic Seq.singleton) n)

compress :: Trie a -> CompressedTrie a
compress = go . makeCompressable
  where
    go :: CompressedTrie a -> CompressedTrie a
    go (CompressedTrie n) = case n of
      Accept l p m -> CompressedTrie (Accept l p (M.map go m))
      Skip m       -> CompressedTrie (Skip (M.foldMapWithKey compress1 m))

    compress1 :: Seq Char -> CompressedTrie a -> Map (Seq Char) (CompressedTrie a)
    compress1 k c@(CompressedTrie n) =
      case M.size sm of
        0 -> M.singleton k c
        1 | Skip _ <- n -> compress1 (k <> k') sm'
          where (k', sm') = M.findMax sm
        _ -> M.singleton k (go (n & node .~ sm & CompressedTrie))
      where sm = n^.node

--------------------------------------------------------------------------------

-- | A node in a radixtree. To advance from here a parser must parse the 'Text'
-- (i.e., the prefix) value at this node.
data RadixNode a = RadixNode {-# UNPACK #-} !Text !(RadixTree a)
  deriving (Eq, Show, Typeable, Data)

-- | A radixtree. Construct with 'fromFoldable_, and use with 'parse'.
data RadixTree a
  = -- | Can terminate a parser successfully, returning the 'Text' value given.
    RadixAccept
    {-# UNPACK #-} !Text -- ^ text to return at this point
    {-# UNPACK #-} !(Vector (RadixNode a)) -- ^ possible subtrees beyond this point
    a -- ^ value to return at this point
  | RadixSkip
    {-# UNPACK #-} !(Vector (RadixNode a)) -- ^ possible subtrees beyond this point
  deriving (Eq, Show, Typeable, Data)

instance NFData a => NFData (RadixNode a) where
  {-# INLINE rnf #-}
  rnf (RadixNode l t) = rnf l `seq` rnf t

instance NFData a => NFData (RadixTree a) where
  {-# INLINE rnf #-}
  rnf (RadixAccept t v p) = t `seq` rnf p `seq` rnf v
  rnf (RadixSkip v)       = rnf v

-- | Compress a totally-unoptimised 'Trie' into a nice and easily-parsable
-- 'RadixTree'
fromTrie :: Trie a -> RadixTree a
fromTrie = go . compress
  where
    !z = V.empty

    radixNode :: Seq Char -> CompressedTrie a -> RadixNode a
    radixNode l t = RadixNode (T.pack (toList l)) (go t)

    mapToVector :: Map k a -> Vector (k, a)
    mapToVector m = case M.size m of
      0  -> z
      sz -> V.fromListN sz (M.toList m)

    go :: CompressedTrie a -> RadixTree a
    go (CompressedTrie n) = case n of
      Accept l p m -> RadixAccept l (V.map (uncurry radixNode) (mapToVector m)) p
      Skip m       -> RadixSkip . V.map (uncurry radixNode) . mapToVector $! m

data TextSlice = TextSlice
  { tsOffset16 :: {-# UNPACK #-} !Int -- ^ offset (in units of Word16)
  , tsLength16 :: {-# UNPACK #-} !Int -- ^ length (in units of Word16)
  }

-- | Probably dangerous magic
--
-- When the second argument is found to be within the first, we re-use the
-- 'Text' array of the first. This should allow the second argument to be
-- garbage collected. This is to improve locality and memory use.
magicallySaveSpaceSometimes :: Text -> Text -> Maybe TextSlice
magicallySaveSpaceSometimes full s@(TI.Text _ _ slen) =
  case T.breakOn s full of
    (TI.Text{}, r@(TI.Text _ remoffs _))
      | T.null r  -> Nothing
      | otherwise -> Just TextSlice{tsOffset16 = remoffs, tsLength16 = slen}

-- | A normal 'RadixTree' stores a new 'Text' at every node. In contrast, a
-- 'CompressedRadixTree' takes a single corpus 'Text' which is indexed into by
-- nodes. This can save a lot of memory (e.g., using the radix trees from the
-- parsing benchmarks in this package, the 'CompressedRadixTree' version is
-- 254032 bytes, whereas the ordinary 'RadixTree' is a rotund 709904 bytes) at
-- no runtime cost.
data CompressedRadixTree a
  = CompressedRadixTree {-# UNPACK #-} !TI.Array !(CompressedRadixTree1 a)

data CompressedRadixTree1 a
  = CompressedRadixAccept
    {-# UNPACK #-} !TextSlice
    {-# UNPACK #-} !(Vector (CompressedRadixNode a))
    a
  | CompressedRadixSkip {-# UNPACK #-} !(Vector (CompressedRadixNode a))

data CompressedRadixNode a
  = CompressedRadixNode {-# UNPACK #-} !TextSlice !(CompressedRadixTree1 a)

instance NFData a => NFData (CompressedRadixNode a) where
  {-# INLINE rnf #-}
  rnf (CompressedRadixNode ts t) = ts `seq` rnf t

instance NFData a => NFData (CompressedRadixTree a) where
  {-# INLINE rnf #-}
  rnf (CompressedRadixTree arr v) = arr `seq` rnf v

instance NFData a => NFData (CompressedRadixTree1 a) where
  {-# INLINE rnf #-}
  rnf (CompressedRadixAccept ts v a) = ts `seq` rnf a `seq` rnf v
  rnf (CompressedRadixSkip v)        = rnf v

-- | Compress a 'RadixTree' given a corpus. All values in the tree must be
-- findable within the corpus, though the corpus does not have to necessarily be
-- the direct source of the tree
compressBy :: Text -> RadixTree a -> Maybe (CompressedRadixTree a)
compressBy full@(TI.Text arr _ _) rt =
  CompressedRadixTree arr <$> recompressT rt

  where
    magic = magicallySaveSpaceSometimes full

    recompressN :: RadixNode a -> Maybe (CompressedRadixNode a)
    recompressN (RadixNode t tree) = CompressedRadixNode <$> magic t <*> recompressT tree

    recompressT :: RadixTree a -> Maybe (CompressedRadixTree1 a)
    recompressT (RadixSkip v)       = CompressedRadixSkip <$> V.mapM recompressN v
    recompressT (RadixAccept t v a) = CompressedRadixAccept <$> magic t <*> V.mapM recompressN v <*> pure a

-- | *Slow*. Same as 'fromFoldable', but you do not need to supply pairs of text
-- and values; they will default to '()'.
fromFoldable_ :: Foldable f => f Text -> RadixTree ()
fromFoldable_ =
  fromTrie . foldr' (\t -> insert t t ()) (Trie (Skip M.empty))

-- | *Slow*
fromFoldable :: Foldable f => f (Text, a) -> RadixTree a
fromFoldable =
  fromTrie . foldr' (\(t, a) -> insert t t a) (Trie (Skip M.empty))

--------------------------------------------------------------------------------
-- Parsers from 'RadixTree's

class RadixParsing radixtree where
  keys :: radixtree a -> [(Text, a)]
  parse :: CharParsing m => (Text -> a -> r) -> radixtree a -> m r
  lookup :: radixtree a -> Text -> Maybe (Text, a)

{-# INLINE search #-}
-- | Find all occurences of the terms in a 'RadixTree' from this point on. This
-- will consume the entire remaining input. Can lazily produce results (but this
-- depends on your parser).
search
  :: (Monad m, CharParsing m, RadixParsing radixtree)
  => radixtree a
  -> m [Text]
search r = go
  where
    go =
      (parse const r >>= \x -> (x:) <$> go) <|>
      (anyChar >> go) <|>
      return []

{-# INLINE parse_ #-}
parse_ :: (RadixParsing r, CharParsing m) => r a -> m Text
parse_ = Data.RadixTree.parse const

{-# INLINE lookup_ #-}
lookup_ :: RadixParsing r => r a -> Text -> Maybe Text
lookup_ r t = fst <$> Data.RadixTree.lookup r t

instance RadixParsing RadixTree where
  keys = go []
    where
      go nil (RadixAccept l xs a) = (l,a) : V.foldr (\(RadixNode _ x) xs' -> go xs' x) nil xs
      go nil (RadixSkip xs) = V.foldr (\(RadixNode _ x) xs' -> go xs' x) nil xs

  {-# INLINE parse #-}
  -- | Parse from a 'RadixTree'
  parse :: CharParsing m => (Text -> a -> r) -> RadixTree a -> m r
  parse constr = go
    where
      go r = case r of
        RadixAccept l nodes a
          | T.null l -> empty
          | otherwise -> asum (V.map parseRadixNode nodes) <|> pure (constr l a)
        RadixSkip nodes ->
          asum (V.map parseRadixNode nodes)

      {-# INLINE parseRadixNode #-}
      parseRadixNode (RadixNode prefix tree)
        | T.null prefix = go tree
        | otherwise     = try (text prefix *> go tree)

  lookup :: RadixTree a -> Text -> Maybe (Text, a)
  lookup rt0 t0
    | T.null t0 = case rt0 of
        RadixAccept v _ a -> Just (v, a)
        RadixSkip _       -> Nothing
    | otherwise = case rt0 of
        RadixAccept _ ns _ -> lookupRadixNodes t0 ns
        RadixSkip     ns   -> lookupRadixNodes t0 ns
    where
      lookupRadixNodes t v = go 0
        where
          !vlen = V.length v
          go !i
            | i < vlen  = case V.unsafeIndex v i of
                RadixNode pfix rt -> case T.commonPrefixes pfix t of
                  Just (_, remPfx, remSfx)
                    | T.null remPfx -> Data.RadixTree.lookup rt remSfx
                    | otherwise     -> Nothing
                  Nothing -> go (i + 1)
            | otherwise = Nothing

instance RadixParsing CompressedRadixTree where

  keys (CompressedRadixTree arr crt) = go [] crt
    where
      fromSlice (TextSlice offs len) = TI.text arr offs len

      go nil (CompressedRadixAccept l xs a) =
        (fromSlice l, a) : V.foldr (\(CompressedRadixNode _ x) xs' -> go xs' x) nil xs
      go nil (CompressedRadixSkip xs) =
        V.foldr (\(CompressedRadixNode _ x) xs' -> go xs' x) nil xs

  {-# INLINE parse #-}
  -- | Parse from a 'RadixTree'
  parse :: CharParsing m => (Text -> a -> r) -> CompressedRadixTree a -> m r
  parse constr (CompressedRadixTree arr crt) = go crt
    where
      fromSlice (TextSlice offs len) = TI.text arr offs len
      go r = case r of
        CompressedRadixAccept ts nodes a -> case fromSlice ts of
          l | T.null l -> empty
            | otherwise -> asum (V.map parseRadixNode nodes) <|> pure (constr l a)
        CompressedRadixSkip nodes -> asum (V.map parseRadixNode nodes)
      {-# INLINE parseRadixNode #-}
      parseRadixNode (CompressedRadixNode ts tree) = case fromSlice ts of
        prefix | T.null prefix -> go tree
               | otherwise     -> try (text prefix *> go tree)

  lookup :: CompressedRadixTree a -> Text -> Maybe (Text, a)
  lookup (CompressedRadixTree arr0 rt0) = lookup1 rt0
    where
      fromSlice (TextSlice offs16 len16) = TI.text arr0 offs16 len16

      lookup1 rt !t
        | T.null t = case rt of
            CompressedRadixAccept v _ a -> Just (fromSlice v, a)
            CompressedRadixSkip _       -> Nothing
        | otherwise = case rt of
            CompressedRadixAccept _ ns _ -> lookupCompressedRadixNodes t ns
            CompressedRadixSkip     ns   -> lookupCompressedRadixNodes t ns

      lookupCompressedRadixNodes !t v = go 0
        where
          !vlen = V.length v
          go !i
            | i < vlen  = case V.unsafeIndex v i of
                CompressedRadixNode pfix rt -> case T.commonPrefixes (fromSlice pfix) t of
                  Just (_, remPfx, remSfx)
                    | T.null remPfx -> lookup1 rt remSfx
                    | otherwise     -> Nothing
                  Nothing -> go (i + 1)
            | otherwise = Nothing