{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE MagicHash                  #-}
{-# LANGUAGE RankNTypes                 #-}
{-# LANGUAGE Trustworthy                #-}
{-# LANGUAGE TypeFamilies               #-}

-- |
-- Copyright: © 2017 Herbert Valerio Riedel
-- License: GPLv3
--
-- This module provides the 'TextSet' container for storing sets of text strings.
--
-- This module is intended to be imported @qualified@, e.g.
--
-- > import           Data.TextSet.Unboxed (TextSet)
-- > import qualified Data.TextSet.Unboxed as TextSet
--
module Data.TextSet.Unboxed
    ( TextSet
    , Key
      -- * Querying & lookup
    , size
    , null
    , member
    , lookupMin
    , lookupMax
    , lookupLE
    , lookupGE

    , (!?)
    , lookupIndex

      -- * Construction
    , empty
    , singleton
    , fromList
    , fromDistinctAscList
    , fromSet

      -- * Deconstruction
    , toList
    , toArray
    , toSet

    ) where

import           Control.DeepSeq
import           Data.Hashable                   (Hashable)
import qualified Data.Set                        as Set
import           Data.Text.Short                 (ShortText)
import qualified GHC.Exts                        as GHC
import           Prelude                         hiding (null)

import           Data.TextArray.Unboxed          (TextArray)
import qualified Data.TextArray.Unboxed.Internal as TA
import           Internal

-- | A set of unboxed 'ShortText' strings
--
-- The memory footprint of this data-structure is a single heap object (an unlifted 'ByteArray#') with the size expressed in words
--
-- \[
--  3 + n + \left\lceil \frac{1}{w} \sum_{i=0}^{n-1} len(s_i) \right\rceil
-- \]
--
-- where the word-size \(w\) is either \(w = 4\) or \(w = 8\) bytes; and where \(len(s_i)\) denotes the UTF-8 size in bytes of the \(i\)-th text string.
--
-- NOTE: Depending on whether you @UNPACK@ the 'TextSet' wrapper, you need at least one additional word for the pointer to the internal 'ByteArray#' heap object.
newtype TextSet = TS TextArray
                deriving (Eq,Ord,NFData,Hashable)

type Key = ShortText

instance Show TextSet where
  showsPrec p (TS ta) = showsPrec p ta
  show (TS ta)        = show ta

instance Read TextSet where
  readsPrec p = map (\(x,s) -> (TS x,s)) . readsPrec p

instance GHC.IsList TextSet where
  type Item TextSet = Key
  fromList = fromList
  toList   = toList

-- | \(\mathcal{O}(1)\). An empty 'TextSet'.
empty :: TextSet
empty = TS TA.empty

-- | \(\mathcal{O}(1)\). Report number of elements in 'TextSet'.
--
-- >>> size empty
-- 0
--
-- >>> size (singleton "")
-- 1
--
-- >>> size (fromList ["Hey","Hey","Jude"])
-- 2
--
size :: TextSet -> Int
size (TS ta) = TA.length ta

-- | \(\mathcal{O}(1)\). Check for 'empty' set.
--
-- >>> null empty
-- True
--
-- >>> null (singleton "")
-- False
--
null :: TextSet -> Bool
null (TS ta) = TA.null ta


-- | \(\mathcal{O}(1)\). Construct set containing one element.
--
-- >>> toList (singleton "alone")
-- ["alone"]
--
singleton :: Key -> TextSet
singleton = TS . TA.singleton

-- | \(\mathcal{O}(n \log n)\). Construct set from list of elements.
--
-- >>> toList (fromList ["Hey","Jude","Hey","Law","Hey",""])
-- ["","Hey","Jude","Law"]
--
fromList :: [Key] -> TextSet
fromList []  = empty
fromList [x] = singleton x
fromList xs  = fromSet (Set.fromList xs)

-- | \(\mathcal{O}(n)\). Construct set from list of distinct elements in ascending order.
--
-- __NOTE__: If the input list is not strictly ascending, an 'error' is thrown.
fromDistinctAscList :: [Key] -> TextSet
fromDistinctAscList [] = empty
fromDistinctAscList xs
  | isMonotonic xs = TS (TA.fromList xs)
  | otherwise      = error "fromDistinctAscList: invalid argument"

isMonotonic :: [Key] -> Bool
isMonotonic [] = True
isMonotonic [_] = True
isMonotonic (x:xs) = go x xs
  where
    go _ [] = True
    go y (y':ys)
      | y < y'    = go y' ys
      | otherwise = False

-- | \(\mathcal{O}(n)\). Convert 'Set' to 'TextSet'.
fromSet :: Set.Set Key -> TextSet
fromSet = TS . TA.fromList . Set.toAscList

-- | \(\mathcal{O}(n)\). Convert 'TextSet' to 'Set'.
toSet :: TextSet -> Set.Set Key
toSet = Set.fromDistinctAscList . toList

-- | \(\mathcal{O}(n)\). Convert 'TextSet' to list of 'ShortText' in ascending order.
toList :: TextSet -> [Key]
toList (TS ta) = TA.toList ta

-- | \(\mathcal{O}(1)\). Convert 'TextSet' to 'TextArray' of 'ShortText' in ascending order.
--
-- >>> toList (fromList ["Hey","Jude","Hey","Law","Hey",""])
-- ["","Hey","Jude","Law"]
--
toArray :: TextSet -> TextArray
toArray (TS ta) = ta

-- | \(\mathcal{O}(1)\). Extract minimal element from set.
--
-- >>> lookupMin empty
-- Nothing
--
-- >>> lookupMin (fromList ["a","b","c"])
-- Just "a"
--
lookupMin :: TextSet -> Maybe Key
lookupMin (TS ta) = ta TA.!? 0

-- | \(\mathcal{O}(1)\). Extract maximal element from set.
--
-- >>> lookupMax empty
-- Nothing
--
-- >>> lookupMax (fromList ["a","b","c"])
-- Just "c"
--
lookupMax :: TextSet -> Maybe Key
lookupMax (TS ta) = ta TA.!? (TA.length ta - 1)


-- TODO: isProperSubsetOf isSubsetOf union intersection difference

----------------------------------------------------------------------------
-- lookup

-- | \(\mathcal{O}(\log n)\). Test whether set contains a string.
--
-- >>> member "empty" empty
-- False
--
-- >>> member "a" (fromList ["a","b","c"])
-- True
--
-- >>> member "d" (fromList ["a","b","c"])
-- False
--
member :: Key -> TextSet -> Bool
member t ts = case lookupIndexNear LBelow t ts of
                LResExact _ -> True
                _           -> False


-- | \(\mathcal{O}(1)\). Retrieve \(i\)-th element in the sorted sequence of elements.
--
-- >>> fromList ["Hey","","Jude"] !? 0
-- Just ""
--
-- >>> fromList ["Hey","","Jude"] !? 1
-- Just "Hey"
--
-- >>> fromList ["Hey","","Jude"] !? 3
-- Nothing
--
-- See also 'lookupIndex'.
(!?) :: TextSet -> Int -> Maybe Key
(TS ta) !? i = ta TA.!? i

-- | \(\mathcal{O}(\log n)\). Look up element in set and report its zero-based index in the sorted sequence elements.
--
-- >>> lookupIndex "" (fromList ["Hey","","Jude"])
-- Just 0
--
-- >>> lookupIndex "Hey" (fromList ["Hey","","Jude"])
-- Just 1
--
-- >>> lookupIndex "Law" (fromList ["Hey","","Jude"])
-- Nothing
--
-- See also '!?'.
lookupIndex :: Key -> TextSet -> Maybe Int
lookupIndex t ts = case lookupIndexNear LBelow t ts of
                     LResExact (IdxOfsLen i _ _) -> Just i
                     _                           -> Nothing

mkist :: TextSet -> IdxOfsLen -> (Int, Key)
mkist (TS ta) (IdxOfsLen i ofs n) = (i, ba2st (sliceBA (TA.ta2ba ta) ofs n))

-- TODO: lookupLT lookupGT

-- | \(\mathcal{O}(\log n)\). Look up \"greatest\" string (together with its index) in set less or equal to given string.
--
-- >>> lookupLE "a" (fromList ["bb","cc"])
-- Nothing
--
-- >>> lookupLE "c" (fromList ["bb","cc"])
-- Just (0,"bb")
--
-- >>> lookupLE "cc" (fromList ["bb","cc"])
-- Just (1,"cc")
--
-- >>> lookupLE "z" (fromList ["bb","cc"])
-- Just (1,"cc")
--
lookupLE :: Key -> TextSet -> Maybe (Int,Key)
lookupLE t ts = case lookupIndexNear LAbove t ts of
                  LResAbove res -> Just (mkist ts res)
                  LResExact res -> Just (mkist ts res)
                  _             -> Nothing

-- | \(\mathcal{O}(\log n)\). Look up \"least\" string (together with its index) in set greater or equal to given string.
--
-- >>> lookupGE "a" (fromList ["bb","cc"])
-- Just (0,"bb")
--
-- >>> lookupGE "c" (fromList ["bb","cc"])
-- Just (1,"cc")
--
-- >>> lookupGE "cc" (fromList ["bb","cc"])
-- Just (1,"cc")
--
-- >>> lookupGE "z" (fromList ["bb","cc"])
-- Nothing
--
lookupGE :: Key -> TextSet -> Maybe (Int,Key)
lookupGE t ts = case lookupIndexNear LBelow t ts of
                  LResBelow res -> Just (mkist ts res)
                  LResExact res -> Just (mkist ts res)
                  _             -> Nothing

data LRes = LResEmpty              -- empty set
          | LResExact   !IdxOfsLen -- exact match
          | LResBelow   !IdxOfsLen -- inexact match, needle is /below/ IdxOfsLen entry
          | LResAbove   !IdxOfsLen -- inexact match, needle is /above/ IdxOfsLen entry
          deriving Show

-- | associativity of 'lookupIndexNear'
data LMode = LBelow -- if between two entries, return 'LResBelow'
           | LAbove -- if between two entries, return 'LResAbove'

-- Find near entry
--
-- NB: the implementation of 'lookupIndex{L,R}' differs only in two places, marked with (I) and (II) comments
lookupIndexNear :: LMode -> Key -> TextSet -> LRes
lookupIndexNear mode x (TS ta)
  | ui0 <   0 = LResEmpty
  | otherwise = case cmpBA lv0 of
      LT -> LResBelow lv0
      EQ -> LResExact lv0
      GT -> case cmpBA uv0 of
              GT -> LResAbove uv0
              EQ -> LResExact uv0
              LT -> case mode of
                      LAbove -> goL li0 ui0 -- invariant: lv0 < v < uv0   && li < ui
                      LBelow -> goR li0 ui0 -- invariant: lv0 < v < uv0   && li < ui
  where
    li0 = 0
    ui0 = TA.length ta - 1

    lv0, uv0 :: IdxOfsLen
    lv0 = TA.indexOfsLen' ta li0
    uv0 = TA.indexOfsLen' ta ui0

    cmpBA = cmpBA2OfsLen (st2ba x) (TA.ta2ba ta)

    -----------------------------------------------

    goL :: Int -> Int -> LRes
    goL li ui -- invariant: li < ui
      | mi == li  = LResAbove mv -- closest match -- (I)
      | otherwise = case cmpBA mv of
                      LT -> goL li mi -- go left
                      EQ -> LResExact mv -- exact match
                      GT -> goL mi ui -- go right
      where
        mi = mid li ui
        mv = TA.indexOfsLen' ta mi

        mid :: Int -> Int -> Int
        mid li' ui' = li' + quot (ui'-li') 2 -- (II)

    goR :: Int -> Int -> LRes
    goR li ui -- invariant: li < ui
      | mi == ui  = LResBelow mv -- closest match -- (I)
      | otherwise = case cmpBA mv of
                      LT -> goR li mi    -- go left
                      EQ -> LResExact mv -- exact match
                      GT -> goR mi ui    -- go right
      where
        mi = mid li ui
        mv = TA.indexOfsLen' ta mi

        mid :: Int -> Int -> Int
        mid li' ui' = ui' - quot (ui'-li') 2 -- (II)


-- $setup
-- >>> :set -XOverloadedStrings