{-# OPTIONS_GHC -fspec-constr #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE DeriveDataTypeable #-}
#if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
-----------------------------------------------------------------------------
-- |
-- Module      :  Data.CharSet
-- Copyright   :  (c) Edward Kmett 2010-2011
-- License     :  BSD3
-- Maintainer  :  ekmett@gmail.com
-- Stability   :  experimental
-- Portability :  portable
--
-- Fast set membership tests for 'Char' values
--
-- Stored as a (possibly negated) IntMap and a fast set used for the head byte.
--
-- The set of valid (possibly negated) head bytes is stored unboxed as a 32-byte
-- bytestring-based lookup table.
--
-- Designed to be imported qualified:
--
-- > import Data.CharSet (CharSet)
-- > import qualified Data.CharSet as CharSet
--
-------------------------------------------------------------------------------

module Data.CharSet
    (
    -- * Set type
      CharSet(..)
    -- * Operators
    , (\\)
    -- * Query
    , null
    , size
    , member
    , notMember
    , overlaps, isSubsetOf
    , isComplemented
    -- * Construction
    , build
    , empty
    , singleton
    , full
    , insert
    , delete
    , complement
    , range
    -- * Combine
    , union
    , intersection
    , difference
    -- * Filter
    , filter
    , partition
    -- * Map
    , map
    -- * Fold
    , fold
    -- * Conversion
    -- ** List
    , toList
    , fromList
    -- ** Ordered list
    , toAscList
    , fromAscList
    , fromDistinctAscList
    -- ** IntMaps
    , fromCharSet
    , toCharSet
    -- ** Array
    , toArray
    ) where

import Data.Array.Unboxed hiding (range)
import Data.Data
import Data.Function (on)
import Data.IntSet (IntSet)
import Data.CharSet.ByteSet (ByteSet)
import qualified Data.CharSet.ByteSet as ByteSet
import Data.Bits hiding (complement)
import Data.Word
import Data.ByteString.Internal (c2w)
import Data.Semigroup
import qualified Data.IntSet as I
import qualified Data.List as L
import Prelude hiding (filter, map, null)
import qualified Prelude as P
import Text.Read

data CharSet = CharSet !Bool {-# UNPACK #-} !ByteSet !IntSet
  deriving Typeable

charSet :: Bool -> IntSet -> CharSet
charSet b s = CharSet b (ByteSet.fromList (fmap headByte (I.toAscList s))) s

headByte :: Int -> Word8
headByte i
  | i <= 0x7f   = toEnum i
  | i <= 0x7ff  = toEnum $ 0x80 + (i `shiftR` 6)
  | i <= 0xffff = toEnum $ 0xe0 + (i `shiftR` 12)
  | otherwise   = toEnum $ 0xf0 + (i `shiftR` 18)

pos :: IntSet -> CharSet
pos = charSet True

neg :: IntSet -> CharSet
neg = charSet False

(\\) :: CharSet -> CharSet -> CharSet
(\\) = difference

build :: (Char -> Bool) -> CharSet
build p = fromDistinctAscList $ P.filter p [minBound .. maxBound]
{-# INLINE build #-}

map :: (Char -> Char) -> CharSet -> CharSet
map f (CharSet True _ i) = pos (I.map (fromEnum . f . toEnum) i)
map f (CharSet False _ i) = fromList $ P.map f $ P.filter (\x -> fromEnum x `I.notMember` i) [ul..uh]
{-# INLINE map #-}

isComplemented :: CharSet -> Bool
isComplemented (CharSet True _ _) = False
isComplemented (CharSet False _ _) = True
{-# INLINE isComplemented #-}

toList :: CharSet -> String
toList (CharSet True _ i) = P.map toEnum (I.toList i)
toList (CharSet False _ i) = P.filter (\x -> fromEnum x `I.notMember` i) [ul..uh]
{-# INLINE toList #-}

toAscList :: CharSet -> String
toAscList (CharSet True _ i) = P.map toEnum (I.toAscList i)
toAscList (CharSet False _ i) = P.filter (\x -> fromEnum x `I.notMember` i) [ul..uh]
{-# INLINE toAscList #-}

empty :: CharSet
empty = pos I.empty

singleton :: Char -> CharSet
singleton = pos . I.singleton . fromEnum
{-# INLINE singleton #-}

full :: CharSet
full = neg I.empty

-- | /O(n)/ worst case
null :: CharSet -> Bool
null (CharSet True _ i) = I.null i
null (CharSet False _ i) = I.size i == numChars
{-# INLINE null #-}

-- | /O(n)/
size :: CharSet -> Int
size (CharSet True _ i) = I.size i
size (CharSet False _ i) = numChars - I.size i
{-# INLINE size #-}

insert :: Char -> CharSet -> CharSet
insert c (CharSet True _ i) = pos (I.insert (fromEnum c) i)
insert c (CharSet False _ i) = neg (I.delete (fromEnum c) i)
{-# INLINE insert #-}

range :: Char -> Char -> CharSet
range a b
  | a <= b = fromDistinctAscList [a..b]
  | otherwise = empty

delete :: Char -> CharSet -> CharSet
delete c (CharSet True _ i) = pos (I.delete (fromEnum c) i)
delete c (CharSet False _ i) = neg (I.insert (fromEnum c) i)
{-# INLINE delete #-}

complement :: CharSet -> CharSet
complement (CharSet True s i) = CharSet False s i
complement (CharSet False s i) = CharSet True s i
{-# INLINE complement #-}

union :: CharSet -> CharSet -> CharSet
union (CharSet True _ i) (CharSet True _ j) = pos (I.union i j)
union (CharSet True _ i) (CharSet False _ j) = neg (I.difference j i)
union (CharSet False _ i) (CharSet True _ j) = neg (I.difference i j)
union (CharSet False _ i) (CharSet False _ j) = neg (I.intersection i j)
{-# INLINE union #-}

intersection :: CharSet -> CharSet -> CharSet
intersection (CharSet True _ i) (CharSet True _ j) = pos (I.intersection i j)
intersection (CharSet True _ i) (CharSet False _ j) = pos (I.difference i j)
intersection (CharSet False _ i) (CharSet True _ j) = pos (I.difference j i)
intersection (CharSet False _ i) (CharSet False _ j) = neg (I.union i j)
{-# INLINE intersection #-}

difference :: CharSet -> CharSet -> CharSet
difference (CharSet True _ i) (CharSet True _ j) = pos (I.difference i j)
difference (CharSet True _ i) (CharSet False _ j) = pos (I.intersection i j)
difference (CharSet False _ i) (CharSet True _ j) = neg (I.union i j)
difference (CharSet False _ i) (CharSet False _ j) = pos (I.difference j i)
{-# INLINE difference #-}

member :: Char -> CharSet -> Bool
member c (CharSet True b i)
  | c <= toEnum 0x7f = ByteSet.member (c2w c) b
  | otherwise        = I.member (fromEnum c) i
member c (CharSet False b i)
  | c <= toEnum 0x7f = not (ByteSet.member (c2w c) b)
  | otherwise        = I.notMember (fromEnum c) i
{-# INLINE member #-}

notMember :: Char -> CharSet -> Bool
notMember c s = not (member c s)
{-# INLINE notMember #-}

fold :: (Char -> b -> b) -> b -> CharSet -> b
fold f z (CharSet True _ i) = I.fold (f . toEnum) z i
fold f z (CharSet False _ i) = foldr f z $ P.filter (\x -> fromEnum x `I.notMember` i) [ul..uh]
{-# INLINE fold #-}

filter :: (Char -> Bool) -> CharSet -> CharSet
filter p (CharSet True _ i) = pos (I.filter (p . toEnum) i)
filter p (CharSet False _ i) = neg $ foldr (I.insert) i $ P.filter (\x -> (x `I.notMember` i) && not (p (toEnum x))) [ol..oh]
{-# INLINE filter #-}

partition :: (Char -> Bool) -> CharSet -> (CharSet, CharSet)
partition p (CharSet True _ i) = (pos l, pos r)
    where (l,r) = I.partition (p . toEnum) i
partition p (CharSet False _ i) = (neg (foldr I.insert i l), neg (foldr I.insert i r))
    where (l,r) = L.partition (p . toEnum) $ P.filter (\x -> x `I.notMember` i) [ol..oh]
{-# INLINE partition #-}

overlaps :: CharSet -> CharSet -> Bool
overlaps (CharSet True _ i) (CharSet True _ j) = not (I.null (I.intersection i j))
overlaps (CharSet True _ i) (CharSet False _ j) = not (I.isSubsetOf j i)
overlaps (CharSet False _ i) (CharSet True _ j) = not (I.isSubsetOf i j)
overlaps (CharSet False _ i) (CharSet False _ j) = any (\x -> I.notMember x i && I.notMember x j) [ol..oh] -- not likely
{-# INLINE overlaps #-}

isSubsetOf :: CharSet -> CharSet -> Bool
isSubsetOf (CharSet True _ i) (CharSet True _ j) = I.isSubsetOf i j
isSubsetOf (CharSet True _ i) (CharSet False _ j) = I.null (I.intersection i j)
isSubsetOf (CharSet False _ i) (CharSet True _ j) = all (\x -> I.member x i && I.member x j) [ol..oh] -- not bloody likely
isSubsetOf (CharSet False _ i) (CharSet False _ j) = I.isSubsetOf j i
{-# INLINE isSubsetOf #-}

fromList :: String -> CharSet
fromList = pos . I.fromList . P.map fromEnum
{-# INLINE fromList #-}

fromAscList :: String -> CharSet
fromAscList = pos . I.fromAscList . P.map fromEnum
{-# INLINE fromAscList #-}

fromDistinctAscList :: String -> CharSet
fromDistinctAscList = pos . I.fromDistinctAscList . P.map fromEnum
{-# INLINE fromDistinctAscList #-}

-- isProperSubsetOf :: CharSet -> CharSet -> Bool
-- isProperSubsetOf (P i) (P j) = I.isProperSubsetOf i j
-- isProperSubsetOf (P i) (N j) = null (I.intersection i j) && ...
-- isProperSubsetOf (N i) (N j) = I.isProperSubsetOf j i

ul, uh :: Char
ul = minBound
uh = maxBound
{-# INLINE ul #-}
{-# INLINE uh #-}

ol, oh :: Int
ol = fromEnum ul
oh = fromEnum uh
{-# INLINE ol #-}
{-# INLINE oh #-}

numChars :: Int
numChars = oh - ol + 1
{-# INLINE numChars #-}

instance Data CharSet where
  gfoldl k z set
    | isComplemented set = z complement `k` complement set
    | otherwise          = z fromList `k` toList set

  toConstr set
    | isComplemented set = complementConstr
    | otherwise = fromListConstr

  dataTypeOf _ = charSetDataType

  gunfold k z c = case constrIndex c of
    1 -> k (z fromList)
    2 -> k (z complement)
    _ -> error "gunfold"

fromListConstr :: Constr
fromListConstr   = mkConstr charSetDataType "fromList" [] Prefix
{-# NOINLINE fromListConstr #-}

complementConstr :: Constr
complementConstr = mkConstr charSetDataType "complement" [] Prefix
{-# NOINLINE complementConstr #-}

charSetDataType :: DataType
charSetDataType  = mkDataType "Data.CharSet.CharSet" [fromListConstr, complementConstr]
{-# NOINLINE charSetDataType #-}

-- returns an intset and if the charSet is positive
fromCharSet :: CharSet -> (Bool, IntSet)
fromCharSet (CharSet b _ i) = (b, i)
{-# INLINE fromCharSet #-}

toCharSet :: IntSet -> CharSet
toCharSet = pos
{-# INLINE toCharSet #-}

instance Eq CharSet where
  (==) = (==) `on` toAscList

instance Ord CharSet where
  compare = compare `on` toAscList

instance Bounded CharSet where
  minBound = empty
  maxBound = full

-- TODO return a tighter bounded array perhaps starting from the least element present to the last element present?
toArray :: CharSet -> UArray Char Bool
toArray set = array (minBound, maxBound) $ fmap (\x -> (x, x `member` set)) [minBound .. maxBound]

instance Show CharSet where
  showsPrec d i
    | isComplemented i = showParen (d > 10) $ showString "complement " . showsPrec 11 (complement i)
    | otherwise        = showParen (d > 10) $ showString "fromDistinctAscList " . showsPrec 11 (toAscList i)

instance Read CharSet where
  readPrec = parens $ complemented +++ normal
    where
      complemented = prec 10 $ do
        Ident "complement" <- lexP
        complement `fmap` step readPrec
      normal = prec 10 $ do
        Ident "fromDistinctAscList" <- lexP
        fromDistinctAscList `fmap` step readPrec

instance Semigroup CharSet where
  (<>) = union

instance Monoid CharSet where
  mempty = empty
  mappend = union