{-# OPTIONS_GHC -fspec-constr #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE DeriveDataTypeable #-}
#if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
-----------------------------------------------------------------------------
-- |
-- Module      :  Data.CharSet
-- Copyright   :  (c) Edward Kmett 2010-2011
-- License     :  BSD3
-- Maintainer  :  ekmett@gmail.com
-- Stability   :  experimental
-- Portability :  portable
--
-- A CharSet is an /efficient/ representation of a set of 'Char' values
-- designed for fast membership tests.
--
-- As an example @build isAlpha@ will create a set of alphabetic characters.
-- We can then use 'member' on the generated set to /efficiently/ test if a
-- given @Char@ represents an alphabetic character.
--
-- Designed to be imported qualified:
--
-- > import Data.CharSet (CharSet)
-- > import qualified Data.CharSet as CharSet
--
-------------------------------------------------------------------------------

module Data.CharSet
    (
    -- * Set type
      CharSet(..)
    -- * Operators
    , (\\)
    -- * Query
    , null
    , size
    , member
    , notMember
    , overlaps, isSubsetOf
    , isComplemented
    -- * Construction
    , build
    , empty
    , singleton
    , full
    , insert
    , delete
    , complement
    , range
    -- * Combine
    , union
    , intersection
    , difference
    -- * Filter
    , filter
    , partition
    -- * Map
    , map
    -- * Fold
    , fold
    -- * Conversion
    -- ** List
    , toList
    , fromList
    -- ** Ordered list
    , toAscList
    , fromAscList
    , fromDistinctAscList
    -- ** IntMaps
    , fromCharSet
    , toCharSet
    -- ** Array
    , toArray
    ) where

#if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ >= 608
import Data.String (IsString(..))
-- <<< -XOverloadedStrings >>> was introduced by GHC 6.8.1
#endif

import Data.Array.Unboxed hiding (range)
import Data.Data
import Data.Function (on)
import Data.IntSet (IntSet)
import Data.CharSet.ByteSet (ByteSet)
import qualified Data.CharSet.ByteSet as ByteSet
import Data.Bits hiding (complement)
import Data.Word
import Data.ByteString.Internal (c2w)
import Data.Semigroup
import qualified Data.IntSet as I
import qualified Data.List as L
import Prelude hiding (filter, map, null)
import qualified Prelude as P
import Text.Read

-- | Stored as a (possibly negated) IntSet and a fast set used for the head byte.
--
-- The set of valid (possibly negated) head bytes is stored unboxed as a 32-byte
-- bytestring-based lookup table.
data CharSet = CharSet
    !Bool    -- Whether ByteSet and IntSet are negated
    !ByteSet -- Set of head bytes, unboxed
    !IntSet  -- Set of characters in the charset
  deriving Typeable

#if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ >= 608
-- | @= CharSet.`fromList`@
instance IsString CharSet where
  fromString = fromList
#endif

charSet :: Bool -> IntSet -> CharSet
charSet b s = CharSet b (ByteSet.fromList (fmap headByte (I.toAscList s))) s

headByte :: Int -> Word8
headByte i
  | i <= 0x7f   = toEnum i
  | i <= 0x7ff  = toEnum $ 0x80 + (i `shiftR` 6)
  | i <= 0xffff = toEnum $ 0xe0 + (i `shiftR` 12)
  | otherwise   = toEnum $ 0xf0 + (i `shiftR` 18)

pos :: IntSet -> CharSet
pos = charSet True

neg :: IntSet -> CharSet
neg = charSet False

(\\) :: CharSet -> CharSet -> CharSet
(\\) = difference

-- | Applies a predicate across the whole range of possible character values
-- to create a set of only those characters which satisfy the predicate.
--
-- As an example @build isAlpha@ will generate a CharSet of all
-- alphabetic characters.
build :: (Char -> Bool) -> CharSet
build p = fromDistinctAscList $ P.filter p [minBound .. maxBound]
{-# INLINE build #-}

map :: (Char -> Char) -> CharSet -> CharSet
map f (CharSet True _ i) = pos (I.map (fromEnum . f . toEnum) i)
map f (CharSet False _ i) = fromList $ P.map f $ P.filter (\x -> fromEnum x `I.notMember` i) [ul..uh]
{-# INLINE map #-}

isComplemented :: CharSet -> Bool
isComplemented (CharSet True _ _) = False
isComplemented (CharSet False _ _) = True
{-# INLINE isComplemented #-}

toList :: CharSet -> String
toList (CharSet True _ i) = P.map toEnum (I.toList i)
toList (CharSet False _ i) = P.filter (\x -> fromEnum x `I.notMember` i) [ul..uh]
{-# INLINE toList #-}

toAscList :: CharSet -> String
toAscList (CharSet True _ i) = P.map toEnum (I.toAscList i)
toAscList (CharSet False _ i) = P.filter (\x -> fromEnum x `I.notMember` i) [ul..uh]
{-# INLINE toAscList #-}

empty :: CharSet
empty = pos I.empty

singleton :: Char -> CharSet
singleton = pos . I.singleton . fromEnum
{-# INLINE singleton #-}

full :: CharSet
full = neg I.empty

-- | /O(n)/ worst case
null :: CharSet -> Bool
null (CharSet True _ i) = I.null i
null (CharSet False _ i) = I.size i == numChars
{-# INLINE null #-}

-- | /O(n)/
size :: CharSet -> Int
size (CharSet True _ i) = I.size i
size (CharSet False _ i) = numChars - I.size i
{-# INLINE size #-}

insert :: Char -> CharSet -> CharSet
insert c (CharSet True _ i) = pos (I.insert (fromEnum c) i)
insert c (CharSet False _ i) = neg (I.delete (fromEnum c) i)
{-# INLINE insert #-}

range :: Char -> Char -> CharSet
range a b
  | a <= b = fromDistinctAscList [a..b]
  | otherwise = empty

delete :: Char -> CharSet -> CharSet
delete c (CharSet True _ i) = pos (I.delete (fromEnum c) i)
delete c (CharSet False _ i) = neg (I.insert (fromEnum c) i)
{-# INLINE delete #-}

complement :: CharSet -> CharSet
complement (CharSet True s i) = CharSet False s i
complement (CharSet False s i) = CharSet True s i
{-# INLINE complement #-}

union :: CharSet -> CharSet -> CharSet
union (CharSet True _ i) (CharSet True _ j) = pos (I.union i j)
union (CharSet True _ i) (CharSet False _ j) = neg (I.difference j i)
union (CharSet False _ i) (CharSet True _ j) = neg (I.difference i j)
union (CharSet False _ i) (CharSet False _ j) = neg (I.intersection i j)
{-# INLINE union #-}

intersection :: CharSet -> CharSet -> CharSet
intersection (CharSet True _ i) (CharSet True _ j) = pos (I.intersection i j)
intersection (CharSet True _ i) (CharSet False _ j) = pos (I.difference i j)
intersection (CharSet False _ i) (CharSet True _ j) = pos (I.difference j i)
intersection (CharSet False _ i) (CharSet False _ j) = neg (I.union i j)
{-# INLINE intersection #-}

difference :: CharSet -> CharSet -> CharSet
difference (CharSet True _ i) (CharSet True _ j) = pos (I.difference i j)
difference (CharSet True _ i) (CharSet False _ j) = pos (I.intersection i j)
difference (CharSet False _ i) (CharSet True _ j) = neg (I.union i j)
difference (CharSet False _ i) (CharSet False _ j) = pos (I.difference j i)
{-# INLINE difference #-}

member :: Char -> CharSet -> Bool
member c (CharSet True b i)
  | c <= toEnum 0x7f = ByteSet.member (c2w c) b
  | otherwise        = I.member (fromEnum c) i
member c (CharSet False b i)
  | c <= toEnum 0x7f = not (ByteSet.member (c2w c) b)
  | otherwise        = I.notMember (fromEnum c) i
{-# INLINE member #-}

notMember :: Char -> CharSet -> Bool
notMember c s = not (member c s)
{-# INLINE notMember #-}

fold :: (Char -> b -> b) -> b -> CharSet -> b
fold f z (CharSet True _ i) = I.fold (f . toEnum) z i
fold f z (CharSet False _ i) = foldr f z $ P.filter (\x -> fromEnum x `I.notMember` i) [ul..uh]
{-# INLINE fold #-}

filter :: (Char -> Bool) -> CharSet -> CharSet
filter p (CharSet True _ i) = pos (I.filter (p . toEnum) i)
filter p (CharSet False _ i) = neg $ foldr (I.insert) i $ P.filter (\x -> (x `I.notMember` i) && not (p (toEnum x))) [ol..oh]
{-# INLINE filter #-}

partition :: (Char -> Bool) -> CharSet -> (CharSet, CharSet)
partition p (CharSet True _ i) = (pos l, pos r)
    where (l,r) = I.partition (p . toEnum) i
partition p (CharSet False _ i) = (neg (foldr I.insert i l), neg (foldr I.insert i r))
    where (l,r) = L.partition (p . toEnum) $ P.filter (\x -> x `I.notMember` i) [ol..oh]
{-# INLINE partition #-}

overlaps :: CharSet -> CharSet -> Bool
overlaps (CharSet True _ i) (CharSet True _ j) = not (I.null (I.intersection i j))
overlaps (CharSet True _ i) (CharSet False _ j) = not (I.isSubsetOf j i)
overlaps (CharSet False _ i) (CharSet True _ j) = not (I.isSubsetOf i j)
overlaps (CharSet False _ i) (CharSet False _ j) = any (\x -> I.notMember x i && I.notMember x j) [ol..oh] -- not likely
{-# INLINE overlaps #-}

isSubsetOf :: CharSet -> CharSet -> Bool
isSubsetOf (CharSet True _ i) (CharSet True _ j) = I.isSubsetOf i j
isSubsetOf (CharSet True _ i) (CharSet False _ j) = I.null (I.intersection i j)
isSubsetOf (CharSet False _ i) (CharSet True _ j) = all (\x -> I.member x i && I.member x j) [ol..oh] -- not bloody likely
isSubsetOf (CharSet False _ i) (CharSet False _ j) = I.isSubsetOf j i
{-# INLINE isSubsetOf #-}

fromList :: String -> CharSet
fromList = pos . I.fromList . P.map fromEnum
{-# INLINE fromList #-}

fromAscList :: String -> CharSet
fromAscList = pos . I.fromAscList . P.map fromEnum
{-# INLINE fromAscList #-}

fromDistinctAscList :: String -> CharSet
fromDistinctAscList = pos . I.fromDistinctAscList . P.map fromEnum
{-# INLINE fromDistinctAscList #-}

-- isProperSubsetOf :: CharSet -> CharSet -> Bool
-- isProperSubsetOf (P i) (P j) = I.isProperSubsetOf i j
-- isProperSubsetOf (P i) (N j) = null (I.intersection i j) && ...
-- isProperSubsetOf (N i) (N j) = I.isProperSubsetOf j i

ul, uh :: Char
ul = minBound
uh = maxBound
{-# INLINE ul #-}
{-# INLINE uh #-}

ol, oh :: Int
ol = fromEnum ul
oh = fromEnum uh
{-# INLINE ol #-}
{-# INLINE oh #-}

numChars :: Int
numChars = oh - ol + 1
{-# INLINE numChars #-}

instance Data CharSet where
  gfoldl k z set
    | isComplemented set = z complement `k` complement set
    | otherwise          = z fromList `k` toList set

  toConstr set
    | isComplemented set = complementConstr
    | otherwise = fromListConstr

  dataTypeOf _ = charSetDataType

  gunfold k z c = case constrIndex c of
    1 -> k (z fromList)
    2 -> k (z complement)
    _ -> error "gunfold"

fromListConstr :: Constr
fromListConstr   = mkConstr charSetDataType "fromList" [] Prefix
{-# NOINLINE fromListConstr #-}

complementConstr :: Constr
complementConstr = mkConstr charSetDataType "complement" [] Prefix
{-# NOINLINE complementConstr #-}

charSetDataType :: DataType
charSetDataType  = mkDataType "Data.CharSet.CharSet" [fromListConstr, complementConstr]
{-# NOINLINE charSetDataType #-}

-- returns an intset and if the charSet is positive
fromCharSet :: CharSet -> (Bool, IntSet)
fromCharSet (CharSet b _ i) = (b, i)
{-# INLINE fromCharSet #-}

toCharSet :: IntSet -> CharSet
toCharSet = pos
{-# INLINE toCharSet #-}

instance Eq CharSet where
  (==) = (==) `on` toAscList

instance Ord CharSet where
  compare = compare `on` toAscList

instance Bounded CharSet where
  minBound = empty
  maxBound = full

-- TODO return a tighter bounded array perhaps starting from the least element present to the last element present?
toArray :: CharSet -> UArray Char Bool
toArray set = array (minBound, maxBound) $ fmap (\x -> (x, x `member` set)) [minBound .. maxBound]

instance Show CharSet where
  showsPrec d i
    | isComplemented i = showParen (d > 10) $ showString "complement " . showsPrec 11 (complement i)
    | otherwise        = showParen (d > 10) $ showString "fromDistinctAscList " . showsPrec 11 (toAscList i)

instance Read CharSet where
  readPrec = parens $ complemented +++ normal
    where
      complemented = prec 10 $ do
        Ident "complement" <- lexP
        complement `fmap` step readPrec
      normal = prec 10 $ do
        Ident "fromDistinctAscList" <- lexP
        fromDistinctAscList `fmap` step readPrec

instance Semigroup CharSet where
  (<>) = union

instance Monoid CharSet where
  mempty = empty
#if !(MIN_VERSION_base(4,11,0))
  mappend = union
#endif