----------------------------------------------------------------------------- -- | -- Module : Text.Trifecta.CharSet -- Copyright : (c) Edward Kmett 2010-2011 -- License : BSD3 -- Maintainer : ekmett@gmail.com -- Stability : experimental -- Portability : non-portable (Data, BangPatterns, MagicHash) -- -- Fast set membership tests for 'Char' values -- -- Stored as a (possibly negated IntMap) and a fast set for the ASCII range. -- -- The ASCII range is unboxed for efficiency. For small (complemented) sets, we -- test for membership using a binary search. For larger (complemented) sets -- we use a lookup table. -- -- Designed to be imported qualified: -- -- > import Text.Trifecta.CharSet.Prim (CharSet) -- > import qualified Text.Trifecta.CharSet.Prim as CharSet -- ------------------------------------------------------------------------------- module Text.Trifecta.CharSet ( -- * Set type CharSet -- * Operators , (\\) -- * Query , null , size , member , notMember , overlaps, isSubsetOf , isComplemented -- * Construction , build , empty , singleton , full , insert , delete , complement , range -- * Combine , union , intersection , difference -- * Filter , filter , partition -- * Map , map -- * Fold , fold -- * Conversion -- ** List , toList , fromList -- ** Ordered list , toAscList , fromAscList , fromDistinctAscList -- ** IntMaps , fromCharSet , toCharSet -- ** Array , toArray ) where import Data.Array.Unboxed hiding (range) import Data.Data import Data.Function (on) import Data.IntSet (IntSet) import Text.Trifecta.CharSet.AsciiSet (AsciiSet) import qualified Text.Trifecta.CharSet.AsciiSet as AsciiSet import Data.Monoid (Monoid(..)) import qualified Data.IntSet as I import qualified Data.List as L import Prelude hiding (filter, map, null) import qualified Prelude as P import Text.Read data CharSet = Pos AsciiSet !IntSet | Neg AsciiSet !IntSet pos :: IntSet -> CharSet pos s = Pos (AsciiSet.fromList (fmap toEnum (takeWhile (<= 0x7f) (I.toAscList s)))) s neg :: IntSet -> CharSet neg s = Neg (AsciiSet.fromList (fmap toEnum (takeWhile (<= 0x7f) (I.toAscList s)))) s (\\) :: CharSet -> CharSet -> CharSet (\\) = difference build :: (Char -> Bool) -> CharSet build p = fromDistinctAscList $ P.filter p [minBound .. maxBound] {-# INLINE build #-} map :: (Char -> Char) -> CharSet -> CharSet map f (Pos _ i) = pos (I.map (fromEnum . f . toEnum) i) map f (Neg _ i) = fromList $ P.map f $ P.filter (\x -> fromEnum x `I.notMember` i) [ul..uh] {-# INLINE map #-} isComplemented :: CharSet -> Bool isComplemented (Pos _ _) = False isComplemented (Neg _ _) = True {-# INLINE isComplemented #-} toList :: CharSet -> String toList (Pos _ i) = P.map toEnum (I.toList i) toList (Neg _ i) = P.filter (\x -> fromEnum x `I.notMember` i) [ul..uh] {-# INLINE toList #-} toAscList :: CharSet -> String toAscList (Pos _ i) = P.map toEnum (I.toAscList i) toAscList (Neg _ i) = P.filter (\x -> fromEnum x `I.notMember` i) [ul..uh] {-# INLINE toAscList #-} empty :: CharSet empty = pos I.empty singleton :: Char -> CharSet singleton = pos . I.singleton . fromEnum {-# INLINE singleton #-} full :: CharSet full = neg I.empty -- | /O(n)/ worst case null :: CharSet -> Bool null (Pos _ i) = I.null i null (Neg _ i) = I.size i == numChars {-# INLINE null #-} -- | /O(n)/ size :: CharSet -> Int size (Pos _ i) = I.size i size (Neg _ i) = numChars - I.size i {-# INLINE size #-} insert :: Char -> CharSet -> CharSet insert c (Pos _ i) = pos (I.insert (fromEnum c) i) insert c (Neg _ i) = neg (I.delete (fromEnum c) i) {-# INLINE insert #-} range :: Char -> Char -> CharSet range a b | a <= b = fromDistinctAscList [a..b] | otherwise = empty delete :: Char -> CharSet -> CharSet delete c (Pos _ i) = pos (I.delete (fromEnum c) i) delete c (Neg _ i) = neg (I.insert (fromEnum c) i) {-# INLINE delete #-} complement :: CharSet -> CharSet complement (Pos s i) = Neg s i complement (Neg s i) = Pos s i {-# INLINE complement #-} union :: CharSet -> CharSet -> CharSet union (Pos _ i) (Pos _ j) = pos (I.union i j) union (Pos _ i) (Neg _ j) = neg (I.difference j i) union (Neg _ i) (Pos _ j) = neg (I.difference i j) union (Neg _ i) (Neg _ j) = neg (I.intersection i j) {-# INLINE union #-} intersection :: CharSet -> CharSet -> CharSet intersection (Pos _ i) (Pos _ j) = pos (I.intersection i j) intersection (Pos _ i) (Neg _ j) = pos (I.difference i j) intersection (Neg _ i) (Pos _ j) = pos (I.difference j i) intersection (Neg _ i) (Neg _ j) = neg (I.union i j) {-# INLINE intersection #-} difference :: CharSet -> CharSet -> CharSet difference (Pos _ i) (Pos _ j) = pos (I.difference i j) difference (Pos _ i) (Neg _ j) = pos (I.intersection i j) difference (Neg _ i) (Pos _ j) = neg (I.union i j) difference (Neg _ i) (Neg _ j) = pos (I.difference j i) {-# INLINE difference #-} member :: Char -> CharSet -> Bool member c (Pos b i) | c <= toEnum 0x7f = AsciiSet.member c b | otherwise = I.member (fromEnum c) i member c (Neg b i) | c <= toEnum 0x7f = not (AsciiSet.member c b) | otherwise = I.notMember (fromEnum c) i {-# INLINE member #-} notMember :: Char -> CharSet -> Bool notMember c s = not (member c s) {-# INLINE notMember #-} fold :: (Char -> b -> b) -> b -> CharSet -> b fold f z (Pos _ i) = I.fold (f . toEnum) z i fold f z (Neg _ i) = foldr f z $ P.filter (\x -> fromEnum x `I.notMember` i) [ul..uh] {-# INLINE fold #-} filter :: (Char -> Bool) -> CharSet -> CharSet filter p (Pos _ i) = pos (I.filter (p . toEnum) i) filter p (Neg _ i) = neg $ foldr (I.insert) i $ P.filter (\x -> (x `I.notMember` i) && not (p (toEnum x))) [ol..oh] {-# INLINE filter #-} partition :: (Char -> Bool) -> CharSet -> (CharSet, CharSet) partition p (Pos _ i) = (pos l, pos r) where (l,r) = I.partition (p . toEnum) i partition p (Neg _ i) = (neg (foldr I.insert i l), neg (foldr I.insert i r)) where (l,r) = L.partition (p . toEnum) $ P.filter (\x -> x `I.notMember` i) [ol..oh] {-# INLINE partition #-} overlaps :: CharSet -> CharSet -> Bool overlaps (Pos _ i) (Pos _ j) = not (I.null (I.intersection i j)) overlaps (Pos _ i) (Neg _ j) = not (I.isSubsetOf j i) overlaps (Neg _ i) (Pos _ j) = not (I.isSubsetOf i j) overlaps (Neg _ i) (Neg _ j) = any (\x -> I.notMember x i && I.notMember x j) [ol..oh] -- not likely {-# INLINE overlaps #-} isSubsetOf :: CharSet -> CharSet -> Bool isSubsetOf (Pos _ i) (Pos _ j) = I.isSubsetOf i j isSubsetOf (Pos _ i) (Neg _ j) = I.null (I.intersection i j) isSubsetOf (Neg _ i) (Pos _ j) = all (\x -> I.member x i && I.member x j) [ol..oh] -- not bloody likely isSubsetOf (Neg _ i) (Neg _ j) = I.isSubsetOf j i {-# INLINE isSubsetOf #-} fromList :: String -> CharSet fromList = pos . I.fromList . P.map fromEnum {-# INLINE fromList #-} fromAscList :: String -> CharSet fromAscList = pos . I.fromAscList . P.map fromEnum {-# INLINE fromAscList #-} fromDistinctAscList :: String -> CharSet fromDistinctAscList = pos . I.fromDistinctAscList . P.map fromEnum {-# INLINE fromDistinctAscList #-} -- isProperSubsetOf :: CharSet -> CharSet -> Bool -- isProperSubsetOf (P i) (P j) = I.isProperSubsetOf i j -- isProperSubsetOf (P i) (N j) = null (I.intersection i j) && ... -- isProperSubsetOf (N i) (N j) = I.isProperSubsetOf j i ul, uh :: Char ul = minBound uh = maxBound {-# INLINE ul #-} {-# INLINE uh #-} ol, oh :: Int ol = fromEnum ul oh = fromEnum uh {-# INLINE ol #-} {-# INLINE oh #-} numChars :: Int numChars = oh - ol + 1 {-# INLINE numChars #-} instance Typeable CharSet where typeOf _ = mkTyConApp charSetTyCon [] charSetTyCon :: TyCon charSetTyCon = mkTyCon "Text.Trifecta.CharSet.CharSet" {-# NOINLINE charSetTyCon #-} instance Data CharSet where gfoldl k z set | isComplemented set = z complement `k` complement set | otherwise = z fromList `k` toList set toConstr set | isComplemented set = complementConstr | otherwise = fromListConstr dataTypeOf _ = charSetDataType gunfold k z c = case constrIndex c of 1 -> k (z fromList) 2 -> k (z complement) _ -> error "gunfold" fromListConstr :: Constr fromListConstr = mkConstr charSetDataType "fromList" [] Prefix {-# NOINLINE fromListConstr #-} complementConstr :: Constr complementConstr = mkConstr charSetDataType "complement" [] Prefix {-# NOINLINE complementConstr #-} charSetDataType :: DataType charSetDataType = mkDataType "Text.Trifecta.CharSet.CharSet" [fromListConstr, complementConstr] {-# NOINLINE charSetDataType #-} -- returns an intset and if the intset should be complemented to obtain the contents of the CharSet fromCharSet :: CharSet -> (Bool, IntSet) fromCharSet (Pos _ i) = (False, i) fromCharSet (Neg _ i) = (True, i) {-# INLINE fromCharSet #-} toCharSet :: IntSet -> CharSet toCharSet = pos {-# INLINE toCharSet #-} instance Eq CharSet where (==) = (==) `on` toAscList instance Ord CharSet where compare = compare `on` toAscList instance Bounded CharSet where minBound = empty maxBound = full -- TODO return a tighter bounded array perhaps starting from the least element present to the last element present? toArray :: CharSet -> UArray Char Bool toArray set = array (minBound, maxBound) $ fmap (\x -> (x, x `member` set)) [minBound .. maxBound] instance Show CharSet where showsPrec d i | isComplemented i = showParen (d > 10) $ showString "complement " . showsPrec 11 (complement i) | otherwise = showParen (d > 10) $ showString "fromDistinctAscList " . showsPrec 11 (toAscList i) instance Read CharSet where readPrec = parens $ complemented +++ normal where complemented = prec 10 $ do Ident "complement" <- lexP complement `fmap` step readPrec normal = prec 10 $ do Ident "fromDistinctAscList" <- lexP fromDistinctAscList `fmap` step readPrec instance Monoid CharSet where mempty = empty mappend = union