{-# LANGUAGE BangPatterns #-} {-# LANGUAGE CPP #-} -- | Sets of characters. -- -- Using this is more efficint than 'RE.Type.Alt':ng individual characters. module Distribution.Utils.CharSet ( -- * Set of characters CharSet, -- * Construction empty, universe, singleton, insert, union, intersection, complement, difference, -- * Query size, null, member, -- * Conversions fromList, toList, fromIntervalList, toIntervalList, -- * Special lists alpha, alphanum, upper, ) where import Data.Char (chr, isAlpha, isAlphaNum, isUpper, ord) import Data.List (foldl', sortBy) import Data.Monoid (Monoid (..)) import Data.String (IsString (..)) import Distribution.Compat.Semigroup (Semigroup (..)) import Prelude (Bool (..), Bounded (..), Char, Enum (..), Eq (..), Int, Maybe (..), Num (..), Ord (..), Show (..), String, concatMap, flip, fst, otherwise, showParen, showString, uncurry, ($), (.)) #if MIN_VERSION_containers(0,5,0) import qualified Data.IntMap.Strict as IM #else import qualified Data.IntMap as IM #endif -- | A set of 'Char's. -- -- We use range set, which works great with 'Char'. newtype CharSet = CS { unCS :: IM.IntMap Int } deriving (Eq, Ord) instance IsString CharSet where fromString = fromList instance Show CharSet where showsPrec d cs | size cs < 20 = showsPrec d (toList cs) | otherwise = showParen (d > 10) $ showString "CS " . showsPrec 11 (unCS cs) instance Semigroup CharSet where (<>) = union instance Monoid CharSet where mempty = empty mappend = (<>) -- | Empty character set. empty :: CharSet empty = CS IM.empty -- | universe universe :: CharSet universe = CS $ IM.singleton 0 0x10ffff -- | Check whether 'CharSet' is 'empty'. null :: CharSet -> Bool null (CS cs) = IM.null cs -- | Size of 'CharSet' -- -- >>> size $ fromIntervalList [('a','f'), ('0','9')] -- 16 -- -- >>> length $ toList $ fromIntervalList [('a','f'), ('0','9')] -- 16 -- size :: CharSet -> Int size (CS m) = foldl' (\ !acc (lo, hi) -> acc + (hi - lo) + 1) 0 (IM.toList m) -- | Singleton character set. singleton :: Char -> CharSet singleton c = CS (IM.singleton (ord c) (ord c)) -- | Test whether character is in the set. member :: Char -> CharSet -> Bool #if MIN_VERSION_containers(0,5,0) member c (CS m) = case IM.lookupLE i m of Nothing -> False Just (_, hi) -> i <= hi where #else member c (CS m) = go (IM.toList m) where go [] = False go ((x,y):zs) = (x <= i && i <= y) || go zs #endif i = ord c -- | Insert 'Char' into 'CharSet'. insert :: Char -> CharSet -> CharSet insert c (CS m) = normalise (IM.insert (ord c) (ord c) m) -- | Union of two 'CharSet's. union :: CharSet -> CharSet -> CharSet union (CS xs) (CS ys) = normalise (IM.unionWith max xs ys) -- | Intersection of two 'CharSet's intersection :: CharSet -> CharSet -> CharSet intersection (CS xs) (CS ys) = CS $ IM.fromList (intersectRangeList (IM.toList xs) (IM.toList ys)) -- | Compute the intersection. intersectRangeList :: Ord a => [(a, a)] -> [(a, a)] -> [(a, a)] intersectRangeList aset@((x,y):as) bset@((u,v):bs) | y < u = intersectRangeList as bset | v < x = intersectRangeList aset bs | y < v = (max x u, y) : intersectRangeList as bset | otherwise = (max x u, v) : intersectRangeList aset bs intersectRangeList _ [] = [] intersectRangeList [] _ = [] -- | Complement of a CharSet complement :: CharSet -> CharSet complement (CS xs) = CS $ IM.fromList $ complementRangeList (IM.toList xs) -- | Compute the complement intersected with @[x,)@ assuming @x [(Int, Int)] -> [(Int, Int)] complementRangeList' x ((u,v):s) = (x,pred u) : complementRangeList'' v s complementRangeList' x [] = [(x,0x10ffff)] -- | Compute the complement intersected with @(x,)@. complementRangeList'' :: Int -> [(Int, Int)] -> [(Int, Int)] complementRangeList'' x s | x == 0x10ffff = [] | otherwise = complementRangeList' (succ x) s -- | Compute the complement. -- -- Note: we treat Ints as codepoints, i.e minBound is 0, and maxBound is 0x10ffff complementRangeList :: [(Int, Int)] -> [(Int, Int)] complementRangeList s@((x,y):s') | x == 0 = complementRangeList'' y s' | otherwise = complementRangeList' 0 s complementRangeList [] = [(0, 0x10ffff)] -- | Difference of two 'CharSet's. difference :: CharSet -> CharSet -> CharSet difference xs ys = intersection xs (complement ys) -- | Make 'CharSet' from a list of characters, i.e. 'String'. fromList :: String -> CharSet fromList = normalise . foldl' (\ acc c -> IM.insert (ord c) (ord c) acc) IM.empty -- | Convert 'CharSet' to a list of characters i.e. 'String'. toList :: CharSet -> String toList = concatMap (uncurry enumFromTo) . toIntervalList -- | Convert to interval list -- -- >>> toIntervalList $ union "01234" "56789" -- [('0','9')] -- toIntervalList :: CharSet -> [(Char, Char)] toIntervalList (CS m) = [ (chr lo, chr hi) | (lo, hi) <- IM.toList m ] -- | Convert from interval pairs. -- -- >>> fromIntervalList [] -- "" -- -- >>> fromIntervalList [('a','f'), ('0','9')] -- "0123456789abcdef" -- -- >>> fromIntervalList [('Z','A')] -- "" -- fromIntervalList :: [(Char,Char)] -> CharSet fromIntervalList xs = normalise' $ sortBy (\a b -> compare (fst a) (fst b)) [ (ord lo, ord hi) | (lo, hi) <- xs , lo <= hi ] ------------------------------------------------------------------------------- -- Normalisation ------------------------------------------------------------------------------- normalise :: IM.IntMap Int -> CharSet normalise = normalise'. IM.toList normalise' :: [(Int,Int)] -> CharSet normalise' = CS . IM.fromList . go where go :: [(Int,Int)] -> [(Int,Int)] go [] = [] go ((x,y):zs) = go' x y zs go' :: Int -> Int -> [(Int, Int)] -> [(Int, Int)] go' lo hi [] = [(lo, hi)] go' lo hi ws0@((u,v):ws) | u <= succ hi = go' lo (max v hi) ws | otherwise = (lo,hi) : go ws0 ------------------------------------------------------------------------------- -- Alpha Numeric character list ------------------------------------------------------------------------------- -- Computing this takes some time, -- but they are not used in-non testing in Cabal's normal operation. -- | Note: this set varies depending on @base@ version. -- alpha :: CharSet alpha = foldl' (flip insert) empty [ c | c <- [ minBound .. maxBound ], isAlpha c ] {-# NOINLINE alpha #-} -- | Note: this set varies depending on @base@ version. -- alphanum :: CharSet alphanum = foldl' (flip insert) empty [ c | c <- [ minBound .. maxBound ], isAlphaNum c ] {-# NOINLINE alphanum #-} -- | Note: this set varies depending on @base@ version. -- upper :: CharSet upper = foldl' (flip insert) empty [ c | c <- [ minBound .. maxBound ], isUpper c ] {-# NOINLINE upper #-}