{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP          #-}
#if __GLASGOW_HASKELL__ >=704
{-# LANGUAGE Safe         #-}
#elif __GLASGOW_HASKELL__ >=702
{-# LANGUAGE Trustworthy  #-}
#endif
-- | Sets of characters.
--
-- Using this is more efficint than 'RE.Type.Alt':ng individual characters.
module RERE.CharSet (
    -- * Set of characters
    CharSet,
    -- * Construction
    empty,
    universe,
    singleton,
    insert,
    union,
    intersection,
    complement,
    difference,
    -- * Query
    size,
    null,
    member,
    -- * Conversions
    fromList,
    toList,
    fromIntervalList,
    toIntervalList,
    ) where

import Prelude hiding (null)

import Data.Char   (chr, ord)
import Data.List   (foldl', sortBy)
import Data.String (IsString (..))

#if MIN_VERSION_containers(0,5,0)
import qualified Data.IntMap.Strict as IM
#else
import qualified Data.IntMap as IM
#endif

-- | A set of 'Char's.
--
-- We use range set, which works great with 'Char'.
newtype CharSet = CS { CharSet -> IntMap Int
unCS :: IM.IntMap Int }
  deriving (CharSet -> CharSet -> Bool
(CharSet -> CharSet -> Bool)
-> (CharSet -> CharSet -> Bool) -> Eq CharSet
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: CharSet -> CharSet -> Bool
$c/= :: CharSet -> CharSet -> Bool
== :: CharSet -> CharSet -> Bool
$c== :: CharSet -> CharSet -> Bool
Eq, Eq CharSet
Eq CharSet
-> (CharSet -> CharSet -> Ordering)
-> (CharSet -> CharSet -> Bool)
-> (CharSet -> CharSet -> Bool)
-> (CharSet -> CharSet -> Bool)
-> (CharSet -> CharSet -> Bool)
-> (CharSet -> CharSet -> CharSet)
-> (CharSet -> CharSet -> CharSet)
-> Ord CharSet
CharSet -> CharSet -> Bool
CharSet -> CharSet -> Ordering
CharSet -> CharSet -> CharSet
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: CharSet -> CharSet -> CharSet
$cmin :: CharSet -> CharSet -> CharSet
max :: CharSet -> CharSet -> CharSet
$cmax :: CharSet -> CharSet -> CharSet
>= :: CharSet -> CharSet -> Bool
$c>= :: CharSet -> CharSet -> Bool
> :: CharSet -> CharSet -> Bool
$c> :: CharSet -> CharSet -> Bool
<= :: CharSet -> CharSet -> Bool
$c<= :: CharSet -> CharSet -> Bool
< :: CharSet -> CharSet -> Bool
$c< :: CharSet -> CharSet -> Bool
compare :: CharSet -> CharSet -> Ordering
$ccompare :: CharSet -> CharSet -> Ordering
$cp1Ord :: Eq CharSet
Ord)

-- | 
--
-- >>> "foobar" :: CharSet
-- "abfor"
--
instance IsString CharSet where
    fromString :: String -> CharSet
fromString = String -> CharSet
fromList

instance Show CharSet where
    showsPrec :: Int -> CharSet -> ShowS
showsPrec Int
d CharSet
cs
        | CharSet -> Int
size CharSet
cs Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
20
        = Int -> String -> ShowS
forall a. Show a => Int -> a -> ShowS
showsPrec Int
d (CharSet -> String
toList CharSet
cs)
        | Bool
otherwise
        = Bool -> ShowS -> ShowS
showParen (Int
d Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
10)
        (ShowS -> ShowS) -> ShowS -> ShowS
forall a b. (a -> b) -> a -> b
$ String -> ShowS
showString String
"CS.fromIntervalList "
        ShowS -> ShowS -> ShowS
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [(Char, Char)] -> ShowS
forall a. Show a => Int -> a -> ShowS
showsPrec Int
11 (CharSet -> [(Char, Char)]
toIntervalList CharSet
cs)

-- | Empty character set.
empty :: CharSet
empty :: CharSet
empty = IntMap Int -> CharSet
CS IntMap Int
forall a. IntMap a
IM.empty

-- | universe
--
-- >>> size universe
-- 1114112
--
-- >>> universe
-- CS.fromIntervalList [('\NUL','\1114111')]
--
universe :: CharSet
universe :: CharSet
universe = IntMap Int -> CharSet
CS (IntMap Int -> CharSet) -> IntMap Int -> CharSet
forall a b. (a -> b) -> a -> b
$ Int -> Int -> IntMap Int
forall a. Int -> a -> IntMap a
IM.singleton Int
0 Int
0x10ffff

-- | Check whether 'CharSet' is 'empty'.
null :: CharSet -> Bool
null :: CharSet -> Bool
null (CS IntMap Int
cs) = IntMap Int -> Bool
forall a. IntMap a -> Bool
IM.null IntMap Int
cs

-- | Size of 'CharSet'
--
-- >>> size $ fromIntervalList [('a','f'), ('0','9')]
-- 16
--
-- >>> length $ toList $ fromIntervalList [('a','f'), ('0','9')]
-- 16
--
size :: CharSet -> Int
size :: CharSet -> Int
size (CS IntMap Int
m) = (Int -> (Int, Int) -> Int) -> Int -> [(Int, Int)] -> Int
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' (\ !Int
acc (Int
lo, Int
hi) -> Int
acc Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
hi Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
lo) Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) Int
0 (IntMap Int -> [(Int, Int)]
forall a. IntMap a -> [(Int, a)]
IM.toList IntMap Int
m)

-- | Singleton character set.
singleton :: Char -> CharSet
singleton :: Char -> CharSet
singleton Char
c = IntMap Int -> CharSet
CS (Int -> Int -> IntMap Int
forall a. Int -> a -> IntMap a
IM.singleton (Char -> Int
ord Char
c) (Char -> Int
ord Char
c))

-- | Test whether character is in the set.
member :: Char -> CharSet -> Bool
#if MIN_VERSION_containers(0,5,0)
member :: Char -> CharSet -> Bool
member Char
c (CS IntMap Int
m) = case Int -> IntMap Int -> Maybe (Int, Int)
forall a. Int -> IntMap a -> Maybe (Int, a)
IM.lookupLE Int
i IntMap Int
m of
    Maybe (Int, Int)
Nothing      -> Bool
False
    Just (Int
_, Int
hi) -> Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
hi
  where
#else
member c (CS m) = go (IM.toList m)
  where
    go [] = False
    go ((x,y):zs) = (x <= i && i <= y) || go zs
#endif
    i :: Int
i = Char -> Int
ord Char
c

-- | Insert 'Char' into 'CharSet'.
insert :: Char -> CharSet -> CharSet
insert :: Char -> CharSet -> CharSet
insert Char
c (CS IntMap Int
m) = IntMap Int -> CharSet
normalise (Int -> Int -> IntMap Int -> IntMap Int
forall a. Int -> a -> IntMap a -> IntMap a
IM.insert (Char -> Int
ord Char
c) (Char -> Int
ord Char
c) IntMap Int
m)

-- | Union of two 'CharSet's.
union :: CharSet -> CharSet -> CharSet
union :: CharSet -> CharSet -> CharSet
union (CS IntMap Int
xs) (CS IntMap Int
ys) = IntMap Int -> CharSet
normalise ((Int -> Int -> Int) -> IntMap Int -> IntMap Int -> IntMap Int
forall a. (a -> a -> a) -> IntMap a -> IntMap a -> IntMap a
IM.unionWith Int -> Int -> Int
forall a. Ord a => a -> a -> a
max IntMap Int
xs IntMap Int
ys)

-- | Intersection of two 'CharSet's
intersection :: CharSet -> CharSet -> CharSet
intersection :: CharSet -> CharSet -> CharSet
intersection (CS IntMap Int
xs) (CS IntMap Int
ys) = IntMap Int -> CharSet
CS (IntMap Int -> CharSet) -> IntMap Int -> CharSet
forall a b. (a -> b) -> a -> b
$
    [(Int, Int)] -> IntMap Int
forall a. [(Int, a)] -> IntMap a
IM.fromList ([(Int, Int)] -> [(Int, Int)] -> [(Int, Int)]
forall a. Ord a => [(a, a)] -> [(a, a)] -> [(a, a)]
intersectRangeList (IntMap Int -> [(Int, Int)]
forall a. IntMap a -> [(Int, a)]
IM.toList IntMap Int
xs) (IntMap Int -> [(Int, Int)]
forall a. IntMap a -> [(Int, a)]
IM.toList IntMap Int
ys))

-- | Compute the intersection.
intersectRangeList :: Ord a => [(a, a)] -> [(a, a)] -> [(a, a)]
intersectRangeList :: [(a, a)] -> [(a, a)] -> [(a, a)]
intersectRangeList aset :: [(a, a)]
aset@((a
x,a
y):[(a, a)]
as) bset :: [(a, a)]
bset@((a
u,a
v):[(a, a)]
bs)
   | a
y a -> a -> Bool
forall a. Ord a => a -> a -> Bool
< a
u     = [(a, a)] -> [(a, a)] -> [(a, a)]
forall a. Ord a => [(a, a)] -> [(a, a)] -> [(a, a)]
intersectRangeList [(a, a)]
as [(a, a)]
bset
   | a
v a -> a -> Bool
forall a. Ord a => a -> a -> Bool
< a
x     = [(a, a)] -> [(a, a)] -> [(a, a)]
forall a. Ord a => [(a, a)] -> [(a, a)] -> [(a, a)]
intersectRangeList [(a, a)]
aset [(a, a)]
bs
   | a
y a -> a -> Bool
forall a. Ord a => a -> a -> Bool
< a
v     = (a -> a -> a
forall a. Ord a => a -> a -> a
max a
x a
u, a
y) (a, a) -> [(a, a)] -> [(a, a)]
forall a. a -> [a] -> [a]
: [(a, a)] -> [(a, a)] -> [(a, a)]
forall a. Ord a => [(a, a)] -> [(a, a)] -> [(a, a)]
intersectRangeList [(a, a)]
as [(a, a)]
bset
   | Bool
otherwise = (a -> a -> a
forall a. Ord a => a -> a -> a
max a
x a
u, a
v) (a, a) -> [(a, a)] -> [(a, a)]
forall a. a -> [a] -> [a]
: [(a, a)] -> [(a, a)] -> [(a, a)]
forall a. Ord a => [(a, a)] -> [(a, a)] -> [(a, a)]
intersectRangeList [(a, a)]
aset [(a, a)]
bs
intersectRangeList [(a, a)]
_ [] = []
intersectRangeList [] [(a, a)]
_ = []

-- | Complement of a CharSet
complement :: CharSet -> CharSet
complement :: CharSet -> CharSet
complement (CS IntMap Int
xs) = IntMap Int -> CharSet
CS (IntMap Int -> CharSet) -> IntMap Int -> CharSet
forall a b. (a -> b) -> a -> b
$ [(Int, Int)] -> IntMap Int
forall a. [(Int, a)] -> IntMap a
IM.fromList ([(Int, Int)] -> IntMap Int) -> [(Int, Int)] -> IntMap Int
forall a b. (a -> b) -> a -> b
$ [(Int, Int)] -> [(Int, Int)]
complementRangeList (IntMap Int -> [(Int, Int)]
forall a. IntMap a -> [(Int, a)]
IM.toList IntMap Int
xs)

-- | Compute the complement intersected with @[x,)@ assuming @x<u@.
complementRangeList' :: Int -> [(Int, Int)] -> [(Int, Int)]
complementRangeList' :: Int -> [(Int, Int)] -> [(Int, Int)]
complementRangeList' Int
x ((Int
u,Int
v):[(Int, Int)]
s) = (Int
x,Int -> Int
forall a. Enum a => a -> a
pred Int
u) (Int, Int) -> [(Int, Int)] -> [(Int, Int)]
forall a. a -> [a] -> [a]
: Int -> [(Int, Int)] -> [(Int, Int)]
complementRangeList'' Int
v [(Int, Int)]
s
complementRangeList' Int
x []        = [(Int
x,Int
0x10ffff)]

-- | Compute the complement intersected with @(x,)@.
complementRangeList'' :: Int -> [(Int, Int)] -> [(Int, Int)]
complementRangeList'' :: Int -> [(Int, Int)] -> [(Int, Int)]
complementRangeList'' Int
x [(Int, Int)]
s
    | Int
x Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0x10ffff = []
    | Bool
otherwise     = Int -> [(Int, Int)] -> [(Int, Int)]
complementRangeList' (Int -> Int
forall a. Enum a => a -> a
succ Int
x) [(Int, Int)]
s

-- | Compute the complement.
--
-- Note: we treat Ints as codepoints, i.e minBound is 0, and maxBound is 0x10ffff
complementRangeList :: [(Int, Int)] -> [(Int, Int)]
complementRangeList :: [(Int, Int)] -> [(Int, Int)]
complementRangeList s :: [(Int, Int)]
s@((Int
x,Int
y):[(Int, Int)]
s')
    | Int
x Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0    = Int -> [(Int, Int)] -> [(Int, Int)]
complementRangeList'' Int
y [(Int, Int)]
s'
    | Bool
otherwise = Int -> [(Int, Int)] -> [(Int, Int)]
complementRangeList' Int
0 [(Int, Int)]
s
complementRangeList [] = [(Int
0, Int
0x10ffff)]

-- | Difference of two 'CharSet's.
difference :: CharSet -> CharSet -> CharSet
difference :: CharSet -> CharSet -> CharSet
difference CharSet
xs CharSet
ys = CharSet -> CharSet -> CharSet
intersection CharSet
xs (CharSet -> CharSet
complement CharSet
ys)

-- | Make 'CharSet' from a list of characters, i.e. 'String'.
fromList :: String -> CharSet
fromList :: String -> CharSet
fromList = IntMap Int -> CharSet
normalise (IntMap Int -> CharSet)
-> (String -> IntMap Int) -> String -> CharSet
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (IntMap Int -> Char -> IntMap Int)
-> IntMap Int -> String -> IntMap Int
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' (\ IntMap Int
acc Char
c -> Int -> Int -> IntMap Int -> IntMap Int
forall a. Int -> a -> IntMap a -> IntMap a
IM.insert (Char -> Int
ord Char
c) (Char -> Int
ord Char
c) IntMap Int
acc) IntMap Int
forall a. IntMap a
IM.empty

-- | Convert 'CharSet' to a list of characters i.e. 'String'.
toList :: CharSet -> String
toList :: CharSet -> String
toList = ((Char, Char) -> String) -> [(Char, Char)] -> String
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap ((Char -> Char -> String) -> (Char, Char) -> String
forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry Char -> Char -> String
forall a. Enum a => a -> a -> [a]
enumFromTo) ([(Char, Char)] -> String)
-> (CharSet -> [(Char, Char)]) -> CharSet -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CharSet -> [(Char, Char)]
toIntervalList

-- | Convert to interval list
--
-- >>> toIntervalList $ union "01234" "56789"
-- [('0','9')]
--
toIntervalList :: CharSet -> [(Char, Char)]
toIntervalList :: CharSet -> [(Char, Char)]
toIntervalList (CS IntMap Int
m) = [ (Int -> Char
chr Int
lo, Int -> Char
chr Int
hi) | (Int
lo, Int
hi) <- IntMap Int -> [(Int, Int)]
forall a. IntMap a -> [(Int, a)]
IM.toList IntMap Int
m ]

-- | Convert from interval pairs.
--
-- >>> fromIntervalList []
-- ""
--
-- >>> fromIntervalList [('a','f'), ('0','9')]
-- "0123456789abcdef"
--
-- >>> fromIntervalList [('Z','A')]
-- ""
--
fromIntervalList :: [(Char,Char)] -> CharSet
fromIntervalList :: [(Char, Char)] -> CharSet
fromIntervalList [(Char, Char)]
xs = [(Int, Int)] -> CharSet
normalise' ([(Int, Int)] -> CharSet) -> [(Int, Int)] -> CharSet
forall a b. (a -> b) -> a -> b
$ ((Int, Int) -> (Int, Int) -> Ordering)
-> [(Int, Int)] -> [(Int, Int)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (\(Int, Int)
a (Int, Int)
b -> Int -> Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (Int, Int)
a) ((Int, Int) -> Int
forall a b. (a, b) -> a
fst (Int, Int)
b))
    [ (Char -> Int
ord Char
lo, Char -> Int
ord Char
hi)
    | (Char
lo, Char
hi) <- [(Char, Char)]
xs
    , Char
lo Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
hi
    ]

-------------------------------------------------------------------------------
-- Normalisation
-------------------------------------------------------------------------------

normalise :: IM.IntMap Int -> CharSet
normalise :: IntMap Int -> CharSet
normalise = [(Int, Int)] -> CharSet
normalise'([(Int, Int)] -> CharSet)
-> (IntMap Int -> [(Int, Int)]) -> IntMap Int -> CharSet
forall b c a. (b -> c) -> (a -> b) -> a -> c
. IntMap Int -> [(Int, Int)]
forall a. IntMap a -> [(Int, a)]
IM.toList

normalise' :: [(Int,Int)] -> CharSet
normalise' :: [(Int, Int)] -> CharSet
normalise' = IntMap Int -> CharSet
CS (IntMap Int -> CharSet)
-> ([(Int, Int)] -> IntMap Int) -> [(Int, Int)] -> CharSet
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [(Int, Int)] -> IntMap Int
forall a. [(Int, a)] -> IntMap a
IM.fromList ([(Int, Int)] -> IntMap Int)
-> ([(Int, Int)] -> [(Int, Int)]) -> [(Int, Int)] -> IntMap Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [(Int, Int)] -> [(Int, Int)]
go where
    go :: [(Int,Int)] -> [(Int,Int)]
    go :: [(Int, Int)] -> [(Int, Int)]
go []         = []
    go ((Int
x,Int
y):[(Int, Int)]
zs) = Int -> Int -> [(Int, Int)] -> [(Int, Int)]
go' Int
x Int
y [(Int, Int)]
zs

    go' :: Int -> Int -> [(Int, Int)] -> [(Int, Int)]
    go' :: Int -> Int -> [(Int, Int)] -> [(Int, Int)]
go' Int
lo Int
hi [] = [(Int
lo, Int
hi)]
    go' Int
lo Int
hi ws0 :: [(Int, Int)]
ws0@((Int
u,Int
v):[(Int, Int)]
ws)
        | Int
u Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int -> Int
forall a. Enum a => a -> a
succ Int
hi = Int -> Int -> [(Int, Int)] -> [(Int, Int)]
go' Int
lo (Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
v Int
hi) [(Int, Int)]
ws
        | Bool
otherwise    = (Int
lo,Int
hi) (Int, Int) -> [(Int, Int)] -> [(Int, Int)]
forall a. a -> [a] -> [a]
: [(Int, Int)] -> [(Int, Int)]
go [(Int, Int)]
ws0