-- ----------------------------------------------------------------------------- -- -- CharSet.hs, part of Alex -- -- (c) Chris Dornan 1995-2000, Simon Marlow 2003 -- -- An abstract CharSet type for Alex. To begin with we'll use Alex's -- original definition of sets as functions, then later will -- transition to something that will work better with Unicode. -- -- ----------------------------------------------------------------------------} module CharSet ( setSingleton, Encoding(..), Byte, ByteSet, byteSetSingleton, byteRanges, byteSetRange, CharSet, -- abstract emptyCharSet, charSetSingleton, charSet, charSetMinus, charSetComplement, charSetRange, charSetUnion, charSetQuote, setUnions, byteSetToArray, byteSetElems, byteSetElem ) where import Data.Array import Data.Ranged import Data.Word import Data.Maybe (catMaybes) import Data.Char (chr,ord) import UTF8 type Byte = Word8 -- Implementation as functions type CharSet = RSet Char type ByteSet = RSet Byte -- type Utf8Set = RSet [Byte] type Utf8Range = Span [Byte] data Encoding = Latin1 | UTF8 emptyCharSet :: CharSet emptyCharSet = rSetEmpty byteSetElem :: ByteSet -> Byte -> Bool byteSetElem = rSetHas charSetSingleton :: Char -> CharSet charSetSingleton = rSingleton setSingleton :: DiscreteOrdered a => a -> RSet a setSingleton = rSingleton charSet :: [Char] -> CharSet charSet = setUnions . fmap charSetSingleton charSetMinus :: CharSet -> CharSet -> CharSet charSetMinus = rSetDifference charSetUnion :: CharSet -> CharSet -> CharSet charSetUnion = rSetUnion setUnions :: DiscreteOrdered a => [RSet a] -> RSet a setUnions = foldr rSetUnion rSetEmpty charSetComplement :: CharSet -> CharSet charSetComplement = rSetNegation charSetRange :: Char -> Char -> CharSet charSetRange c1 c2 = makeRangedSet [Range (BoundaryBelow c1) (BoundaryAbove c2)] byteSetToArray :: ByteSet -> Array Byte Bool byteSetToArray set = array (fst (head ass), fst (last ass)) ass where ass = [(c,rSetHas set c) | c <- [0..0xff]] byteSetElems :: ByteSet -> [Byte] byteSetElems set = [c | c <- [0 .. 0xff], rSetHas set c] charToRanges :: Encoding -> CharSet -> [Utf8Range] charToRanges Latin1 = map (fmap ((: []).fromIntegral.ord)) -- Span [Byte] . catMaybes . fmap (charRangeToCharSpan False) . rSetRanges charToRanges UTF8 = concat -- Span [Byte] . fmap toUtfRange -- [Span [Byte]] . fmap (fmap UTF8.encode) -- Span [Byte] . catMaybes . fmap (charRangeToCharSpan True) . rSetRanges -- | Turns a range of characters expressed as a pair of UTF-8 byte sequences into a set of ranges, in which each range of the resulting set is between pairs of sequences of the same length toUtfRange :: Span [Byte] -> [Span [Byte]] toUtfRange (Span x y) = fix x y fix :: [Byte] -> [Byte] -> [Span [Byte]] fix x y | length x == length y = [Span x y] | length x == 1 = Span x [0x7F] : fix [0xC2,0x80] y | length x == 2 = Span x [0xDF,0xBF] : fix [0xE0,0x80,0x80] y | length x == 3 = Span x [0xEF,0xBF,0xBF] : fix [0xF0,0x80,0x80,0x80] y | otherwise = error "fix: incorrect input given" byteRangeToBytePair :: Span [Byte] -> ([Byte],[Byte]) byteRangeToBytePair (Span x y) = (x,y) data Span a = Span a a -- lower bound inclusive, higher bound exclusive -- (SDM: upper bound inclusive, surely??) instance Functor Span where fmap f (Span x y) = Span (f x) (f y) charRangeToCharSpan :: Bool -> Range Char -> Maybe (Span Char) charRangeToCharSpan _ (Range BoundaryAboveAll _) = Nothing charRangeToCharSpan _ (Range _ BoundaryBelowAll) = Nothing charRangeToCharSpan uni (Range x y) = Just (Span (l x) (h y)) where l b = case b of BoundaryBelowAll -> '\0' BoundaryBelow a -> a BoundaryAbove a -> succ a BoundaryAboveAll -> error "panic: charRangeToCharSpan" h b = case b of BoundaryBelowAll -> error "panic: charRangeToCharSpan" BoundaryBelow a -> pred a BoundaryAbove a -> a BoundaryAboveAll | uni -> chr 0x10ffff | otherwise -> chr 0xff byteRanges :: Encoding -> CharSet -> [([Byte],[Byte])] byteRanges enc = fmap byteRangeToBytePair . charToRanges enc byteSetRange :: Byte -> Byte -> ByteSet byteSetRange c1 c2 = makeRangedSet [Range (BoundaryBelow c1) (BoundaryAbove c2)] byteSetSingleton :: Byte -> ByteSet byteSetSingleton = rSingleton instance DiscreteOrdered Word8 where adjacent x y = x + 1 == y adjacentBelow 0 = Nothing adjacentBelow x = Just (x-1) -- TODO: More efficient generated code! charSetQuote :: CharSet -> String charSetQuote s = "(\\c -> " ++ foldr (\x y -> x ++ " || " ++ y) "False" (map quoteRange (rSetRanges s)) ++ ")" where quoteRange (Range l h) = quoteL l ++ " && " ++ quoteH h quoteL (BoundaryAbove a) = "c > " ++ show a quoteL (BoundaryBelow a) = "c >= " ++ show a quoteL (BoundaryAboveAll) = "False" quoteL (BoundaryBelowAll) = "True" quoteH (BoundaryAbove a) = "c <= " ++ show a quoteH (BoundaryBelow a) = "c < " ++ show a quoteH (BoundaryAboveAll) = "True" quoteH (BoundaryBelowAll) = "False"