-- -----------------------------------------------------------------------------
-- 
-- CharSet.hs, part of Alex
--
-- (c) Chris Dornan 1995-2000, Simon Marlow 2003
--
-- An abstract CharSet type for Alex.  To begin with we'll use Alex's
-- original definition of sets as functions, then later will
-- transition to something that will work better with Unicode.
--
-- ----------------------------------------------------------------------------}

module CharSet (
  setSingleton,

  Encoding(..),

  Byte,
  ByteSet,
  byteSetSingleton,
  byteRanges,
  byteSetRange,

  CharSet, -- abstract
  emptyCharSet,
  charSetSingleton,
  charSet,
  charSetMinus,
  charSetComplement,
  charSetRange,
  charSetUnion,
  charSetQuote,
  setUnions,
  byteSetToArray,
  byteSetElems,
  byteSetElem
  ) where

import Data.Array
import Data.Ranged
import Data.Word
import Data.Maybe (catMaybes)
import Data.Char (chr,ord)
import UTF8

type Byte = Word8
-- Implementation as functions
type CharSet = RSet Char
type ByteSet = RSet Byte
-- type Utf8Set = RSet [Byte]
type Utf8Range = Span [Byte]

data Encoding = Latin1 | UTF8

emptyCharSet :: CharSet
emptyCharSet :: CharSet
emptyCharSet = CharSet
forall a. DiscreteOrdered a => RSet a
rSetEmpty

byteSetElem :: ByteSet -> Byte -> Bool
byteSetElem :: ByteSet -> Byte -> Bool
byteSetElem = ByteSet -> Byte -> Bool
forall v. DiscreteOrdered v => RSet v -> v -> Bool
rSetHas

charSetSingleton :: Char -> CharSet
charSetSingleton :: Char -> CharSet
charSetSingleton = Char -> CharSet
forall v. DiscreteOrdered v => v -> RSet v
rSingleton

setSingleton :: DiscreteOrdered a => a -> RSet a
setSingleton :: forall v. DiscreteOrdered v => v -> RSet v
setSingleton = a -> RSet a
forall v. DiscreteOrdered v => v -> RSet v
rSingleton

charSet :: [Char] -> CharSet
charSet :: [Char] -> CharSet
charSet = [CharSet] -> CharSet
forall a. DiscreteOrdered a => [RSet a] -> RSet a
setUnions ([CharSet] -> CharSet)
-> ([Char] -> [CharSet]) -> [Char] -> CharSet
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> CharSet) -> [Char] -> [CharSet]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Char -> CharSet
charSetSingleton

charSetMinus :: CharSet -> CharSet -> CharSet
charSetMinus :: CharSet -> CharSet -> CharSet
charSetMinus = CharSet -> CharSet -> CharSet
forall v. DiscreteOrdered v => RSet v -> RSet v -> RSet v
rSetDifference

charSetUnion :: CharSet -> CharSet -> CharSet
charSetUnion :: CharSet -> CharSet -> CharSet
charSetUnion = CharSet -> CharSet -> CharSet
forall v. DiscreteOrdered v => RSet v -> RSet v -> RSet v
rSetUnion

setUnions :: DiscreteOrdered a => [RSet a] -> RSet a
setUnions :: forall a. DiscreteOrdered a => [RSet a] -> RSet a
setUnions = (RSet a -> RSet a -> RSet a) -> RSet a -> [RSet a] -> RSet a
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr RSet a -> RSet a -> RSet a
forall v. DiscreteOrdered v => RSet v -> RSet v -> RSet v
rSetUnion RSet a
forall a. DiscreteOrdered a => RSet a
rSetEmpty

charSetComplement :: CharSet -> CharSet
charSetComplement :: CharSet -> CharSet
charSetComplement = CharSet -> CharSet
forall a. DiscreteOrdered a => RSet a -> RSet a
rSetNegation

charSetRange :: Char -> Char -> CharSet
charSetRange :: Char -> Char -> CharSet
charSetRange Char
c1 Char
c2 = [Range Char] -> CharSet
forall v. DiscreteOrdered v => [Range v] -> RSet v
makeRangedSet [Boundary Char -> Boundary Char -> Range Char
forall v. Boundary v -> Boundary v -> Range v
Range (Char -> Boundary Char
forall a. a -> Boundary a
BoundaryBelow Char
c1) (Char -> Boundary Char
forall a. a -> Boundary a
BoundaryAbove Char
c2)]

byteSetToArray :: ByteSet -> Array Byte Bool
byteSetToArray :: ByteSet -> Array Byte Bool
byteSetToArray ByteSet
set = (Byte, Byte) -> [(Byte, Bool)] -> Array Byte Bool
forall i e. Ix i => (i, i) -> [(i, e)] -> Array i e
array ((Byte, Bool) -> Byte
forall a b. (a, b) -> a
fst ([(Byte, Bool)] -> (Byte, Bool)
forall a. [a] -> a
head [(Byte, Bool)]
ass), (Byte, Bool) -> Byte
forall a b. (a, b) -> a
fst ([(Byte, Bool)] -> (Byte, Bool)
forall a. [a] -> a
last [(Byte, Bool)]
ass)) [(Byte, Bool)]
ass
  where ass :: [(Byte, Bool)]
ass = [(Byte
c,ByteSet -> Byte -> Bool
forall v. DiscreteOrdered v => RSet v -> v -> Bool
rSetHas ByteSet
set Byte
c) | Byte
c <- [Byte
0..Byte
0xff]]

byteSetElems :: ByteSet -> [Byte]
byteSetElems :: ByteSet -> [Byte]
byteSetElems ByteSet
set = [Byte
c | Byte
c <- [Byte
0 .. Byte
0xff], ByteSet -> Byte -> Bool
forall v. DiscreteOrdered v => RSet v -> v -> Bool
rSetHas ByteSet
set Byte
c]

charToRanges :: Encoding -> CharSet -> [Utf8Range]
charToRanges :: Encoding -> CharSet -> [Utf8Range]
charToRanges Encoding
Latin1 =
    (Span Char -> Utf8Range) -> [Span Char] -> [Utf8Range]
forall a b. (a -> b) -> [a] -> [b]
map ((Char -> [Byte]) -> Span Char -> Utf8Range
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((Byte -> [Byte] -> [Byte]
forall a. a -> [a] -> [a]
: [])(Byte -> [Byte]) -> (Char -> Byte) -> Char -> [Byte]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.Int -> Byte
forall a b. (Integral a, Num b) => a -> b
fromIntegral(Int -> Byte) -> (Char -> Int) -> Char -> Byte
forall b c a. (b -> c) -> (a -> b) -> a -> c
.Char -> Int
ord)) -- Span [Byte]
  ([Span Char] -> [Utf8Range])
-> (CharSet -> [Span Char]) -> CharSet -> [Utf8Range]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Maybe (Span Char)] -> [Span Char]
forall a. [Maybe a] -> [a]
catMaybes
  ([Maybe (Span Char)] -> [Span Char])
-> (CharSet -> [Maybe (Span Char)]) -> CharSet -> [Span Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Range Char -> Maybe (Span Char))
-> [Range Char] -> [Maybe (Span Char)]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Bool -> Range Char -> Maybe (Span Char)
charRangeToCharSpan Bool
False)
  ([Range Char] -> [Maybe (Span Char)])
-> (CharSet -> [Range Char]) -> CharSet -> [Maybe (Span Char)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CharSet -> [Range Char]
forall v. RSet v -> [Range v]
rSetRanges
charToRanges Encoding
UTF8 =
    [[Utf8Range]] -> [Utf8Range]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat                  -- Span [Byte]
  ([[Utf8Range]] -> [Utf8Range])
-> (CharSet -> [[Utf8Range]]) -> CharSet -> [Utf8Range]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Utf8Range -> [Utf8Range]) -> [Utf8Range] -> [[Utf8Range]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Utf8Range -> [Utf8Range]
toUtfRange         -- [Span [Byte]]
  ([Utf8Range] -> [[Utf8Range]])
-> (CharSet -> [Utf8Range]) -> CharSet -> [[Utf8Range]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Span Char -> Utf8Range) -> [Span Char] -> [Utf8Range]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((Char -> [Byte]) -> Span Char -> Utf8Range
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Char -> [Byte]
UTF8.encode) -- Span [Byte]
  ([Span Char] -> [Utf8Range])
-> (CharSet -> [Span Char]) -> CharSet -> [Utf8Range]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Maybe (Span Char)] -> [Span Char]
forall a. [Maybe a] -> [a]
catMaybes
  ([Maybe (Span Char)] -> [Span Char])
-> (CharSet -> [Maybe (Span Char)]) -> CharSet -> [Span Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Range Char -> Maybe (Span Char))
-> [Range Char] -> [Maybe (Span Char)]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Bool -> Range Char -> Maybe (Span Char)
charRangeToCharSpan Bool
True)
  ([Range Char] -> [Maybe (Span Char)])
-> (CharSet -> [Range Char]) -> CharSet -> [Maybe (Span Char)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CharSet -> [Range Char]
forall v. RSet v -> [Range v]
rSetRanges

-- | Turns a range of characters expressed as a pair of UTF-8 byte sequences into a set of ranges, in which each range of the resulting set is between pairs of sequences of the same length
toUtfRange :: Span [Byte] -> [Span [Byte]]
toUtfRange :: Utf8Range -> [Utf8Range]
toUtfRange (Span [Byte]
x [Byte]
y) = [Byte] -> [Byte] -> [Utf8Range]
fix [Byte]
x [Byte]
y

fix :: [Byte] -> [Byte] -> [Span [Byte]]
fix :: [Byte] -> [Byte] -> [Utf8Range]
fix [Byte]
x [Byte]
y 
    | [Byte] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Byte]
x Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== [Byte] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Byte]
y = [[Byte] -> [Byte] -> Utf8Range
forall a. a -> a -> Span a
Span [Byte]
x [Byte]
y]
    | [Byte] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Byte]
x Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
1 = [Byte] -> [Byte] -> Utf8Range
forall a. a -> a -> Span a
Span [Byte]
x [Byte
0x7F] Utf8Range -> [Utf8Range] -> [Utf8Range]
forall a. a -> [a] -> [a]
: [Byte] -> [Byte] -> [Utf8Range]
fix [Byte
0xC2,Byte
0x80] [Byte]
y    
    | [Byte] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Byte]
x Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
2 = [Byte] -> [Byte] -> Utf8Range
forall a. a -> a -> Span a
Span [Byte]
x [Byte
0xDF,Byte
0xBF] Utf8Range -> [Utf8Range] -> [Utf8Range]
forall a. a -> [a] -> [a]
: [Byte] -> [Byte] -> [Utf8Range]
fix [Byte
0xE0,Byte
0x80,Byte
0x80] [Byte]
y
    | [Byte] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Byte]
x Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
3 = [Byte] -> [Byte] -> Utf8Range
forall a. a -> a -> Span a
Span [Byte]
x [Byte
0xEF,Byte
0xBF,Byte
0xBF] Utf8Range -> [Utf8Range] -> [Utf8Range]
forall a. a -> [a] -> [a]
: [Byte] -> [Byte] -> [Utf8Range]
fix [Byte
0xF0,Byte
0x80,Byte
0x80,Byte
0x80] [Byte]
y
    | Bool
otherwise = [Char] -> [Utf8Range]
forall a. HasCallStack => [Char] -> a
error [Char]
"fix: incorrect input given"


byteRangeToBytePair :: Span [Byte] -> ([Byte],[Byte])
byteRangeToBytePair :: Utf8Range -> ([Byte], [Byte])
byteRangeToBytePair (Span [Byte]
x [Byte]
y) = ([Byte]
x,[Byte]
y)

data Span a = Span a a -- lower bound inclusive, higher bound exclusive
                       -- (SDM: upper bound inclusive, surely??)
instance Functor Span where
    fmap :: forall a b. (a -> b) -> Span a -> Span b
fmap a -> b
f (Span a
x a
y) = b -> b -> Span b
forall a. a -> a -> Span a
Span (a -> b
f a
x) (a -> b
f a
y)

charRangeToCharSpan :: Bool -> Range Char -> Maybe (Span Char)
charRangeToCharSpan :: Bool -> Range Char -> Maybe (Span Char)
charRangeToCharSpan Bool
_ (Range Boundary Char
BoundaryAboveAll Boundary Char
_) = Maybe (Span Char)
forall a. Maybe a
Nothing
charRangeToCharSpan Bool
_ (Range Boundary Char
_ Boundary Char
BoundaryBelowAll) = Maybe (Span Char)
forall a. Maybe a
Nothing
charRangeToCharSpan Bool
uni (Range Boundary Char
x Boundary Char
y) = Span Char -> Maybe (Span Char)
forall a. a -> Maybe a
Just (Char -> Char -> Span Char
forall a. a -> a -> Span a
Span (Boundary Char -> Char
l Boundary Char
x) (Boundary Char -> Char
h Boundary Char
y))
    where l :: Boundary Char -> Char
l Boundary Char
b = case Boundary Char
b of
            Boundary Char
BoundaryBelowAll -> Char
'\0'
            BoundaryBelow Char
a  -> Char
a
            BoundaryAbove Char
a  -> Char -> Char
forall a. Enum a => a -> a
succ Char
a
            Boundary Char
BoundaryAboveAll -> [Char] -> Char
forall a. HasCallStack => [Char] -> a
error [Char]
"panic: charRangeToCharSpan"
          h :: Boundary Char -> Char
h Boundary Char
b = case Boundary Char
b of
            Boundary Char
BoundaryBelowAll -> [Char] -> Char
forall a. HasCallStack => [Char] -> a
error [Char]
"panic: charRangeToCharSpan"
            BoundaryBelow Char
a  -> Char -> Char
forall a. Enum a => a -> a
pred Char
a
            BoundaryAbove Char
a  -> Char
a
            Boundary Char
BoundaryAboveAll | Bool
uni -> Int -> Char
chr Int
0x10ffff
                             | Bool
otherwise -> Int -> Char
chr Int
0xff

byteRanges :: Encoding -> CharSet -> [([Byte],[Byte])]
byteRanges :: Encoding -> CharSet -> [([Byte], [Byte])]
byteRanges Encoding
enc =  (Utf8Range -> ([Byte], [Byte]))
-> [Utf8Range] -> [([Byte], [Byte])]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Utf8Range -> ([Byte], [Byte])
byteRangeToBytePair ([Utf8Range] -> [([Byte], [Byte])])
-> (CharSet -> [Utf8Range]) -> CharSet -> [([Byte], [Byte])]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Encoding -> CharSet -> [Utf8Range]
charToRanges Encoding
enc

byteSetRange :: Byte -> Byte -> ByteSet
byteSetRange :: Byte -> Byte -> ByteSet
byteSetRange Byte
c1 Byte
c2 = [Range Byte] -> ByteSet
forall v. DiscreteOrdered v => [Range v] -> RSet v
makeRangedSet [Boundary Byte -> Boundary Byte -> Range Byte
forall v. Boundary v -> Boundary v -> Range v
Range (Byte -> Boundary Byte
forall a. a -> Boundary a
BoundaryBelow Byte
c1) (Byte -> Boundary Byte
forall a. a -> Boundary a
BoundaryAbove Byte
c2)]

byteSetSingleton :: Byte -> ByteSet
byteSetSingleton :: Byte -> ByteSet
byteSetSingleton = Byte -> ByteSet
forall v. DiscreteOrdered v => v -> RSet v
rSingleton

instance DiscreteOrdered Word8 where
    adjacent :: Byte -> Byte -> Bool
adjacent Byte
x Byte
y = Byte
x Byte -> Byte -> Byte
forall a. Num a => a -> a -> a
+ Byte
1 Byte -> Byte -> Bool
forall a. Eq a => a -> a -> Bool
== Byte
y
    adjacentBelow :: Byte -> Maybe Byte
adjacentBelow Byte
0 = Maybe Byte
forall a. Maybe a
Nothing
    adjacentBelow Byte
x = Byte -> Maybe Byte
forall a. a -> Maybe a
Just (Byte
xByte -> Byte -> Byte
forall a. Num a => a -> a -> a
-Byte
1)

-- TODO: More efficient generated code!
charSetQuote :: CharSet -> String
charSetQuote :: CharSet -> [Char]
charSetQuote CharSet
s = [Char]
"(\\c -> " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ ([Char] -> [Char] -> [Char]) -> [Char] -> [[Char]] -> [Char]
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (\[Char]
x [Char]
y -> [Char]
x [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char]
" || " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char]
y) [Char]
"False" ((Range Char -> [Char]) -> [Range Char] -> [[Char]]
forall a b. (a -> b) -> [a] -> [b]
map Range Char -> [Char]
forall {a}. Show a => Range a -> [Char]
quoteRange (CharSet -> [Range Char]
forall v. RSet v -> [Range v]
rSetRanges CharSet
s)) [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char]
")" 
    where quoteRange :: Range a -> [Char]
quoteRange (Range Boundary a
l Boundary a
h) = Boundary a -> [Char]
forall {a}. Show a => Boundary a -> [Char]
quoteL Boundary a
l [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char]
" && " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ Boundary a -> [Char]
forall {a}. Show a => Boundary a -> [Char]
quoteH Boundary a
h
          quoteL :: Boundary a -> [Char]
quoteL (BoundaryAbove a
a) = [Char]
"c > " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ a -> [Char]
forall a. Show a => a -> [Char]
show a
a
          quoteL (BoundaryBelow a
a) = [Char]
"c >= " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ a -> [Char]
forall a. Show a => a -> [Char]
show a
a
          quoteL (Boundary a
BoundaryAboveAll) = [Char]
"False"
          quoteL (Boundary a
BoundaryBelowAll) = [Char]
"True"
          quoteH :: Boundary a -> [Char]
quoteH (BoundaryAbove a
a) = [Char]
"c <= " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ a -> [Char]
forall a. Show a => a -> [Char]
show a
a
          quoteH (BoundaryBelow a
a) = [Char]
"c < " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ a -> [Char]
forall a. Show a => a -> [Char]
show a
a
          quoteH (Boundary a
BoundaryAboveAll) = [Char]
"True"
          quoteH (Boundary a
BoundaryBelowAll) = [Char]
"False"