module Data.Monoid.Lexical.UTF8.Decoder
( module Data.Monoid.Reducer.Char
, UTF8
, runUTF8
) where
import Data.Bits (shiftL,(.&.),(.|.))
import Data.Word (Word8)
import Control.Functor.Pointed
import Data.Monoid.Reducer.Char
data H = H0
| H2_1 !Word8
| H3_1 !Word8
| H3_2 !Word8 !Word8
| H4_1 !Word8
| H4_2 !Word8 !Word8
| H4_3 !Word8 !Word8 !Word8
type T = [Word8]
data UTF8 m = S T m !H
| T T
flushH :: CharReducer m => H -> m
flushH (H0) = mempty
flushH (H2_1 x) = invalidChar [x]
flushH (H3_1 x) = invalidChar [x]
flushH (H3_2 x y) = invalidChar [x,y]
flushH (H4_1 x) = invalidChar [x]
flushH (H4_2 x y) = invalidChar [x,y]
flushH (H4_3 x y z) = invalidChar [x,y,z]
flushT :: CharReducer m => [Word8] -> m
flushT = invalidChar
snocH :: CharReducer m => H -> Word8 -> (m -> H -> UTF8 m) -> m -> UTF8 m
snocH H0 c k m
| c < 0x80 = k (m `mappend` b1 c) H0
| c < 0xc0 = k (m `mappend` invalidChar [c]) H0
| c < 0xe0 = k m (H2_1 c)
| c < 0xf0 = k m (H3_1 c)
| c < 0xf5 = k m (H4_1 c)
| otherwise = k (m `mappend` invalidChar [c]) H0
snocH (H2_1 c) d k m
| d >= 0x80 && d < 0xc0 = k (m `mappend` b2 c d) H0
| otherwise = k (m `mappend` invalidChar [c]) H0
snocH (H3_1 c) d k m
| d >= 0x80 && d < 0xc0 = k m (H3_2 c d)
| otherwise = k (m `mappend` invalidChar [c]) H0
snocH (H3_2 c d) e k m
| d >= 0x80 && d < 0xc0 = k (m `mappend` b3 c d e) H0
| otherwise = k (m `mappend` invalidChar [c,d]) H0
snocH (H4_1 c) d k m
| d >= 0x80 && d < 0xc0 = k m (H4_2 c d)
| otherwise = k (m `mappend` invalidChar [c,d]) H0
snocH (H4_2 c d) e k m
| d >= 0x80 && d < 0xc0 = k m (H4_3 c d e)
| otherwise = k (m `mappend` invalidChar [c,d,e]) H0
snocH (H4_3 c d e) f k m
| d >= 0x80 && d < 0xc0 = k (m `mappend` b4 c d e f) H0
| otherwise = k (m `mappend` invalidChar [c,d,e,f]) H0
mask :: Word8 -> Word8 -> Int
mask c m = fromEnum (c .&. m)
combine :: Int -> Word8 -> Int
combine a r = shiftL a 6 .|. fromEnum (r .&. 0x3f)
b1 :: CharReducer m => Word8 -> m
b1 c | c < 0x80 = fromChar . toEnum $ fromEnum c
| otherwise = invalidChar [c]
b2 :: CharReducer m => Word8 -> Word8 -> m
b2 c d | valid_b2 c d = fromChar (toEnum (combine (mask c 0x1f) d))
| otherwise = invalidChar [c,d]
b3 :: CharReducer m => Word8 -> Word8 -> Word8 -> m
b3 c d e | valid_b3 c d e = fromChar (toEnum (combine (combine (mask c 0x0f) d) e))
| otherwise = invalidChar [c,d,e]
b4 :: CharReducer m => Word8 -> Word8 -> Word8 -> Word8 -> m
b4 c d e f | valid_b4 c d e f = fromChar (toEnum (combine (combine (combine (mask c 0x07) d) e) f))
| otherwise = invalidChar [c,d,e,f]
valid_b2 :: Word8 -> Word8 -> Bool
valid_b2 c d = (c >= 0xc2 && c <= 0xdf && d >= 0x80 && d <= 0xbf)
valid_b3 :: Word8 -> Word8 -> Word8 -> Bool
valid_b3 c d e = (c == 0xe0 && d >= 0xa0 && d <= 0xbf && e >= 0x80 && e <= 0xbf) ||
(c >= 0xe1 && c <= 0xef && d >= 0x80 && d <= 0xbf && e >= 0x80 && e <= 0xbf)
valid_b4 :: Word8 -> Word8 -> Word8 -> Word8 -> Bool
valid_b4 c d e f = (c == 0xf0 && d >= 0x90 && d <= 0xbf && e >= 0x80 && e <= 0xbf && f >= 0x80 && f <= 0xbf) ||
(c >= 0xf1 && c <= 0xf3 && d >= 0x80 && d <= 0xbf && e >= 0x80 && e <= 0xbf && f >= 0x80 && f <= 0xbf) ||
(c == 0xf4 && d >= 0x80 && d <= 0x8f && e >= 0x80 && e <= 0xbf && f >= 0x80 && f <= 0xbf)
consT :: CharReducer m => Word8 -> T -> (H -> UTF8 m) -> (m -> UTF8 m) -> (T -> UTF8 m) -> UTF8 m
consT c cs h m t
| c < 0x80 = m $ b1 c `mappend` invalidChars cs
| c < 0xc0 = t (c:cs)
| c < 0xe0 = case cs of
[] -> h $ H2_1 c
(d:ds) -> m $ b2 c d `mappend` invalidChars ds
| c < 0xf0 = case cs of
[] -> h $ H3_1 c
[d] -> h $ H3_2 c d
(d:e:es) -> m $ b3 c d e `mappend` invalidChars es
| c < 0xf5 = case cs of
[] -> h $ H4_1 c
[d] -> h $ H4_2 c d
[d,e] -> h $ H4_3 c d e
(d:e:f:fs) -> m $ b4 c d e f `mappend` invalidChars fs
| otherwise = mempty
invalidChars :: CharReducer m => [Word8] -> m
invalidChars = foldr (mappend . invalidChar . return) mempty
merge :: CharReducer m => H -> T -> (m -> a) -> (H -> a) -> a
merge H0 cs k _ = k $ invalidChars cs
merge (H2_1 c) [] _ p = p $ H2_1 c
merge (H2_1 c) (d:ds) k _ = k $ b2 c d `mappend` invalidChars ds
merge (H3_1 c) [] _ p = p $ H3_1 c
merge (H3_1 c) [d] _ p = p $ H3_2 c d
merge (H3_1 c) (d:e:es) k _ = k $ b3 c d e `mappend` invalidChars es
merge (H3_2 c d) [] _ p = p $ H3_2 c d
merge (H3_2 c d) (e:es) k _ = k $ b3 c d e `mappend` invalidChars es
merge (H4_1 c) [] _ p = p $ H4_1 c
merge (H4_1 c) [d] _ p = p $ H4_2 c d
merge (H4_1 c) [d,e] _ p = p $ H4_3 c d e
merge (H4_1 c) (d:e:f:fs) k _ = k $ b4 c d e f `mappend` invalidChars fs
merge (H4_2 c d) [] _ p = p $ H4_2 c d
merge (H4_2 c d) [e] _ p = p $ H4_3 c d e
merge (H4_2 c d) (e:f:fs) k _ = k $ b4 c d e f `mappend` invalidChars fs
merge (H4_3 c d e) [] _ p = p $ H4_3 c d e
merge (H4_3 c d e) (f:fs) k _ = k $ b4 c d e f `mappend` invalidChars fs
instance CharReducer m => Monoid (UTF8 m) where
mempty = T []
T c `mappend` T d = T (c ++ d)
T c `mappend` S l m r = S (c ++ l) m r
S l m c `mappend` S c' m' r = S l (m `mappend` merge c c' id flushH `mappend` m') r
s@(S _ _ _) `mappend` T [] = s
S l m c `mappend` T c' = merge c c' k (S l m) where
k m' = S l (m `mappend` m') H0
instance CharReducer m => Reducer Word8 (UTF8 m) where
unit c | c >= 0x80 && c < 0xc0 = T [c]
| otherwise = snocH H0 c (S []) mempty
S t m h `snoc` c = snocH h c (S t) m
T t `snoc` c | c >= 0x80 && c < 0xc0 = T (t ++ [c])
| otherwise = snocH H0 c (S t) mempty
c `cons` T cs = consT c cs (S [] mempty) (flip (S []) H0) T
c `cons` S cs m h = consT c cs k1 k2 k3 where
k1 h' = S [] (flushH h' `mappend` m) h
k2 m' = S [] (m' `mappend` m) h
k3 t' = S t' m h
instance Functor UTF8 where
fmap f (S t x h) = S t (f x) h
fmap _ (T t) = T t
instance Pointed UTF8 where
point f = S [] f H0
runUTF8 :: CharReducer m => UTF8 m -> m
runUTF8 (T t) = flushT t
runUTF8 (S t m h) = flushT t `mappend` m `mappend` flushH h