module Distribution.Superdoc.UTF8 (
Token (..),
parse_utf8,
) where
import Data.Bits
import Data.Char
data Token = Unicode Char | Invalid Char
tok0 :: Char -> Bool
tok0 c = '\x00' <= c && c < '\x80'
tok1 :: Char -> Bool
tok1 c = '\x80' <= c && c < '\xc0'
tok2 :: Char -> Bool
tok2 c = '\xc0' <= c && c < '\xe0'
tok3 :: Char -> Bool
tok3 c = '\xe0' <= c && c < '\xf0'
tok4 :: Char -> Bool
tok4 c = '\xf0' <= c && c < '\xf8'
parse_utf8 :: String -> [Token]
parse_utf8 [] = []
parse_utf8 (c1:cs)
| tok0 c1
= Unicode c1 : parse_utf8 cs
parse_utf8 (c1:c2:cs)
| tok2 c1 && tok1 c2 &&
val >= 0x80
= Unicode (chr val) : parse_utf8 cs
where
val = ((ord c1) .&. 0x1f) `shift` 6 .|. ((ord c2) .&. 0x3f)
parse_utf8 (c1:c2:c3:cs)
| tok3 c1 && tok1 c2 && tok1 c3
&& val >= 0x800
= Unicode (chr val) : parse_utf8 cs
where
val = ((ord c1) .&. 0x0f) `shift` 12 .|. ((ord c2) .&. 0x3f) `shift` 6 .|. ((ord c3) .&. 0x3f)
parse_utf8 (c1:c2:c3:c4:cs)
| tok4 c1 && tok1 c2 && tok1 c3 && tok1 c4
&& val >= 0x10000 && val <= 0x10ffff
= Unicode (chr val) : parse_utf8 cs
where
val = ((ord c1) .&. 0x07) `shift` 18 .|. ((ord c2) .&. 0x3f) `shift` 12 .|. ((ord c3) .&. 0x3f) `shift` 6 .|. ((ord c4) .&. 0x3f)
parse_utf8 (c:cs) = Invalid c : parse_utf8 cs