module Text.Utf8Validator
( State
, initialState
, isAccepting
, isRejected
, feed
, validateBS
, validateBS'
) where
import Data.Bits
import qualified Data.ByteString as BS
import Data.Word
data CharClass
= C0
| C1
| C2
| C3
| C4
| C5
| C6
| C7
| C8
| C9
| C10
| C11
deriving (Eq, Ord, Enum, Bounded, Show)
data S
= S0
| S1
| S2
| S3
| S4
| S5
| S6
| S7
| S8
deriving (Eq, Ord, Enum, Bounded, Show)
charClass :: Word8 -> CharClass
charClass i
| i <= 0x7f = C0
| i <= 0x8f = C1
| i <= 0x9f = C9
| i <= 0xbf = C7
| i <= 0xc1 = C8
| i <= 0xdf = C2
| i <= 0xe0 = C10
| i <= 0xec = C3
| i <= 0xed = C4
| i <= 0xef = C3
| i <= 0xf0 = C11
| i <= 0xf3 = C6
| i <= 0xf4 = C5
| otherwise = C8
ccMask :: CharClass -> Int
ccMask c = shift 0xff (-(fromEnum c))
transitions :: CharClass -> S -> S
transitions cc st =
case (st, cc) of
(S0, C0) -> S0
(S0, C2) -> S2
(S0, C3) -> S3
(S0, C4) -> S5
(S0, C5) -> S8
(S0, C6) -> S7
(S0, C10) -> S4
(S0, C11) -> S6
(S2, C1) -> S0
(S2, C7) -> S0
(S2, C9) -> S0
(S3, C1) -> S2
(S3, C7) -> S2
(S3, C9) -> S2
(S4, C7) -> S2
(S5, C1) -> S2
(S5, C9) -> S2
(S6, C7) -> S3
(S6, C9) -> S3
(S7, C1) -> S3
(S7, C7) -> S3
(S7, C9) -> S3
(S8, C1) -> S3
_ -> S1
data State = State
{ codepoint :: !Int
, state :: !S
} deriving (Eq, Ord, Show)
initialState :: State
initialState = State 0 S0
isAccepting :: State -> Bool
isAccepting (State _ S0) = True
isAccepting (State _ _) = False
isRejected :: State -> Bool
isRejected (State _ S1) = True
isRejected (State _ _) = False
feed :: Word8 -> State -> State
feed inp (State cp st) = State cp' st'
where
typ = charClass inp
chr = fromIntegral inp
cp' =
case st of
S0 -> ccMask typ .&. chr
_ -> chr .&. 0x3f .|. shift cp 6
st' = transitions typ st
validateBS' :: BS.ByteString -> State -> State
validateBS' bs st = BS.foldl' (flip feed) st bs
validateBS :: BS.ByteString -> Bool
validateBS bs = isAccepting $ validateBS' bs initialState