{-|

Haskell implementation of http://bjoern.hoehrmann.de/utf-8/decoder/charClasses/

Validate a UTF8 'ByteString' in constant-space without building a 'Text' value.
-}
module Text.Utf8Validator
  ( State
  , initialState
  , isAccepting
  , isRejected
  , feed
  , validateBS
  , validateBS'
  ) where
import           Data.Bits
import qualified Data.ByteString as BS
import           Data.Word

data CharClass
  = C0
  | C1
  | C2
  | C3
  | C4
  | C5
  | C6
  | C7
  | C8
  | C9
  | C10
  | C11
  deriving (Eq, Ord, Enum, Bounded, Show)

data S
  = S0
  | S1
  | S2
  | S3
  | S4
  | S5
  | S6
  | S7
  | S8
  deriving (Eq, Ord, Enum, Bounded, Show)

charClass :: Word8 -> CharClass
charClass i
  | i <= 0x7f = C0
  | i <= 0x8f = C1
  | i <= 0x9f = C9
  | i <= 0xbf = C7
  | i <= 0xc1 = C8
  | i <= 0xdf = C2
  | i <= 0xe0 = C10
  | i <= 0xec = C3
  | i <= 0xed = C4
  | i <= 0xef = C3
  | i <= 0xf0 = C11
  | i <= 0xf3 = C6
  | i <= 0xf4 = C5
  | otherwise = C8

ccMask :: CharClass -> Int
ccMask c = shift 0xff (-(fromEnum c))

transitions :: CharClass -> S -> S
transitions cc st =
  case (st, cc) of
    (S0, C0) -> S0
    (S0, C2) -> S2
    (S0, C3) -> S3
    (S0, C4) -> S5
    (S0, C5) -> S8
    (S0, C6) -> S7
    (S0, C10) -> S4
    (S0, C11) -> S6
    (S2, C1) -> S0
    (S2, C7) -> S0
    (S2, C9) -> S0
    (S3, C1) -> S2
    (S3, C7) -> S2
    (S3, C9) -> S2
    (S4, C7) -> S2
    (S5, C1) -> S2
    (S5, C9) -> S2
    (S6, C7) -> S3
    (S6, C9) -> S3
    (S7, C1) -> S3
    (S7, C7) -> S3
    (S7, C9) -> S3
    (S8, C1) -> S3
    _ -> S1

data State = State
  { codepoint :: !Int
  , state :: !S
  } deriving (Eq, Ord, Show)

initialState :: State
initialState = State 0 S0

isAccepting :: State -> Bool
isAccepting (State _ S0) = True
isAccepting (State _ _) = False

isRejected :: State -> Bool
isRejected (State _ S1) = True
isRejected (State _ _) = False

feed :: Word8 -> State -> State
feed inp (State cp st) = State cp' st'
  where
    typ = charClass inp
    chr = fromIntegral inp
    cp' =
      case st of
        S0 -> ccMask typ .&. chr
        _ -> chr .&. 0x3f .|. shift cp 6
    st' = transitions typ st

validateBS' :: BS.ByteString -> State -> State
validateBS' bs st = BS.foldl' (flip feed) st bs

validateBS :: BS.ByteString -> Bool
validateBS bs = isAccepting $ validateBS' bs initialState