{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE UnboxedTuples #-}
module FastTags.LexerM
( AlexState(..)
, mkAlexState
, alexEnterBirdLiterateEnv
, alexEnterLiterateLatexEnv
, alexExitLiterateEnv
, pushContext
, modifyCommentDepth
, modifyQuasiquoterDepth
, modifyPreprocessorDepth
, addIndentationSize
, checkQuasiQuoteEndPresent
, AlexM
, runAlexM
, alexSetInput
, alexSetNextCode
, AlexInput(..)
, aiLineL
, takeText
, countInputSpace
, extractDefineOrLetName
, dropUntilNL
, dropUntilUnescapedNL
, dropUntilNLOr
, dropUntilNLOrEither
, unsafeTextHeadAscii
, unsafeTextHeadOfTailAscii
, unsafeTextHead
, utf8BS
, asCodeL
, asCommentDepthL
, asQuasiquoterDepthL
, asIndentationSizeL
, asPreprocessorDepthL
, asLiterateLocL
, asHaveQQEndL
, alexInputPrevChar
, alexGetByte
) where
import Control.Applicative as A
import Control.DeepSeq
import Control.Monad.ST
import Control.Monad.State.Strict
import Control.Monad.Writer.Strict
import Data.Char
import Data.Int
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import Data.Void (Void, vacuous)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Char8 as C8
import qualified Data.ByteString.Internal as BSI
import qualified Data.Vector.Unboxed as U
import qualified Data.Vector.Unboxed.Mutable as UM
import Foreign.ForeignPtr
import Foreign.Ptr
import GHC.Base
import GHC.Ptr
import GHC.Word
import Text.Printf
import FastTags.LensBlaze
import FastTags.LexerTypes
import FastTags.Token
data AlexState = AlexState
{ asInput :: {-# UNPACK #-} !AlexInput
, asIntStore :: {-# UNPACK #-} !Word64
, asContextStack :: [Context]
} deriving (Show, Eq, Ord)
{-# INLINE asIntStoreL #-}
asIntStoreL :: Lens' AlexState Word64
asIntStoreL = lens asIntStore (\b s -> s { asIntStore = b })
{-# INLINE maybeBoolToInt #-}
maybeBoolToInt :: Maybe Bool -> Int
maybeBoolToInt = \case
Nothing -> 0
Just False -> 1
Just True -> 2
{-# INLINE intToMaybeBool #-}
intToMaybeBool :: Int -> Maybe Bool
intToMaybeBool = \case
0 -> Nothing
1 -> Just False
2 -> Just True
x -> error $ "Invalid integer representation of 'Maybe Bool': " ++ show x
{-# INLINE asCodeL #-}
{-# INLINE asCommentDepthL #-}
{-# INLINE asQuasiquoterDepthL #-}
{-# INLINE asIndentationSizeL #-}
{-# INLINE asPreprocessorDepthL #-}
{-# INLINE asLiterateLocL #-}
{-# INLINE asHaveQQEndL #-}
asCodeL :: Lens' AlexState AlexCode
asCommentDepthL, asQuasiquoterDepthL, asIndentationSizeL :: Lens' AlexState Int16
asPreprocessorDepthL :: Lens' AlexState Int16
asLiterateLocL :: Lens' AlexState (LitMode LitStyle)
asHaveQQEndL :: Lens' AlexState (Maybe Bool)
asCodeL = asIntStoreL . intL 0 0x000f
asCommentDepthL = asIntStoreL . intL 4 0x03ff
asQuasiquoterDepthL = asIntStoreL . intL 14 0x03ff
asIndentationSizeL = asIntStoreL . int16L 24
asPreprocessorDepthL = asIntStoreL . int16L 40
asLiterateLocL = \f -> asIntStoreL (intL 56 0x0003 (fmap litLocToInt . f . intToLitLoc))
asHaveQQEndL = \f -> asIntStoreL (intL 58 0x0003 (fmap maybeBoolToInt . f . intToMaybeBool))
{-# INLINE litLocToInt #-}
litLocToInt :: LitMode LitStyle -> Int
litLocToInt = \case
LitVanilla -> 0
LitOutside -> 1
LitInside Bird -> 2
LitInside Latex -> 3
{-# INLINE intToLitLoc #-}
intToLitLoc :: Int -> LitMode LitStyle
intToLitLoc = \case
0 -> LitVanilla
1 -> LitOutside
2 -> LitInside Bird
3 -> LitInside Latex
x -> error $ "Invalid literate location representation: " ++ show x
mkAlexState :: LitMode Void -> AlexCode -> AlexInput -> AlexState
mkAlexState litLoc startCode input =
set asCodeL startCode $
set asLiterateLocL (vacuous litLoc) AlexState
{ asInput = input
, asIntStore = 0
, asContextStack = []
}
{-# INLINE alexEnterBirdLiterateEnv #-}
alexEnterBirdLiterateEnv :: MonadState AlexState m => m ()
alexEnterBirdLiterateEnv =
modify $ set asLiterateLocL (LitInside Bird)
{-# INLINE alexEnterLiterateLatexEnv #-}
alexEnterLiterateLatexEnv :: MonadState AlexState m => m ()
alexEnterLiterateLatexEnv =
modify $ set asLiterateLocL (LitInside Latex)
{-# INLINE alexExitLiterateEnv #-}
alexExitLiterateEnv :: MonadState AlexState m => m ()
alexExitLiterateEnv =
modify $ set asLiterateLocL LitOutside
{-# INLINE pushContext #-}
pushContext :: MonadState AlexState m => Context -> m ()
pushContext ctx =
modify (\s -> s { asContextStack = ctx : asContextStack s })
{-# INLINE modifyCommentDepth #-}
modifyCommentDepth :: MonadState AlexState m => (Int16 -> Int16) -> m Int16
modifyCommentDepth f = do
depth <- gets (view asCommentDepthL)
let !depth' = f depth
modify $ \s -> set asCommentDepthL depth' s
return depth'
{-# INLINE modifyQuasiquoterDepth #-}
modifyQuasiquoterDepth :: MonadState AlexState m => (Int16 -> Int16) -> m Int16
modifyQuasiquoterDepth f = do
depth <- gets (view asQuasiquoterDepthL)
let !depth' = f depth
modify $ \s -> set asQuasiquoterDepthL depth' s
return depth'
{-# INLINE modifyPreprocessorDepth #-}
modifyPreprocessorDepth :: MonadState AlexState m => (Int16 -> Int16) -> m Int16
modifyPreprocessorDepth f = do
depth <- gets (view asPreprocessorDepthL)
let !depth' = f depth
modify $ \s -> set asPreprocessorDepthL depth' s
return depth'
{-# INLINE alexSetInput #-}
alexSetInput :: MonadState AlexState m => AlexInput -> m ()
alexSetInput input = modify $ \s -> s { asInput = input }
{-# INLINE alexSetNextCode #-}
alexSetNextCode :: MonadState AlexState m => AlexCode -> m ()
alexSetNextCode code = modify $ set asCodeL code
{-# INLINE addIndentationSize #-}
addIndentationSize :: MonadState AlexState m => Int16 -> m ()
addIndentationSize x =
modify (over asIndentationSizeL (+ x))
data QQEndsState = QQEndsState
{ qqessPresent :: !Int#
, qqessPrevChar :: !Char#
}
checkQuasiQuoteEndPresent :: Ptr Word8 -> Bool
checkQuasiQuoteEndPresent
= (\x -> isTrue# (qqessPresent x))
. utf8Foldl' combine (QQEndsState 0# '\n'#)
where
combine :: QQEndsState -> Char# -> QQEndsState
combine QQEndsState{qqessPresent, qqessPrevChar} c# = QQEndsState
{ qqessPresent =
qqessPresent `orI#`
case (# qqessPrevChar, c# #) of
(# '|'#, ']'# #) -> 1#
(# _, '⟧'# #) -> 1#
_ -> 0#
, qqessPrevChar = c#
}
type AlexM = WriterT [(AlexInput, TokenVal)] (State AlexState)
{-# INLINE runAlexM #-}
runAlexM
:: FilePath
-> Bool
-> LitMode Void
-> AlexCode
-> C8.ByteString
-> AlexM a
-> (a, [Token])
runAlexM filepath trackPrefixesAndOffsets litLoc startCode input action =
withAlexInput input $ \input' inputSize ->
let (a, xs) = evalState (runWriterT action)
$ mkAlexState litLoc startCode input'
in if trackPrefixesAndOffsets
then
let !ptr = aiPtr input' `plusPtr` 1
!size = inputSize - 1
!idx = positionsIndex ptr size
res =
map (\(x, y) -> Pos (mkSrcPos filepath idx ptr x) y) xs
in res `deepseq` (a, res)
else
(a, map (\(x, y) -> Pos (mkSrcPosNoPrefix filepath x) y) xs)
mkSrcPosNoPrefix :: FilePath -> AlexInput -> SrcPos
mkSrcPosNoPrefix filename input =
SrcPos { posFile = filename
, posLine = view aiLineL input
, posOffset = Offset 0
, posPrefix = mempty
, posSuffix = mempty
}
mkSrcPos :: FilePath -> U.Vector Int -> Ptr Word8 -> AlexInput -> SrcPos
mkSrcPos filename bytesToCharsMap start (input@AlexInput {aiPtr}) =
SrcPos { posFile = filename
, posLine = view aiLineL input
, posOffset
, posPrefix
, posSuffix
}
where
lineLen = view aiLineLengthL input
posPrefix = TE.decodeUtf8 $ bytesToUtf8BS lineLen $ plusPtr aiPtr $ negate lineLen
posSuffix = TE.decodeUtf8 $ regionToUtf8BS aiPtr $ dropUntilNL# aiPtr
posOffset = Offset $ U.unsafeIndex bytesToCharsMap $ minusPtr aiPtr start
positionsIndex :: Ptr Word8 -> Int -> U.Vector Int
positionsIndex (Ptr start#) len =
U.create $ do
(vec :: UM.MVector s Int) <- UM.new len
let assignAfter :: Int -> Int -> Int -> ST s ()
assignAfter start n item = go' n start
where
go' :: Int -> Int -> ST s ()
go' 0 !i = UM.unsafeWrite vec i item
go' !k !i = UM.unsafeWrite vec i item *> go' (k - 1) (i + 1)
go :: Int# -> Int -> ST s ()
go bytes# !nChars =
case utf8SizeChar# (start# `plusAddr#` bytes#) of
0# -> pure ()
nBytes# -> do
assignAfter (I# bytes#) (I# nBytes#) nChars
go (bytes# +# nBytes#) $ nChars + 1
go 0# 0
A.pure vec
data AlexInput = AlexInput
{ aiPtr :: {-# UNPACK #-} !(Ptr Word8)
, aiIntStore :: {-# UNPACK #-} !Word64
} deriving (Eq, Ord)
instance Show AlexInput where
show AlexInput{aiPtr, aiIntStore} =
printf "AlexInput 0x%08x 0x%08x" ptr aiIntStore
where
ptr :: Word
ptr = fromIntegral $ ptrToWordPtr aiPtr
{-# INLINE aiIntStoreL #-}
aiIntStoreL :: Lens' AlexInput Word64
aiIntStoreL = lens aiIntStore (\b s -> s { aiIntStore = b })
lineInt32L :: Lens' Int32 Line
lineInt32L = lens (Line . fromIntegral) (\(Line x) _ -> fromIntegral x)
int2Int32L :: Lens' Int32 Int
int2Int32L = lens fromIntegral (\x _ -> fromIntegral x)
{-# INLINE aiLineL #-}
{-# INLINE aiLineLengthL #-}
aiLineL :: Lens' AlexInput Line
aiLineLengthL :: Lens' AlexInput Int
aiLineL = aiIntStoreL . int32L 0 . lineInt32L
aiLineLengthL = aiIntStoreL . int32L 32 . int2Int32L
{-# INLINE takeText #-}
takeText :: AlexInput -> Int -> T.Text
takeText AlexInput{aiPtr} len =
TE.decodeUtf8 $ utf8BS len aiPtr
countInputSpace :: AlexInput -> Int -> Int
countInputSpace AlexInput{aiPtr} len =
utf8FoldlBounded len inc 0 aiPtr
where
inc !acc ' '# = acc + 1
inc !acc '\t'# = acc + 8
inc !acc c# = case fixChar c# of
1## -> acc + 1
_ -> acc
{-# INLINE withAlexInput #-}
withAlexInput :: C8.ByteString -> (AlexInput -> Int -> a) -> a
withAlexInput s f =
case s' of
BSI.PS ptr offset len ->
BSI.accursedUnutterablePerformIO $ withForeignPtr ptr $ \ptr' -> do
let !input =
set aiLineL initLine $
AlexInput
{ aiPtr = ptr' `plusPtr` offset
, aiIntStore = 0
}
!res = f input $ len - offset
touchForeignPtr ptr
pure res
where
initLine = Line 0
s' = C8.cons '\n' $ C8.snoc (C8.snoc (stripBOM s) '\n') '\0'
stripBOM :: C8.ByteString -> C8.ByteString
stripBOM xs
| "\xEF\xBB\xBF" `C8.isPrefixOf` xs
= C8.drop 3 xs
| otherwise
= xs
{-# INLINE extractDefineOrLetName #-}
extractDefineOrLetName :: AlexInput -> Int -> T.Text
extractDefineOrLetName AlexInput{aiPtr} n =
TE.decodeUtf8 $ regionToUtf8BS (Ptr start#) end
where
!end = aiPtr `plusPtr` n
!(Ptr end#) = end
start# = (goBack# (end# `plusAddr#` -1#)) `plusAddr#` 1#
goBack# :: Addr# -> Addr#
goBack# ptr# = case indexWord8OffAddr# ptr# 0# of
0## -> ptr#
9## -> ptr#
10## -> ptr#
13## -> ptr#
32## -> ptr#
92## -> ptr#
_ -> goBack# (ptr# `plusAddr#` -1#)
{-# INLINE dropUntilNL #-}
dropUntilNL :: AlexInput -> AlexInput
dropUntilNL input@AlexInput{aiPtr} =
input { aiPtr = dropUntilNL# aiPtr }
{-# INLINE dropUntilUnescapedNL #-}
dropUntilUnescapedNL :: AlexInput -> AlexInput
dropUntilUnescapedNL input@AlexInput{aiPtr = start} =
case dropUntilUnescapedNL# start of
(# seenNewlines, end #) ->
over aiLineL (\(Line n) -> Line (n + seenNewlines)) $
input { aiPtr = end }
{-# INLINE dropUntilNLOr #-}
dropUntilNLOr :: Word8 -> AlexInput -> AlexInput
dropUntilNLOr w input@AlexInput{aiPtr} =
input { aiPtr = dropUntilNLOr# w aiPtr }
{-# INLINE dropUntilNLOrEither #-}
dropUntilNLOrEither :: Word8 -> Word8 -> AlexInput -> AlexInput
dropUntilNLOrEither w1 w2 input@AlexInput{aiPtr} =
input { aiPtr = dropUntilNLOrEither# w1 w2 aiPtr }
{-# INLINE alexInputPrevChar #-}
alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar AlexInput{ aiPtr = Ptr ptr# } =
case base# `minusAddr#` start# of
0# -> C# (chr# ch0)
1# -> let !(# x, _ #) = readChar1# start# ch0 in C# x
2# -> let !(# x, _ #) = readChar2# start# ch0 in C# x
3# -> let !(# x, _ #) = readChar3# start# ch0 in C# x
_ -> '\0'
where
ch0 :: Int#
!ch0 = word2Int# (indexWord8OffAddr# start# 0#)
base# = findCharStart ptr# `plusAddr#` -1#
start# = findCharStart base#
findCharStart :: Addr# -> Addr#
findCharStart p#
| startsWith10# w#
= findCharStart (p# `plusAddr#` -1#)
| otherwise
= p#
where
w# = word2Int# (indexWord8OffAddr# p# 0#)
{-# INLINE alexGetByte #-}
alexGetByte :: AlexInput -> Maybe (Word8, AlexInput)
alexGetByte input@AlexInput{aiPtr} =
case nextChar aiPtr of
(# c#, n, cs #) ->
case fixChar c# of
0## -> Nothing
10## -> Just (10, input')
where
!input' =
over aiLineL increaseLine $
set aiLineLengthL 0 $
input { aiPtr = cs }
c -> Just (b, input')
where
!b = W8# c
!input' =
over aiLineLengthL (+ I# n) $
input { aiPtr = cs }
{-# INLINE fixChar #-}
fixChar :: Char# -> Word#
fixChar = \case
'→'# -> reservedSym
'∷'# -> reservedSym
'⇒'# -> reservedSym
'∀'# -> reservedSym
'⦇'# -> reservedSym
'⦈'# -> reservedSym
'⟦'# -> reservedSym
'⟧'# -> reservedSym
'\x00'# -> fullStop
'\x01'# -> fullStop
'\x02'# -> fullStop
'\x03'# -> fullStop
'\x04'# -> fullStop
'\x05'# -> fullStop
'\x06'# -> fullStop
'\x07'# -> fullStop
'\x08'# -> other
c# -> case ord# c# of
c2# | isTrue# (c2# <=# 0x7f#) ->
int2Word# c2#
| otherwise ->
case generalCategory (C# c#) of
UppercaseLetter -> upper
LowercaseLetter -> lower
TitlecaseLetter -> upper
ModifierLetter -> suffix
OtherLetter -> lower
NonSpacingMark -> suffix
DecimalNumber -> digit
OtherNumber -> digit
Space -> space
ConnectorPunctuation -> symbol
DashPunctuation -> symbol
OtherPunctuation -> symbol
MathSymbol -> symbol
CurrencySymbol -> symbol
ModifierSymbol -> symbol
OtherSymbol -> symbol
SpacingCombiningMark -> space
EnclosingMark -> other
LetterNumber -> symbol
OpenPunctuation -> symbol
ClosePunctuation -> symbol
InitialQuote -> symbol
FinalQuote -> symbol
LineSeparator -> space
ParagraphSeparator -> space
Control -> other
Format -> other
Surrogate -> other
PrivateUse -> other
NotAssigned -> other
where
fullStop, space, upper, lower, symbol :: Word#
digit, suffix, reservedSym, other :: Word#
fullStop = 0x00##
space = 0x01##
upper = 0x02##
lower = 0x03##
symbol = 0x04##
digit = 0x05##
suffix = 0x06##
reservedSym = 0x07##
other = 0x08##
{-# INLINE unsafeTextHeadAscii #-}
unsafeTextHeadAscii :: Ptr Word8 -> Word8
unsafeTextHeadAscii (Ptr ptr#) = W8# (indexWord8OffAddr# ptr# 0#)
{-# INLINE unsafeTextHeadOfTailAscii #-}
unsafeTextHeadOfTailAscii :: Ptr Word8 -> Word8
unsafeTextHeadOfTailAscii (Ptr ptr#) = W8# (indexWord8OffAddr# ptr# 1#)
{-# INLINE unsafeTextHead #-}
unsafeTextHead :: Ptr Word8 -> Char
unsafeTextHead x =
case nextChar x of
(# c#, _, _ #) -> C# c#
{-# INLINE nextChar #-}
nextChar :: Ptr Word8 -> (# Char#, Int#, Ptr Word8 #)
nextChar (Ptr ptr#) =
case utf8DecodeChar# ptr# of
(# c#, nBytes# #) -> (# c#, nBytes#, Ptr (ptr# `plusAddr#` nBytes#) #)
{-# INLINE dropUntilNL# #-}
dropUntilNL# :: Ptr Word8 -> Ptr Word8
dropUntilNL# (Ptr start#) = Ptr (go start#)
where
go :: Addr# -> Addr#
go ptr# = case indexWord8OffAddr# ptr# 0# of
0## -> ptr#
10## -> ptr#
_ -> go (ptr# `plusAddr#` 1#)
{-# INLINE dropUntilUnescapedNL# #-}
dropUntilUnescapedNL# :: Ptr Word8 -> (# Int, Ptr Word8 #)
dropUntilUnescapedNL# (Ptr start#) = go 0 start#
where
go :: Int -> Addr# -> (# Int, Ptr Word8 #)
go !n ptr# = case indexWord8OffAddr# ptr# 0# of
0## -> (# n, Ptr ptr# #)
10## -> (# n, Ptr ptr# #)
92## ->
case indexWord8OffAddr# ptr# 1# of
0## -> (# n, Ptr (ptr# `plusAddr#` 1#) #)
10## -> go (n + 1) (ptr# `plusAddr#` 2#)
_ -> go n (ptr# `plusAddr#` 2#)
_ -> go n (ptr# `plusAddr#` 1#)
{-# INLINE dropUntilNLOr# #-}
dropUntilNLOr# :: Word8 -> Ptr Word8 -> Ptr Word8
dropUntilNLOr# (W8# w#) (Ptr start#) = Ptr (go start#)
where
go :: Addr# -> Addr#
go ptr# = case indexWord8OffAddr# ptr# 0# of
0## -> ptr#
10## -> ptr#
c# | isTrue# (c# `eqWord#` w#) -> ptr#
| otherwise -> go (ptr# `plusAddr#` 1#)
{-# INLINE dropUntilNLOrEither# #-}
dropUntilNLOrEither# :: Word8 -> Word8 -> Ptr Word8 -> Ptr Word8
dropUntilNLOrEither# (W8# w1#) (W8# w2#) (Ptr start#) = Ptr (go start#)
where
go :: Addr# -> Addr#
go ptr# = case indexWord8OffAddr# ptr# 0# of
0## -> ptr#
10## -> ptr#
c# | isTrue# ((c# `eqWord#` w1#) `orI#` (c# `eqWord#` w2#))
-> ptr#
| otherwise
-> go (ptr# `plusAddr#` 1#)
{-# INLINE utf8Foldl' #-}
utf8Foldl' :: forall a. (a -> Char# -> a) -> a -> Ptr Word8 -> a
utf8Foldl' f x0 (Ptr ptr#) =
go x0 ptr#
where
go :: a -> Addr# -> a
go !acc addr# =
case utf8DecodeChar# addr# of
(# _, 0# #) -> acc
(# c#, nBytes# #) -> go (acc `f` c#) (addr# `plusAddr#` nBytes#)
{-# INLINE utf8FoldlBounded #-}
utf8FoldlBounded :: forall a. Int -> (a -> Char# -> a) -> a -> Ptr Word8 -> a
utf8FoldlBounded (I# len#) f x0 (Ptr ptr#) =
go len# x0 ptr#
where
go :: Int#-> a -> Addr# -> a
go 0# !acc _ = acc
go n# !acc addr# =
case utf8DecodeChar# addr# of
(# _, 0# #) -> acc
(# c#, nBytes# #) ->
go (n# -# 1#) (acc `f` c#) (addr# `plusAddr#` nBytes#)
{-# INLINE utf8BS #-}
utf8BS :: Int -> Ptr Word8 -> BS.ByteString
utf8BS (I# nChars#) (Ptr start#) =
BSI.PS (BSI.accursedUnutterablePerformIO (newForeignPtr_ (Ptr start#))) 0 (I# (go nChars# 0#))
where
go :: Int# -> Int# -> Int#
go 0# bytes# = bytes#
go k# bytes# =
case utf8SizeChar# (start# `plusAddr#` bytes#) of
0# -> bytes#
nBytes# -> go (k# -# 1#) (bytes# +# nBytes#)
{-# INLINE bytesToUtf8BS #-}
bytesToUtf8BS :: Int -> Ptr Word8 -> BS.ByteString
bytesToUtf8BS (I# nbytes#) (Ptr start#) =
BSI.PS (BSI.accursedUnutterablePerformIO (newForeignPtr_ (Ptr start#))) 0 (I# nbytes#)
{-# INLINE regionToUtf8BS #-}
regionToUtf8BS :: Ptr Word8 -> Ptr Word8 -> BS.ByteString
regionToUtf8BS start end =
BSI.PS (BSI.accursedUnutterablePerformIO (newForeignPtr_ start)) 0 (minusPtr end start)
{-# INLINE utf8DecodeChar# #-}
utf8DecodeChar# :: Addr# -> (# Char#, Int# #)
utf8DecodeChar# a# =
case indexWord8OffAddr# a# 0# of
0## -> (# '\0'#, 0# #)
!x# ->
let !ch0 = word2Int# x# in
if | startsWith0# ch0 -> (# chr# ch0, 1# #)
| startsWith110# ch0 -> readChar1# a# ch0
| startsWith1110# ch0 -> readChar2# a# ch0
| startsWith11110# ch0 -> readChar3# a# ch0
| otherwise -> invalid# 1#
{-# INLINE invalid# #-}
invalid# :: Int# -> (# Char#, Int# #)
invalid# nBytes# = (# '\8'#, nBytes# #)
{-# INLINE readChar1# #-}
readChar1# :: Addr# -> Int# -> (# Char#, Int# #)
readChar1# a# ch0 =
let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in
if noValidUtf8Cont# ch1 then invalid# 1# else
(# chr# (((ch0 `andI#` 0x3F#) `uncheckedIShiftL#` 6#) `orI#`
(ch1 `andI#` 0x7F#)),
2# #)
{-# INLINE readChar2# #-}
readChar2# :: Addr# -> Int# -> (# Char#, Int# #)
readChar2# a# ch0 =
let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in
if noValidUtf8Cont# ch1 then invalid# 1# else
let !ch2 = word2Int# (indexWord8OffAddr# a# 2#) in
if noValidUtf8Cont# ch2 then invalid# 2# else
(# chr# (((ch0 `andI#` 0x1F#) `uncheckedIShiftL#` 12#) `orI#`
((ch1 `andI#` 0x7F#) `uncheckedIShiftL#` 6#) `orI#`
(ch2 `andI#` 0x7F#)),
3# #)
{-# INLINE readChar3# #-}
readChar3# :: Addr# -> Int# -> (# Char#, Int# #)
readChar3# a# ch0 =
let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in
if noValidUtf8Cont# ch1 then invalid# 1# else
let !ch2 = word2Int# (indexWord8OffAddr# a# 2#) in
if noValidUtf8Cont# ch2 then invalid# 2# else
let !ch3 = word2Int# (indexWord8OffAddr# a# 3#) in
if noValidUtf8Cont# ch3 then invalid# 3# else
(# chr# (((ch0 `andI#` 0x0F#) `uncheckedIShiftL#` 18#) `orI#`
((ch1 `andI#` 0x7F#) `uncheckedIShiftL#` 12#) `orI#`
((ch2 `andI#` 0x7F#) `uncheckedIShiftL#` 6#) `orI#`
(ch3 `andI#` 0x7F#)),
4# #)
{-# INLINE noValidUtf8Cont# #-}
noValidUtf8Cont# :: Int# -> Bool
noValidUtf8Cont# x = isTrue# ((x <# 0x80#) `orI#` (x ># 0xBF#))
{-# INLINE startsWith0# #-}
startsWith0# :: Int# -> Bool
startsWith0# x = isTrue# ((x `andI#` 0x80#) ==# 0#)
{-# INLINE startsWith10# #-}
startsWith10# :: Int# -> Bool
startsWith10# x = isTrue# ((x `andI#` 0xC0#) ==# 0x80#)
{-# INLINE startsWith110# #-}
startsWith110# :: Int# -> Bool
startsWith110# x = isTrue# ((x `andI#` 0xE0#) ==# 0xC0#)
{-# INLINE startsWith1110# #-}
startsWith1110# :: Int# -> Bool
startsWith1110# x = isTrue# ((x `andI#` 0xF0#) ==# 0xE0#)
{-# INLINE startsWith11110# #-}
startsWith11110# :: Int# -> Bool
startsWith11110# x = isTrue# ((x `andI#` 0xF8#) ==# 0xF0#)
{-# INLINE utf8SizeChar# #-}
utf8SizeChar# :: Addr# -> Int#
utf8SizeChar# a# =
case indexWord8OffAddr# a# 0# of
0## -> 0#
!x# ->
let !ch0 = word2Int# x# in
if | startsWith0# ch0 -> 1#
| startsWith110# ch0 -> 2#
| startsWith1110# ch0 -> 3#
| startsWith11110# ch0 -> 4#
| otherwise -> 1#