{-# LANGUAGE MagicHash #-} {- | This module defines the types and functions necessary for an Alex-generated lexer. https://haskell-alex.readthedocs.io/en/latest/api.html# -} module GHC.Parser.Lexer.Interface ( AlexInput (..), alexGetByte, alexInputPrevChar, -- * Helpers alexGetChar, adjustChar, ) where import GHC.Prelude import Data.Char (GeneralCategory (..), generalCategory, ord) import Data.Word (Word8) import GHC.Data.StringBuffer (StringBuffer, atEnd, nextChar, prevChar) import GHC.Exts import GHC.Types.SrcLoc (PsLoc, advancePsLoc) data AlexInput = AI !PsLoc !StringBuffer deriving (Show) -- See Note [Unicode in Alex] alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) alexGetByte (AI loc s) | atEnd s = Nothing | otherwise = byte `seq` loc' `seq` s' `seq` --trace (show (ord c)) $ Just (byte, (AI loc' s')) where (c,s') = nextChar s loc' = advancePsLoc loc c byte = adjustChar c -- Getting the previous 'Char' isn't enough here - we need to convert it into -- the same format that 'alexGetByte' would have produced. -- -- See Note [Unicode in Alex] and #13986. alexInputPrevChar :: AlexInput -> Char alexInputPrevChar (AI _ buf) = unsafeChr (fromIntegral (adjustChar pc)) where pc = prevChar buf '\n' -- backwards compatibility for Alex 2.x alexGetChar :: AlexInput -> Maybe (Char,AlexInput) alexGetChar inp = case alexGetByte inp of Nothing -> Nothing Just (b,i) -> c `seq` Just (c,i) where c = unsafeChr $ fromIntegral b unsafeChr :: Int -> Char unsafeChr (I# c) = GHC.Exts.C# (GHC.Exts.chr# c) {- Note [Unicode in Alex] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Although newer versions of Alex support unicode, this grammar is processed with the old style '--latin1' behaviour. This means that when implementing the functions alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) alexInputPrevChar :: AlexInput -> Char which Alex uses to take apart our 'AlexInput', we must * return a latin1 character in the 'Word8' that 'alexGetByte' expects * return a latin1 character in 'alexInputPrevChar'. We handle this in 'adjustChar' by squishing entire classes of unicode characters into single bytes. -} {-# INLINE adjustChar #-} adjustChar :: Char -> Word8 adjustChar c = adj_c where non_graphic = 0x00 upper = 0x01 lower = 0x02 digit = 0x03 symbol = 0x04 space = 0x05 other_graphic = 0x06 uniidchar = 0x07 adj_c | c <= '\x07' = non_graphic | c <= '\x7f' = fromIntegral (ord c) -- Alex doesn't handle Unicode, so when Unicode -- character is encountered we output these values -- with the actual character value hidden in the state. | otherwise = -- NB: The logic behind these definitions is also reflected -- in "GHC.Utils.Lexeme" -- Any changes here should likely be reflected there. case generalCategory c of UppercaseLetter -> upper LowercaseLetter -> lower TitlecaseLetter -> upper ModifierLetter -> uniidchar -- see #10196 OtherLetter -> lower -- see #1103 NonSpacingMark -> uniidchar -- see #7650 SpacingCombiningMark -> other_graphic EnclosingMark -> other_graphic DecimalNumber -> digit LetterNumber -> digit OtherNumber -> digit -- see #4373 ConnectorPunctuation -> symbol DashPunctuation -> symbol OpenPunctuation -> other_graphic ClosePunctuation -> other_graphic InitialQuote -> other_graphic FinalQuote -> other_graphic OtherPunctuation -> symbol MathSymbol -> symbol CurrencySymbol -> symbol ModifierSymbol -> symbol OtherSymbol -> symbol Space -> space _other -> non_graphic