----------------------------------------------------------------------------- -- Module : Lexer.x -- Copyright : (c) [1999..2004] Manuel M T Chakravarty -- (c) 2005 Duncan Coutts -- (c) 2008 Benedikt Huber -- License : BSD-style -- Maintainer : benedikt.huber@gmail.com -- Portability : portable -- -- Lexer for C files, after being processed by the C preprocessor -- -- We assume that the input already went through cpp. Thus, we do not handle -- comments and preprocessor directives here. It supports the -- C99 `restrict' extension: as -- well as inline functions. -- -- Comments: -- -- * Universal character names and multi-character character constants, -- as well as trigraphs are unsupported. They are lexed, but yield an error. -- -- * We add `typedef-name' (K&R 8.9) as a token, as proposed in K&R A13. -- However, as these tokens cannot be recognized lexically, but require a -- context analysis, they are never produced by the lexer, but instead have -- to be introduced in a later phase (by converting the corresponding -- identifiers). -- -- * We also recognize GNU C `__attribute__', `__extension__', `__complex__', -- `__const', `__const__', `__imag', `__imag__', `__inline', `__inline__', -- `__real', `__real__, `__restrict', and `__restrict__'. -- -- * Any line starting with `#pragma' is ignored. -- -- With K&R we refer to ``The C Programming Language'', second edition, Brain -- W. Kernighan and Dennis M. Ritchie, Prentice Hall, 1988. -- -- With C99 we refer to ``ISO/IEC 9899:TC3'', -- available online at http://www.open-std.org/JTC1/SC22/WG14/www/docs/n1256.pdf. -- --- TODO ---------------------------------------------------------------------- -- -- * There are more GNU C specific keywords. Add them and change `Parser.y' -- correspondingly (in particular, most tokens within __attribute ((...)) -- expressions are actually keywords, but we handle them as identifiers at -- the moment). -- -- * Add support for bytestrings { module Language.C.Parser.Lexer (lexC, parseError) where import Data.Char (isDigit) import Control.Monad (liftM) import Language.C.Data.InputStream import Language.C.Data.Position (Position(..),posOf) import Language.C.Data.Ident (mkIdent) import Language.C.Syntax.Constants import Language.C.Parser.Tokens import Language.C.Parser.ParserMonad } $space = [ \ \t ] -- horizontal white space $eol = \n -- end of line $letter = [a-zA-Z] $identletter = [a-zA-Z_\$] -- GNU extension: allow $ in variable names $octdigit = 0-7 $digit = 0-9 $digitNZ = 1-9 $hexdigit = [0-9a-fA-F] $inchar = \0-\255 # [ \\ \' \n \r ] -- valid character in char constant $instr = \0-\255 # [ \\ \" \n \r ] -- valid character in a string literal $anyButNL = \0-\255 # \n $infname = \ -\127 # [ \\ \" ] -- valid character in a filename @sp = $space* -- character escape sequence (follows K&R A2.5.2) -- -- * also used for strings -- * C99: 6.4.4.4 @charesc = \\([ntvbrfaeE\\\?\'\"]|$octdigit{1,3}|x$hexdigit+) @ucn = \\u$hexdigit{4}|\\U$hexdigit{8} -- components of integer constants -- -- * C99: 6.4.4.1 @int = $digitNZ$digit* -- integer suffixes @llsuffix = ll|LL @gnusuffix = [ij]? @intsuffix = [uU][lL]?|[uU]@llsuffix|[lL][uU]?|@llsuffix[uU]? @intgnusuffix = @intsuffix@gnusuffix?|@gnusuffix@intsuffix? -- components of float constants (follows K&R A2.5.3) -- -- * C99: 6.4.4.2 @digits = $digit+ @intpart = @digits @fractpart = @digits @mantpart = @intpart?\.@fractpart|@intpart\. @exppart = [eE][\+\-]?@digits @hexprefix = 0x @hexdigits = $hexdigit+ @hexmant = @hexdigits?\.@hexdigits|@hexdigits\. @binexp = [pP][\+\-]?@digits @floatsuffix = [fFlL] @floatgnusuffix = @floatsuffix@gnusuffix?|@gnusuffix@floatsuffix? tokens :- -- whitespace (follows K&R A2.1) -- -- * horizontal and vertical tabs, newlines, and form feeds are filter out by -- `Lexers.ctrlLexer' -- -- * comments are not handled, as we assume the input already went through cpp -- $white+ ; -- #line directive (K&R A12.6) -- -- * allows further ints after the file name a la GCC; as the GCC CPP docu -- doesn't say how many ints there can be, we allow an unbound number -- \#$space*@int$space*(\"($infname|@charesc)*\"$space*)?(@int$space*)*\r?$eol { \pos len str -> setPos (linePragmaAdjust (takeChars len str) pos) >> lexToken } -- #pragma directive (K&R A12.8) -- -- * we simply ignore any #pragma (but take care to update the position -- information) -- \#$space*pragma$anyButNL*$eol ; -- #ident directive, eg used by rcs/cvs -- -- * we simply ignore any #ident (but take care to update the position -- information) -- \#$space*ident$anyButNL*$eol ; -- identifiers and keywords (follows K&R A2.3 and A2.4) -- $identletter($identletter|$digit)* { \pos len str -> idkwtok (takeChars len str) pos } -- constants (follows K&R A2.5) -- -- * K&R,C99 explicitly mention `enumeration-constants'; however, as they are -- lexically identifiers, we do not have an extra case for them -- -- integer constants (follows K&R A2.5.1, C99 6.4.4.1) -- NOTE: 0 is lexed as octal integer constant, and readCOctal takes care of this 0$octdigit*@intgnusuffix? { token_plus CTokILit readCOctal } $digitNZ$digit*@intgnusuffix? { token_plus CTokILit (readCInteger DecRepr) } 0[xX]$hexdigit+@intgnusuffix? { token_plus CTokILit (readCInteger HexRepr . drop 2) } (0$octdigit*|$digitNZ$digit*|0[xX]$hexdigit+)[uUlL]+ { token_fail "Invalid integer constant suffix" } -- character constants (follows K&R A2.5.2, C99 6.4.4.4) -- -- * Universal Character Names are unsupported and cause an error. \'($inchar|@charesc)\' { token CTokCLit (cChar . fst . unescapeChar . tail) } L\'($inchar|@charesc)\' { token CTokCLit (cChar_w . fst . unescapeChar . tail . tail) } \'($inchar|@charesc){2,}\' { token CTokCLit (flip cChars False . unescapeMultiChars .tail) } L\'($inchar|@charesc){2,}\' { token CTokCLit (flip cChars True . unescapeMultiChars . tail . tail) } -- float constants (follows K&R A2.5.3. C99 6.4.4.2) -- -- * NOTE: Hexadecimal floating constants without binary exponents are forbidden. -- They generate a lexer error, because they are hard to recognize in the parser. (@mantpart@exppart?|@intpart@exppart)@floatgnusuffix? { token CTokFLit readCFloat } @hexprefix(@hexmant|@hexdigits)@binexp@floatgnusuffix? { token CTokFLit readCFloat } @hexprefix@hexmant { token_fail "Hexadecimal floating constant requires an exponent" } -- string literal (follows K&R A2.6) -- C99: 6.4.5. \"($instr|@charesc)*\" { token CTokSLit (cString . unescapeString . init . tail) } L\"($instr|@charesc)*\" { token CTokSLit (cString_w . unescapeString . init . tail . tail) } L?\'@ucn\' { token_fail "Universal character names are unsupported" } L?\'\\[^0-7'\"\?\\abfnrtvuUx]\' { token_fail "Invalid escape sequence" } L?\"($inchar|@charesc)*@ucn($inchar|@charesc|@ucn)*\" { token_fail "Universal character names in string literals are unsupported"} -- operators and separators -- "(" { token_ CTokLParen } ")" { token_ CTokRParen } "[" { token_ CTokLBracket } "]" { token_ CTokRBracket } "->" { token_ CTokArrow } "." { token_ CTokDot } "!" { token_ CTokExclam } "~" { token_ CTokTilde } "++" { token_ CTokInc } "--" { token_ CTokDec } "+" { token_ CTokPlus } "-" { token_ CTokMinus } "*" { token_ CTokStar } "/" { token_ CTokSlash } "%" { token_ CTokPercent } "&" { token_ CTokAmper } "<<" { token_ CTokShiftL } ">>" { token_ CTokShiftR } "<" { token_ CTokLess } "<=" { token_ CTokLessEq } ">" { token_ CTokHigh } ">=" { token_ CTokHighEq } "==" { token_ CTokEqual } "!=" { token_ CTokUnequal } "^" { token_ CTokHat } "|" { token_ CTokBar } "&&" { token_ CTokAnd } "||" { token_ CTokOr } "?" { token_ CTokQuest } ":" { token_ CTokColon } "=" { token_ CTokAssign } "+=" { token_ CTokPlusAss } "-=" { token_ CTokMinusAss } "*=" { token_ CTokStarAss } "/=" { token_ CTokSlashAss } "%=" { token_ CTokPercAss } "&=" { token_ CTokAmpAss } "^=" { token_ CTokHatAss } "|=" { token_ CTokBarAss } "<<=" { token_ CTokSLAss } ">>=" { token_ CTokSRAss } "," { token_ CTokComma } \; { token_ CTokSemic } "{" { token_ CTokLBrace } "}" { token_ CTokRBrace } "..." { token_ CTokEllipsis } { -- Fix the 'octal' lexing of '0' readCOctal :: String -> Either String CInteger readCOctal s@('0':r) = case r of (c:_) | isDigit c -> readCInteger OctalRepr r _ -> readCInteger DecRepr s -- We use the odd looking list of string patterns here rather than normal -- string literals since GHC converts the latter into a sequence of string -- comparisons (ie a linear search) but it translates the former using its -- effecient pattern matching which gives us the expected radix-style search. -- This gives change makes a significant performance difference. -- idkwtok :: String -> Position -> P CToken idkwtok ('a':'l':'i':'g':'n':'o':'f':[]) = tok CTokAlignof idkwtok ('_':'_':'a':'l':'i':'g':'n':'o':'f':[]) = tok CTokAlignof idkwtok ('_':'_':'a':'l':'i':'g':'n':'o':'f':'_':'_':[]) = tok CTokAlignof idkwtok ('a':'s':'m':[]) = tok CTokAsm idkwtok ('_':'_':'a':'s':'m':[]) = tok CTokAsm idkwtok ('_':'_':'a':'s':'m':'_':'_':[]) = tok CTokAsm idkwtok ('a':'u':'t':'o':[]) = tok CTokAuto idkwtok ('b':'r':'e':'a':'k':[]) = tok CTokBreak idkwtok ('_':'B':'o':'o':'l':[]) = tok CTokBool idkwtok ('c':'a':'s':'e':[]) = tok CTokCase idkwtok ('c':'h':'a':'r':[]) = tok CTokChar idkwtok ('c':'o':'n':'s':'t':[]) = tok CTokConst idkwtok ('_':'_':'c':'o':'n':'s':'t':[]) = tok CTokConst idkwtok ('_':'_':'c':'o':'n':'s':'t':'_':'_':[]) = tok CTokConst idkwtok ('c':'o':'n':'t':'i':'n':'u':'e':[]) = tok CTokContinue idkwtok ('_':'C':'o':'m':'p':'l':'e':'x':[]) = tok CTokComplex idkwtok ('_':'_':'c':'o':'m':'p':'l':'e':'x':'_':'_':[]) = tok CTokComplex idkwtok ('d':'e':'f':'a':'u':'l':'t':[]) = tok CTokDefault idkwtok ('d':'o':[]) = tok CTokDo idkwtok ('d':'o':'u':'b':'l':'e':[]) = tok CTokDouble idkwtok ('e':'l':'s':'e':[]) = tok CTokElse idkwtok ('e':'n':'u':'m':[]) = tok CTokEnum idkwtok ('e':'x':'t':'e':'r':'n':[]) = tok CTokExtern idkwtok ('f':'l':'o':'a':'t':[]) = tok CTokFloat idkwtok ('f':'o':'r':[]) = tok CTokFor idkwtok ('g':'o':'t':'o':[]) = tok CTokGoto idkwtok ('i':'f':[]) = tok CTokIf idkwtok ('i':'n':'l':'i':'n':'e':[]) = tok CTokInline idkwtok ('_':'_':'i':'n':'l':'i':'n':'e':[]) = tok CTokInline idkwtok ('_':'_':'i':'n':'l':'i':'n':'e':'_':'_':[]) = tok CTokInline idkwtok ('i':'n':'t':[]) = tok CTokInt idkwtok ('l':'o':'n':'g':[]) = tok CTokLong idkwtok ('r':'e':'g':'i':'s':'t':'e':'r':[]) = tok CTokRegister idkwtok ('r':'e':'s':'t':'r':'i':'c':'t':[]) = tok CTokRestrict idkwtok ('_':'_':'r':'e':'s':'t':'r':'i':'c':'t':[]) = tok CTokRestrict idkwtok ('_':'_':'r':'e':'s':'t':'r':'i':'c':'t':'_':'_':[]) = tok CTokRestrict idkwtok ('r':'e':'t':'u':'r':'n':[]) = tok CTokReturn idkwtok ('s':'h':'o':'r':'t':[]) = tok CTokShort idkwtok ('s':'i':'g':'n':'e':'d':[]) = tok CTokSigned idkwtok ('_':'_':'s':'i':'g':'n':'e':'d':[]) = tok CTokSigned idkwtok ('_':'_':'s':'i':'g':'n':'e':'d':'_':'_':[]) = tok CTokSigned idkwtok ('s':'i':'z':'e':'o':'f':[]) = tok CTokSizeof idkwtok ('s':'t':'a':'t':'i':'c':[]) = tok CTokStatic idkwtok ('s':'t':'r':'u':'c':'t':[]) = tok CTokStruct idkwtok ('s':'w':'i':'t':'c':'h':[]) = tok CTokSwitch idkwtok ('t':'y':'p':'e':'d':'e':'f':[]) = tok CTokTypedef idkwtok ('t':'y':'p':'e':'o':'f':[]) = tok CTokTypeof idkwtok ('_':'_':'t':'y':'p':'e':'o':'f':[]) = tok CTokTypeof idkwtok ('_':'_':'t':'y':'p':'e':'o':'f':'_':'_':[]) = tok CTokTypeof idkwtok ('_':'_':'t':'h':'r':'e':'a':'d':[]) = tok CTokThread idkwtok ('u':'n':'i':'o':'n':[]) = tok CTokUnion idkwtok ('u':'n':'s':'i':'g':'n':'e':'d':[]) = tok CTokUnsigned idkwtok ('v':'o':'i':'d':[]) = tok CTokVoid idkwtok ('v':'o':'l':'a':'t':'i':'l':'e':[]) = tok CTokVolatile idkwtok ('_':'_':'v':'o':'l':'a':'t':'i':'l':'e':[]) = tok CTokVolatile idkwtok ('_':'_':'v':'o':'l':'a':'t':'i':'l':'e':'_':'_':[]) = tok CTokVolatile idkwtok ('w':'h':'i':'l':'e':[]) = tok CTokWhile idkwtok ('_':'_':'l':'a':'b':'e':'l':'_':'_':[]) = tok CTokLabel idkwtok ('_':'_':'a':'t':'t':'r':'i':'b':'u':'t':'e':[]) = tok (CTokGnuC GnuCAttrTok) idkwtok ('_':'_':'a':'t':'t':'r':'i':'b':'u':'t':'e':'_':'_':[]) = tok (CTokGnuC GnuCAttrTok) idkwtok ('_':'_':'e':'x':'t':'e':'n':'s':'i':'o':'n':'_':'_':[]) = tok (CTokGnuC GnuCExtTok) idkwtok ('_':'_':'r':'e':'a':'l':[]) = tok (CTokGnuC GnuCComplexReal) idkwtok ('_':'_':'r':'e':'a':'l':'_':'_':[]) = tok (CTokGnuC GnuCComplexReal) idkwtok ('_':'_':'i':'m':'a':'g':[]) = tok (CTokGnuC GnuCComplexImag) idkwtok ('_':'_':'i':'m':'a':'g':'_':'_':[]) = tok (CTokGnuC GnuCComplexImag) idkwtok ('_':'_':'b':'u':'i':'l':'t':'i':'n':'_':rest) | rest == "va_arg" = tok (CTokGnuC GnuCVaArg) | rest == "offsetof" = tok (CTokGnuC GnuCOffsetof) | rest == "types_compatible_p" = tok (CTokGnuC GnuCTyCompat) idkwtok cs = \pos -> do name <- getNewName let ident = mkIdent pos cs name tyident <- isTypeIdent ident if tyident then return (CTokTyIdent pos ident) else return (CTokIdent pos ident) ignoreAttribute :: P () ignoreAttribute = skipTokens (0::Int) where skipTokens :: Int -> P () skipTokens n = do tok <- lexToken case tok of CTokRParen _ | n == 1 -> return () | otherwise -> skipTokens (n-1) CTokLParen _ -> skipTokens (n+1) _ -> skipTokens n tok :: (Position -> CToken) -> Position -> P CToken tok tc pos = return (tc pos) linePragmaAdjust :: String -> Position -> Position linePragmaAdjust str p@(Position fname row _) = Position fname' row' 1 where str' = dropWhite . drop 1 $ str (rowStr, str'') = span isDigit str' row' = read rowStr str''' = dropWhite str'' fnameStr = takeWhile (/= '"') . drop 1 $ str''' fname' | null str''' || head str''' /= '"' = fname -- try and get more sharing of file name strings | fnameStr == fname = fname | otherwise = fnameStr -- dropWhite = dropWhile (\c -> c == ' ' || c == '\t') -- special utility for the lexer unescapeMultiChars :: String -> [Char] unescapeMultiChars cs@(_ : _ : _) = case unescapeChar cs of (c,cs') -> c : unescapeMultiChars cs' unescapeMultiChars ('\'' : []) = [] unescapeMultiChars _ = error "Unexpected end of multi-char constant" {-# INLINE token_ #-} -- token that ignores the string token_ :: (Position -> CToken) -> Position -> Int -> InputStream -> P CToken token_ tok pos _ _ = return (tok pos) {-# INLINE token_fail #-} -- error token token_fail :: String -> Position -> Int -> InputStream -> P CToken token_fail errmsg pos _ _ = failP pos [ "Lexical Error !", errmsg ] {-# INLINE token #-} -- token that uses the string token :: (Position -> a -> CToken) -> (String -> a) -> Position -> Int -> InputStream -> P CToken token tok read pos len str = return (tok pos (read $ takeChars len str)) {-# INLINE token_plus #-} -- token that may fail token_plus :: (Position -> a -> CToken) -> (String -> Either String a) -> Position -> Int -> InputStream -> P CToken token_plus tok read pos len str = case read (takeChars len str) of Left err -> failP pos [ "Lexical error ! ", err ] Right ok -> return $! tok pos ok -- ----------------------------------------------------------------------------- -- The input type type AlexInput = (Position, -- current position, InputStream) -- current input string alexInputPrevChar :: AlexInput -> Char alexInputPrevChar _ = error "alexInputPrevChar not used" alexGetChar :: AlexInput -> Maybe (Char,AlexInput) alexGetChar (p,is) | inputStreamEmpty is = Nothing | otherwise = let (c,s) = takeChar is in let p' = alexMove p c in p' `seq` Just (c, (p', s)) alexMove :: Position -> Char -> Position alexMove (Position f l c) ' ' = Position f l (c+1) alexMove (Position f l c) '\n' = Position f (l+1) 1 alexMove (Position f l c) '\t' = Position f l (((c+7) `div` 8)*8+1) alexMove p '\r' = p alexMove (Position f l c) _ = Position f l (c+1) lexicalError :: P a lexicalError = do pos <- getPos (c,cs) <- liftM takeChar getInput failP pos ["Lexical error !", "The character " ++ show c ++ " does not fit here."] parseError :: P a parseError = do tok <- getLastToken failP (posOf tok) ["Syntax error !", "The symbol `" ++ show tok ++ "' does not fit here."] lexToken :: P CToken lexToken = do pos <- getPos inp <- getInput case alexScan (pos, inp) 0 of AlexEOF -> return CTokEof AlexError inp' -> lexicalError AlexSkip (pos', inp') len -> do setPos pos' setInput inp' lexToken AlexToken (pos', inp') len action -> do setPos pos' setInput inp' tok <- action pos len inp setLastToken tok return tok lexC :: (CToken -> P a) -> P a lexC cont = do tok <- lexToken cont tok }