-- C -> Haskell Compiler: Lexer for C Header Files -- -- Author : Manuel M T Chakravarty, Duncan Coutts -- Created: 24 May 2005 -- -- Copyright (c) [1999..2004] Manuel M T Chakravarty -- Copyright (c) 2005 Duncan Coutts -- -- This file is free software; you can redistribute it and/or modify -- it under the terms of the GNU General Public License as published by -- the Free Software Foundation; either version 2 of the License, or -- (at your option) any later version. -- -- This file is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -- GNU General Public License for more details. -- --- DESCRIPTION --------------------------------------------------------------- -- -- Lexer for C header files after being processed by the C preprocessor -- --- DOCU ---------------------------------------------------------------------- -- -- language: Haskell 98 -- -- We assume that the input already went through cpp. Thus, we do not handle -- comments and preprocessor directives here. The lexer recognizes all tokens -- of ANCI C except those occuring only in function bodies. It supports the -- C99 `restrict' extension: as -- well as inline functions. -- -- Comments: -- -- * There is no support for the optional feature of extended characters (see -- K&R A2.5.2) or the corresponding strings (A2.6). -- -- * We add `typedef-name' (K&R 8.9) as a token, as proposed in K&R A13. -- However, as these tokens cannot be recognized lexically, but require a -- context analysis, they are never produced by the lexer, but instead have -- to be introduced in a later phase (by converting the corresponding -- identifiers). -- -- * We also recognize GNU C `__attribute__', `__extension__', `__const', -- `__const__', `__inline', `__inline__', `__restrict', and `__restrict__'. -- -- * Any line starting with `#pragma' is ignored. -- -- With K&R we refer to ``The C Programming Language'', second edition, Brain -- W. Kernighan and Dennis M. Ritchie, Prentice Hall, 1988. -- --- TODO ---------------------------------------------------------------------- -- -- * `showsPrec' of `CTokCLit' should produce K&R-conforming escapes; -- same for `CTokSLit' -- -- * There are more GNU C specific keywords. Add them and change `CParser' -- correspondingly (in particular, most tokens within __attribute ((...)) -- expressions are actually keywords, but we handle them as identifiers at -- the moment). -- { module CLexer (lexC, parseError) where import Data.Char (isDigit) import Numeric (readDec, readOct, readHex) import Position (Position(..), Pos(posOf)) import Idents (lexemeToIdent) import CTokens import CParserMonad } $space = [ \ \t ] -- horazontal white space $eol = \n $letter = [a-zA-Z_] $octdigit = 0-7 $digit = 0-9 $digitNZ = 1-9 $hexdigit = [0-9a-fA-F] $inchar = \0-\255 # [ \\ \' \n \r ] $instr = \0-\255 # [ \\ \" \n \r ] $anyButNL = \0-\255 # \n $infname = \ -\127 # [ \\ \" ] $visible = \ -\127 @int = $digitNZ$digit* @sp = $space* -- character escape sequence (follows K&R A2.5.2) -- -- * also used for strings -- @charesc = \\([ntvbrfae\\\?\'\"]|$octdigit{1,3}|x$hexdigit+) -- components of float constants (follows K&R A2.5.3) -- @digits = $digit+ @intpart = @digits @fractpart = @digits @mantpart = @intpart?\.@fractpart|@intpart\. @exppart = [eE][\+\-]?@digits @suffix = [fFlL] tokens :- -- whitespace (follows K&R A2.1) -- -- * horizontal and vertical tabs, newlines, and form feeds are filter out by -- `Lexers.ctrlLexer' -- -- * comments are not handled, as we assume the input already went through cpp -- $white+ ; -- #line directive (K&R A12.6) -- -- * allows further ints after the file name a la GCC; as the GCC CPP docu -- doesn't say how many ints there can be, we allow an unbound number -- \#$space*@int$space*(\"($infname|@charesc)*\"$space*)?(@int$space*)*$eol { \pos len str -> setPos (adjustPos (take len str) pos) >> lexToken } -- #pragma directive (K&R A12.8) -- -- * we simply ignore any #pragma (but take care to update the position -- information) -- \#$space*pragma$anyButNL*$eol ; -- #itent directive, eg used by rcs/cvs -- -- * we simply ignore any #itent (but take care to update the position -- information) -- \#$space*ident$anyButNL*$eol ; -- identifiers and keywords (follows K&R A2.3 and A2.4) -- $letter($letter|$digit)* { \pos len str -> idkwtok (take len str) pos } -- constants (follows K&R A2.5) -- -- * K&R explicit mentions `enumeration-constants'; however, as they are -- lexically identifiers, we do not have an extra case for them -- -- integer constants (follows K&R A2.5.1) -- 0$octdigit*[uUlL]{0,3} { token CTokILit (fst . head . readOct) } $digitNZ$digit*[uUlL]{0,3} { token CTokILit (fst . head . readDec) } 0[xX]$hexdigit*[uUlL]{0,3} { token CTokILit (fst . head . readHex . drop 2) } -- character constants (follows K&R A2.5.2) -- \'($inchar|@charesc)\' { token CTokCLit (fst . oneChar . tail) } L\'($inchar|@charesc)\' { token CTokCLit (fst . oneChar . tail . tail) } -- float constants (follows K&R A2.5.3) -- (@mantpart@exppart?|@intpart@exppart)@suffix? { token CTokFLit id } -- string literal (follows K&R A2.6) -- \"($instr|@charesc)*\" { token CTokSLit normalizeEscapes } L\"($instr|@charesc)*\" { token CTokSLit (normalizeEscapes . tail) } -- operators and separators -- "(" { token_ CTokLParen } ")" { token_ CTokRParen } "[" { token_ CTokLBracket } "]" { token_ CTokRBracket } "->" { token_ CTokArrow } "." { token_ CTokDot } "!" { token_ CTokExclam } "~" { token_ CTokTilde } "++" { token_ CTokInc } "--" { token_ CTokDec } "+" { token_ CTokPlus } "-" { token_ CTokMinus } "*" { token_ CTokStar } "/" { token_ CTokSlash } "%" { token_ CTokPercent } "&" { token_ CTokAmper } "<<" { token_ CTokShiftL } ">>" { token_ CTokShiftR } "<" { token_ CTokLess } "<=" { token_ CTokLessEq } ">" { token_ CTokHigh } ">=" { token_ CTokHighEq } "==" { token_ CTokEqual } "!=" { token_ CTokUnequal } "^" { token_ CTokHat } "|" { token_ CTokBar } "&&" { token_ CTokAnd } "||" { token_ CTokOr } "?" { token_ CTokQuest } ":" { token_ CTokColon } "=" { token_ CTokAssign } "+=" { token_ CTokPlusAss } "-=" { token_ CTokMinusAss } "*=" { token_ CTokStarAss } "/=" { token_ CTokSlashAss } "%=" { token_ CTokPercAss } "&=" { token_ CTokAmpAss } "^=" { token_ CTokHatAss } "|=" { token_ CTokBarAss } "<<=" { token_ CTokSLAss } ">>=" { token_ CTokSRAss } "," { token_ CTokComma } \; { token_ CTokSemic } "{" { token_ CTokLBrace } "}" { token_ CTokRBrace } "..." { token_ CTokEllipsis } { -- We use the odd looking list of string patterns here rather than normal -- string literals since GHC converts the latter into a sequence of string -- comparisons (ie a linear search) but it translates the former using its -- effecient pattern matching which gives us the expected radix-style search. -- This gives change makes a significant performance difference. -- idkwtok :: String -> Position -> P CToken idkwtok ('a':'l':'i':'g':'n':'o':'f':[]) = tok CTokAlignof idkwtok ('_':'_':'a':'l':'i':'g':'n':'o':'f':[]) = tok CTokAlignof idkwtok ('_':'_':'a':'l':'i':'g':'n':'o':'f':'_':'_':[]) = tok CTokAlignof idkwtok ('a':'s':'m':[]) = tok CTokAsm idkwtok ('_':'_':'a':'s':'m':[]) = tok CTokAsm idkwtok ('_':'_':'a':'s':'m':'_':'_':[]) = tok CTokAsm idkwtok ('a':'u':'t':'o':[]) = tok CTokAuto idkwtok ('b':'r':'e':'a':'k':[]) = tok CTokBreak idkwtok ('_':'B':'o':'o':'l':[]) = tok CTokBool idkwtok ('c':'a':'s':'e':[]) = tok CTokCase idkwtok ('c':'h':'a':'r':[]) = tok CTokChar idkwtok ('c':'o':'n':'s':'t':[]) = tok CTokConst idkwtok ('_':'_':'c':'o':'n':'s':'t':[]) = tok CTokConst idkwtok ('_':'_':'c':'o':'n':'s':'t':'_':'_':[]) = tok CTokConst idkwtok ('c':'o':'n':'t':'i':'n':'u':'e':[]) = tok CTokContinue idkwtok ('_':'C':'o':'m':'p':'l':'e':'x':[]) = tok CTokComplex idkwtok ('d':'e':'f':'a':'u':'l':'t':[]) = tok CTokDefault idkwtok ('d':'o':[]) = tok CTokDo idkwtok ('d':'o':'u':'b':'l':'e':[]) = tok CTokDouble idkwtok ('e':'l':'s':'e':[]) = tok CTokElse idkwtok ('e':'n':'u':'m':[]) = tok CTokEnum idkwtok ('e':'x':'t':'e':'r':'n':[]) = tok CTokExtern idkwtok ('f':'l':'o':'a':'t':[]) = tok CTokFloat idkwtok ('f':'o':'r':[]) = tok CTokFor idkwtok ('g':'o':'t':'o':[]) = tok CTokGoto idkwtok ('i':'f':[]) = tok CTokIf idkwtok ('i':'n':'l':'i':'n':'e':[]) = tok CTokInline idkwtok ('_':'_':'i':'n':'l':'i':'n':'e':[]) = tok CTokInline idkwtok ('_':'_':'i':'n':'l':'i':'n':'e':'_':'_':[]) = tok CTokInline idkwtok ('i':'n':'t':[]) = tok CTokInt idkwtok ('l':'o':'n':'g':[]) = tok CTokLong idkwtok ('r':'e':'g':'i':'s':'t':'e':'r':[]) = tok CTokRegister idkwtok ('r':'e':'s':'t':'r':'i':'c':'t':[]) = tok CTokRestrict idkwtok ('_':'_':'r':'e':'s':'t':'r':'i':'c':'t':[]) = tok CTokRestrict idkwtok ('_':'_':'r':'e':'s':'t':'r':'i':'c':'t':'_':'_':[]) = tok CTokRestrict idkwtok ('r':'e':'t':'u':'r':'n':[]) = tok CTokReturn idkwtok ('s':'h':'o':'r':'t':[]) = tok CTokShort idkwtok ('s':'i':'g':'n':'e':'d':[]) = tok CTokSigned idkwtok ('_':'_':'s':'i':'g':'n':'e':'d':[]) = tok CTokSigned idkwtok ('_':'_':'s':'i':'g':'n':'e':'d':'_':'_':[]) = tok CTokSigned idkwtok ('s':'i':'z':'e':'o':'f':[]) = tok CTokSizeof idkwtok ('s':'t':'a':'t':'i':'c':[]) = tok CTokStatic idkwtok ('s':'t':'r':'u':'c':'t':[]) = tok CTokStruct idkwtok ('s':'w':'i':'t':'c':'h':[]) = tok CTokSwitch idkwtok ('t':'y':'p':'e':'d':'e':'f':[]) = tok CTokTypedef idkwtok ('t':'y':'p':'e':'o':'f':[]) = tok CTokTypeof idkwtok ('_':'_':'t':'y':'p':'e':'o':'f':[]) = tok CTokTypeof idkwtok ('_':'_':'t':'y':'p':'e':'o':'f':'_':'_':[]) = tok CTokTypeof idkwtok ('_':'_':'t':'h':'r':'e':'a':'d':[]) = tok CTokThread idkwtok ('u':'n':'i':'o':'n':[]) = tok CTokUnion idkwtok ('u':'n':'s':'i':'g':'n':'e':'d':[]) = tok CTokUnsigned idkwtok ('v':'o':'i':'d':[]) = tok CTokVoid idkwtok ('v':'o':'l':'a':'t':'i':'l':'e':[]) = tok CTokVolatile idkwtok ('_':'_':'v':'o':'l':'a':'t':'i':'l':'e':[]) = tok CTokVolatile idkwtok ('_':'_':'v':'o':'l':'a':'t':'i':'l':'e':'_':'_':[]) = tok CTokVolatile idkwtok ('w':'h':'i':'l':'e':[]) = tok CTokWhile idkwtok ('_':'_':'l':'a':'b':'e':'l':'_':'_':[]) = tok CTokLabel idkwtok ('_':'_':'a':'t':'t':'r':'i':'b':'u':'t':'e':[]) = tok (CTokGnuC GnuCAttrTok) -- ignoreAttribute >> lexToken idkwtok ('_':'_':'a':'t':'t':'r':'i':'b':'u':'t':'e':'_':'_':[]) = tok (CTokGnuC GnuCAttrTok) -- ignoreAttribute >> lexToken idkwtok ('_':'_':'e':'x':'t':'e':'n':'s':'i':'o':'n':'_':'_':[]) = tok (CTokGnuC GnuCExtTok) idkwtok ('_':'_':'b':'u':'i':'l':'t':'i':'n':'_':rest) | rest == "va_arg" = tok (CTokGnuC GnuCVaArg) | rest == "offsetof" = tok (CTokGnuC GnuCOffsetof) | rest == "types_compatible_p" = tok (CTokGnuC GnuCTyCompat) idkwtok cs = \pos -> do name <- getNewName let ident = lexemeToIdent pos cs name tyident <- isTypeIdent ident if tyident then return (CTokTyIdent pos ident) else return (CTokIdent pos ident) ignoreAttribute :: P () ignoreAttribute = skipTokens 0 where skipTokens n = do tok <- lexToken case tok of CTokRParen _ | n == 1 -> return () | otherwise -> skipTokens (n-1) CTokLParen _ -> skipTokens (n+1) _ -> skipTokens n tok :: (Position -> CToken) -> Position -> P CToken tok tc pos = return (tc pos) -- converts the first character denotation of a C-style string to a character -- and the remaining string -- oneChar :: String -> (Char, String) oneChar ('\\':c:cs) = case c of 'n' -> ('\n', cs) 't' -> ('\t', cs) 'v' -> ('\v', cs) 'b' -> ('\b', cs) 'r' -> ('\r', cs) 'f' -> ('\f', cs) 'a' -> ('\a', cs) 'e' -> ('\ESC', cs) --GNU C extension '\\' -> ('\\', cs) '?' -> ('?', cs) '\'' -> ('\'', cs) '"' -> ('"', cs) 'x' -> case head (readHex cs) of (i, cs') -> (toEnum i, cs') _ -> case head (readOct (c:cs)) of (i, cs') -> (toEnum i, cs') oneChar (c :cs) = (c, cs) normalizeEscapes [] = [] normalizeEscapes cs = case oneChar cs of (c, cs') -> c : normalizeEscapes cs' adjustPos :: String -> Position -> Position adjustPos str (Position fname row _) = Position fname' row' 0 where str' = dropWhite . drop 1 $ str (rowStr, str'') = span isDigit str' row' = read rowStr str''' = dropWhite str'' fnameStr = takeWhile (/= '"') . drop 1 $ str''' fname' | null str''' || head str''' /= '"' = fname -- try and get more sharing of file name strings | fnameStr == fname = fname | otherwise = fnameStr -- dropWhite = dropWhile (\c -> c == ' ' || c == '\t') {-# INLINE token_ #-} -- token that ignores the string token_ :: (Position -> CToken) -> Position -> Int -> String -> P CToken token_ tok pos _ _ = return (tok pos) {-# INLINE token #-} -- token that uses the string token :: (Position -> a -> CToken) -> (String -> a) -> Position -> Int -> String -> P CToken token tok read pos len str = return (tok pos (read $ take len str)) -- ----------------------------------------------------------------------------- -- The input type type AlexInput = (Position, -- current position, String) -- current input string alexInputPrevChar :: AlexInput -> Char alexInputPrevChar _ = error "alexInputPrevChar not used" alexGetChar :: AlexInput -> Maybe (Char,AlexInput) alexGetChar (p,[]) = Nothing alexGetChar (p,(c:s)) = let p' = alexMove p c in p' `seq` Just (c, (p', s)) alexMove :: Position -> Char -> Position alexMove (Position f l c) '\t' = Position f l (((c+7) `div` 8)*8+1) alexMove (Position f l c) '\n' = Position f (l+1) 1 alexMove (Position f l c) _ = Position f l (c+1) lexicalError :: P a lexicalError = do pos <- getPos (c:cs) <- getInput failP pos ["Lexical error!", "The character " ++ show c ++ " does not fit here."] parseError :: P a parseError = do tok <- getLastToken failP (posOf tok) ["Syntax error!", "The symbol `" ++ show tok ++ "' does not fit here."] lexToken :: P CToken lexToken = do pos <- getPos inp <- getInput case alexScan (pos, inp) 0 of AlexEOF -> return CTokEof AlexError inp' -> lexicalError AlexSkip (pos', inp') len -> do setPos pos' setInput inp' lexToken AlexToken (pos', inp') len action -> do setPos pos' setInput inp' tok <- action pos len inp setLastToken tok return tok lexC :: (CToken -> P a) -> P a lexC cont = do tok <- lexToken cont tok }