| 1 | ----------------------------------------------------------------------------- |
|---|
| 2 | -- |
|---|
| 3 | -- (c) The University of Glasgow, 2004-2006 |
|---|
| 4 | -- |
|---|
| 5 | -- Lexer for concrete Cmm. We try to stay close to the C-- spec, but there |
|---|
| 6 | -- are a few minor differences: |
|---|
| 7 | -- |
|---|
| 8 | -- * extra keywords for our macros, and float32/float64 types |
|---|
| 9 | -- * global registers (Sp,Hp, etc.) |
|---|
| 10 | -- |
|---|
| 11 | ----------------------------------------------------------------------------- |
|---|
| 12 | |
|---|
| 13 | { |
|---|
| 14 | {-# LANGUAGE BangPatterns #-} |
|---|
| 15 | {-# OPTIONS -Wwarn -w #-} |
|---|
| 16 | -- The above -Wwarn supression flag is a temporary kludge. |
|---|
| 17 | -- While working on this module you are encouraged to remove it and fix |
|---|
| 18 | -- any warnings in the module. See |
|---|
| 19 | -- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings |
|---|
| 20 | -- for details |
|---|
| 21 | |
|---|
| 22 | module CmmLex ( |
|---|
| 23 | CmmToken(..), cmmlex, |
|---|
| 24 | ) where |
|---|
| 25 | |
|---|
| 26 | import OldCmm |
|---|
| 27 | import Lexer |
|---|
| 28 | |
|---|
| 29 | import SrcLoc |
|---|
| 30 | import UniqFM |
|---|
| 31 | import StringBuffer |
|---|
| 32 | import FastString |
|---|
| 33 | import Ctype |
|---|
| 34 | import Util |
|---|
| 35 | --import TRACE |
|---|
| 36 | |
|---|
| 37 | import Data.Word |
|---|
| 38 | import Data.Char |
|---|
| 39 | } |
|---|
| 40 | |
|---|
| 41 | $whitechar = [\ \t\n\r\f\v\xa0] -- \xa0 is Unicode no-break space |
|---|
| 42 | $white_no_nl = $whitechar # \n |
|---|
| 43 | |
|---|
| 44 | $ascdigit = 0-9 |
|---|
| 45 | $unidigit = \x01 -- Trick Alex into handling Unicode. See alexGetChar. |
|---|
| 46 | $digit = [$ascdigit $unidigit] |
|---|
| 47 | $octit = 0-7 |
|---|
| 48 | $hexit = [$digit A-F a-f] |
|---|
| 49 | |
|---|
| 50 | $unilarge = \x03 -- Trick Alex into handling Unicode. See alexGetChar. |
|---|
| 51 | $asclarge = [A-Z \xc0-\xd6 \xd8-\xde] |
|---|
| 52 | $large = [$asclarge $unilarge] |
|---|
| 53 | |
|---|
| 54 | $unismall = \x04 -- Trick Alex into handling Unicode. See alexGetChar. |
|---|
| 55 | $ascsmall = [a-z \xdf-\xf6 \xf8-\xff] |
|---|
| 56 | $small = [$ascsmall $unismall \_] |
|---|
| 57 | |
|---|
| 58 | $namebegin = [$large $small \. \$ \@] |
|---|
| 59 | $namechar = [$namebegin $digit] |
|---|
| 60 | |
|---|
| 61 | @decimal = $digit+ |
|---|
| 62 | @octal = $octit+ |
|---|
| 63 | @hexadecimal = $hexit+ |
|---|
| 64 | @exponent = [eE] [\-\+]? @decimal |
|---|
| 65 | |
|---|
| 66 | @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent |
|---|
| 67 | |
|---|
| 68 | @escape = \\ ([abfnrt\\\'\"\?] | x $hexit{1,2} | $octit{1,3}) |
|---|
| 69 | @strchar = ($printable # [\"\\]) | @escape |
|---|
| 70 | |
|---|
| 71 | cmm :- |
|---|
| 72 | |
|---|
| 73 | $white_no_nl+ ; |
|---|
| 74 | ^\# pragma .* \n ; -- Apple GCC 3.3 CPP generates pragmas in its output |
|---|
| 75 | |
|---|
| 76 | ^\# (line)? { begin line_prag } |
|---|
| 77 | |
|---|
| 78 | -- single-line line pragmas, of the form |
|---|
| 79 | -- # <line> "<file>" <extra-stuff> \n |
|---|
| 80 | <line_prag> $digit+ { setLine line_prag1 } |
|---|
| 81 | <line_prag1> \" [^\"]* \" { setFile line_prag2 } |
|---|
| 82 | <line_prag2> .* { pop } |
|---|
| 83 | |
|---|
| 84 | <0> { |
|---|
| 85 | \n ; |
|---|
| 86 | |
|---|
| 87 | [\:\;\{\}\[\]\(\)\=\`\~\/\*\%\-\+\&\^\|\>\<\,\!] { special_char } |
|---|
| 88 | |
|---|
| 89 | ".." { kw CmmT_DotDot } |
|---|
| 90 | "::" { kw CmmT_DoubleColon } |
|---|
| 91 | ">>" { kw CmmT_Shr } |
|---|
| 92 | "<<" { kw CmmT_Shl } |
|---|
| 93 | ">=" { kw CmmT_Ge } |
|---|
| 94 | "<=" { kw CmmT_Le } |
|---|
| 95 | "==" { kw CmmT_Eq } |
|---|
| 96 | "!=" { kw CmmT_Ne } |
|---|
| 97 | "&&" { kw CmmT_BoolAnd } |
|---|
| 98 | "||" { kw CmmT_BoolOr } |
|---|
| 99 | |
|---|
| 100 | P@decimal { global_regN (\n -> VanillaReg n VGcPtr) } |
|---|
| 101 | R@decimal { global_regN (\n -> VanillaReg n VNonGcPtr) } |
|---|
| 102 | F@decimal { global_regN FloatReg } |
|---|
| 103 | D@decimal { global_regN DoubleReg } |
|---|
| 104 | L@decimal { global_regN LongReg } |
|---|
| 105 | Sp { global_reg Sp } |
|---|
| 106 | SpLim { global_reg SpLim } |
|---|
| 107 | Hp { global_reg Hp } |
|---|
| 108 | HpLim { global_reg HpLim } |
|---|
| 109 | CCCS { global_reg CCCS } |
|---|
| 110 | CurrentTSO { global_reg CurrentTSO } |
|---|
| 111 | CurrentNursery { global_reg CurrentNursery } |
|---|
| 112 | HpAlloc { global_reg HpAlloc } |
|---|
| 113 | BaseReg { global_reg BaseReg } |
|---|
| 114 | |
|---|
| 115 | $namebegin $namechar* { name } |
|---|
| 116 | |
|---|
| 117 | 0 @octal { tok_octal } |
|---|
| 118 | @decimal { tok_decimal } |
|---|
| 119 | 0[xX] @hexadecimal { tok_hexadecimal } |
|---|
| 120 | @floating_point { strtoken tok_float } |
|---|
| 121 | |
|---|
| 122 | \" @strchar* \" { strtoken tok_string } |
|---|
| 123 | } |
|---|
| 124 | |
|---|
| 125 | { |
|---|
| 126 | data CmmToken |
|---|
| 127 | = CmmT_SpecChar Char |
|---|
| 128 | | CmmT_DotDot |
|---|
| 129 | | CmmT_DoubleColon |
|---|
| 130 | | CmmT_Shr |
|---|
| 131 | | CmmT_Shl |
|---|
| 132 | | CmmT_Ge |
|---|
| 133 | | CmmT_Le |
|---|
| 134 | | CmmT_Eq |
|---|
| 135 | | CmmT_Ne |
|---|
| 136 | | CmmT_BoolAnd |
|---|
| 137 | | CmmT_BoolOr |
|---|
| 138 | | CmmT_CLOSURE |
|---|
| 139 | | CmmT_INFO_TABLE |
|---|
| 140 | | CmmT_INFO_TABLE_RET |
|---|
| 141 | | CmmT_INFO_TABLE_FUN |
|---|
| 142 | | CmmT_INFO_TABLE_CONSTR |
|---|
| 143 | | CmmT_INFO_TABLE_SELECTOR |
|---|
| 144 | | CmmT_else |
|---|
| 145 | | CmmT_export |
|---|
| 146 | | CmmT_section |
|---|
| 147 | | CmmT_align |
|---|
| 148 | | CmmT_goto |
|---|
| 149 | | CmmT_if |
|---|
| 150 | | CmmT_jump |
|---|
| 151 | | CmmT_foreign |
|---|
| 152 | | CmmT_never |
|---|
| 153 | | CmmT_prim |
|---|
| 154 | | CmmT_return |
|---|
| 155 | | CmmT_returns |
|---|
| 156 | | CmmT_import |
|---|
| 157 | | CmmT_switch |
|---|
| 158 | | CmmT_case |
|---|
| 159 | | CmmT_default |
|---|
| 160 | | CmmT_bits8 |
|---|
| 161 | | CmmT_bits16 |
|---|
| 162 | | CmmT_bits32 |
|---|
| 163 | | CmmT_bits64 |
|---|
| 164 | | CmmT_float32 |
|---|
| 165 | | CmmT_float64 |
|---|
| 166 | | CmmT_gcptr |
|---|
| 167 | | CmmT_GlobalReg GlobalReg |
|---|
| 168 | | CmmT_Name FastString |
|---|
| 169 | | CmmT_String String |
|---|
| 170 | | CmmT_Int Integer |
|---|
| 171 | | CmmT_Float Rational |
|---|
| 172 | | CmmT_EOF |
|---|
| 173 | deriving (Show) |
|---|
| 174 | |
|---|
| 175 | -- ----------------------------------------------------------------------------- |
|---|
| 176 | -- Lexer actions |
|---|
| 177 | |
|---|
| 178 | type Action = RealSrcSpan -> StringBuffer -> Int -> P (RealLocated CmmToken) |
|---|
| 179 | |
|---|
| 180 | begin :: Int -> Action |
|---|
| 181 | begin code _span _str _len = do pushLexState code; lexToken |
|---|
| 182 | |
|---|
| 183 | pop :: Action |
|---|
| 184 | pop _span _buf _len = do popLexState; lexToken |
|---|
| 185 | |
|---|
| 186 | special_char :: Action |
|---|
| 187 | special_char span buf len = return (L span (CmmT_SpecChar (currentChar buf))) |
|---|
| 188 | |
|---|
| 189 | kw :: CmmToken -> Action |
|---|
| 190 | kw tok span buf len = return (L span tok) |
|---|
| 191 | |
|---|
| 192 | global_regN :: (Int -> GlobalReg) -> Action |
|---|
| 193 | global_regN con span buf len |
|---|
| 194 | = return (L span (CmmT_GlobalReg (con (fromIntegral n)))) |
|---|
| 195 | where buf' = stepOn buf |
|---|
| 196 | n = parseUnsignedInteger buf' (len-1) 10 octDecDigit |
|---|
| 197 | |
|---|
| 198 | global_reg :: GlobalReg -> Action |
|---|
| 199 | global_reg r span buf len = return (L span (CmmT_GlobalReg r)) |
|---|
| 200 | |
|---|
| 201 | strtoken :: (String -> CmmToken) -> Action |
|---|
| 202 | strtoken f span buf len = |
|---|
| 203 | return (L span $! (f $! lexemeToString buf len)) |
|---|
| 204 | |
|---|
| 205 | name :: Action |
|---|
| 206 | name span buf len = |
|---|
| 207 | case lookupUFM reservedWordsFM fs of |
|---|
| 208 | Just tok -> return (L span tok) |
|---|
| 209 | Nothing -> return (L span (CmmT_Name fs)) |
|---|
| 210 | where |
|---|
| 211 | fs = lexemeToFastString buf len |
|---|
| 212 | |
|---|
| 213 | reservedWordsFM = listToUFM $ |
|---|
| 214 | map (\(x, y) -> (mkFastString x, y)) [ |
|---|
| 215 | ( "CLOSURE", CmmT_CLOSURE ), |
|---|
| 216 | ( "INFO_TABLE", CmmT_INFO_TABLE ), |
|---|
| 217 | ( "INFO_TABLE_RET", CmmT_INFO_TABLE_RET ), |
|---|
| 218 | ( "INFO_TABLE_FUN", CmmT_INFO_TABLE_FUN ), |
|---|
| 219 | ( "INFO_TABLE_CONSTR", CmmT_INFO_TABLE_CONSTR ), |
|---|
| 220 | ( "INFO_TABLE_SELECTOR",CmmT_INFO_TABLE_SELECTOR ), |
|---|
| 221 | ( "else", CmmT_else ), |
|---|
| 222 | ( "export", CmmT_export ), |
|---|
| 223 | ( "section", CmmT_section ), |
|---|
| 224 | ( "align", CmmT_align ), |
|---|
| 225 | ( "goto", CmmT_goto ), |
|---|
| 226 | ( "if", CmmT_if ), |
|---|
| 227 | ( "jump", CmmT_jump ), |
|---|
| 228 | ( "foreign", CmmT_foreign ), |
|---|
| 229 | ( "never", CmmT_never ), |
|---|
| 230 | ( "prim", CmmT_prim ), |
|---|
| 231 | ( "return", CmmT_return ), |
|---|
| 232 | ( "returns", CmmT_returns ), |
|---|
| 233 | ( "import", CmmT_import ), |
|---|
| 234 | ( "switch", CmmT_switch ), |
|---|
| 235 | ( "case", CmmT_case ), |
|---|
| 236 | ( "default", CmmT_default ), |
|---|
| 237 | ( "bits8", CmmT_bits8 ), |
|---|
| 238 | ( "bits16", CmmT_bits16 ), |
|---|
| 239 | ( "bits32", CmmT_bits32 ), |
|---|
| 240 | ( "bits64", CmmT_bits64 ), |
|---|
| 241 | ( "float32", CmmT_float32 ), |
|---|
| 242 | ( "float64", CmmT_float64 ), |
|---|
| 243 | -- New forms |
|---|
| 244 | ( "b8", CmmT_bits8 ), |
|---|
| 245 | ( "b16", CmmT_bits16 ), |
|---|
| 246 | ( "b32", CmmT_bits32 ), |
|---|
| 247 | ( "b64", CmmT_bits64 ), |
|---|
| 248 | ( "f32", CmmT_float32 ), |
|---|
| 249 | ( "f64", CmmT_float64 ), |
|---|
| 250 | ( "gcptr", CmmT_gcptr ) |
|---|
| 251 | ] |
|---|
| 252 | |
|---|
| 253 | tok_decimal span buf len |
|---|
| 254 | = return (L span (CmmT_Int $! parseUnsignedInteger buf len 10 octDecDigit)) |
|---|
| 255 | |
|---|
| 256 | tok_octal span buf len |
|---|
| 257 | = return (L span (CmmT_Int $! parseUnsignedInteger (offsetBytes 1 buf) (len-1) 8 octDecDigit)) |
|---|
| 258 | |
|---|
| 259 | tok_hexadecimal span buf len |
|---|
| 260 | = return (L span (CmmT_Int $! parseUnsignedInteger (offsetBytes 2 buf) (len-2) 16 hexDigit)) |
|---|
| 261 | |
|---|
| 262 | tok_float str = CmmT_Float $! readRational str |
|---|
| 263 | |
|---|
| 264 | tok_string str = CmmT_String (read str) |
|---|
| 265 | -- urk, not quite right, but it'll do for now |
|---|
| 266 | |
|---|
| 267 | -- ----------------------------------------------------------------------------- |
|---|
| 268 | -- Line pragmas |
|---|
| 269 | |
|---|
| 270 | setLine :: Int -> Action |
|---|
| 271 | setLine code span buf len = do |
|---|
| 272 | let line = parseUnsignedInteger buf len 10 octDecDigit |
|---|
| 273 | setSrcLoc (mkRealSrcLoc (srcSpanFile span) (fromIntegral line - 1) 1) |
|---|
| 274 | -- subtract one: the line number refers to the *following* line |
|---|
| 275 | -- trace ("setLine " ++ show line) $ do |
|---|
| 276 | popLexState |
|---|
| 277 | pushLexState code |
|---|
| 278 | lexToken |
|---|
| 279 | |
|---|
| 280 | setFile :: Int -> Action |
|---|
| 281 | setFile code span buf len = do |
|---|
| 282 | let file = lexemeToFastString (stepOn buf) (len-2) |
|---|
| 283 | setSrcLoc (mkRealSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span)) |
|---|
| 284 | popLexState |
|---|
| 285 | pushLexState code |
|---|
| 286 | lexToken |
|---|
| 287 | |
|---|
| 288 | -- ----------------------------------------------------------------------------- |
|---|
| 289 | -- This is the top-level function: called from the parser each time a |
|---|
| 290 | -- new token is to be read from the input. |
|---|
| 291 | |
|---|
| 292 | cmmlex :: (Located CmmToken -> P a) -> P a |
|---|
| 293 | cmmlex cont = do |
|---|
| 294 | (L span tok) <- lexToken |
|---|
| 295 | --trace ("token: " ++ show tok) $ do |
|---|
| 296 | cont (L (RealSrcSpan span) tok) |
|---|
| 297 | |
|---|
| 298 | lexToken :: P (RealLocated CmmToken) |
|---|
| 299 | lexToken = do |
|---|
| 300 | inp@(loc1,buf) <- getInput |
|---|
| 301 | sc <- getLexState |
|---|
| 302 | case alexScan inp sc of |
|---|
| 303 | AlexEOF -> do let span = mkRealSrcSpan loc1 loc1 |
|---|
| 304 | setLastToken span 0 |
|---|
| 305 | return (L span CmmT_EOF) |
|---|
| 306 | AlexError (loc2,_) -> do failLocMsgP loc1 loc2 "lexical error" |
|---|
| 307 | AlexSkip inp2 _ -> do |
|---|
| 308 | setInput inp2 |
|---|
| 309 | lexToken |
|---|
| 310 | AlexToken inp2@(end,buf2) len t -> do |
|---|
| 311 | setInput inp2 |
|---|
| 312 | let span = mkRealSrcSpan loc1 end |
|---|
| 313 | span `seq` setLastToken span len |
|---|
| 314 | t span buf len |
|---|
| 315 | |
|---|
| 316 | -- ----------------------------------------------------------------------------- |
|---|
| 317 | -- Monad stuff |
|---|
| 318 | |
|---|
| 319 | -- Stuff that Alex needs to know about our input type: |
|---|
| 320 | type AlexInput = (RealSrcLoc,StringBuffer) |
|---|
| 321 | |
|---|
| 322 | alexInputPrevChar :: AlexInput -> Char |
|---|
| 323 | alexInputPrevChar (_,s) = prevChar s '\n' |
|---|
| 324 | |
|---|
| 325 | -- backwards compatibility for Alex 2.x |
|---|
| 326 | alexGetChar :: AlexInput -> Maybe (Char,AlexInput) |
|---|
| 327 | alexGetChar inp = case alexGetByte inp of |
|---|
| 328 | Nothing -> Nothing |
|---|
| 329 | Just (b,i) -> c `seq` Just (c,i) |
|---|
| 330 | where c = chr $ fromIntegral b |
|---|
| 331 | |
|---|
| 332 | alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) |
|---|
| 333 | alexGetByte (loc,s) |
|---|
| 334 | | atEnd s = Nothing |
|---|
| 335 | | otherwise = b `seq` loc' `seq` s' `seq` Just (b, (loc', s')) |
|---|
| 336 | where c = currentChar s |
|---|
| 337 | b = fromIntegral $ ord $ c |
|---|
| 338 | loc' = advanceSrcLoc loc c |
|---|
| 339 | s' = stepOn s |
|---|
| 340 | |
|---|
| 341 | getInput :: P AlexInput |
|---|
| 342 | getInput = P $ \s@PState{ loc=l, buffer=b } -> POk s (l,b) |
|---|
| 343 | |
|---|
| 344 | setInput :: AlexInput -> P () |
|---|
| 345 | setInput (l,b) = P $ \s -> POk s{ loc=l, buffer=b } () |
|---|
| 346 | } |
|---|