{ module Language.Java.Lexer (L(..), Token(..), lexer) where import Numeric import Data.Char import Debug.Trace (trace) } %wrapper "posn" $digit = [0-9] $nonzero = [1-9] $octdig = [0-7] $hexdig = [0-9A-Fa-f] @lineterm = [\n\r] | \r\n @any = . | @lineterm @tradcomm = "/*" @any* "*/" @linecomm = "//" .* @lineterm @comm = @tradcomm | @linecomm $javaLetter = [a-zA-Z\_\$] $javaDigit = $digit $javaLetterOrDigit = [a-zA-Z0-9\_\$] @octEscape = [0123]? $octdig{1,2} @hexEscape = u $hexdig{4} @charEscape = \\ (@octEscape | @hexEscape | [btnfr\"\']) @expsuffix = [\+\-]? $digit+ @exponent = [eE] @expsuffix @pexponent = [pP] @expsuffix tokens :- $white+ ; @comm ; abstract { \p _ -> L (pos p) $ KW_Abstract } assert { \p _ -> L (pos p) $ KW_Assert } boolean { \p _ -> L (pos p) $ KW_Boolean } break { \p _ -> L (pos p) $ KW_Break } byte { \p _ -> L (pos p) $ KW_Byte } case { \p _ -> L (pos p) $ KW_Case } catch { \p _ -> L (pos p) $ KW_Catch } char { \p _ -> L (pos p) $ KW_Char } class { \p _ -> L (pos p) $ KW_Class } const { \p _ -> L (pos p) $ KW_Const } continue { \p _ -> L (pos p) $ KW_Continue } default { \p _ -> L (pos p) $ KW_Default } do { \p _ -> L (pos p) $ KW_Do } double { \p _ -> L (pos p) $ KW_Double } else { \p _ -> L (pos p) $ KW_Else } enum { \p _ -> L (pos p) $ KW_Enum } extends { \p _ -> L (pos p) $ KW_Extends } final { \p _ -> L (pos p) $ KW_Final } finally { \p _ -> L (pos p) $ KW_Finally } float { \p _ -> L (pos p) $ KW_Float } for { \p _ -> L (pos p) $ KW_For } goto { \p _ -> L (pos p) $ KW_Goto } if { \p _ -> L (pos p) $ KW_If } implements { \p _ -> L (pos p) $ KW_Implements } import { \p _ -> L (pos p) $ KW_Import } instanceof { \p _ -> L (pos p) $ KW_Instanceof } int { \p _ -> L (pos p) $ KW_Int } interface { \p _ -> L (pos p) $ KW_Interface } long { \p _ -> L (pos p) $ KW_Long } native { \p _ -> L (pos p) $ KW_Native } new { \p _ -> L (pos p) $ KW_New } package { \p _ -> L (pos p) $ KW_Package } private { \p _ -> L (pos p) $ KW_Private } protected { \p _ -> L (pos p) $ KW_Protected } public { \p _ -> L (pos p) $ KW_Public } return { \p _ -> L (pos p) $ KW_Return } short { \p _ -> L (pos p) $ KW_Short } static { \p _ -> L (pos p) $ KW_Static } strictfp { \p _ -> L (pos p) $ KW_Strictfp } super { \p _ -> L (pos p) $ KW_Super } switch { \p _ -> L (pos p) $ KW_Switch } synchronized { \p _ -> L (pos p) $ KW_Synchronized } this { \p _ -> L (pos p) $ KW_This } throw { \p _ -> L (pos p) $ KW_Throw } throws { \p _ -> L (pos p) $ KW_Throws } transient { \p _ -> L (pos p) $ KW_Transient } try { \p _ -> L (pos p) $ KW_Try } void { \p _ -> L (pos p) $ KW_Void } volatile { \p _ -> L (pos p) $ KW_Volatile } while { \p _ -> L (pos p) $ KW_While } 0 { \p _ -> L (pos p) $ IntTok 0 } 0 [lL] { \p _ -> L (pos p) $ LongTok 0 } 0 $octdig+ { \p s -> L (pos p) $ IntTok (fst . head $ readOct s) } 0 $octdig+ [lL] { \p s -> L (pos p) $ LongTok (fst . head $ readOct (init s)) } $nonzero $digit* { \p s -> L (pos p) $ IntTok (read s) } $nonzero $digit* [lL] { \p s -> L (pos p) $ LongTok (read (init s)) } 0 [xX] $hexdig { \p s -> L (pos p) $ IntTok (fst . head $ readHex (drop 2 s)) } 0 [xX] $hexdig [lL] { \p s -> L (pos p) $ IntTok (fst . head $ readHex (init (drop 2 s))) } $digit+ \. $digit* @exponent? [dD]? { \p s -> L (pos p) $ DoubleTok (fst . head $ readFloat $ '0':s) } \. $digit+ @exponent? [dD]? { \p s -> L (pos p) $ DoubleTok (fst . head $ readFloat $ '0':s) } $digit+ \. $digit* @exponent? [fF] { \p s -> L (pos p) $ FloatTok (fst . head $ readFloat $ '0':s) } \. $digit+ @exponent? [fF] { \p s -> L (pos p) $ FloatTok (fst . head $ readFloat $ '0':s) } $digit+ @exponent? [dD]? { \p s -> L (pos p) $ DoubleTok (fst . head $ readFloat s) } $digit+ @exponent? [fF] { \p s -> L (pos p) $ FloatTok (fst . head $ readFloat s) } 0 [xX] $hexdig* \.? $hexdig* @pexponent [dD]? { \p s -> L (pos p) $ DoubleTok (readHexExp (drop 2 s)) } 0 [xX] $hexdig* \.? $hexdig* @pexponent [fF] { \p s -> L (pos p) $ FloatTok (readHexExp (drop 2 s)) } true { \p _ -> L (pos p) $ BoolTok True } false { \p _ -> L (pos p) $ BoolTok False } '(@charEscape)' { \p s -> L (pos p) $ CharTok (readCharTok s) } '(~[\'])' { \p s -> L (pos p) $ CharTok (readCharTok s) } \" (@charEscape | ~[\"])* \" { \p s -> L (pos p) $ StringTok (readStringTok s) } null {\p _ -> L (pos p) $ NullTok } $javaLetter $javaLetterOrDigit* { \p s -> L (pos p) $ IdentTok s } \( { \p _ -> L (pos p) $ OpenParen } \) { \p _ -> L (pos p) $ CloseParen } \[ { \p _ -> L (pos p) $ OpenSquare } \] { \p _ -> L (pos p) $ CloseSquare } \{ { \p _ -> L (pos p) $ OpenCurly } \} { \p _ -> L (pos p) $ CloseCurly } \; { \p _ -> L (pos p) $ SemiColon } \, { \p _ -> L (pos p) $ Comma } \. { \p _ -> L (pos p) $ Period } "=" { \p _ -> L (pos p) $ Op_Equal } ">" { \p _ -> L (pos p) $ Op_GThan } "<" { \p _ -> L (pos p) $ Op_LThan } "!" { \p _ -> L (pos p) $ Op_Bang } "~" { \p _ -> L (pos p) $ Op_Tilde } "?" { \p _ -> L (pos p) $ Op_Query } ":" { \p _ -> L (pos p) $ Op_Colon } "==" { \p _ -> L (pos p) $ Op_Equals } "<=" { \p _ -> L (pos p) $ Op_LThanE } ">=" { \p _ -> L (pos p) $ Op_GThanE } "!=" { \p _ -> L (pos p) $ Op_BangE } "&&" { \p _ -> L (pos p) $ Op_AAnd } "||" { \p _ -> L (pos p) $ Op_OOr } "++" { \p _ -> L (pos p) $ Op_PPlus } "--" { \p _ -> L (pos p) $ Op_MMinus } "+" { \p _ -> L (pos p) $ Op_Plus } "-" { \p _ -> L (pos p) $ Op_Minus } "*" { \p _ -> L (pos p) $ Op_Star } "/" { \p _ -> L (pos p) $ Op_Slash } "&" { \p _ -> L (pos p) $ Op_And } "|" { \p _ -> L (pos p) $ Op_Or } "^" { \p _ -> L (pos p) $ Op_Caret } "%" { \p _ -> L (pos p) $ Op_Percent } "<<" { \p _ -> L (pos p) $ Op_LShift } ">>" { \p _ -> L (pos p) $ Op_RShift } ">>>" { \p _ -> L (pos p) $ Op_RRShift } "+=" { \p _ -> L (pos p) $ Op_PlusE } "-=" { \p _ -> L (pos p) $ Op_MinusE } "*=" { \p _ -> L (pos p) $ Op_StarE } "/=" { \p _ -> L (pos p) $ Op_SlashE } "&=" { \p _ -> L (pos p) $ Op_AndE } "|=" { \p _ -> L (pos p) $ Op_OrE } "^=" { \p _ -> L (pos p) $ Op_CaretE } "%=" { \p _ -> L (pos p) $ Op_PercentE } "<<=" { \p _ -> L (pos p) $ Op_LShiftE } ">>=" { \p _ -> L (pos p) $ Op_RShiftE } ">>>=" { \p _ -> L (pos p) $ Op_RRShiftE } { readHexExp :: Floating a => String -> a readHexExp s = let (m, suf) = head $ readHex s (e, _) = case suf of p:s | toLower p == 'p' -> head $ readHex s _ -> (0, "") in m ** e readCharTok :: String -> Char readCharTok s = trace s $ read . convChar $ s readStringTok :: String -> String readStringTok = read . convChar convChar :: String -> String convChar ('\\':'u':s) = '\\':'x':convChar s convChar (x:s) = x:convChar s convChar "" = "" data L a = L Pos a deriving (Show, Eq) -- (line, column) type Pos = (Int, Int) pos :: AlexPosn -> Pos pos (AlexPn _ l c) = (l,c) data Token -- Keywords = KW_Abstract | KW_Assert | KW_Boolean | KW_Break | KW_Byte | KW_Case | KW_Catch | KW_Char | KW_Class | KW_Const | KW_Continue | KW_Default | KW_Do | KW_Double | KW_Else | KW_Enum | KW_Extends | KW_Final | KW_Finally | KW_Float | KW_For | KW_Goto | KW_If | KW_Implements | KW_Import | KW_Instanceof | KW_Int | KW_Interface | KW_Long | KW_Native | KW_New | KW_Package | KW_Private | KW_Protected | KW_Public | KW_Return | KW_Short | KW_Static | KW_Strictfp | KW_Super | KW_Switch | KW_Synchronized | KW_This | KW_Throw | KW_Throws | KW_Transient | KW_Try | KW_Void | KW_Volatile | KW_While -- Separators | OpenParen | CloseParen | OpenSquare | CloseSquare | OpenCurly | CloseCurly | SemiColon | Comma | Period -- Literals | IntTok Integer | LongTok Integer | DoubleTok Double | FloatTok Double | CharTok Char | StringTok String | BoolTok Bool | NullTok -- Identifiers | IdentTok String -- Operators | Op_Equal | Op_GThan | Op_LThan | Op_Bang | Op_Tilde | Op_Query | Op_Colon | Op_Equals | Op_LThanE | Op_GThanE | Op_BangE | Op_AAnd | Op_OOr | Op_PPlus | Op_MMinus | Op_Plus | Op_Minus | Op_Star | Op_Slash | Op_And | Op_Or | Op_Caret | Op_Percent | Op_LShift | Op_RShift | Op_RRShift | Op_PlusE | Op_MinusE | Op_StarE | Op_SlashE | Op_AndE | Op_OrE | Op_CaretE | Op_PercentE | Op_LShiftE | Op_RShiftE | Op_RRShiftE deriving (Show, Eq) lexer :: String -> [L Token] lexer = alexScanTokens }