| 1 | module LexCore where |
|---|
| 2 | |
|---|
| 3 | import ParserCoreUtils |
|---|
| 4 | import Panic |
|---|
| 5 | import Data.Char |
|---|
| 6 | import Numeric |
|---|
| 7 | |
|---|
| 8 | isNameChar :: Char -> Bool |
|---|
| 9 | isNameChar c = isAlpha c || isDigit c || (c == '_') || (c == '\'') |
|---|
| 10 | || (c == '$') || (c == '-') || (c == '.') |
|---|
| 11 | |
|---|
| 12 | isKeywordChar :: Char -> Bool |
|---|
| 13 | isKeywordChar c = isAlpha c || (c == '_') |
|---|
| 14 | |
|---|
| 15 | lexer :: (Token -> P a) -> P a |
|---|
| 16 | lexer cont [] = cont TKEOF [] |
|---|
| 17 | lexer cont ('\n':cs) = \line -> lexer cont cs (line+1) |
|---|
| 18 | lexer cont ('-':'>':cs) = cont TKrarrow cs |
|---|
| 19 | |
|---|
| 20 | lexer cont (c:cs) |
|---|
| 21 | | isSpace c = lexer cont cs |
|---|
| 22 | | isLower c || (c == '_') = lexName cont TKname (c:cs) |
|---|
| 23 | | isUpper c = lexName cont TKcname (c:cs) |
|---|
| 24 | | isDigit c || (c == '-') = lexNum cont (c:cs) |
|---|
| 25 | |
|---|
| 26 | lexer cont ('%':cs) = lexKeyword cont cs |
|---|
| 27 | lexer cont ('\'':cs) = lexChar cont cs |
|---|
| 28 | lexer cont ('\"':cs) = lexString [] cont cs |
|---|
| 29 | lexer cont ('#':cs) = cont TKhash cs |
|---|
| 30 | lexer cont ('(':cs) = cont TKoparen cs |
|---|
| 31 | lexer cont (')':cs) = cont TKcparen cs |
|---|
| 32 | lexer cont ('{':cs) = cont TKobrace cs |
|---|
| 33 | lexer cont ('}':cs) = cont TKcbrace cs |
|---|
| 34 | lexer cont ('=':cs) = cont TKeq cs |
|---|
| 35 | lexer cont (':':'=':':':cs) = cont TKcoloneqcolon cs |
|---|
| 36 | lexer cont (':':':':cs) = cont TKcoloncolon cs |
|---|
| 37 | lexer cont ('*':cs) = cont TKstar cs |
|---|
| 38 | lexer cont ('.':cs) = cont TKdot cs |
|---|
| 39 | lexer cont ('\\':cs) = cont TKlambda cs |
|---|
| 40 | lexer cont ('@':cs) = cont TKat cs |
|---|
| 41 | lexer cont ('?':cs) = cont TKquestion cs |
|---|
| 42 | lexer cont (';':cs) = cont TKsemicolon cs |
|---|
| 43 | -- 20060420 GHC spits out constructors with colon in them nowadays. jds |
|---|
| 44 | -- 20061103 but it's easier to parse if we split on the colon, and treat them |
|---|
| 45 | -- as several tokens |
|---|
| 46 | lexer cont (':':cs) = cont TKcolon cs |
|---|
| 47 | -- 20060420 Likewise does it create identifiers starting with dollar. jds |
|---|
| 48 | lexer cont ('$':cs) = lexName cont TKname ('$':cs) |
|---|
| 49 | lexer _ (c:_) = failP "invalid character" [c] |
|---|
| 50 | |
|---|
| 51 | lexChar :: (Token -> String -> Int -> ParseResult a) -> String -> Int |
|---|
| 52 | -> ParseResult a |
|---|
| 53 | lexChar cont ('\\':'x':h1:h0:'\'':cs) |
|---|
| 54 | | isHexEscape [h1,h0] = cont (TKchar (hexToChar h1 h0)) cs |
|---|
| 55 | lexChar _ ('\\':cs) = failP "invalid char character" ('\\':(take 10 cs)) |
|---|
| 56 | lexChar _ ('\'':_) = failP "invalid char character" ['\''] |
|---|
| 57 | lexChar _ ('\"':_) = failP "invalid char character" ['\"'] |
|---|
| 58 | lexChar cont (c:'\'':cs) = cont (TKchar c) cs |
|---|
| 59 | lexChar _ cs = panic ("lexChar: " ++ show cs) |
|---|
| 60 | |
|---|
| 61 | lexString :: String -> (Token -> [Char] -> Int -> ParseResult a) |
|---|
| 62 | -> String -> Int -> ParseResult a |
|---|
| 63 | lexString s cont ('\\':'x':h1:h0:cs) |
|---|
| 64 | | isHexEscape [h1,h0] = lexString (s++[hexToChar h1 h0]) cont cs |
|---|
| 65 | lexString _ _ ('\\':_) = failP "invalid string character" ['\\'] |
|---|
| 66 | lexString _ _ ('\'':_) = failP "invalid string character" ['\''] |
|---|
| 67 | lexString s cont ('\"':cs) = cont (TKstring s) cs |
|---|
| 68 | lexString s cont (c:cs) = lexString (s++[c]) cont cs |
|---|
| 69 | lexString _ _ [] = panic "lexString []" |
|---|
| 70 | |
|---|
| 71 | isHexEscape :: String -> Bool |
|---|
| 72 | isHexEscape = all (\c -> isHexDigit c && (isDigit c || isLower c)) |
|---|
| 73 | |
|---|
| 74 | hexToChar :: Char -> Char -> Char |
|---|
| 75 | hexToChar h1 h0 = chr (digitToInt h1 * 16 + digitToInt h0) |
|---|
| 76 | |
|---|
| 77 | lexNum :: (Token -> String -> a) -> String -> a |
|---|
| 78 | lexNum cont cs = |
|---|
| 79 | case cs of |
|---|
| 80 | ('-':cs) -> f (-1) cs |
|---|
| 81 | _ -> f 1 cs |
|---|
| 82 | where f sgn cs = |
|---|
| 83 | case span isDigit cs of |
|---|
| 84 | (digits,'.':c:rest) |
|---|
| 85 | | isDigit c -> cont (TKrational (fromInteger sgn * r)) rest' |
|---|
| 86 | where ((r,rest'):_) = readFloat (digits ++ ('.':c:rest)) |
|---|
| 87 | -- When reading a floating-point number, which is |
|---|
| 88 | -- a bit complicated, use the standard library function |
|---|
| 89 | -- "readFloat" |
|---|
| 90 | (digits,rest) -> cont (TKinteger (sgn * (read digits))) rest |
|---|
| 91 | |
|---|
| 92 | lexName :: (a -> String -> b) -> (String -> a) -> String -> b |
|---|
| 93 | lexName cont cstr cs = cont (cstr name) rest |
|---|
| 94 | where (name,rest) = span isNameChar cs |
|---|
| 95 | |
|---|
| 96 | lexKeyword :: (Token -> [Char] -> Int -> ParseResult a) -> String -> Int |
|---|
| 97 | -> ParseResult a |
|---|
| 98 | lexKeyword cont cs = |
|---|
| 99 | case span isKeywordChar cs of |
|---|
| 100 | ("module",rest) -> cont TKmodule rest |
|---|
| 101 | ("data",rest) -> cont TKdata rest |
|---|
| 102 | ("newtype",rest) -> cont TKnewtype rest |
|---|
| 103 | ("forall",rest) -> cont TKforall rest |
|---|
| 104 | ("rec",rest) -> cont TKrec rest |
|---|
| 105 | ("let",rest) -> cont TKlet rest |
|---|
| 106 | ("in",rest) -> cont TKin rest |
|---|
| 107 | ("case",rest) -> cont TKcase rest |
|---|
| 108 | ("of",rest) -> cont TKof rest |
|---|
| 109 | ("cast",rest) -> cont TKcast rest |
|---|
| 110 | ("note",rest) -> cont TKnote rest |
|---|
| 111 | ("external",rest) -> cont TKexternal rest |
|---|
| 112 | ("local",rest) -> cont TKlocal rest |
|---|
| 113 | ("_",rest) -> cont TKwild rest |
|---|
| 114 | _ -> failP "invalid keyword" ('%':cs) |
|---|