{-# OPTIONS_GHC -fglasgow-exts -optc-w -cpp #-} {-| Lexical analyzer. > No words were laid on stream or stone > When Durin woke and walked alone. > He named the nameless hills and dells; > He drank from yet untasted wells... -} module Pugs.Lexer ( wordAlpha, wordAny, isWordAlpha, isWordAny, maybeParens, parens, whiteSpace, mandatoryWhiteSpace, lexeme, identifier, identLetter, braces, brackets, angles, balancedDelim, decimal, ruleVerbatimIdentifier, ruleDelimitedIdentifier, ruleQualifiedIdentifier, symbol, interpolatingStringLiteral, escapeCode, rule, verbatimRule, literalRule, tryRule, tryVerbatimRule, tryChoice, ruleComma, ruleScope, ruleTrait, ruleTraitName, ruleBareTrait, ruleType, verbatimParens, verbatimBrackets, verbatimBraces, eof ) where import Pugs.Internals import Pugs.AST import Pugs.Rule import Pugs.Types import Pugs.Parser.Types import Pugs.Parser.Charnames import Text.ParserCombinators.Parsec.Pos (sourceColumn, sourceLine) import qualified Text.ParserCombinators.Parsec as Parsec (eof) eof :: RuleParser () eof = Parsec.eof "" identStart, identLetter :: RuleParser Char identStart = satisfy isWordAlpha identLetter = satisfy isWordAny wordAlpha :: RuleParser Char wordAny :: RuleParser Char wordAlpha = satisfy isWordAlpha "alphabetic word character" wordAny = satisfy isWordAny "word character" isWordAny :: Char -> Bool isWordAlpha :: Char -> Bool isWordAny x = (isAlphaNum x || x == '_') isWordAlpha x = (isAlpha x || x == '_') maybeParens :: RuleParser a -> RuleParser a maybeParens p = choice [ parens p, p ] parens, braces, angles, brackets :: RuleParser a -> RuleParser a parens p = between (lexeme (char '(')) (lexeme (char ')')) p braces p = between (lexeme (char '{')) (lexeme (char '}')) p angles p = between (lexeme (char '<')) (lexeme (char '>')) p brackets p = between (lexeme (char '[')) (lexeme (char ']')) p mandatoryWhiteSpace :: RuleParser () mandatoryWhiteSpace = skipMany1 (simpleSpace <|> comment) balancedDelim :: Char -> Char balancedDelim c = case c of '\x0028' -> '\x0029'; '\x003C' -> '\x003E'; '\x005B' -> '\x005D'; #ifndef HADDOCK '\x007B' -> '\x007D'; '\x00AB' -> '\x00BB'; '\x0F3A' -> '\x0F3B'; '\x0F3C' -> '\x0F3D'; '\x169B' -> '\x169C'; '\x2039' -> '\x203A'; '\x2045' -> '\x2046'; '\x207D' -> '\x207E'; '\x208D' -> '\x208E'; '\x2208' -> '\x220B'; '\x2209' -> '\x220C'; '\x220A' -> '\x220D'; '\x2215' -> '\x29F5'; '\x223C' -> '\x223D'; '\x2243' -> '\x22CD'; '\x2252' -> '\x2253'; '\x2254' -> '\x2255'; '\x2264' -> '\x2265'; '\x2266' -> '\x2267'; '\x2268' -> '\x2269'; '\x226A' -> '\x226B'; '\x226E' -> '\x226F'; '\x2270' -> '\x2271'; '\x2272' -> '\x2273'; '\x2274' -> '\x2275'; '\x2276' -> '\x2277'; '\x2278' -> '\x2279'; '\x227A' -> '\x227B'; '\x227C' -> '\x227D'; '\x227E' -> '\x227F'; '\x2280' -> '\x2281'; '\x2282' -> '\x2283'; '\x2284' -> '\x2285'; '\x2286' -> '\x2287'; '\x2288' -> '\x2289'; '\x228A' -> '\x228B'; '\x228F' -> '\x2290'; '\x2291' -> '\x2292'; '\x2298' -> '\x29B8'; '\x22A2' -> '\x22A3'; '\x22A6' -> '\x2ADE'; '\x22A8' -> '\x2AE4'; '\x22A9' -> '\x2AE3'; '\x22AB' -> '\x2AE5'; '\x22B0' -> '\x22B1'; '\x22B2' -> '\x22B3'; '\x22B4' -> '\x22B5'; '\x22B6' -> '\x22B7'; '\x22C9' -> '\x22CA'; '\x22CB' -> '\x22CC'; '\x22D0' -> '\x22D1'; '\x22D6' -> '\x22D7'; '\x22D8' -> '\x22D9'; '\x22DA' -> '\x22DB'; '\x22DC' -> '\x22DD'; '\x22DE' -> '\x22DF'; '\x22E0' -> '\x22E1'; '\x22E2' -> '\x22E3'; '\x22E4' -> '\x22E5'; '\x22E6' -> '\x22E7'; '\x22E8' -> '\x22E9'; '\x22EA' -> '\x22EB'; '\x22EC' -> '\x22ED'; '\x22F0' -> '\x22F1'; '\x22F2' -> '\x22FA'; '\x22F3' -> '\x22FB'; '\x22F4' -> '\x22FC'; '\x22F6' -> '\x22FD'; '\x22F7' -> '\x22FE'; '\x2308' -> '\x2309'; '\x230A' -> '\x230B'; '\x2329' -> '\x232A'; '\x23B4' -> '\x23B5'; '\x2768' -> '\x2769'; '\x276A' -> '\x276B'; '\x276C' -> '\x276D'; '\x276E' -> '\x276F'; '\x2770' -> '\x2771'; '\x2772' -> '\x2773'; '\x2774' -> '\x2775'; '\x27C3' -> '\x27C4'; '\x27C5' -> '\x27C6'; '\x27D5' -> '\x27D6'; '\x27DD' -> '\x27DE'; '\x27E2' -> '\x27E3'; '\x27E4' -> '\x27E5'; '\x27E6' -> '\x27E7'; '\x27E8' -> '\x27E9'; '\x27EA' -> '\x27EB'; '\x2983' -> '\x2984'; '\x2985' -> '\x2986'; '\x2987' -> '\x2988'; '\x2989' -> '\x298A'; '\x298B' -> '\x298C'; '\x298D' -> '\x298E'; '\x298F' -> '\x2990'; '\x2991' -> '\x2992'; '\x2993' -> '\x2994'; '\x2995' -> '\x2996'; '\x2997' -> '\x2998'; '\x29C0' -> '\x29C1'; '\x29C4' -> '\x29C5'; '\x29CF' -> '\x29D0'; '\x29D1' -> '\x29D2'; '\x29D4' -> '\x29D5'; '\x29D8' -> '\x29D9'; '\x29DA' -> '\x29DB'; '\x29F8' -> '\x29F9'; '\x29FC' -> '\x29FD'; '\x2A2B' -> '\x2A2C'; '\x2A2D' -> '\x2A2E'; '\x2A34' -> '\x2A35'; '\x2A3C' -> '\x2A3D'; '\x2A64' -> '\x2A65'; '\x2A79' -> '\x2A7A'; '\x2A7D' -> '\x2A7E'; '\x2A7F' -> '\x2A80'; '\x2A81' -> '\x2A82'; '\x2A83' -> '\x2A84'; '\x2A8B' -> '\x2A8C'; '\x2A91' -> '\x2A92'; '\x2A93' -> '\x2A94'; '\x2A95' -> '\x2A96'; '\x2A97' -> '\x2A98'; '\x2A99' -> '\x2A9A'; '\x2A9B' -> '\x2A9C'; '\x2AA1' -> '\x2AA2'; '\x2AA6' -> '\x2AA7'; '\x2AA8' -> '\x2AA9'; '\x2AAA' -> '\x2AAB'; '\x2AAC' -> '\x2AAD'; '\x2AAF' -> '\x2AB0'; '\x2AB3' -> '\x2AB4'; '\x2ABB' -> '\x2ABC'; '\x2ABD' -> '\x2ABE'; '\x2ABF' -> '\x2AC0'; '\x2AC1' -> '\x2AC2'; '\x2AC3' -> '\x2AC4'; '\x2AC5' -> '\x2AC6'; '\x2ACD' -> '\x2ACE'; '\x2ACF' -> '\x2AD0'; '\x2AD1' -> '\x2AD2'; '\x2AD3' -> '\x2AD4'; '\x2AD5' -> '\x2AD6'; '\x2AEC' -> '\x2AED'; '\x2AF7' -> '\x2AF8'; '\x2AF9' -> '\x2AFA'; '\x2E02' -> '\x2E03'; '\x2E04' -> '\x2E05'; '\x2E09' -> '\x2E0A'; '\x2E0C' -> '\x2E0D'; '\x2E1C' -> '\x2E1D'; '\x3008' -> '\x3009'; '\x300A' -> '\x300B'; '\x300C' -> '\x300D'; '\x300E' -> '\x300F'; '\x3010' -> '\x3011'; '\x3014' -> '\x3015'; '\x3016' -> '\x3017'; '\x3018' -> '\x3019'; '\x301A' -> '\x301B'; '\x301D' -> '\x301E'; '\xFD3E' -> '\xFD3F'; '\xFE17' -> '\xFE18'; '\xFE35' -> '\xFE36'; '\xFE37' -> '\xFE38'; '\xFE39' -> '\xFE3A'; '\xFE3B' -> '\xFE3C'; '\xFE3D' -> '\xFE3E'; '\xFE3F' -> '\xFE40'; '\xFE41' -> '\xFE42'; '\xFE43' -> '\xFE44'; '\xFE47' -> '\xFE48'; '\xFE59' -> '\xFE5A'; '\xFE5B' -> '\xFE5C'; '\xFE5D' -> '\xFE5E'; '\xFF08' -> '\xFF09'; '\xFF1C' -> '\xFF1E'; '\xFF3B' -> '\xFF3D'; '\xFF5B' -> '\xFF5D'; '\xFF5F' -> '\xFF60'; '\xFF62' -> '\xFF63'; other -> other #endif {- -- balanced: parses an open/close delimited expression of any non-alphanumeric character balanced :: RuleParser String balanced = do notFollowedBy alphaNum opendelim <- anyChar contents <- many $ satisfy (/= balancedDelim opendelim) char $ balancedDelim opendelim return contents -- The \b rule. _ruleWordBoundary :: RuleParser () _ruleWordBoundary = do prev <- getPrevCharClass case prev of SpaceClass -> return () _ -> do curr <- getCurrCharClass guard (prev /= curr) -- The rule. ruleWs :: RuleParser () ruleWs = do prev <- getPrevCharClass case prev of SpaceClass -> whiteSpace _ -> do curr <- getCurrCharClass if prev == curr then mandatoryWhiteSpace else case curr of SpaceClass -> whiteSpace _ -> return () -} {-| Match one or more identifiers, separated internally by the given delimiter. Returns a list of the identifiers matched, discarding the delimiters. You can always recreate them using \"@concat $ intersperse delim@\" if you want, or else use 'ruleQualifiedIdentifier'. -} ruleDelimitedIdentifier :: String -- ^ Delimiter (e.g. \'@::@\') -> RuleParser [String] ruleDelimitedIdentifier delim = verbatimRule "delimited identifier" $ do -- Allowing the leading delim actually leads to subtle oddness with things -- like `use jsan:.Foo` and `use pugs:::Foo`, so I took it out. --option "" (try $ string delim) -- leading delimiter ruleVerbatimIdentifier `sepBy1` (try $ string delim) ruleQualifiedIdentifier :: RuleParser String ruleQualifiedIdentifier = verbatimRule "qualified identifier" $ do chunks <- ruleDelimitedIdentifier "::" return $ concat (intersperse "::" chunks) ruleVerbatimIdentifier :: RuleParser String ruleVerbatimIdentifier = verbatimRule "identifier" $ do c <- identStart cs <- many identLetter return (c:cs) {- {-| Match any amount of whitespace (not including newlines), followed by a newline (as matched by 'ruleEndOfLine'). -} ruleWhiteSpaceLine :: RuleParser () ruleWhiteSpaceLine = do many $ satisfy (\x -> isSpace x && x /= '\n') ruleEndOfLine {-| Match either a single newline, or EOF (which constitutes the termination of a line anyway). -} ruleEndOfLine :: RuleParser () ruleEndOfLine = choice [ do { char '\n'; return () }, eof ] -} symbol :: String -> RuleParser String symbol s = try $ do rv <- string s let lastCh = last s ahead f = lookAhead (satisfy (f lastCh)) <|> (eof >> return ' ') if isWordAny lastCh then ahead aheadWord >> whiteSpace >> notFollowedBy (try (char '=' >> char '>')) else ahead aheadSym >> whiteSpace return rv where aheadWord x '=' = not $ x `elem` "xXZ" aheadWord _ y = not $ isWordAny y aheadSym '-' '>' = False -- XXX hardcode aheadSym '!' '~' = False -- XXX hardcode aheadSym '\\''(' = False -- XXX hardcode aheadSym '|' '<' = False -- XXX hardcode aheadSym x '=' = not (x `elem` "!~+-*&/|.%^") aheadSym '?' y = not (y `elem` "&|^?") aheadSym '+' y = not (y `elem` "&|^+") aheadSym '~' y = not (y `elem` "&|^~") aheadSym '^' y = not (y `elem` "^.") aheadSym x y = y `elem` ";!" || x /= y interpolatingStringLiteral :: RuleParser String -- ^ Opening delimiter -> RuleParser String -- ^ Closing delimiter -> RuleParser Exp -- ^ Interpolator -> RuleParser Exp -- ^ Entire string -- (without delims) interpolatingStringLiteral startRule endRule interpolator = do list <- stringList 0 return $ Ann (Cxt (CxtItem $ mkType "Str")) (homogenConcat list) where homogenConcat :: [Exp] -> Exp homogenConcat [] = Val (VStr "") homogenConcat [v@(Val (VStr _))] = v homogenConcat (Val (VStr x):Val (VStr y):xs) = homogenConcat (Val (VStr (x ++ y)) : xs) homogenConcat (x:xs) = App (_Var "&infix:~") Nothing [x, homogenConcat xs] stringList :: Int -> RuleParser [Exp] stringList i = choice [ do parse <- interpolator rest <- stringList i return (parse:rest) , do ch <- try endRule if i == 0 then return [] else do rest <- stringList (i-1) return (Val (VStr ch):rest) , do ch <- try startRule rest <- stringList (i+1) return (Val (VStr ch):rest) , do char <- anyChar rest <- stringList i return (Val (VStr [char]):rest) ] -- | Backslashed non-alphanumerics (except for @\^@) translate into themselves. escapeCode :: RuleParser String escapeCode = charNum <|> ch charEsc <|> ch charAscii <|> charControl <|> ch anyChar "escape code" where ch = fmap (:[]) charControl :: RuleParser String charControl = do char 'c' code <- upper <|> oneOf "@[" <|> digit case code of '[' -> do charNames <- many1 (noneOf ",]") `sepBy1` many1 ruleComma char ']' forM charNames $ \charName -> do if all isDigit charName then return $ chr (read charName) else case nameToCode charName of Just c -> return (chr c) _ -> fail $ "Invalid unicode character name: " ++ charName _ | isDigit code -> do cs <- many digit return [chr $ read (code:cs)] _ -> return [toEnum (fromEnum code - fromEnum '@')] -- This is currently the only escape that can return multiples. charNum :: RuleParser String charNum = do codes <- choice [ many1 digit >>= \ds -> case ds of "0" -> return [0] -- "\08..." and "\09..." are treated as "\0" and then "8..." or "9...". ('0':xs@(x:_)) | x == '8' || x == '9' -> return (0:map (toInteger . ord) xs) _ -> error ("Error: Invalid escape sequence \\" ++ ds ++ "; write as decimal \\c" ++ ds ++ " or octal \\o" ++ ds ++ " instead") -- return [read ds] , based 'o' 8 octDigit , based 'x' 16 hexDigit -- , based 'c' 10 digit ] return $ map (toEnum . fromInteger) codes where based ch num p = do char ch choice [ verbatimBrackets (number num p `sepEndBy1` ruleComma) , fmap (:[]) (number num p) ] ruleComma :: RuleParser () ruleComma = do lexeme (char ',') return () number :: Integer -> RuleParser Char -> RuleParser Integer number base baseDigit = do{ digits <- many1 baseDigit ; let n = foldl (\x d -> base*x + toInteger (digitToInt d)) 0 digits ; seq n (return n) } charEsc :: RuleParser Char charEsc = choice (map parseEsc escMap) where parseEsc (c,code) = do{ char c; return code } charAscii :: RuleParser Char charAscii = choice (map parseAscii asciiMap) where parseAscii (asc,code) = try (do{ string asc; return code }) -- escape code tables escMap :: [(Char, Char)] escMap = zip ("abfnrtv\\\"\'") ("\a\b\f\n\r\t\v\\\"\'") asciiMap :: [(String, Char)] asciiMap = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2) ascii2codes :: [String] ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM", "FS","GS","RS","US","SP"] ascii3codes :: [String] ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL", "DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB", "CAN","SUB","ESC","DEL"] ascii2 :: [Char] ascii2 = ['\BS','\HT','\LF','\VT','\FF','\CR','\SO','\SI', '\EM','\FS','\GS','\RS','\US','\SP'] ascii3 :: [Char] ascii3 = ['\NUL','\SOH','\STX','\ETX','\EOT','\ENQ','\ACK', '\BEL','\DLE','\DC1','\DC2','\DC3','\DC4','\NAK', '\SYN','\ETB','\CAN','\SUB','\ESC','\DEL'] rule :: String -> RuleParser a -> RuleParser a rule name = ( name) . lexeme verbatimRule :: String -> RuleParser a -> RuleParser a verbatimRule name = ( name) literalRule :: String -> RuleParser a -> RuleParser a literalRule name = ( name) . postSpace tryRule :: String -> RuleParser a -> RuleParser a tryRule name = ( name) . lexeme . try tryVerbatimRule :: String -> RuleParser a -> RuleParser a tryVerbatimRule name = ( name) . try ruleScope :: RuleParser Scope ruleScope = rule "scope" $ do scope <- ruleScopeName return $ readScope scope where readScope "state" = SState readScope "has" = SHas readScope "my" = SMy readScope "our" = SOur readScope "constant"= SConstant readScope _ = SState ruleScopeName :: RuleParser String ruleScopeName = choice $ map symbol scopeNames scopeNames :: [String] scopeNames = map (map toLower) . map (tail . show) $ ([minBound .. maxBound] :: [Scope]) postSpace :: RuleParser a -> RuleParser a postSpace rule = try $ do rv <- rule notFollowedBy wordAny whiteSpace return rv ruleTrait :: [String] -> RuleParser (String, String) ruleTrait auxs = rule "trait" $ do aux <- choice $ map symbol auxs trait <- do optional $ string "::" -- XXX Bad Hack ruleQualifiedIdentifier -- XXX: For now, we *only* *parse* "is export(...)". The arguments are -- thrown away. So module writers can give a sensible export lists now, -- as otherwise -- Otherwise, it'll be a pain to go back to every module and -- change it all once the proper behaviour is implemented. optional $ verbatimParens $ many $ satisfy (/= ')') return (aux, trait) ruleTraitName :: String -> RuleParser String ruleTraitName trait = rule "named trait" $ do symbol "is" symbol trait ruleQualifiedIdentifier ruleBareTrait :: String -> RuleParser String ruleBareTrait trait = rule "bare trait" $ do choice [ ruleTraitName trait , do symbol trait str <- ruleQualifiedIdentifier -- Hierarchical types like Hash of Str -- not yet recognised many . try $ do { whiteSpace; symbol "of"; ruleQualifiedIdentifier } return str ] ruleType :: RuleParser String ruleType = literalRule "context" $ do -- Valid type names: Foo, Bar::Baz, ::Grtz, ::?CLASS, but not :Foo lead <- count 1 wordAlpha <|> (string "::" >> return []) rest <- many (wordAny <|> oneOf ":&|?") return (lead ++ rest) {-| Attempt each of the given parsers in turn until one succeeds, but if one of them fails we backtrack (i.e. retroactively consume no input) before trying the next one. -} tryChoice :: [RuleParser a] -- ^ List of candidate parsers -> RuleParser a tryChoice = choice . map try {-| Match '@(@', followed by the given parser, followed by '@)@'. -} verbatimParens :: RuleParser a -> RuleParser a verbatimParens = between (lexeme $ char '(') (char ')') {-| Match '@\[@', followed by the given parser, followed by '@\]@'. -} verbatimBrackets :: RuleParser a -> RuleParser a verbatimBrackets = between (lexeme $ char '[') (char ']') {-| Match '@{@', followed by the given parser, followed by '@}@'. -} verbatimBraces :: RuleParser a -> RuleParser a verbatimBraces = between (lexeme $ char '{') (char '}') ----------------------------------------------------------- -- Chars & Strings ----------------------------------------------------------- ----------------------------------------------------------- -- Numbers ----------------------------------------------------------- decimal :: RuleParser Integer decimal = number 10 digit ----------------------------------------------------------- -- Identifiers & Reserved words ----------------------------------------------------------- identifier, ident :: RuleParser String identifier = lexeme . try $ ident ident = do{ c <- identStart ; cs <- many identLetter ; return (c:cs) } "identifier" ----------------------------------------------------------- -- White space & symbols ----------------------------------------------------------- lexeme :: RuleParser a -> RuleParser a lexeme p = do{ x <- p; whiteSpace; return x } whiteSpace :: RuleParser () whiteSpace = skipMany (simpleSpace <|> comment) comment :: RuleParser () comment = do char '#' "" pos <- getPosition if sourceColumn pos /= 2 then multiLineComment <|> skipToLineEnd else do -- Beginning of line - parse #line directive isPlain <- (<|> return True) $ try $ do string "line" many1 $ satisfy (\x -> isSpace x && x /= '\n') return False if isPlain then skipToLineEnd else do line <- decimal file <- (<|> return Nothing) $ try $ do many1 $ satisfy (\x -> isSpace x && x /= '\n') fileNameQuoted <|> fileNameBare case file of Just Nothing -> skipToLineEnd _ -> do many $ satisfy (/= '\n') setPosition $ pos `setSourceLine` (fromInteger line - 1) `setSourceColumn` 1 `setSourceName` maybe (sourceName pos) fromJust file fileNameQuoted :: RuleParser (Maybe (Maybe String)) fileNameQuoted = try $ do char '"' file <- many (satisfy (/= '"')) char '"' many $ satisfy (\x -> isSpace x && x /= '\n') lookAhead (satisfy (== '\n')) return . Just $ Just file fileNameBare :: RuleParser (Maybe (Maybe String)) fileNameBare = try $ do file <- many1 $ satisfy (not . isSpace) many $ satisfy (\x -> isSpace x && x /= '\n') (<|> return (Just Nothing)) $ try $ do lookAhead (satisfy (== '\n')) return . Just $ Just file skipToLineEnd :: RuleParser () skipToLineEnd = do skipMany (satisfy (/= '\n')) simpleSpace :: RuleParser () simpleSpace = do skipMany1 (satisfy isSpace) pos <- getPosition updateState (\s -> s{ s_wsLine = sourceLine pos, s_wsColumn = sourceColumn pos }) multiLineComment :: RuleParser () multiLineComment = do openOne <- satisfy (\x -> balancedDelim x /= x) more <- many (char openOne) let closeOne = balancedDelim openOne openAll = string (openOne:more) closeAll = string (replicate (1 + length more) (balancedDelim openOne)) scanOne = do c <- anyChar if c == closeOne then return () else do if c == openOne then scanOne >> scanOne else do scanOne scanAll = choice [ do { try closeAll; return () } , do { try openAll; scanAll; scanAll } , do { anyChar; scanAll } ] if null more then scanOne else scanAll