{-# LANGUAGE BangPatterns #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} -- | Duet syntax tokenizer. module Duet.Tokenizer where import Control.Monad import Data.Char import Data.List import Data.Text (Text) import qualified Data.Text as T import Duet.Printer import Duet.Types import Text.Parsec hiding (anyToken) import Text.Parsec.Text import Text.Printf tokenize :: FilePath -> Text -> Either ParseError [(Token, Location)] tokenize fp t = parse tokensTokenizer fp t tokensTokenizer :: Parser [(Token, Location)] tokensTokenizer = manyTill (many space >>= tokenTokenizer) (try (spaces >> eof)) tokenTokenizer :: [Char] -> Parser (Token, Location) tokenTokenizer prespaces = choice [ if isSuffixOf "\n" prespaces then do pos <- getPosition pure ( NonIndentedNewline , Location (sourceLine pos) (sourceColumn pos) (sourceLine pos) (sourceColumn pos)) else unexpected "indented newline" , atomThenSpace If "if" , atomThenSpace Then "then" , atomThenSpace ClassToken "class" , atomThenSpace InstanceToken "instance" , atomThenSpace Where "where" , atomThenSpace Data "data" , atomThenSpace Else "else" , atomThenSpace ForallToken "forall" , atomThenSpace Case "case" , atomThenSpace Of "of" , atom Bang "!" , atom Period "." , atom Backslash "\\" , atom OpenParen "(" , atom CloseParen ")" , atom Imply "=>" , atom Equals "=" , atom Bar "|" , atom Colons "::" , atom RightArrow "->" , atomThenSpace Let "let" , atomThenSpace In "in" , atom Comma "," , do tok <- parsing Operator (fmap T.pack (choice [ string "*" , string "+" , try (string ">=") , try (string "<=") , string ">" , string "<" , string "/" ])) "operator (e.g. *, <, +, etc.)" when (null prespaces) (unexpected (tokenString tok ++ ", there should be spaces before and after operators.")) lookAhead spaces1 ("space after " ++ tokenString tok) pure tok , specialParsing Character (do _ <- string "'" chars <- many1 (satisfy (\c->c/='\n' &&c/= '\'')) "character e.g. 'a'" when (length chars > 1) (unexpected (concat [ "character: you wrote\n" , "'" ++ ellipsis 5 chars ++ "\n" , "but only one character is allowed inside single quotes, like this:\n'" ++ take 1 chars ++ "'" , "\nPerhaps you forgot to put the closing single quote?\n" , "You may also have meant to use double quotes for text, e.g.\n" , "\"" ++ takeWhile (/= '\'') chars ++ "\"" ])) _ <- string "'" pure (head chars)) "character (e.g. 'a', 'z', '9', etc.)" , parsing String (do _ <- string "\"" chars <- many (satisfy (\c -> c /= '"')) when (any (== '\\') chars) (unexpected "\\ character, not allowed inside a string.") when (any (== '\n') chars) (unexpected "newline character, not allowed inside a string.") _ <- string "\"" "double quotes (\") to close the string" pure (T.pack chars)) "string (e.g. \"hello\", \"123\", etc.)" , parsing Constructor (do c <- satisfy isUpper variable <- many (satisfy (flip elem (['A' .. 'Z']++['a' .. 'z']))) pure (T.singleton c <> T.pack variable)) "constructor (e.g. “Rocket”, “Just”, etc.)" , parsing Variable (do variable <- do start <- many1 (satisfy (flip elem ("_" ++ ['a' .. 'z']))) end <- many (satisfy (flip elem ("_" ++ ['A' .. 'Z'] ++['a' .. 'z'] ++ ['0' .. '9']))) pure (start ++ end) pure (T.pack variable)) "variable (e.g. “elephant”, “age”, “t2”, etc.)" , parseNumbers prespaces ] where spaces1 :: Parser () spaces1 = space >> spaces ellipsis :: Int -> [Char] -> [Char] ellipsis n text = if length text > 2 then take n text ++ "…" else text specialParsing :: (t1 -> t) -> Parser t1 -> String -> Parser (t, Location) specialParsing constructor parser description = do start <- getPosition thing <- parser description end <- getPosition pure ( constructor thing , Location (sourceLine start) (sourceColumn start) (sourceLine end) (sourceColumn end)) atom :: t -> String -> Parser (t, Location) atom constructor text = do start <- getPosition _ <- try (string text) smartQuotes text end <- getPosition pure ( constructor , Location (sourceLine start) (sourceColumn start) (sourceLine end) (sourceColumn end)) atomThenSpace :: t -> String -> Parser (t, Location) atomThenSpace constructor text = do start <- getPosition _ <- try ((string text smartQuotes text) <* (lookAhead spaces1 ("space or newline after " ++ smartQuotes text))) end <- getPosition pure ( constructor , Location (sourceLine start) (sourceColumn start) (sourceLine end) (sourceColumn end)) parsing :: (Text -> t) -> Parser Text -> String -> Parser (t, Location) parsing constructor parser description = do start <- getPosition text <- parser description mapM_ (bailOnUnsupportedKeywords text) [ "class" , "data" , "default" , "deriving" , "do" , "forall" , "import" , "infix" , "infixl" , "infixr" , "instance" , "module" , "if" , "then" , "else" , "case" , "newtype" , "qualified" , "type" , "where" , "foreign" , "ccall" , "as" , "safe" , "unsafe" ] end <- getPosition pure ( constructor text , Location (sourceLine start) (sourceColumn start) (sourceLine end) (sourceColumn end)) where supportedKeywords = ["class","data","forall","instance","if","then","else","case"] bailOnUnsupportedKeywords text word = when (text == word) (unexpected (if elem word supportedKeywords then "the keyword " ++ curlyQuotes (T.unpack word) ++ " isn't in the right place or is incomplete. Try adding a space after it?" else ("“" ++ T.unpack word ++ "”: that keyword isn't allowed, " ++ ext))) where ext = "but you could use this instead: " ++ T.unpack word ++ "_" parseNumbers :: [a] -> Parser (Token, Location) parseNumbers prespaces = parser "number (e.g. 42, 3.141, etc.)" where parser = do start <- getPosition neg <- fmap Just (char '-') <|> pure Nothing let operator = do end <- getPosition pure ( Operator "-" , Location (sourceLine start) (sourceColumn start) (sourceLine end) (sourceColumn end)) number :: (forall a. (Num a) => a -> a) -> Parser (Token, Location) number f = do x <- many1 digit (do _ <- char '.' y <- many1 digit ("decimal component, e.g. " ++ x ++ ".0") end <- getPosition pure ( Decimal (f (read (x ++ "." ++ y))) , Location (sourceLine start) (sourceColumn start) (sourceLine end) (sourceColumn end))) <|> (do end <- getPosition pure ( Integer (f (read x)) , Location (sourceLine start) (sourceColumn start) (sourceLine end) (sourceColumn end))) case neg of Nothing -> number id Just {} -> do when (null prespaces) (unexpected (curlyQuotes "-" ++ ", there should be a space before it.")) (number (* (-1)) "number (e.g. 123)") <|> operator <* (space ("space after operator " ++ curlyQuotes "-")) smartQuotes :: [Char] -> [Char] smartQuotes t = "“" <> t <> "”" equalToken :: Token -> TokenParser Location equalToken p = fmap snd (satisfyToken (==p) tokenStr p) -- | Consume the given predicate from the token stream. satisfyToken :: (Token -> Bool) -> TokenParser (Token, Location) satisfyToken p = consumeToken (\tok -> if p tok then Just tok else Nothing) -- | The parser @anyToken@ accepts any kind of token. It is for example -- used to implement 'eof'. Returns the accepted token. anyToken :: TokenParser (Token, Location) anyToken = consumeToken Just -- | Consume the given predicate from the token stream. consumeToken :: (Token -> Maybe a) -> TokenParser (a, Location) consumeToken f = do u <- getState tokenPrim tokenString tokenPosition (\(tok, loc) -> if locationStartColumn loc > u then fmap (, loc) (f tok) else Nothing) -- | Make a string out of the token, for error message purposes. tokenString :: (Token, Location) -> [Char] tokenString = tokenStr . fst tokenStr :: Token -> [Char] tokenStr tok = case tok of If -> curlyQuotes "if" Then -> curlyQuotes "then" Imply -> curlyQuotes "=>" RightArrow -> curlyQuotes "->" Else -> curlyQuotes "else" Where -> curlyQuotes "where" ClassToken -> curlyQuotes "class" Data -> curlyQuotes "data" InstanceToken -> curlyQuotes "instance" Case -> curlyQuotes "case" Of -> curlyQuotes "of" Let -> curlyQuotes "let" NonIndentedNewline -> "non-indented newline" In -> curlyQuotes "in" Backslash -> curlyQuotes ("backslash " ++ curlyQuotes "\\") OpenParen -> "opening parenthesis " ++ curlyQuotes "(" CloseParen -> "closing parenthesis " ++ curlyQuotes ")" Equals -> curlyQuotes "=" Colons -> curlyQuotes "::" ForallToken -> curlyQuotes "forall" Variable t -> "variable " ++ curlyQuotes (T.unpack t) Constructor t -> "constructor " ++ curlyQuotes (T.unpack t) Character !c -> "character '" ++ (T.unpack (T.singleton c)) ++ "'" String !t -> "string " ++ show t Operator !t -> "operator " ++ curlyQuotes (T.unpack t) Comma -> curlyQuotes "," Integer !i -> "integer " ++ show i Decimal !d -> "decimal " ++ printf "%f" d Bar -> curlyQuotes "|" Period -> curlyQuotes "." Bang -> curlyQuotes "!" -- | Update the position by the token. tokenPosition :: SourcePos -> (Token, Location) -> t -> SourcePos tokenPosition pos (_, l) _ = setSourceColumn (setSourceLine pos line) col where (line,col) = (locationStartLine l, locationStartColumn l) type TokenParser e = forall s m. Stream s m (Token, Location) => ParsecT s Int m e -- | @notFollowedBy p@ only succeeds when parser @p@ fails. This parser -- does not consume any input. This parser can be used to implement the -- \'longest match\' rule. For example, when recognizing keywords (for -- example @let@), we want to make sure that a keyword is not followed -- by a legal identifier character, in which case the keyword is -- actually an identifier (for example @lets@). We can program this -- behaviour as follows: -- -- > keywordLet = try (do{ string "let" -- > ; notFollowedBy alphaNum -- > }) notFollowedBy' :: TokenParser (Token, Location) -> TokenParser () notFollowedBy' p = try ((do c <- try p unexpected (tokenString c)) <|> return ()) -- | This parser only succeeds at the end of the input. This is not a -- primitive parser but it is defined using 'notFollowedBy'. -- -- > eof = notFollowedBy anyToken "end of input" endOfTokens :: TokenParser () endOfTokens = notFollowedBy' anyToken "end of input"