module Lexer (tokenize) where
import Control.Applicative ((<|>), (<$>), (<*>))
import Data.Char (isSymbol)
import Text.Parsec (Parsec, alphaNum, anyChar, char, choice, digit, eof, letter,
many, many1, manyTill, newline, notFollowedBy, oneOf, parse,
satisfy, string, try, upper)
import Tokens
import Control.Monad (liftM)
token :: Parsec String u Token
token = NUMBER <$> integer
<|> whitespace
<|> chrs [ ('(',LPAREN) , (')',RPAREN)
, ('{',LBRACE) , ('}',RBRACE)
, ('[',LBRACKET), (']',RBRACKET)
, (',',COMMA) , (';',SEMI), ('_',UNDERSCORE)
, ('\\',LAMBDA) , ('\x03BB',LAMBDA) ]
<|> reserveds [ ("True",TRUE), ("False",FALSE)
, ("if",IF), ("then",THEN), ("else",ELSE)
, ("case",CASE), ("of",OF), ("data", DATA)
, ("let",LET), ("in",IN) ]
<|> do { char '`'; v <- variable; char '`'; return $ OP v }
<|> anyOp
<|> do { char '"'; s <- many $ backslashed <|> satisfy (/='"'); char '"'
; return $ STRING s}
<|> do { char '\''; c <- backslashed <|> satisfy (/='\''); char '\''
; return $ CHAR c}
<|> (ID <$> variable)
<|> typeVar
<|> do { try $ string "\r\n" <|> string "\n"; return NEWLINE }
chrs :: [(Char, Token)] -> Parsec String u Token
chrs = choice . map chr
where chr (c, t) = char c >> return t
reserveds :: [(String, Token)] -> Parsec String u Token
reserveds = choice . map reserved
where reserved (s, t) = try $ string s >>
notFollowedBy (alphaNum <|> char '_') >>
return t
anyOp = do op <- many1 (satisfy isSymbol <|> oneOf "+-/*=.$<>:&|^?%#@~!")
case op of { ".." -> return DOT2
; "->" -> return ARROW; "\8594" -> return ARROW
; _ -> return $ OP op }
backslashed :: Parsec String u Char
backslashed = do { char '\\'; c <- anyChar
; return . read $ ['\'','\\',c,'\''] }
integer :: Parsec String u Int
integer = read <$> many1 digit
variable :: Parsec String u String
variable = identifier $ letter <|> char '_'
typeVar :: Parsec String u Token
typeVar = TYPE <$> identifier upper
identifier :: Parsec String u Char -> Parsec String u String
identifier c = (:) <$> c <*> (many $ alphaNum <|> oneOf "_\'")
whitespace = (lineComment >> return NEWLINE) <|>
(many1 (multiComment <|> many1 (char ' ')) >> return SPACES)
lineComment = do try $ string "--"
manyTill anyChar $ newline <|> (eof >> return '\n')
multiComment = do { try $ string "{-"; closeComment }
closeComment = manyTill anyChar . choice $
[ try $ string "-}"
, do { try $ string "{-"; closeComment; closeComment }
]
tokenParser :: Parsec String u [Token]
tokenParser = many1 token
tokenize :: String -> Either String [Token]
tokenize s = case parse tokenParser "" s of
Right ts -> Right ts
Left err -> Left $ "Syntax error: " ++ show err