module SMR.Source.Lexer ( lexTokens , Located (..) , Location(..)) where import SMR.Source.Token import SMR.Data.Located import Data.Text (Text) import qualified Data.Text as Text import qualified Data.Char as Char -- Lexer ---------------------------------------------------------------------- -- | Lex a sequence of tokens. lexTokens :: Location -> [Char] -> ([Located Token], Location, [Char]) lexTokens lStart0 cs0 = case skipSpace lStart0 cs0 of (lStart, []) -> ( LL lStart lStart KEnd : [] , lStart, []) (lStart, cs) -> case lexToken lStart cs of Nothing -> ([], lStart, cs) Just (k, cs') | (ks, lStart', cs'') <- lexTokens (endOfLocated k) cs' -> (k : ks, lStart', cs'') -- | Lex a single token. lexToken :: Location -> [Char] -> Maybe (Located Token, [Char]) lexToken lStart xx = case xx of [] -> Nothing c : cs -- Punctuation. | isCharPunc c -> let lEnd = incCharOfLocation 1 lStart tok = KPunc c in Just (LL lStart lEnd tok, cs) -- Variable name. | Just (space, xx') <- takeSpace c cs , Just (name, lEnd, csRest) <- lexName (incCharOfLocation 1 lStart) xx' -> let tok = KName space name in Just (LL lStart lEnd tok, csRest) -- Natural number. | Char.isDigit c , Just (nat, lEnd, csRest) <- lexNat lStart (c : cs) -> let tok = KNat nat in Just (LL lStart lEnd tok, csRest) -- Text string. | c == '\"' , Just (tx, lEnd, csRest) <- lexText lStart cs -> let tok = KText tx in Just (LL lStart lEnd tok, csRest) | otherwise -> Nothing -- | Lex a variable name. lexName :: Location -> [Char] -> Maybe (Text, Location, [Char]) lexName lStart xx = go lStart [] xx where go lStart' acc [] | not $ null acc = let name = Text.pack $ reverse acc in Just (name, lStart', []) | otherwise = Nothing go lStart' acc (c : cs) | isNameBodyChar c = go (incCharOfLocation 1 lStart') (c : acc) cs | otherwise = let name = Text.pack $ reverse acc in Just (name, lStart', c : cs) -- | Lex a natural number. lexNat :: Location -> [Char] -> Maybe (Integer, Location, [Char]) lexNat lStart xx = go lStart [] xx where go lStart' acc [] | not $ null acc , all Char.isDigit acc , nat <- read $ reverse acc = Just (nat, lStart', []) go lStart' acc (c : cs) | Char.isDigit c = go (incCharOfLocation 1 lStart') (c : acc) cs | all Char.isDigit acc , not $ null acc , nat <- read $ reverse acc = Just (nat, lStart', c : cs) go _ _ _ = Nothing -- | Lex a string. lexText :: Location -> [Char] -> Maybe (Text, Location, [Char]) lexText lStart xx = go lStart [] xx where go _ _ [] = Nothing go lStart' acc ('\"' : cs) = Just (Text.pack $ reverse acc, lStart', cs) go lStart' acc ('\\' : c : cs) = let l' = incCharOfLocation 1 lStart' in case c of '\"' -> go l' (c : acc) cs '\\' -> go l' (c : acc) cs 'b' -> go l' ('\b' : acc) cs 'f' -> go l' ('\f' : acc) cs 'n' -> go l' ('\n' : acc) cs 'r' -> go l' ('\r' : acc) cs 't' -> go l' ('\t' : acc) cs -- TODO: read hex encoded special chars. _ -> Nothing go lStart' acc (c : cs) = let l' = incCharOfLocation 1 lStart' in go l' (c : acc) cs -- Whitespace ----------------------------------------------------------------- skipSpace :: Location -> [Char] -> (Location, [Char]) skipSpace lStart xx = case xx of [] -> (lStart, xx) c : cs -- Skip whitespace. | c == ' ' -> skipSpace (incCharOfLocation 1 lStart) cs | c == '\n' -> skipSpace (incLineOfLocation 1 lStart) cs | c == '\t' -> skipSpace (incCharOfLocation 8 lStart) cs -- Skip comments | c == '-' , c2 : cs2 <- cs , c2 == '-' -> skipSpace lStart $ dropWhile (\x -> x /= '\n') cs2 | otherwise -> (lStart, xx) -- | Take the namespace qualifier from the front of a name. takeSpace :: Char -> [Char] -> Maybe (Space, [Char]) takeSpace c cs | Char.isLower c = Just (SVar, c : cs) | c == '@' = Just (SMac, cs) | c == '%' = Just (SSym, cs) | c == '+' = Just (SSet, cs) | c == '#' , c' : cs' <- cs , c' == '#' = Just (SKey, cs') | c == '#' = Just (SPrm, cs) | otherwise = Nothing -- Character Classes ---------------------------------------------------------- -- | Check if this character can appear in the body of a name. isNameBodyChar :: Char -> Bool isNameBodyChar c = Char.isLower c || Char.isUpper c || Char.isDigit c || (c == '-' || c == '\'' || c == '_') -- | Check if this is a punctuation character. isCharPunc :: Char -> Bool isCharPunc c | c == '(' = True | c == ')' = True | c == '{' = True | c == '}' = True | c == '[' = True | c == ']' = True | c == '<' = True | c == '>' = True | c == '^' = True | c == ',' = True | c == ':' = True | c == '\\' = True | c == '.' = True | c == ';' = True | c == '=' = True | c == '$' = True | c == '!' = True | c == '~' = True | c == '?' = True | otherwise = False