% % (c) 1998-99, sof % A lexer for MS & OMG IDLs \begin{code} {-# LANGUAGE ScopedTypeVariables #-} module Lex ( lexIDL -- :: (IDLToken -> LexM a) -> LexM a ) where import LexM import Char import Numeric import IDLToken import List ( isPrefixOf ) import Literal import BasicTypes import Utils ( deEscapeString, notNull ) import SrcLoc import Opts ( optIncludeAsImport, optExcludeSysIncludes ) \end{code} \begin{code} lexIDL :: (IDLToken -> LexM a) -> LexM a lexIDL cont = do eof <- isEOF if eof then cont T_eof else do c <- getNextChar case c of ' ' -> lexIDL cont '\t' -> lexIDL cont '\v' -> lexIDL cont '\b' -> lexIDL cont '\r' -> lexIDL cont '\f' -> lexIDL cont '\n' -> incLineNo (lexIDL cont) '/' -> do c1 <- getNextChar case c1 of '/' -> lex_oneline_comment cont '*' -> lex_nested_comment cont (1::Int){-one seen-} _ -> do putBackChar c1 cont T_div '(' -> cont T_oparen ')' -> cont T_cparen '{' -> cont T_ocurly '}' -> cont T_ccurly '[' -> cont T_osquare ']' -> cont T_csquare ',' -> cont T_comma '.' -> do c1 <- getNextChar case c1 of '.' -> do c2 <- getNextChar case c2 of '.' -> cont T_dotdotdot _ -> do putBackChar c1 putBackChar c2 cont T_dot _ -> do putBackChar c1 cont T_dot ';' -> cont T_semi ':' -> do c1 <- getNextChar case c1 of ':' -> cont T_dcolon _ -> putBackChar c1 >> cont T_colon 'L' -> do c1 <- getNextChar case c1 of '\"' -> lex_string (\ (T_string_lit ls) -> cont (T_literal (WStringLit ls))) _ -> putBackChar c1 >> start_lex_id 'L' cont '\"' -> lex_string cont -- matching " '\'' -> lex_char cont '=' -> do c1 <- getNextChar case c1 of '=' -> cont T_eqeq _ -> putBackChar c1 >> cont T_equal '!' -> do c1 <- getNextChar case c1 of '=' -> cont T_neq _ -> putBackChar c1 >> cont T_negate '+' -> cont T_plus '-' -> cont T_minus '?' -> cont T_question '<' -> do c1 <- getNextChar case c1 of '<' -> cont (T_shift L) '=' -> cont T_le _ -> putBackChar c1 >> cont T_lt '>' -> do c1 <- getNextChar case c1 of '>' -> cont (T_shift R) '=' -> cont T_ge _ -> putBackChar c1 >> cont T_gt '|' -> do c1 <- getNextChar case c1 of '|' -> cont T_rel_or _ -> putBackChar c1 >> cont T_or '^' -> cont T_xor '&' -> do c1 <- getNextChar case c1 of '&' -> cont T_rel_and _ -> putBackChar c1 >> cont T_and '*' -> cont T_times '%' -> cont T_mod '~' -> cont T_not '#' -> do -- I'm delaying the decision on whether #pragma lines should -- be parsed or not. For the moment, we'll just chop off everything -- after #pragma. cs1 <- getStream let cs2 = dropWhile isSpace cs1 if "define" `isPrefixOf` cs2 then setStream (drop (6::Int) cs2) >> cont T_hdefine else if "pragma" `isPrefixOf` cs2 then case span (/='\n') (drop (6::Int){-length of "pragma"-} cs2) of (prag,[]) -> do setStream [] cont (T_pragma prag) (prag,_:cs3) -> do setStream cs3 cont (T_pragma prag) else if ("line" `isPrefixOf` cs2) || (notNull cs1 && isSpace (head cs1)) then let cs1_no_line = dropWhile (not.isDigit) cs1 in -- munge, munge - get at the line and loc. case (reads cs1_no_line) of ((ln,rs):_) -> case (reads (dropWhile isSpace rs)) of ((fn,rs1):_) -> do -- drop any trailing info (such as flags).. let (flags, rs2) = break (=='\n') rs1 setStream rs2 sl <- getSrcLoc osl <- getOrigSrcLoc let f = modSrcLoc sl ofn = modSrcLoc osl in_system = '3' `elem` flags at_end = '2' `elem` flags flg <- getSystemContextFlag inSystemContext in_system $ do setSrcLoc (mkSrcLoc fn ln) $ if (optIncludeAsImport && at_end) then cont T_include_end else if (optExcludeSysIncludes && at_end && (flg || ofn == fn)) then -- emit end marker if we're leaving a system context -- (or name of source module.) cont T_include_end else if (optExcludeSysIncludes && not at_end && (in_system || ofn == fn) && f /= fn) then -- emit start marker if we're entering a system context -- (or name of source module.) cont (T_include_start fn) else if (not optIncludeAsImport || optExcludeSysIncludes || f == fn) then lexIDL cont -- nothing new. else cont (T_include_start fn) _ -> do sloc <- getSrcLoc cont (T_unknown (sloc,c:cs1)) _ -> do sloc <- getSrcLoc cont (T_unknown (sloc,c:cs1)) else do sloc <- getSrcLoc cont (T_unknown (sloc,c:cs1)) x -> start_lex_id x cont start_lex_id :: Char -> (IDLToken -> LexM a) -> LexM a start_lex_id c cont = do putBackChar c if isDigit c then lex_num cont else if isHexDigit c then lex_guid' cont else lex_id cont {- spool_on :: String -> [Char] -> [Char] spool_on s [] = [] spool_on s ('\n':'#':xs) = case dropWhile isSpace (dropWhile isDigit (dropWhile isSpace xs)) of ('"':xs1) | s `isPrefixOf` xs1 -> dropWhile (/= '\n') xs1 -- matching '"' xs -> spool_on s xs spool_on s (x:xs) = spool_on s xs -} lex_oneline_comment :: (IDLToken -> LexM a) -> LexM a lex_oneline_comment cont = do cs <- getStream case dropWhile (/='\n') cs of [] -> setStream [] >> incLineNo (lexIDL cont) (_:xs) -> setStream xs >> incLineNo (lexIDL cont) lex_nested_comment :: (IDLToken -> LexM a) -> Int -> LexM a lex_nested_comment cont count = do cs <- getStream case dropWhile (\ x -> x /='/' && x /= '*' && x /= '\n') cs of [] -> do setStream [] lexIDL cont ('\n':cs1) -> do setStream cs1 incLineNo (lex_nested_comment cont count) ('/':'*':cs1) -> do setStream cs1 lex_nested_comment cont (count+1) ('*':'/':cs1) -> do setStream cs1 if count == 1 then lexIDL cont else lex_nested_comment cont (count-1) (_:cs1) -> do setStream cs1 lex_nested_comment cont count lex_num :: (IDLToken -> LexM a) -> LexM a lex_num cont = do cs <- getStream case cs of '0':'x':cs1 -> case readHex cs1 of [(i,cs2)] -> setStream (removeL cs2) >> cont (T_literal (IntegerLit (ILit 16 i))) _ -> getSrcLoc >>= \ sc -> cont (T_unknown (sc,cs)) '0':cs1 -> case readOct cs1 of [(i,cs2)] -> case cs2 of '-':cs3 -> try_lex_guid cs3 c:cs3 | isHexDigit c -> case dropWhile (isHexDigit) cs3 of '-':cs4 -> try_lex_guid cs4 _ -> do -- or should that be an error ? setStream cs2 cont (T_literal (IntegerLit (ILit 8 i))) _ -> do setStream (removeL cs2) cont (T_literal (IntegerLit (ILit 8 i))) where -- this may just be the start of a GUID. try_lex_guid cs3 = case lex_guid (takeWhile (isHexDigit) cs) cs3 of Nothing -> do setStream (removeL cs2) cont (T_literal (IntegerLit (ILit 8 i))) Just (guid,cs4) -> do setStream cs4 cont (T_literal (GuidLit guid)) _ -> -- this is a mess. let (as, bs) = span (isHexDigit) cs in case bs of '-':bs1 -> case lex_guid as bs1 of Nothing -> do sloc <- getSrcLoc cont (T_unknown (sloc, cs)) Just (guid,cs2) -> do setStream cs2 cont (T_literal (GuidLit guid)) _ -> case reads cs of [(i,cs2)] -> do setStream (removeL cs2) cont (T_literal (IntegerLit (ILit 10 i))) _ -> case reads cs of [(d,cs2)] -> do setStream cs2 let rs = takeWhile (\ x -> isDigit x || x == '.') cs cont (T_literal (FloatingLit (rs,d))) _ -> do sloc <- getSrcLoc cont (T_unknown (sloc, cs)) _ -> -- this is a mess. let (as, bs) = span (isHexDigit) cs in case bs of '-':bs1 -> case lex_guid as bs1 of Nothing -> do sloc <- getSrcLoc cont (T_unknown (sloc, cs)) Just (guid,cs2) -> do setStream cs2 cont (T_literal (GuidLit guid)) _ -> case reads cs of [(i,cs1)] -> do setStream (removeL cs1) cont (T_literal (IntegerLit (ILit 10 i))) _ -> case reads cs of [(d,cs1)] -> do setStream cs1 let rs = takeWhile (\ x -> isDigit x || x == '.') cs cont (T_literal (FloatingLit (rs,d))) _ -> getSrcLoc >>= \ sc -> cont (T_unknown (sc,cs)) where removeL ('L':xs) = xs removeL xs = xs lex_guid' :: (IDLToken -> LexM a) -> LexM a lex_guid' cont = do cs <- getStream case readHex cs of [((_::Int),cs1)] -> case cs1 of ('-':cs2) -> -- this may just be the start of a GUID. case lex_guid (takeWhile (isHexDigit) cs) cs2 of Nothing -> lex_id cont Just (guid,cs3) -> do setStream cs3 cont (T_literal (GuidLit guid)) _ -> lex_id cont _ -> lex_id cont lex_guid :: String -> String -> Maybe ([String],String) lex_guid d1 cs1 = case span (isHexDigit) cs1 of (d2,'-':cs2) -> case span (isHexDigit) cs2 of (d3,'-':cs3) -> case span (isHexDigit) cs3 of (d4,'-':cs4) -> case span (isHexDigit) cs4 of ([],_) -> Nothing (d5,cs5) -> Just ([d1,d2,d3,d4,d5],cs5) _ -> Nothing _ -> Nothing _ -> Nothing lex_id :: (IDLToken -> LexM a) -> LexM a lex_id cont = do cs <- getStream case span is_id_char cs of ([],(r:rs)) -> do setStream rs sloc <- getSrcLoc cont (T_unknown (sloc, [r])) (is,rs) -> do t <- lookupSymbol is case t of Just tok@T_safearray -> -- SIGH. case rs of ('(':rs2) -> setTok tok >> setStream rs2 >> cont tok _ -> do setTok tok setStream rs res <- lookupType is -- check to see whether SAFEARRAY -- is to be considered an ID or a TYPE.. case res of Nothing -> cont (T_id "SAFEARRAY") Just x -> cont x Just (T_include _) -> do let -- Sigh, trying to integrate CPP's #include syntax -- into IDLs will lead to trouble (what's the parse of -- ""?) Not unfixable, but let's delay doing so until -- we move to using a lexer generator. rs' = dropWhile isSpace rs ((as,bs), wrapper) = case rs' of '(':xs -> -- ("foo.h") or (foo.h) (break (==')') xs, id) '<':xs -> -- (break (=='>') xs , \ x -> '<':x ++ ">") '"':xs -> -- "foo.h" Don't even think of using double qoutes -- in your filenames! (break (=='"') xs, \ x -> '"':x ++ "\"") _ -> (break isSpace rs', id) bs' = case bs of [] -> [] (_:xs) -> xs tok = T_include (wrapper as) setTok tok setStream bs' cont tok Just T_ignore_start -> do ls <- getStream case (dropUntil "__ignore_end__" ls) of [] -> cont T_eof xs -> setStream xs >> lexIDL cont Just tok -> setTok tok >> setStream rs >> cont tok Nothing -> setTok (T_id is) >> setStream rs >> cont (T_id is) is_id_char :: Char -> Bool is_id_char ch = isAlpha ch || isDigit ch || ch == '_' || ch == '$' || ch == '.' dropUntil :: String -> String -> String dropUntil _pref [] = [] dropUntil pref ls@(_:xs) | pref `isPrefixOf` ls = drop (length pref) ls | otherwise = dropUntil pref xs lex_string :: (IDLToken -> LexM a) -> LexM a lex_string cont = do cs <- getStream --assert (head cs /= '\"') case loop cs of (str,cs1) -> do setStream cs1 cont (T_string_lit (deEscapeString str)) where not_quote_nor_esc ch = ch /= '\"' && ch /= '\\' loop cs = case span not_quote_nor_esc cs of (ls,'\\':rs) -> case rs of ('"':rs1) -> -- escaped quote, just continue. let (as,bs) = loop rs1 in (ls++'\\':'\"':as, bs) ('\\':rs1) -> -- want \\" to be interpreted as \\ ", not \ \" let (as,bs) = loop rs1 in (ls++'\\':'\\':as, bs) _ -> -- just continue. let (as,bs) = loop rs in (ls++'\\':as, bs) (ls,'\"':rs) -> (ls,rs) x -> x lex_char :: (IDLToken -> LexM a) -> LexM a lex_char cont = do cs <- getStream loop cs where not_quote_nor_esc ch = ch /= '\'' && ch /= '\\' loop cs = case span not_quote_nor_esc cs of ([],'\\':rs) -> case rs of '\\':'\'':rs1 -> setStream rs1 >> cont (T_literal (CharLit '\\')) x:'\'':rs1 -> setStream rs1 >> cont (T_literal (CharLit (read ['\\',x]))) _ -> getSrcLoc >>= \ sl -> cont (T_unknown (sl,cs)) (ls,'\'':rs) -> setStream rs >> cont (T_literal (CharLit (read ls))) (ls,rs) -> setStream rs >> getSrcLoc >>= \ sl -> cont (T_unknown (sl,ls)) \end{code}