-- Compiler Toolkit: Self-optimizing lexers -- -- Author : Manuel M. T. Chakravarty -- Created: 24 February 95, 2 March 99 -- -- Version $Revision: 1.1 $ from $Date: 2002/07/28 03:35:20 $ -- -- Copyright (c) [1995..2000] Manuel M. T. Chakravarty -- Copyright (c) 2004-2008 Don Stewart -- -- This library is free software; you can redistribute it and/or -- modify it under the terms of the GNU Library General Public -- License as published by the Free Software Foundation; either -- version 2 of the License, or (at your option) any later version. -- -- This library is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -- Library General Public License for more details. -- --- DESCRIPTION --------------------------------------------------------------- -- -- Self-optimizing lexer combinators. -- -- For detailed information, see ``Lazy Lexing is Fast'', Manuel -- M. T. Chakravarty, in A. Middeldorp and T. Sato, editors, Proceedings of -- Fourth Fuji International Symposium on Functional and Logic Programming, -- Springer-Verlag, LNCS 1722, 1999. (See my Web page for details.) -- -- http://www.cse.unsw.edu.au/~chak/papers/Cha99.html -- -- Thanks to Simon L. Peyton Jones and Roman -- Lechtchinsky for their helpful suggestions that -- improved the design of this library. -- --- DOCU ---------------------------------------------------------------------- -- -- language: Haskell 98 -- -- The idea is to combine the benefits of off-line generators with -- combinators like in `Parsers.hs' (which builds on Swierstra/Duponcheel's -- technique for self-optimizing parser combinators). In essence, a state -- transition graph representing a lexer table is computed on the fly, to -- make lexing deterministic and based on cheap table lookups. -- -- Regular expression map to Haskell expressions as follows. If `x' and `y' -- are regular expressions, -- -- -> epsilon -- xy -> x +> y -- x*y -> x `star` y -- x+y -> x `plus` y -- x?y -> x `quest` y -- -- Given such a Haskelized regular expression `hre', we can use -- -- (1) hre `lexaction` \lexeme -> Nothing -- (2) hre `lexaction` \lexeme -> Just token -- (3) hre `lexmeta` \lexeme pos s -> (res, pos', s', Nothing) -- (4) hre `lexmeta` \lexeme pos s -> (res, pos', s', Just l) -- -- where `epsilon' is required at the end of `hre' if it otherwise ends on -- `star', `plus', or `quest', and then, we have -- -- (1) discards `lexeme' accepted by `hre', -- (2) turns the `lexeme' accepted by `hre' into a token, -- (3) while discarding the lexeme accepted by `hre', transforms the -- position and/or user state, and -- (4) while discarding the lexeme accepted by `hre', transforms the -- position and/or user state and returns a lexer to be used for the -- next lexeme. -- -- The component `res' in case of a meta action, can be `Nothing', `Just -- (Left err)', or `Just (Right token)' to return nothing, an error, or a -- token from a meta action, respectively. -- -- * This module makes essential use of graphical data structures (for -- representing the state transition graph) and laziness (for maintaining -- the last action in `execLexer'. -- -- NOTES: -- -- * In this implementation, the combinators `quest`, `star`, and `plus` are -- *right* associative - this was different in the ``Lazy Lexing is Fast'' -- paper. This change was made on a suggestion by Martin Norbäck -- . -- --- TODO ---------------------------------------------------------------------- -- -- * error correction is missing -- -- * in (>||<) in the last case, `(addBoundsNum bn bn')' is too simple, as -- the number of outgoing edges is not the sum of the numbers of the -- individual states when there are conflicting edges, ie, ones labeled -- with the same character; however, the number is only used to decide a -- heuristic, so it is questionable whether it is worth spending the -- additional effort of computing the accurate number -- -- * Unicode posses a problem as the character domain becomes too big for -- using arrays to represent transition tables and even sparse structures -- will posse a significant overhead when character ranges are naively -- represented. So, it might be time for finite maps again. -- -- Regarding the character ranges, there seem to be at least two -- possibilities. Doaitse explicitly uses ranges and avoids expanding -- them. The problem with this approach is that we may only have -- predicates such as `isAlphaNum' to determine whether a givne character -- belongs to some character class. From this representation it is -- difficult to efficiently compute a range. The second approach, as -- proposed by Tom Pledger (on the Haskell list) -- would be to actually use predicates directly and make the whole business -- efficient by caching predicate queries. In other words, for any given -- character after we have determined (in a given state) once what the -- following state on accepting that character is, we need not consult the -- predicates again if we memorise the successor state the first time -- around. -- module Lexers ( Regexp, Lexer, Action, Meta, Error, epsilon, char, (+>), lexaction, lexactionErr, lexmeta, action, meta, (>|<), (>||<), star, plus, quest, alt, string, with, LexerState, execLexer ) where import Prelude hiding (last) import Data.Maybe (fromMaybe) import Data.Array (Array, (!), assocs, accumArray) infixr 4 `quest`, `star`, `plus` infixl 3 +>, `lexaction`, `lexmeta`, `action`, `meta` infixl 2 >|<, >||< -- constants -- --------- -- we use the dense representation if a table has at least the given number of -- (non-error) elements -- denseMin :: Int denseMin = 20 -- data structures -- --------------- -- represents the number of (non-error) elements and the bounds of a table -- type BoundsNum = (Int, Char, Char) -- empty bounds -- -- nullBoundsNum :: BoundsNum -- nullBoundsNum = (0, maxBound, minBound) -- combine two bounds -- addBoundsNum :: BoundsNum -> BoundsNum -> BoundsNum addBoundsNum (n, lc, hc) (n', lc', hc') = (n + n', min lc lc', max hc hc') -- check whether a character is in the bounds -- inBounds :: Char -> BoundsNum -> Bool inBounds c (_, lc, hc) = c >= lc && c <= hc -- Lexer errors type Error = String -- Lexical actions take a lexeme with its position and may return a token; in -- a variant, an error can be returned (EXPORTED) -- -- * if there is no token returned, the current lexeme is discarded lexing -- continues looking for a token -- type Action t = String -> Maybe t type ActionErr t = String -> Either Error t -- Meta actions transform the lexeme, and a user-defined state; they -- may return a lexer, which is then used for accepting the next token -- (this is important to implement non-regular behaviour like nested -- comments) (EXPORTED) -- type Meta s t = String -> s -> (Maybe (Either Error t), -- err/tok? s, -- state Maybe (Lexer s t)) -- lexer? -- tree structure used to represent the lexer table (EXPORTED ABSTRACTLY) -- -- * each node in the tree corresponds to a state of the lexer; the associated -- actions are those that apply when the corresponding state is reached -- data Lexer s t = Lexer (LexAction s t) (Cont s t) -- represent the continuation of a lexer -- data Cont s t = -- on top of the tree, where entries are dense, we use arrays -- Dense BoundsNum (Array Char (Lexer s t)) -- -- further down, where the valid entries are sparse, we -- use association lists, to save memory (the first argument -- is the length of the list) -- | Sparse BoundsNum [(Char, Lexer s t)] -- -- end of a automaton -- | Done -- lexical action (EXPORTED ABSTRACTLY) -- data LexAction s t = Action (Meta s t) | NoAction -- a regular expression (EXPORTED) -- type Regexp s t = Lexer s t -> Lexer s t -- basic combinators -- ----------------- -- Empty lexeme (EXPORTED) -- epsilon :: Regexp s t epsilon = id -- One character regexp (EXPORTED) -- char :: Char -> Regexp s t char c = \l -> Lexer NoAction (Sparse (1, c, c) [(c, l)]) -- Concatenation of regexps (EXPORTED) -- (+>) :: Regexp s t -> Regexp s t -> Regexp s t (+>) = (.) -- Close a regular expression with an action that converts the lexeme into a -- token (EXPORTED) -- -- * Note: After the application of the action, the position is advanced -- according to the length of the lexeme. This implies that normal -- actions should not be used in the case where a lexeme might contain -- control characters that imply non-standard changes of the position, -- such as newlines or tabs. -- action :: Regexp s t -> Action t -> Lexer s t action = lexaction lexaction :: Regexp s t -> Action t -> Lexer s t lexaction re a = re `lexmeta` a' where a' lexeme s = case a lexeme of Nothing -> (Nothing, s, Nothing) Just t -> (Just (Right t), s, Nothing) -- Variant for actions that may returns an error (EXPORTED) -- lexactionErr :: Regexp s t -> ActionErr t -> Lexer s t lexactionErr re a = re `lexmeta` a' where a' lexeme s = (Just (a lexeme), s, Nothing) -- Close a regular expression with a meta action (EXPORTED) -- -- * Note: Meta actions have to advance the position in dependence of the -- lexeme by themselves. -- meta :: Regexp s t -> Meta s t -> Lexer s t meta = lexmeta lexmeta :: Regexp s t -> Meta s t -> Lexer s t lexmeta re a = re (Lexer (Action a) Done) -- useful for building meta actions with :: b -> Maybe (Either a b) with a = Just (Right a) -- disjunctive combination of two regexps (EXPORTED) -- (>|<) :: Regexp s t -> Regexp s t -> Regexp s t re >|< re' = \l -> re l >||< re' l -- disjunctive combination of two lexers (EXPORTED) -- (>||<) :: Lexer s t -> Lexer s t -> Lexer s t (Lexer a c) >||< (Lexer a' c') = Lexer (joinActions a a') (joinConts c c') -- combine two disjunctive continuations -- joinConts :: Cont s t -> Cont s t -> Cont s t joinConts Done c' = c' joinConts c Done = c joinConts c c' = let (bn , cls ) = listify c (bn', cls') = listify c' in -- note: `addsBoundsNum' can, at this point, only -- approx. the number of *non-overlapping* cases; -- however, the bounds are correct -- aggregate (addBoundsNum bn bn') (cls ++ cls') where listify (Dense n arr) = (n, assocs arr) listify (Sparse n cls) = (n, cls) listify _ = error "Lexers.listify: Impossible argument!" -- combine two actions. Use the latter in case of overlap (!) -- joinActions :: LexAction s t -> LexAction s t -> LexAction s t joinActions NoAction a' = a' joinActions a NoAction = a joinActions _ a' = a' -- error "Lexers.>||<: Overlapping actions!" -- Note: `n' is only an upper bound of the number of non-overlapping cases -- aggregate :: BoundsNum -> ([(Char, Lexer s t)]) -> Cont s t aggregate bn@(n, lc, hc) cls | n >= denseMin = Dense bn (accumArray (>||<) noLexer (lc, hc) cls) | otherwise = Sparse bn (accum (>||<) cls) where noLexer = Lexer NoAction Done -- combine the elements in the association list that have the same key -- accum :: Eq a => (b -> b -> b) -> [(a, b)] -> [(a, b)] accum _ [] = [] accum f ((c, el):ces) = let (ce, ces') = gather c el ces in ce : accum f ces' where gather k e [] = ((k, e), []) gather k e (ke'@(k', e'):kes) | k == k' = gather k (f e e') kes | otherwise = let (ke'', kes') = gather k e kes in (ke'', ke':kes') -- non-basic combinators -- --------------------- -- x `star` y corresponds to the regular expression x*y (EXPORTED) -- star :: Regexp s t -> Regexp s t -> Regexp s t -- -- The definition used below can be obtained by equational reasoning from this -- one (which is much easier to understand): -- -- star re1 re2 = let self = (re1 +> self >|< epsilon) in self +> re2 -- -- However, in the above, `self' is of type `Regexp s t' (ie, a functional), -- whereas below it is of type `Lexer s t'. Thus, below we have a graphical -- body (finite representation of an infinite structure), which doesn't grow -- with the size of the accepted lexeme - in contrast to the definition using -- the functional recursion. -- star re1 re2 = \l -> let self = re1 self >||< re2 l in self -- x `plus` y corresponds to the regular expression x+y (EXPORTED) -- plus :: Regexp s t -> Regexp s t -> Regexp s t plus re1 re2 = re1 +> (re1 `star` re2) -- x `quest` y corresponds to the regular expression x?y (EXPORTED) -- quest :: Regexp s t -> Regexp s t -> Regexp s t quest re1 re2 = (re1 +> re2) >|< re2 -- accepts a non-empty set of alternative characters (EXPORTED) -- alt :: [Char] -> Regexp s t -- -- Equiv. to `(foldr1 (>|<) . map char) cs', but much faster -- alt [] = error "Lexers.alt: Empty character set!" alt cs = \l -> let bnds = (length cs, minimum cs, maximum cs) in Lexer NoAction (aggregate bnds [(c, l) | c <- cs]) -- accept a character sequence (EXPORTED) -- string :: String -> Regexp s t string [] = error "Lexers.string: Empty character set!" string cs = (foldr1 (+>) . map char) cs -- execution of a lexer -- -------------------- -- threaded top-down during lexing (current input, meta state) (EXPORTED) -- type LexerState s = (String, s) -- apply a lexer, yielding a token sequence and a list of errors (EXPORTED) -- -- * Currently, all errors are fatal; thus, the result is undefined in case of -- an error (this changes when error correction is added). -- -- * The final lexer state is returned. -- -- * The order of the error messages is undefined. -- execLexer :: Lexer s t -> LexerState s -> ([t], LexerState s, [Error]) -- -- * the following is moderately tuned -- execLexer _ st@([], _) = ([], st, []) execLexer l st = case lexOne l st of (Nothing , _ , st') -> execLexer l st' (Just res, l', st') -> let (ts, final, allErrs) = execLexer l' st' in case res of (Left err) -> (ts , final, err:allErrs) (Right t ) -> (t:ts, final, allErrs) where -- accept a single lexeme -- -- lexOne :: Lexer s t -> LexerState s t -- -> (Either Error (Maybe t), Lexer s t, LexerState s t) lexOne l0 st' = oneLexeme l0 st' zeroDL lexErr where -- the result triple of `lexOne' that signals a lexical error; -- the result state is advanced by one character for error correction -- lexErr = let (cs, s) = st' err = "Lexical error!\n" ++ "The character " ++ show (head cs) ++ " does not fit here; skipping it." in (Just (Left err), l, (tail cs, s)) -- -- we take an open list of characters down, where we accumulate the -- lexeme; this function returns maybe a token, the next lexer to use -- (can be altered by a meta action), the new lexer state, and a list -- of errors -- -- we implement the "principle of the longest match" by taking a -- potential result quadruple down (in the last argument); the -- potential result quadruple is updated whenever we pass by an action -- (different from `NoAction'); initially it is an error result -- -- (dons): from the paper "we have to take the last possible -- action before the automaton gets stuck. .. when no further -- transition is possible, we execute the last action on the -- associated lexeme." This means that even once all possible -- actions have been looked at, we currently then wait till the -- _next_ char, before returning the last action we're carrying -- around. For interactive use, we instead need to execute the -- last action once we encounter it, not once we've gone past -- it. Hmm... -- -- oneLexeme :: Lexer s t -- -> LexerState -- -> DList Char -- -> (Maybe (Either Error t), Maybe (Lexer s t), -- LexerState s t) -- -> (Maybe (Either Error t), Maybe (Lexer s t), -- LexerState s t) -- oneLexeme (Lexer a cont) state@(cs, s) csDL last = let last' = doaction a csDL state last in case cs of [] -> last' -- at end, has to be this action (c:cs') -> oneChar cont c (cs', s) csDL last' -- keep looking -- -- There are more chars. Look at the next one -- -- Now, if the next tbl is Done, then there is no more -- transition, so immediately execute our action -- oneChar tbl c state csDL last = case peek tbl c of Nothing -> last Just (Lexer a Done) -> doaction a (csDL `snocDL` c) state last Just l' -> continue l' c state csDL last -- -- Do the lookup. -- peek Done _ = Nothing peek (Dense bn arr) c | c `inBounds` bn = Just $ arr ! c peek (Sparse bn cls) c | c `inBounds` bn = lookup c cls peek _ _ = Nothing -- -- continue within the current lexeme -- continue l' c state csDL last = oneLexeme l' state (csDL `snocDL` c) last -- -- execute the action if present and finalise the current lexeme -- doaction (Action f) csDL (cs, s) _last = case f (closeDL csDL) s of (Nothing, s', l') | not . null $ cs -> lexOne (fromMaybe l0 l') (cs, s') (res, s', l') -> (res, (fromMaybe l0 l'), (cs, s')) doaction NoAction _csDL _state last = last -- no change ------------------------------------------------------------------------ -- -- This module provides the functional equivalent of the difference lists -- from logic programming. They provide an O(1) append. -- -- | a difference list is a function that given a list returns the original -- contents of the difference list prepended at the given list (EXPORTED) type DList a = [a] -> [a] -- | open a list for use as a difference list (EXPORTED) {- openDL :: [a] -> DList a openDL = (++) -} -- | create a difference list containing no elements (EXPORTED) zeroDL :: DList a zeroDL = id -- | create difference list with given single element (EXPORTED) {- unitDL :: a -> DList a unitDL = (:) -} -- | append a single element at a difference list (EXPORTED) snocDL :: DList a -> a -> DList a snocDL dl x = \l -> dl (x:l) -- | appending difference lists (EXPORTED) {- joinDL :: DList a -> DList a -> DList a joinDL = (.) -} -- | closing a difference list into a normal list (EXPORTED) closeDL :: DList a -> [a] closeDL = ($[])