{- Haskell module for constructing indentation aware parser combinators. Copyright (C) 2007 Piyush P Kurur, This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . -} {-| A module to construct indentation aware parsers. Many programming language have indentation based syntax rules e.g. python and Haskell. This module exports combinators to create such parsers. The input source can be thought of as a list of tokens. Abstractly each token occurs at a line and a column and has a width. The column number of a token measures is indentation. If t1 and t2 are two tokens then we say that indentation of t1 is more than t2 if the column number of occurrence of t1 is greater than that of t2. Currently this module supports two kind of indentation based syntactic structures which we now describe: [Block] A block of indentation /c/ is a sequence of tokens with indentation at least /c/. Examples for a block is a where clause of Haskell with no explicit braces. [Line fold] A line fold starting at line /l/ and indentation /c/ is a sequence of tokens that start at line /l/ and possibly continue to subsequent lines as long as the indentation is greater than /c/. Such a sequence of lines need to be /folded/ to a single line. An example is MIME headers. Line folding based binding separation is used in Haskell as well. The module exports three combinators are @'indentParser'@, @'block'@ and @'lineFold'@. To construct parsers for indentation based grammars one typically applies the @'indentParser'@. A block can then be parsed using the combinator @'block'@ and a line fold using @'lineFold'@. Generating indentation aware tokenisers could be tricky. Given a language description via the 'Text.ParserCombinators.Parsec.Language.LanguageDef' record use module @'Text.ParserCombinators.Parsec.IndentParser.Token'@ to generate its tokeiser (this will apply @'indentParser'@ on all tokenisers and then the user can forget about @'indentParser'@ combinator). Warning: Internally indentations are implemented using Parser states. If one wants to use parser states as well then use the 'getState' and 'setState' functions exported by this module instead of those exported from the parsec library. Also use the 'parseTest' and 'runParser' function exported from this module instead of the one exported from Parsec. -} module Text.ParserCombinators.Parsec.IndentParser ( -- * Parser type. IndentParser, IndentCharParser, IndentMode(..), indentParser, noIndent, block, lineFold, betweenOrBlock, betweenOrLineFold, -- * User state manipulation. getState, setState, -- * Running and testing. runParser, parse, parseFromFile, parseTest, ) where import Text.ParserCombinators.Parsec hiding ( getState, setState, parseTest, runParser, parse, parseFromFile ) import Text.ParserCombinators.Parsec.Pos import Text.ParserCombinators.Parsec.IndentParser.Prim -- Some comparisons of indentations. indentGT :: SourcePos -> SourcePos -> Bool indentGT pos1 pos2 = sourceColumn pos1 > sourceColumn pos2 indentGE pos1 pos2 = sourceColumn pos1 >= sourceColumn pos2 indentEq pos1 pos2 = sourceColumn pos1 == sourceColumn pos2 {-| The combinator indentParser makes its input parser indentation aware. Usually one would want to make all the tokenisers indentation aware. -} indentParser :: IndentParser tok st a -> IndentParser tok st a indentParser p = do indMode <- getIndentMode case indMode of NoIndent -> p Block -> inBlockMode p LineFold -> inLineFoldMode p where inBlockMode p = do curCol <- column indCol <- indentColumn if curCol >= indCol then p else pzero inLineFoldMode p = do curPos <- getPosition oldPos <- getIndentPos if curPos `indentGT` oldPos || curPos == oldPos then p else pzero {-| The parser @`noIndent` p@ runs @p@ ignoring any indentation based structure. This can be used to parse for example an explicitly braced where clause in Haskell. -} noIndent :: IndentParser tok st a -> IndentParser tok st a noIndent p = saveIndent (p `withIndentMode` NoIndent) {-| The parser @'block' p@ parses a /block/ of @p@. -} block :: IndentParser tok st a -> IndentParser tok st a {-| The parser @lineFold p@ parses a folded line of @p@. -} lineFold :: IndentParser tok st a -> IndentParser tok st a {- The complication in the definition of block and lineFold is in the handling of nested indentation. We now describe the rules for nesting of blocks and lineFolds. (1) A block or a lineFold can start any where in a NoIndent chunk (2) A block inside a lineFold or block has to be indented more than the previous indentation, otherwise it is an empty block. (3) A lineFold inside a lineFold has to be indented more than the previous indentation otherwise it is an empty lineFold (4) A lineFold inside a block should be indented at least as much as the previous indentation otherwise it is an empty lineFold. Based on these rules we define the parsers isEmptyBlock and isEmptyLineFold. -} isEmptyBlock = do indm <- getIndentMode case indm of NoIndent -> return False _ -> do curCol <- column indCol <- indentColumn return (curCol <= indCol) isEmptyLineFold = do indm <- getIndentMode case indm of NoIndent -> return False Block -> do curCol <- column indCol <- indentColumn return (curCol < indCol) LineFold -> do curCol <- column indCol <- indentColumn return (curCol <= indCol) {- If the current block (or line fold) is empty we cannot outright reject because the argument parser @p@ could be some parser that accepts an empty string. So the strategy we use is that in this case we set the indentation position to be one more than the current position. So this will gurantee that @p@, if it is a valid indentation parser will see empty input. -} setPosBlock = do test <- isEmptyBlock pos <- if test then nextPos else getPosition setIndentPos pos setPosLineFold = do test <- isEmptyLineFold pos <- if test then nextPos else getPosition setIndentPos pos {- Also a block p (lineFold p) should contain p and nothing else. So when we are done with it we need to ensure that there is no more input. This is done using the parser eobReached (eolfReached) -} eobReached :: IndentParser tok st () eobReached = do indCol <- indentColumn col <- column if indCol <= col then (endOfInput <|> prematureEnd) else return () where prematureEnd = do indPos <- getIndentPos pos <- getPosition fail ("premature block termination " ++ "started at " ++ show indPos ++ " and ended at " ++ show pos) eolfReached = do indCol <- indentColumn col <- column if indCol < col then (endOfInput <|> prematureEnd) else return () where prematureEnd = do indPos <- getIndentPos pos <- getPosition fail ("premature line fold termination " ++ "started at " ++ show indPos ++ " and ended at " ++ show pos) {- Having defined these parsers we can now define the parser block and lineFold. -} block p = saveIndent $ do setPosBlock x <- p `withIndentMode` Block eobReached return x lineFold p = saveIndent $ do setPosLineFold x <- p `withIndentMode` LineFold eolfReached return x {-| The parser @betweenOrBlock open close p@ parses @p@ between @open@ and @close@. If open is matched @p@ is parsed in 'NoIndent' mode otherwise a block @p@ is parsed in 'Block' mode. For eg. the parser for parsing haskell where clause would look like > whereClause = do reserved where; betweenOrBlock bindings -} betweenOrBlock :: IndentParser tok st open -> IndentParser tok st close -> IndentParser tok st a -> IndentParser tok st a betweenOrBlock left right p = do left; x <- noIndent p; right; return x <|> block p {-| Similar to betweenOrBlock but uses lineFold instead of block. -} betweenOrLineFold :: IndentParser tok st open -> IndentParser tok st close -> IndentParser tok st a -> IndentParser tok st a betweenOrLineFold left right p = do left; x <- noIndent p; right; return x <|> lineFold p -- Some position parsers. -- | This returns the current line number line :: GenParser tok st Line line = fmap sourceLine getPosition -- | This returns the current column number column :: GenParser tok st Column column = fmap sourceColumn getPosition indentColumn :: IndentParser tok st Column indentColumn = fmap sourceColumn getIndentPos nextPos :: GenParser tok st SourcePos nextPos = do pos <- getPosition return (pos `incSourceColumn` 1) {-| The parser @saveIndent p@, saves the current indentation, runs @p@, and restores the indentation back. -} saveIndent :: IndentParser tok st a -> IndentParser tok st a saveIndent p = do indMode <- getIndentMode indPos <- getIndentPos x <- p setIndentMode indMode setIndentPos indPos return x withIndentMode :: IndentParser tok st a -> IndentMode -> IndentParser tok st a withIndentMode p indm = do setIndentMode indm p {- When an block or a lineFold is to be entered care must be taken to set the indentation position. If the current indentation mode is NoIndent then just set the current position. In case we are already inside a line fold or a block then the indentation should be the max of current position and previous indentation position + 1. The reason we do not outright reject is because block p could be -} endOfInput = do ins <- getInput case ins of [] -> return () _ -> pzero