Text/ParserCombinators/Parsec/IndentParser.hs


{- 

Haskell module for constructing indentation aware parser combinators.
Copyright (C) 2007  Piyush P Kurur, <http://www.cse.iitk.ac.in/~ppk>

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
  

-}

{-| 

A module to construct indentation aware parsers. Many programming
language have indentation based syntax rules e.g. python and Haskell.
This module exports combinators to create such parsers. 

The input source can be thought of as a list of tokens. Abstractly
each token occurs at a line and a column and has a width. The column
number of a token measures is indentation. If t1 and t2 are two tokens
then we say that indentation of t1 is more than t2 if the column
number of occurrence of t1 is greater than that of t2.

Currently this module supports two kind of indentation based syntactic
structures which we now describe:

[Block] A block of indentation /c/ is a sequence of tokens with
indentation at least /c/.  Examples for a block is a where clause of
Haskell with no explicit braces.

[Line fold] A line fold starting at line /l/ and indentation /c/ is a
sequence of tokens that start at line /l/ and possibly continue to
subsequent lines as long as the indentation is greater than /c/. Such
a sequence of lines need to be /folded/ to a single line. An example
is MIME headers. Line folding based binding separation is used in
Haskell as well.


The module exports three combinators are @'indentParser'@, @'block'@
and @'lineFold'@.  To construct parsers for indentation based grammars
one typically applies the @'indentParser'@.  A block can then be
parsed using the combinator @'block'@ and a line fold using
@'lineFold'@. Generating indentation aware tokenisers could be tricky.
Given a language description via the
'Text.ParserCombinators.Parsec.Language.LanguageDef' record use module
@'Text.ParserCombinators.Parsec.IndentParser.Token'@ to generate its
tokeiser (this will apply @'indentParser'@ on all tokenisers and then
the user can forget about @'indentParser'@ combinator).


Warning:

Internally indentations are implemented using Parser states. If one
wants to use parser states as well then use the 'getState' and
'setState' functions exported by this module instead of those exported
from the parsec library.  Also use the 'parseTest' and 'runParser'
function exported from this module instead of the one exported from
Parsec.


-}

module Text.ParserCombinators.Parsec.IndentParser
    (
                    -- * Parser type.
                    IndentParser,
                    IndentCharParser,
                    IndentMode(..),
                    indentParser,
                    noIndent, block, lineFold,
                    betweenOrBlock, betweenOrLineFold,
                    -- * User state manipulation.
                    getState, setState,
                    -- * Running and testing.
                    runParser, parse, parseFromFile,
                    parseTest,

) where


import Text.ParserCombinators.Parsec hiding (
                                             getState,
                                             setState,
                                             parseTest,
                                             runParser,
                                             parse,
                                             parseFromFile
                                            )


import Text.ParserCombinators.Parsec.Pos
import Text.ParserCombinators.Parsec.IndentParser.Prim


-- Some comparisons of indentations.


indentGT :: SourcePos -> SourcePos -> Bool
indentGT pos1 pos2 = sourceColumn pos1 > sourceColumn pos2
indentGE pos1 pos2 = sourceColumn pos1 >= sourceColumn pos2
indentEq pos1 pos2 = sourceColumn pos1 == sourceColumn pos2

              



{-| 

  The combinator indentParser makes its input parser indentation
  aware. Usually one would want to make all the tokenisers indentation
  aware. 

-}





indentParser :: IndentParser tok st a -> IndentParser tok st a
indentParser p = do indMode <- getIndentMode
                    case indMode of
                      NoIndent -> p
                      Block    -> inBlockMode p
                      LineFold -> inLineFoldMode p
    where inBlockMode    p = do curCol <- column
                                indCol <- indentColumn
                                if curCol >= indCol then p else pzero

          inLineFoldMode p = do curPos <- getPosition
                                oldPos <- getIndentPos
                                if curPos `indentGT` oldPos 
                                       || curPos == oldPos then p else pzero


{-|

The parser @`noIndent` p@ runs @p@ ignoring any indentation based
structure. This can be used to parse for example an explicitly braced
where clause in Haskell.



-}

noIndent :: IndentParser tok st a -> IndentParser tok st a
noIndent p = saveIndent (p `withIndentMode` NoIndent)

 


{-|
The parser @'block' p@ parses a /block/ of @p@.
-} 
block :: IndentParser tok st a -> IndentParser tok st a

{-| 

The parser @lineFold p@ parses a folded line of @p@.

-}
lineFold :: IndentParser tok st a -> IndentParser tok st a


{-

The complication in the definition of block and lineFold is in the
handling of nested indentation. We now describe the rules for nesting
of blocks and lineFolds.

(1) A block or a lineFold can start any where in a NoIndent chunk
(2) A block inside a lineFold or block has to be indented 
    more than the previous indentation, otherwise it is an empty block.

(3) A lineFold inside a lineFold has to be indented more than
    the previous indentation otherwise it is an empty lineFold
(4) A lineFold inside a block should be indented at least as much
    as the previous indentation otherwise it is an empty lineFold.

Based on these rules we define the parsers isEmptyBlock and
isEmptyLineFold.

-}


isEmptyBlock = do indm <- getIndentMode
                  case indm of
                    NoIndent -> return False
                    _        -> do curCol <- column
                                   indCol <- indentColumn
                                   return (curCol <= indCol)

isEmptyLineFold = do indm <- getIndentMode
                     case indm of
                       NoIndent -> return False
                       Block     -> do curCol <- column
                                       indCol <- indentColumn
                                       return (curCol < indCol)
                       LineFold  -> do curCol <- column
                                       indCol <- indentColumn
                                       return (curCol <= indCol)


{- 

  If the current block (or line fold) is empty we cannot outright
  reject because the argument parser @p@ could be some parser that
  accepts an empty string.  So the strategy we use is that in this
  case we set the indentation position to be one more than the current
  position. So this will gurantee that @p@, if it is a valid
  indentation parser will see empty input.

-}



setPosBlock = do test <- isEmptyBlock
                 pos <- if test then nextPos else getPosition
                 setIndentPos pos
                  

setPosLineFold = do test <- isEmptyLineFold
                    pos <- if test then nextPos else getPosition
                    setIndentPos pos


{- 
   Also a block p (lineFold p) should contain p and nothing else. So
   when we are done with it we need to ensure that there is no more
   input. This is done using the parser eobReached (eolfReached)

-}




eobReached :: IndentParser tok st ()
eobReached = do indCol <- indentColumn
                col    <- column
                if indCol <= col then (endOfInput <|> prematureEnd)
                   else return ()
    where prematureEnd = do indPos <- getIndentPos
                            pos <- getPosition
                            fail ("premature block termination " ++
                                     "started at " ++ show indPos ++
                                     " and ended at " ++ show pos)
              
                


eolfReached = do indCol <- indentColumn
                 col  <- column
                 if indCol < col  then (endOfInput <|> prematureEnd)
                    else return ()
    where prematureEnd = do indPos <- getIndentPos
                            pos <- getPosition
                            fail ("premature line fold termination " ++
                                     "started at " ++ show indPos ++
                                     " and ended at " ++ show pos)
              


{-
  Having defined these parsers we can now define the parser block and
  lineFold.
-}

block p = saveIndent $ do setPosBlock
                          x <- p `withIndentMode` Block
                          eobReached
                          return x


lineFold p = saveIndent $ do setPosLineFold
                             x <- p `withIndentMode` LineFold
                             eolfReached
                             return x


                  

{-|

  The parser @betweenOrBlock open close p@ parses @p@ between @open@
and @close@. If open is matched @p@ is parsed in 'NoIndent' mode otherwise
a block @p@ is parsed in 'Block' mode. For eg. the parser for parsing
haskell where clause would look like

> whereClause = do reserved where; betweenOrBlock bindings

-}

betweenOrBlock :: IndentParser tok st open -> 
                  IndentParser tok st close ->
                  IndentParser tok st a -> 
                  IndentParser tok st a

betweenOrBlock left right p = do left; x <- noIndent p; right; return x
                              <|> block p

{-|

Similar to betweenOrBlock but uses lineFold instead of block.

-}
betweenOrLineFold :: IndentParser tok st open -> 
                     IndentParser tok st close -> 
                     IndentParser tok st a ->
                     IndentParser tok st a
betweenOrLineFold left right p = do left; x <- noIndent p; right; return x
                                 <|> lineFold p




-- Some position parsers.


-- | This returns the current line number

line :: GenParser tok st Line
line   = fmap sourceLine   getPosition

-- | This returns the current column number

column :: GenParser tok st Column
column = fmap sourceColumn getPosition

indentColumn :: IndentParser tok st Column
indentColumn = fmap sourceColumn getIndentPos


nextPos :: GenParser tok st SourcePos
nextPos = do pos <- getPosition
             return (pos `incSourceColumn` 1)


{-| 

The parser @saveIndent p@, saves the current indentation, runs @p@,
and restores the indentation back.

-}

saveIndent :: IndentParser tok st a -> IndentParser tok st a
saveIndent p = do indMode <- getIndentMode
                  indPos  <- getIndentPos
                  x <- p
                  setIndentMode indMode
                  setIndentPos  indPos
                  return x
    
withIndentMode :: IndentParser tok st a -> IndentMode -> IndentParser tok st a
withIndentMode p indm = do setIndentMode indm
                           p
               

{- 

When an block or a lineFold is to be entered care must be taken to set
the indentation position. If the current indentation mode is NoIndent
then just set the current position. In case we are already inside
a line fold or a block then the indentation should be the max of
current position and previous indentation position + 1. The reason
we do not outright reject is because block p could be 

-}


endOfInput = do ins <- getInput
                case ins of
                  [] -> return ()
                  _  -> pzero