src/Text/Parsec/IndentParsec/Token.hs

{-# LANGUAGE FlexibleContexts #-}
{-|

Module to create indentation aware tokenisers. Despite the simplicity
of parser combinators, getting tokenisers for common language
contructs right is tricky. The parsec way of handling this involves
the following steps.

  * Define the description of the language via the
    @`Text.Parsec.Language.LanguageDef`@ record.

  * Apply the 'Text.Parsec.Token.makeTokenParser' combinator get hold
    of @'Text.Parsec.Token.TokenParser'@. The actual tokenisers are
    the fields of this record.

This module provides a similar interfaces for generating indentation
aware tokenisers. There are few specific things that an indentation
aware tokeniser should be careful aboute

   1. All tokenisers should be indentation aware.

   2. Whitespaces and comments should be skipped irrespective on which
      indentation mode one is in

   3. The tokenisers should themselves be lexeme parsers and should skip trailing
      whitespace.

Getting all this working can often be tricky.

-}

module Text.Parsec.IndentParsec.Token
       (
       -- * Usage.
       -- $usage
         GenIndentTokenParser
       , IndentTokenParser
       , identifier
       , operator
       , reserved
       , reservedOp
       , charLiteral
       , stringLiteral
       , natural
       , integer
       , float
       , naturalOrFloat
       , decimal
       , hexadecimal
       , octal
       , symbol
       , lexeme
       , whiteSpace
       , semi
       , comma
       , colon
       , dot
       , parens, parensBlock
       , braces, bracesBlock
       , angles, anglesBlock
       , brackets, bracketsBlock
       , semiSep, semiSepOrFoldedLines
       , semiSep1, semiSepOrFoldedLines1
       , commaSep, commaSepOrFoldedLines
       , commaSep1, commaSepOrFoldedLines1
       ) where


import Text.Parsec.IndentParsec.Prim
import Text.Parsec.IndentParsec.Combinator
import Text.Parsec(many)

import qualified Text.Parsec.Token as T
import Text.Parsec(Stream)
import Text.Parsec.Combinator hiding (between)


{- $usage

For each combinator @foo@ for every field @foo@ of the
'Text.Parsec.Token.TokenParser' with essentially the same semantics
but for the returned parser being indentation aware. There are certain
new combinators that are defined specifically for parsing indentation
based syntactic constructs:

[Grouping Parsers] A grouping parser takes an input parser @p@ and
returns a parser that parses @p@ between two /grouping delimiters/.
There are three flavours of grouping parsers: @foo@, @fooBlock@ where
@foo@ can be one of @angles@, @braces@, @parens@, @brackets@. For
example, consider the parser @'braces' tokP p@ parses @p@ delimited by
'{' and '}'. In this case @p@ does not care about indentation. The
parser @'bracesBlock' tokP p@ is like @braces tokP p@ but if no
explicit delimiting braces are given parses @p@ within an indented
block.

> bracesBlock tokP p    = braces tokP p <|> blockOf p

[Seperator Parsers] A seperator parser takes as input a parser say @p@
and returns a parser that parses a list of @p@ seperated by a
seperator. The module exports the combinators @fooSep@, @fooSep1@,
@fooSepOrFoldedLines@ and @fooSepOrFoldedLines1@, where @foo@ is
either @semi@ (in which case the seperator is a semicolon) or @comma@
(in which case the seperator is a comma).

To illustrate the use of this module we now give, as an incomplete
example, a parser that parses a where clause in Haskell which
illustrates the use of this module.

>   import qualified Text.Parsec.Language as L
>   import qualified Text.Parsec.Toke as T
>   import qualified Text.Parsec.IndentToken as IT
>   tokP = T.makeTokenParser L.haskellDef
>   mySemiSep = IT.semiSepOrFoldedLines tokP
>   myBraces = IT.bracesBlock tokP
>   identifier = IT.identifier tokP
>   ....
>   symbol = IT.symbol tokP
>
>   binding = mySemiSep bind
>   bind    = do id <- identifier
>                symbol (char '=')
>                e <- expr
>                return (id,e)
>
>  whereClause = do reserved "where"; braceBlock binding

-}

type GenIndentTokenParser i s u m = T.GenTokenParser s u (IndentT i m)
type IndentTokenParser s u m = GenIndentTokenParser HaskellLike s u m

-- | Indentation aware tokeniser to match a valid identifier.
identifier :: (Indentation i, Monad m)
           => GenIndentTokenParser i s u m
           -> GenIndentParsecT i s u m String
identifier = tokeniser . T.identifier

-- | Indentation aware tokeniser matches an operator.
operator  :: (Indentation i, Monad m)
          => GenIndentTokenParser i s u m
          -> GenIndentParsecT i s u m String
operator = tokeniser . T.operator

-- | Indentation aware tokeniser to match a reserved word.
reserved :: (Indentation i, Monad m)
         => GenIndentTokenParser i s u m
         -> String -- ^ The reserved word.
         -> GenIndentParsecT i s u m ()
reserved tokP = tokeniser . T.reserved tokP

-- | Indentation aware parser to match a reserved operator of the
-- language.
reservedOp :: (Indentation i, Monad m)
           => GenIndentTokenParser i s u m
           -> String -- ^ The reserved operator to be matched.
           -> GenIndentParsecT i s u m ()
reservedOp tokP = tokeniser . T.reservedOp tokP

-- | Indentation aware parser to match a character literal (the syntax
-- is assumend to be that of Hasekell which matches that of most
-- programming language

charLiteral :: (Indentation i, Monad m)
            => GenIndentTokenParser i s u m
            -> GenIndentParsecT i s u m Char
charLiteral = tokeniser . T.charLiteral

-- | Indentation aware parser to match a string literal (the syntax is
-- assumend to be that of Hasekell which matches that of most
-- programming language).
stringLiteral :: (Indentation i, Monad m)
              => GenIndentTokenParser i s u m
              -> GenIndentParsecT i s u m String
stringLiteral = tokeniser . T.stringLiteral

-- | Indentation aware parser to match a natural number.
natural :: (Indentation i, Monad m)
        => GenIndentTokenParser i s u m
        -> GenIndentParsecT i s u m Integer
natural = tokeniser . T.natural

-- | Indentation aware parser to match an integer.
integer :: (Indentation i, Monad m)
        => GenIndentTokenParser i s u m
        -> GenIndentParsecT i s u m Integer
integer = tokeniser . T.integer

-- | Indentation aware tokeniser to match a floating point number.
float :: (Indentation i, Monad m)
      => GenIndentTokenParser i s u m
      -> GenIndentParsecT i s u m Double
float = tokeniser . T.float

-- | Indentation aware tokensier to match either a natural number or
-- Floating point number.
naturalOrFloat :: (Indentation i, Monad m)
               => GenIndentTokenParser i s u m
               -> GenIndentParsecT i s u m (Either Integer Double)
naturalOrFloat = tokeniser . T.naturalOrFloat

-- | Indentation aware tokensier to match an integer in decimal.
decimal :: (Indentation i, Monad m)
        => GenIndentTokenParser i s u m
        -> GenIndentParsecT i s u m Integer
decimal = tokeniser . T.decimal

-- | Indentation aware tokeniser to match an integer in hexadecimal.
hexadecimal :: (Indentation i, Monad m)
            => GenIndentTokenParser i s u m
            -> GenIndentParsecT i s u m Integer
hexadecimal = tokeniser . T.hexadecimal

-- | Indentation aware tokeniser to match an integer in ocatal.
octal :: (Indentation i, Monad m)
      => GenIndentTokenParser i s u m
      -> GenIndentParsecT i s u m Integer
octal = tokeniser . T.octal

-- | Indentation aware tokeniser that is equvalent to @`string`@.
symbol :: (Indentation i, Monad m)
       => GenIndentTokenParser i s u m
       -> String
       -> GenIndentParsecT i s u m String
symbol tokP = tokeniser . T.symbol tokP

-- | Creates a lexeme tokeniser. The resultant tokeniser indentation
-- aware and skips trailing white spaces/comments.
lexeme :: (Indentation i, Monad m)
       => GenIndentTokenParser i s u m
       -> GenIndentParsecT i s u m a
       -> GenIndentParsecT i s u m a
lexeme tokP = tokeniser . T.lexeme tokP

-- | The parser whiteSpace skips spaces and comments. This does not
-- care about indentation as skipping spaces should be done
-- irrespective of the indentation
whiteSpace :: (Indentation i, Monad m)
           => GenIndentTokenParser i s u m
           -> GenIndentParsecT i s u m ()
whiteSpace = T.whiteSpace

-- | Matches a semicolon and returns ';'.
semi :: (Indentation i, Monad m)
     => GenIndentTokenParser i s u m
     -> GenIndentParsecT i s u m String
semi = tokeniser . T.semi

-- | Matches a comma and returns ",".
comma :: (Indentation i, Monad m)
      => GenIndentTokenParser i s u m
      -> GenIndentParsecT i s u m String
comma = tokeniser . T.comma

-- | Matches a colon and returns ":".
colon :: (Indentation i, Monad m)
      => GenIndentTokenParser i s u m
      -> GenIndentParsecT i s u m String
colon = tokeniser . T.colon

-- | Matches a dot and returns ".".
dot :: (Indentation i, Monad m)
    => GenIndentTokenParser i s u m
    -> GenIndentParsecT i s u m String
dot = tokeniser . T.dot

lparen   :: (Monad m, Indentation i)
         => GenIndentTokenParser i s u m
         -> GenIndentParsecT i s u m String
rparen   :: (Monad m, Indentation i)
         => GenIndentTokenParser i s u m
         -> GenIndentParsecT i s u m String
lbrace   :: (Monad m, Indentation i)
         => GenIndentTokenParser i s u m
         -> GenIndentParsecT i s u m String
rbrace   :: (Monad m, Indentation i)
         => GenIndentTokenParser i s u m
         -> GenIndentParsecT i s u m String
langle   :: (Monad m, Indentation i)
         => GenIndentTokenParser i s u m
         -> GenIndentParsecT i s u m String
rangle   :: (Monad m, Indentation i)
         => GenIndentTokenParser i s u m
         -> GenIndentParsecT i s u m String
lbracket :: (Monad m, Indentation i)
         => GenIndentTokenParser i s u m
         -> GenIndentParsecT i s u m String
rbracket :: (Monad m, Indentation i)
         => GenIndentTokenParser i s u m
         -> GenIndentParsecT i s u m String

lparen tokP = symbol tokP "("
rparen tokP = symbol tokP ")"

lbrace tokP = symbol tokP "{"
rbrace tokP = symbol tokP "}"

langle tokP = symbol tokP "<"
rangle tokP = symbol tokP ">"

lbracket tokP = symbol tokP "["
rbracket tokP = symbol tokP "]"

-- | Match the input parser @p@ within a pair of paranthesis.

parens :: (Indentation i, Show i,
                       Monad m, Stream s (IndentT i m) t, Show t)
       => GenIndentTokenParser i s u m
       -> GenIndentParsecT i s u m a
       -> GenIndentParsecT i s u m a
parens tokP = lparen tokP  `between` rparen tokP

{-|

Same as `parens` but if no explicit paranthesis are given, matches @p@
inside an indented block.

-}

parensBlock :: (Monad m, Stream s (IndentT HaskellLike m) t, Show t)
            => GenIndentTokenParser HaskellLike s u m
            -> GenIndentParsecT HaskellLike s u m a
            -> GenIndentParsecT HaskellLike s u m a
parensBlock tokP = lparen tokP `betweenBlock` rparen tokP

-- | Match the input parser @p@ within a pair of braces
braces :: (Indentation i, Show i,
                       Monad m, Stream s (IndentT i  m) t, Show t)
        => GenIndentTokenParser i s u m
        -> GenIndentParsecT i s u m a
        -> GenIndentParsecT i s u m a
braces tokP = lbrace tokP  `between` rbrace tokP

{-|

Same as `braces` but if no explicit braces are given, matches @p@
inside an indented block.

-}

bracesBlock :: (Monad m, Stream s (IndentT HaskellLike m) t, Show t)
            => GenIndentTokenParser HaskellLike s u m
            -> GenIndentParsecT HaskellLike s u m a
            -> GenIndentParsecT HaskellLike s u m a
bracesBlock tokP = lbrace tokP  `betweenBlock` rbrace tokP

{-|

Match the input parser @p@ within a pair of angular brackets, i.e. '<'
and '>'.

-}
angles :: (Indentation i, Show i, Monad m, 
                       Stream s (IndentT i m) t, Show t)
       => GenIndentTokenParser i s u m
       -> GenIndentParsecT i s u m a
       -> GenIndentParsecT i s u m a
angles tokP = langle tokP  `between` rangle tokP

{-|

Same as `angles` but if no explicit anglular brackets are given,
matches p inside and indented block.

-}

anglesBlock :: (Monad m, Stream s (IndentT HaskellLike m) t, Show t)
            => GenIndentTokenParser HaskellLike s u m
            -> GenIndentParsecT HaskellLike s u m a
            -> GenIndentParsecT HaskellLike s u m a
anglesBlock tokP = langle tokP  `betweenBlock` rangle tokP

-- | Match p within a angular brackets i.e. '[' and ']'.
brackets :: (Indentation i, Show i, Monad m, Stream s (IndentT i m) t, Show t)
         => GenIndentTokenParser i s u m
         -> GenIndentParsecT i s u m a
         -> GenIndentParsecT i s u m a
brackets tokP = lbracket tokP  `between` rbracket tokP

{-|

Same as `brackets` but if no explicit brackets are given, matches p
inside and indented block.

-}

bracketsBlock :: (Monad m, Stream s (IndentT HaskellLike m) t, Show t)
              => GenIndentTokenParser HaskellLike s u m
              -> GenIndentParsecT HaskellLike s u m a
              -> GenIndentParsecT HaskellLike s u m a
bracketsBlock tokP = lbracket tokP  `betweenBlock` rbracket tokP

-- | Parse zero or more @p@ seperated by by a semicolon
semiSep :: (Indentation i, Monad m, Stream s (IndentT i m) t, Show t)
           => GenIndentTokenParser i s u m
           -> GenIndentParsecT i s u m a
           -> GenIndentParsecT i s u m [a]
semiSep tokP p =  sepBy p $ semi tokP

-- |  Parse one or more @p@ seperated by a semicolon
semiSep1 :: (Indentation i, Monad m, Stream s (IndentT i m) t, Show t)
           => GenIndentTokenParser i s u m
           -> GenIndentParsecT i s u m a
           -> GenIndentParsecT i s u m [a]
semiSep1 tokP p =  sepBy1 p $ semi tokP

{-|

Parse zero or more @p@ seperated by semicolon or new line. Long lines
are continued using line folding.

-}
semiSepOrFoldedLines :: (Monad m, Stream s (IndentT HaskellLike m ) t, Show t)
                     => GenIndentTokenParser HaskellLike s u m
                     -> GenIndentParsecT HaskellLike s u m a
                     -> GenIndentParsecT HaskellLike s u m [a]
semiSepOrFoldedLines tokP p = fmap concat .  many
                                          . foldedLinesOf
                                          . sepEndBy1 p
                                          $ semi tokP

{-|

Parse one or more @p@ seperated by semicolon or new line. Long lines
are continued using line folding.

-}
semiSepOrFoldedLines1 :: (Monad m, Stream s (IndentT HaskellLike m ) t, Show t)
                      => GenIndentTokenParser HaskellLike s u m
                      -> GenIndentParsecT HaskellLike s u m a
                      -> GenIndentParsecT HaskellLike s u m [a]
semiSepOrFoldedLines1 tokP p = do first <- foldedLinesOf . sepEndBy1 p
                                                         $ semi tokP
                                  rest <- semiSepOrFoldedLines tokP p
                                  return (first ++ rest)


-- | Parse zero or more @p@ seperated by by a comma.
commaSep :: (Indentation i, Monad m, Stream s (IndentT i m) t, Show t)
           => GenIndentTokenParser i s u m
           -> GenIndentParsecT i s u m a
           -> GenIndentParsecT i s u m [a]
commaSep tokP p =  sepBy p $ comma tokP

-- |  Parse one or more @p@ seperated by a comma.
commaSep1 :: (Indentation i, Monad m, Stream s (IndentT i m) t, Show t)
           => GenIndentTokenParser i s u m
           -> GenIndentParsecT i s u m a
           -> GenIndentParsecT i s u m [a]
commaSep1 tokP p =  sepBy1 p $ comma tokP

{-|

Parse zero or more @p@ seperated by comma or new line. Long lines are
continued using line folding.

-}
commaSepOrFoldedLines :: (Monad m, Stream s (IndentT HaskellLike m ) t, Show t)
                     => GenIndentTokenParser HaskellLike s u m
                     -> GenIndentParsecT HaskellLike s u m a
                     -> GenIndentParsecT HaskellLike s u m [a]
commaSepOrFoldedLines tokP p = fmap concat .  many
                                           . foldedLinesOf
                                           . sepEndBy1 p
                                           $ comma tokP

{-|

Parse one or more @p@ seperated by comma or new line. Long lines are
continued using line folding.

-}
commaSepOrFoldedLines1 :: (Monad m, Stream s (IndentT HaskellLike m ) t, Show t)
                      => GenIndentTokenParser HaskellLike s u m
                      -> GenIndentParsecT HaskellLike s u m a
                      -> GenIndentParsecT HaskellLike s u m [a]
commaSepOrFoldedLines1 tokP p = do first <- foldedLinesOf . sepEndBy1 p
                                                         $ comma tokP
                                   rest <- commaSepOrFoldedLines tokP p
                                   return (first ++ rest)