library/Web/RBB/Crawler/MetaParser.hs

{- |
Module      : Web.RBB.Crawler.MetaParser
Description :  Extract meta data from the repository
Copyright   :  (c) Sebastian Witte
License     :  BSD3

Maintainer  :  woozletoff@gmail.com
Stability   :  experimental

-}
module Web.RBB.Crawler.MetaParser
    where

import Control.Applicative (pure, (*>), (<$>), (<*), (<*>))
import Control.Monad
import Data.Char
import Data.Text           (Text, pack)
import Text.Parsec

-- | The 'Meta' data type represents a model of all the meta data that can be
-- embedded into a commit.
data Meta =
    Tags [(TagQuantifier, Text)]
    -- ^ $tag
    | Title Text
    -- ^ $title
    | Context FilePath
    -- ^ Context entry, usually a relative path for the blog entry repository.
    --
    -- There is no validity check performed.
    | None
    -- ^ Everything that is not a tag quantified in pieces of text followed by
    -- two newlines or the end of input.
    deriving (Eq, Show)

-- | Data type representing a the prefix of a tag.
data TagQuantifier
    = TagAdd
    -- ^ Add the tag to the current set of tags.
    | TagRemove
    -- ^ Remove the tag from the current set of tags.
    | TagReplace
    -- ^ Replace the current set of tags with all the tags given in this tag
    -- definition.
    deriving (Eq, Show, Read, Ord, Enum, Bounded)

-- | Helper parser for a case-insensitive character.
ciChar :: Char -> Parsec String u Char
ciChar c = char (toLower c) <|> char (toUpper c)

-- | Helper parser for a case-insensitive string.
ciString :: String -> Parsec String u String
ciString = mapM ciChar

-- | Parse meta data from the given 'String'. The order of the list items is
-- the same as the order in which the meta data appears in the input.
parseMeta :: String -> Either ParseError [Meta]
parseMeta inp =
    parse (many pSplice) "slice" (inp++"\n\n")
        >>= mapM (parse pMeta "meta parser") . filter (not . all isSpace)

pSplice :: Parsec String u String
pSplice = blankLines
    <|> unwords . words . unlines <$> many1 (notFollowedBy blankLine *> anyLine)

skipSpaces :: Parsec String u ()
skipSpaces = skipMany $ char ' ' <|> char '\t'

blankLine :: Parsec String u Char
blankLine = skipSpaces *> newline

blankLines :: Parsec String u String
blankLines = many1 blankLine

anyLine :: Parsec String u String
anyLine = skipSpaces *> (anyChar `manyTill` newline)

pMeta :: Parsec String u Meta
pMeta = try pTags <|> try pTitle <|> try pContext <|> pNone

pNone :: Parsec String u Meta
pNone = const None <$> many1 anyChar

-- $tag
--
-- A tag can be applied to any published document. For now they are associated
-- with the changed files of a commit. This means that you have to edit a file
-- in order to make update the tags. (Adding/Removing an empty line at the end
-- will do.) This limitation lies within the technical choice of meta data
-- representation. In the future, features from within the blog could be used to
-- do such simple tasks.
--
-- The sytax of tags is quite simple:
-- @tag[s]:@ in any case followed by a space or newline separated list of tags.
-- Tags which spaces can be escaped with quotation marks.
--
-- >>> parseMeta "tAgS   :  foo \"bar'mitz wa\" +'quz''"
--
-- Tags can also be prefixed with @+@ or @-@. If only tags with a prefix are
-- used, they will be added or removed from the current state of the blog's
-- entry. However, if at leas one of the provided tags has no prefix, the tags
-- will be overwritten by the given tags.
--

pTags :: Parsec String u Meta
pTags = Tags
    <$> (try (ciString "tag" *> optional (ciChar 's') *> try spaces *> char ':')
    *> spaces *> pSpaceElements <* spaces)

pQuantifierPrefix :: Parsec String u TagQuantifier
pQuantifierPrefix = quantify <$> (try (char '+') <|> try (char '-') <|> pure '=')
  where
    quantify c
        | c == '+' = TagAdd
        | c == '-' = TagRemove
        | otherwise = TagReplace

pSpaceElements :: Parsec String u [(TagQuantifier, Text)]
pSpaceElements = many pSpaceDelimitedElement

pSpaceDelimitedElement :: Parsec String u (TagQuantifier, Text)
pSpaceDelimitedElement = (\q t -> (q, pack t))
    <$> pQuantifierPrefix
    <*> (char '"' *> anyChar `manyTill` char '"' <* tryEndOfTag
        <|> char '\'' *> anyChar `manyTill` try (char '\'' *> tryEndOfTag)
        <|> ((:) <$> satisfy (not . isSpace)
                 <*> anyChar `manyTill` tryEndOfTag))

tryEndOfTag :: Parsec String u ()
tryEndOfTag = (try (void space) *> spaces) <|> eof

-- $title
-- A title is a String delimited by 2 newlines or the end of input. Newlines
-- withing the title defition are replaced by spaces.
--
-- >>> parseMeta "title: This is an example\nwith a newline within it"
--
pTitle :: Parsec String u Meta
pTitle = Title . pack . unwords
    <$> (try (ciString "title") *> spaces *> char ':' *> spaces
    *> many ((:) <$> anyChar <*> anyChar `manyTill` tryEndOfTag))

pContext :: Parsec String u Meta
pContext = Context . unwords
    <$> (try (ciString "context") *> spaces *> char ':' *> spaces
    *> many ((:) <$> anyChar <*> anyChar `manyTill` tryEndOfTag))