Text/MMark/Parser.hs

-- |
-- Module      :  Text.MMark.Parser
-- Copyright   :  © 2017–2018 Mark Karpov
-- License     :  BSD 3 clause
--
-- Maintainer  :  Mark Karpov <markkarpov92@gmail.com>
-- Stability   :  experimental
-- Portability :  portable
--
-- MMark markdown parser.

{-# LANGUAGE BangPatterns              #-}
{-# LANGUAGE DataKinds                 #-}
{-# LANGUAGE FlexibleContexts          #-}
{-# LANGUAGE LambdaCase                #-}
{-# LANGUAGE NoMonomorphismRestriction #-}
{-# LANGUAGE OverloadedStrings         #-}
{-# LANGUAGE RankNTypes                #-}
{-# LANGUAGE TypeFamilies              #-}

module Text.MMark.Parser
  ( MMarkErr (..)
  , parse )
where

import Control.Applicative (Alternative, liftA2)
import Control.Monad
import Data.Bifunctor (Bifunctor (..))
import Data.Bool (bool)
import Data.HTML.Entities (htmlEntityMap)
import Data.List.NonEmpty (NonEmpty (..), (<|))
import Data.Maybe (isNothing, fromJust, fromMaybe, catMaybes, isJust)
import Data.Monoid (Any (..))
import Data.Ratio ((%))
import Data.Semigroup (Semigroup (..))
import Data.Text (Text)
import Data.Void
import Lens.Micro ((^.))
import Text.MMark.Parser.Internal
import Text.MMark.Type
import Text.MMark.Util
import Text.Megaparsec hiding (parse, State (..))
import Text.Megaparsec.Char hiding (eol)
import Text.URI (URI)
import Text.URI.Lens (uriPath)
import qualified Control.Monad.Combinators.NonEmpty as NE
import qualified Data.Char                  as Char
import qualified Data.DList                 as DList
import qualified Data.HashMap.Strict        as HM
import qualified Data.List.NonEmpty         as NE
import qualified Data.Set                   as E
import qualified Data.Text                  as T
import qualified Data.Text.Encoding         as TE
import qualified Data.Yaml                  as Yaml
import qualified Text.Email.Validate        as Email
import qualified Text.Megaparsec.Char.Lexer as L
import qualified Text.URI                   as URI

----------------------------------------------------------------------------
-- Auxiliary data types

-- | Frame that describes where we are in parsing inlines.

data InlineFrame
  = EmphasisFrame      -- ^ Emphasis with asterisk @*@
  | EmphasisFrame_     -- ^ Emphasis with underscore @_@
  | StrongFrame        -- ^ Strong emphasis with asterisk @**@
  | StrongFrame_       -- ^ Strong emphasis with underscore @__@
  | StrikeoutFrame     -- ^ Strikeout
  | SubscriptFrame     -- ^ Subscript
  | SuperscriptFrame   -- ^ Superscript
  deriving (Eq, Ord, Show)

-- | State of inline parsing that specifies whether we expect to close one
-- frame or there is a possibility to close one of two alternatives.

data InlineState
  = SingleFrame InlineFrame             -- ^ One frame to be closed
  | DoubleFrame InlineFrame InlineFrame -- ^ Two frames to be closed
  deriving (Eq, Ord, Show)

----------------------------------------------------------------------------
-- Top-level API

-- | Parse a markdown document in the form of a strict 'Text' value and
-- either report parse errors or return an 'MMark' document. Note that the
-- parser has the ability to report multiple parse errors at once.

parse
  :: FilePath
     -- ^ File name (only to be used in error messages), may be empty
  -> Text
     -- ^ Input to parse
  -> Either (NonEmpty (ParseError Char MMarkErr)) MMark
     -- ^ Parse errors or parsed document
parse file input =
  case runBParser pMMark file input of
    Left errs -> Left errs
    Right ((myaml, rawBlocks), defs) ->
      let parsed = doInline <$> rawBlocks
          doInline = fmap
            $ first (replaceEof "end of inline block")
            . runIParser defs pInlinesTop
          e2p = either DList.singleton (const DList.empty)
      in case NE.nonEmpty . DList.toList $ foldMap (foldMap e2p) parsed of
           Nothing -> Right MMark
             { mmarkYaml      = myaml
             , mmarkBlocks    = fmap fromRight <$> parsed
             , mmarkExtension = mempty }
           Just errs -> Left errs

----------------------------------------------------------------------------
-- Block parser

-- | Parse an MMark document on block level.

pMMark :: BParser (Maybe Yaml.Value, [Block Isp])
pMMark = do
  meyaml <- optional pYamlBlock
  setTabWidth (mkPos 4)
  blocks <- pBlocks
  eof
  return $ case meyaml of
    Nothing ->
      (Nothing, blocks)
    Just (Left (pos, err)) ->
      (Nothing, prependErr pos (YamlParseError err) blocks)
    Just (Right yaml) ->
      (Just yaml, blocks)

-- | Parse a YAML block. On success return the actual parsed 'Yaml.Value' in
-- 'Right', otherwise return 'SourcePos' of parse error and 'String'
-- describing the error as generated by the @yaml@ package in 'Left'.

pYamlBlock :: BParser (Either (SourcePos, String) Yaml.Value)
pYamlBlock = do
  dpos <- getPosition
  string "---" *> sc' *> eol
  let go = do
        l <- takeWhileP Nothing notNewline
        void (optional eol)
        e <- atEnd
        if e || T.stripEnd l == "---"
          then return []
          else (l :) <$> go
  ls <- go
  return $
    case (Yaml.decodeEither . TE.encodeUtf8 . T.intercalate "\n") ls of
      Left err' ->
        let (apos, err) = splitYamlError (sourceName dpos) err'
        in Left (fromMaybe dpos apos, err)
      Right v -> Right v

-- | Parse several (possibly zero) blocks in a row.

pBlocks :: BParser [Block Isp]
pBlocks = catMaybes <$> many pBlock

-- | Parse a single block of markdown document.

pBlock :: BParser (Maybe (Block Isp))
pBlock = do
  sc
  rlevel <- refLevel
  alevel <- L.indentLevel
  done   <- atEnd
  if done || alevel < rlevel then empty else
    case compare alevel (ilevel rlevel) of
      LT -> choice
        [ Just <$> pThematicBreak
        , Just <$> pAtxHeading
        , Just <$> pFencedCodeBlock
        , Just <$> pUnorderedList
        , Just <$> pOrderedList
        , Just <$> pBlockquote
        , pReferenceDef
        , Just <$> pTable
        , Just <$> pParagraph ]
      _  ->
          Just <$> pIndentedCodeBlock

-- | Parse a thematic break.

pThematicBreak :: BParser (Block Isp)
pThematicBreak = do
  l' <- lookAhead nonEmptyLine
  let l = T.filter (not . isSpace) l'
  if T.length l >= 3   &&
     (T.all (== '*') l ||
      T.all (== '-') l ||
      T.all (== '_') l)
    then ThematicBreak <$ nonEmptyLine <* sc
    else empty

-- | Parse an ATX heading.

pAtxHeading :: BParser (Block Isp)
pAtxHeading = do
  (void . lookAhead . try) hashIntro
  withRecovery recover $ do
    hlevel <- length <$> hashIntro
    sc1'
    ispPos <- getPosition
    r <- someTill (satisfy notNewline <?> "heading character") . try $
      optional (sc1' *> some (char '#') *> sc') *> (eof <|> eol)
    let toBlock = case hlevel of
          1 -> Heading1
          2 -> Heading2
          3 -> Heading3
          4 -> Heading4
          5 -> Heading5
          _ -> Heading6
    toBlock (IspSpan ispPos (T.strip (T.pack r))) <$ sc
  where
    hashIntro = count' 1 6 (char '#')
    recover err =
      Heading1 (IspError err) <$ takeWhileP Nothing notNewline <* sc

-- | Parse a fenced code block.

pFencedCodeBlock :: BParser (Block Isp)
pFencedCodeBlock = do
  alevel <- L.indentLevel
  (ch, n, infoString) <- pOpeningFence
  let content = label "code block content" (option "" nonEmptyLine <* eol)
  ls <- manyTill content (pClosingFence ch n)
  CodeBlock infoString (assembleCodeBlock alevel ls) <$ sc

-- | Parse the opening fence of a fenced code block.

pOpeningFence :: BParser (Char, Int, Maybe Text)
pOpeningFence = p '`' <|> p '~'
  where
    p ch = try $ do
      void $ count 3 (char ch)
      n  <- (+ 3) . length <$> many (char ch)
      ml <- optional
        (T.strip <$> someEscapedWith notNewline <?> "info string")
      guard (maybe True (not . T.any (== '`')) ml)
      (ch, n,
         case ml of
           Nothing -> Nothing
           Just l  ->
             if T.null l
               then Nothing
               else Just l) <$ eol

-- | Parse the closing fence of a fenced code block.

pClosingFence :: Char -> Int -> BParser ()
pClosingFence ch n =  try . label "closing code fence" $ do
  clevel <- ilevel <$> refLevel
  void $ L.indentGuard sc' LT clevel
  void $ count n (char ch)
  (void . many . char) ch
  sc'
  eof <|> eol

-- | Parse an indented code block.

pIndentedCodeBlock :: BParser (Block Isp)
pIndentedCodeBlock = do
  alevel <- L.indentLevel
  clevel <- ilevel <$> refLevel
  let go ls = do
        indented <- lookAhead $
          (>= clevel) <$> (sc *> L.indentLevel)
        if indented
          then do
            l        <- option "" nonEmptyLine
            continue <- eol'
            let ls' = ls . (l:)
            if continue
              then go ls'
              else return ls'
          else return ls
      -- NOTE This is a bit unfortunate, but it's difficult to guarantee
      -- that preceding space is not yet consumed when we get to
      -- interpreting input as an indented code block, so we need to restore
      -- the space this way.
      f x      = T.replicate (unPos alevel - 1) " " <> x
      g []     = []
      g (x:xs) = f x : xs
  ls <- g . ($ []) <$> go id
  CodeBlock Nothing (assembleCodeBlock clevel ls) <$ sc

-- | Parse an unorederd list.

pUnorderedList :: BParser (Block Isp)
pUnorderedList = do
  (bullet, bulletPos, minLevel, indLevel) <-
    pListBullet Nothing
  x  <- innerBlocks bulletPos minLevel indLevel
  xs <- many $ do
    (_, bulletPos', minLevel', indLevel') <-
      pListBullet (Just (bullet, bulletPos))
    innerBlocks bulletPos' minLevel' indLevel'
  return (UnorderedList (normalizeListItems (x:|xs)))
  where
    innerBlocks bulletPos minLevel indLevel = do
      p <- getPosition
      let tooFar = sourceLine p > sourceLine bulletPos <> pos1
          rlevel = slevel minLevel indLevel
      if tooFar || sourceColumn p < minLevel
        then return [bool Naked Paragraph tooFar emptyIspSpan]
        else subEnv True rlevel pBlocks

-- | Parse a list bullet. Return a tuple with the following components (in
-- order):
--
--     * 'Char' used to represent the bullet
--     * 'SourcePos' at which the bullet was located
--     * the closest column position where content could start
--     * the indentation level after the bullet

pListBullet
  :: Maybe (Char, SourcePos)
     -- ^ Bullet 'Char' and start position of the first bullet in a list
  -> BParser (Char, SourcePos, Pos, Pos)
pListBullet mbullet = try $ do
  pos    <- getPosition
  l      <- (<> mkPos 2) <$> L.indentLevel
  bullet <-
    case mbullet of
      Nothing -> char '-' <|> char '+' <|> char '*'
      Just (bullet, bulletPos) -> do
        guard (sourceColumn pos >= sourceColumn bulletPos)
        char bullet
  eof <|> sc1
  l'     <- L.indentLevel
  return (bullet, pos, l, l')

-- | Parse an ordered list.

pOrderedList :: BParser (Block Isp)
pOrderedList = do
  (startIx, del, startPos, minLevel, indLevel) <-
    pListIndex Nothing
  x  <- innerBlocks startPos minLevel indLevel
  xs <- manyIndexed (startIx + 1) $ \expectedIx -> do
    (actualIx, _, startPos', minLevel', indLevel') <-
      pListIndex (Just (del, startPos))
    let f blocks =
          if actualIx == expectedIx
            then blocks
            else prependErr
                   startPos'
                   (ListIndexOutOfOrder actualIx expectedIx)
                   blocks
    f <$> innerBlocks startPos' minLevel' indLevel'
  return . OrderedList startIx . normalizeListItems $
    (if startIx <= 999999999
       then x
       else prependErr startPos (ListStartIndexTooBig startIx) x)
    :| xs
  where
    innerBlocks indexPos minLevel indLevel = do
      p <- getPosition
      let tooFar = sourceLine p > sourceLine indexPos <> pos1
          rlevel = slevel minLevel indLevel
      if tooFar || sourceColumn p < minLevel
        then return [bool Naked Paragraph tooFar emptyIspSpan]
        else subEnv True rlevel pBlocks

-- | Parse a list index. Return a tuple with the following components (in
-- order):
--
--     * 'Word' parsed numeric index
--     * 'Char' used as delimiter after the numeric index
--     * 'SourcePos' at which the index was located
--     * the closest column position where content could start
--     * the indentation level after the index

pListIndex
  :: Maybe (Char, SourcePos)
     -- ^ Delimiter 'Char' and start position of the first index in a list
  -> BParser (Word, Char, SourcePos, Pos, Pos)
pListIndex mstart = try $ do
  pos <- getPosition
  i   <- L.decimal
  del <- case mstart of
    Nothing -> char '.' <|> char ')'
    Just (del, startPos) -> do
      guard (sourceColumn pos >= sourceColumn startPos)
      char del
  l   <- (<> pos1) <$> L.indentLevel
  eof <|> sc1
  l'  <- L.indentLevel
  return (i, del, pos, l, l')

-- | Parse a block quote.

pBlockquote :: BParser (Block Isp)
pBlockquote = do
  minLevel <- try $ do
    minLevel <- (<> pos1) <$> L.indentLevel
    void (char '>')
    eof <|> sc
    l <- L.indentLevel
    return $
      if l > minLevel
        then minLevel <> pos1
        else minLevel
  indLevel <- L.indentLevel
  if indLevel >= minLevel
    then do
      let rlevel = slevel minLevel indLevel
      xs <- subEnv False rlevel pBlocks
      return (Blockquote xs)
    else return (Blockquote [])

-- | Parse a link\/image reference definition and register it.

pReferenceDef :: BParser (Maybe (Block Isp))
pReferenceDef = do
  (pos, dlabel) <- try (pRefLabel <* char ':')
  withRecovery recover $ do
    sc' <* optional eol <* sc'
    uri <- pUri
    hadSpN <- optional $
      (sc1' *> option False (True <$ eol)) <|> (True <$ (sc' <* eol))
    sc'
    mtitle <-
      if isJust hadSpN
        then optional pTitle <* sc'
        else return Nothing
    case (hadSpN, mtitle) of
      (Just True,  Nothing) -> return ()
      _                     -> hidden eof <|> eol
    conflict <- registerReference dlabel (uri, mtitle)
    when conflict $ do
      setPosition pos
      customFailure (DuplicateReferenceDefinition dlabel)
    Nothing <$ sc
  where
    recover err =
      Just (Naked (IspError err)) <$ takeWhileP Nothing notNewline <* sc

-- | Parse a pipe table.

pTable :: BParser (Block Isp)
pTable = do
  (n, headerRow) <- try $ do
    let pipe' = option False (True <$ pipe)
    l <- pipe'
    headerRow <- NE.sepBy1 cell (try (pipe <* notFollowedBy eol))
    r <- pipe'
    let n = NE.length headerRow
    guard (n > 1 || l || r)
    eol <* sc'
    lookAhead nonEmptyLine >>= guard . isHeaderLike
    return (n, headerRow)
  caligns <- rowWrapper (NE.fromList <$> sepByCount n calign pipe)
  otherRows <- many $ do
    lookAhead (option True (isBlank <$> nonEmptyLine)) >>= guard . not
    rowWrapper (NE.fromList <$> sepByCount n cell pipe)
  Table caligns (headerRow :| otherRows) <$ sc
  where
    cell = do
      startPos <- getPosition
      txt      <- fmap (T.stripEnd . T.pack) . foldMany' . choice $
        [ (++) . T.unpack <$> hidden (string "\\|")
        , (:) <$> label "inline content" (satisfy cellChar) ]
      return (IspSpan startPos txt)
    cellChar x = x /= '|' && notNewline x
    rowWrapper p = do
      void (optional pipe)
      r <- p
      void (optional pipe)
      eof <|> eol
      sc'
      return r
    pipe = char '|' <* sc'
    calign = do
      let colon' = option False (True <$ char ':')
      l <- colon'
      void (count 3 (char '-') <* many (char '-'))
      r <- colon'
      sc'
      return $
        case (l, r) of
          (False, False) -> CellAlignDefault
          (True,  False) -> CellAlignLeft
          (False, True)  -> CellAlignRight
          (True,  True)  -> CellAlignCenter
    isHeaderLike txt =
      T.length (T.filter isHeaderConstituent txt) % T.length txt >
      8 % 10
    isHeaderConstituent x =
      isSpace x || x == '|' || x == '-' || x == ':'

-- | Parse a paragraph or naked text (is some cases).

pParagraph :: BParser (Block Isp)
pParagraph = do
  startPos   <- getPosition
  allowNaked <- isNakedAllowed
  rlevel     <- refLevel
  let go ls = do
        l <- lookAhead (option "" nonEmptyLine)
        broken <- succeeds . lookAhead . try $ do
          sc
          alevel <- L.indentLevel
          guard (alevel < ilevel rlevel)
          unless (alevel < rlevel) . choice $
            [ void (char '>')
            , void pThematicBreak
            , void pAtxHeading
            , void pOpeningFence
            , void (pListBullet Nothing)
            , void (pListIndex  Nothing) ]
        if isBlank l
          then return (ls, Paragraph)
          else if broken
                 then return (ls, Naked)
                 else do
                   void nonEmptyLine
                   continue <- eol'
                   let ls' = ls . (l:)
                   if continue
                     then go ls'
                     else return (ls', Naked)
  l        <- nonEmptyLine
  continue <- eol'
  (ls, toBlock) <-
    if continue
      then go id
      else return (id, Naked)
  (if allowNaked then toBlock else Paragraph)
    (IspSpan startPos (assembleParagraph (l:ls []))) <$ sc

----------------------------------------------------------------------------
-- Inline parser

-- | The top level inline parser.

pInlinesTop :: IParser (NonEmpty Inline)
pInlinesTop = do
  inlines <- pInlines
  eof <|> void pLfdr
  return inlines

-- | Parse inlines using settings from given 'InlineConfig'.

pInlines :: IParser (NonEmpty Inline)
pInlines = do
  done        <- atEnd
  allowsEmpty <- isEmptyAllowed
  if done
    then
      if allowsEmpty
        then (return . nes . Plain) ""
        else unexpEic EndOfInput
    else NE.some $ do
      mch <- lookAhead (anyChar <?> "inline content")
      case mch of
        '`' -> pCodeSpan
        '[' -> do
          allowsLinks <- isLinksAllowed
          if allowsLinks
            then pLink
            else unexpEic (Tokens $ nes '[')
        '!' -> do
          gotImage <- (succeeds . void . lookAhead . string) "!["
          allowsImages <- isImagesAllowed
          if gotImage
            then if allowsImages
                   then pImage
                   else unexpEic (Tokens . NE.fromList $ "![")
            else pPlain
        '<' -> do
          allowsLinks <- isLinksAllowed
          if allowsLinks
            then try pAutolink <|> pPlain
            else pPlain
        '\\' ->
          try pHardLineBreak <|> pPlain
        ch ->
          if isFrameConstituent ch
            then pEnclosedInline
            else pPlain

-- | Parse a code span.

pCodeSpan :: IParser Inline
pCodeSpan = do
  n <- try (length <$> some (char '`'))
  let finalizer = try $ do
        void $ count n (char '`')
        notFollowedBy (char '`')
  r <- CodeSpan . collapseWhiteSpace . T.concat <$>
    manyTill (label "code span content" $
               takeWhile1P Nothing (== '`') <|>
               takeWhile1P Nothing (/= '`'))
      finalizer
  r <$ lastChar OtherChar

-- | Parse a link.

pLink :: IParser Inline
pLink = do
  void (char '[')
  pos <- getPosition
  txt <- disallowLinks (disallowEmpty pInlines)
  void (char ']')
  (dest, mtitle) <- pLocation pos txt
  Link txt dest mtitle <$ lastChar OtherChar

-- | Parse an image.

pImage :: IParser Inline
pImage = do
  (pos, alt)    <- emptyAlt <|> nonEmptyAlt
  (src, mtitle) <- pLocation pos alt
  Image alt src mtitle <$ lastChar OtherChar
  where
    emptyAlt = do
      pos <- getPosition
      void (string "![]")
      let alt       = nes (Plain "")
          newColumn = sourceColumn pos <> mkPos 2
      return (pos { sourceColumn = newColumn }, alt)
    nonEmptyAlt = do
      void (string "![")
      pos <- getPosition
      alt <- disallowImages (disallowEmpty pInlines)
      void (char ']')
      return (pos, alt)

-- | Parse an autolink.

pAutolink :: IParser Inline
pAutolink = between (char '<') (char '>') $ do
  uri' <- URI.parser
  let (txt, uri) =
        case isEmailUri uri' of
          Nothing ->
            ( (nes . Plain . URI.render) uri'
            , uri' )
          Just email ->
            ( nes (Plain email)
            , URI.makeAbsolute mailtoScheme uri' )
  Link txt uri Nothing <$ lastChar OtherChar

-- | Parse inline content inside an enclosing construction such as emphasis,
-- strikeout, superscript, and\/or subscript markup.

pEnclosedInline :: IParser Inline
pEnclosedInline = disallowEmpty $ pLfdr >>= \case
  SingleFrame x ->
    liftFrame x <$> pInlines <* pRfdr x
  DoubleFrame x y -> do
    inlines0  <- pInlines
    thisFrame <- pRfdr x <|> pRfdr y
    let thatFrame = if thisFrame == x then y else x
    minlines1 <- optional pInlines
    void (pRfdr thatFrame)
    return . liftFrame thatFrame $
      case minlines1 of
        Nothing ->
          nes (liftFrame thisFrame inlines0)
        Just inlines1 ->
          liftFrame thisFrame inlines0 <| inlines1

-- | Parse a hard line break.

pHardLineBreak :: IParser Inline
pHardLineBreak = do
  void (char '\\')
  eol
  notFollowedBy eof
  sc'
  lastChar SpaceChar
  return LineBreak

-- | Parse plain text.

pPlain :: IParser Inline
pPlain = fmap (Plain . bakeText) . foldSome $ do
  ch <- lookAhead (anyChar <?> "inline content")
  let newline' =
        (('\n':) . dropWhile isSpace) <$ eol <* sc' <* lastChar SpaceChar
  case ch of
    '\\' -> (:) <$>
      ((escapedChar <* lastChar OtherChar) <|>
        try (char '\\' <* notFollowedBy eol <* lastChar OtherChar))
    '\n' ->
      newline'
    '\r' ->
      newline'
    '!' -> do
      notFollowedBy (string "![")
      (:) <$> char '!' <* lastChar PunctChar
    '<' -> do
      notFollowedBy pAutolink
      (:) <$> char '<' <* lastChar PunctChar
    '&' -> choice
      [ (:) <$> numRef
      , (++) . reverse <$> entityRef
      , (:) <$> char '&' ] <* lastChar PunctChar
    _ ->
      (:) <$>
        if Char.isSpace ch
          then char ch <* lastChar SpaceChar
          else if isSpecialChar ch
                 then failure
                   (Just . Tokens . nes $ ch)
                   (E.singleton . Label . NE.fromList $ "inline content")
                 else if Char.isPunctuation ch
                        then char ch <* lastChar PunctChar
                        else char ch <* lastChar OtherChar

----------------------------------------------------------------------------
-- Auxiliary inline-level parsers

-- | Parse an inline and reference-style link\/image location.

pLocation
  :: SourcePos         -- ^ Location where the content inlines start
  -> NonEmpty Inline   -- ^ The inner content inlines
  -> IParser (URI, Maybe Text) -- ^ URI and optionally title
pLocation innerPos inner = do
  mr <- optional (inplace <|> withRef)
  case mr of
    Nothing ->
      collapsed innerPos inner <|> shortcut innerPos inner
    Just (dest, mtitle) ->
      return (dest, mtitle)
  where
    inplace = do
      void (char '(')
      sc'
      dest     <- pUri
      hadSpace <- option False (True <$ sc1)
      mtitle   <- if hadSpace
        then optional pTitle <* sc'
        else return Nothing
      void (char ')')
      return (dest, mtitle)
    withRef =
      pRefLabel >>= uncurry lookupRef
    collapsed pos inlines = do
      -- NOTE We need to do these manipulations so the failure caused by
      -- 'string' "" does not overwrite our custom failures.
      pos' <- getPosition
      setPosition pos
      (void . hidden . string) "[]"
      setPosition pos'
      lookupRef pos (mkLabel inlines)
    shortcut pos inlines =
      lookupRef pos (mkLabel inlines)
    lookupRef pos dlabel =
      lookupReference dlabel >>= \case
        Left names -> do
          setPosition pos
          customFailure (CouldNotFindReferenceDefinition dlabel names)
        Right x ->
          return x
    mkLabel = T.unwords . T.words . asPlainText

-- | Parse a URI.

pUri :: (Ord e, Show e, MonadParsec e Text m) => m URI
pUri = between (char '<') (char '>') URI.parser <|> naked
  where
    naked = do
      let f x = not (isSpaceN x || x == ')')
          l   = "end of URI"
      (s, s') <- T.span f <$> getInput
      when (T.null s) . void $
        (satisfy f <?> "URI") -- this will now fail
      setInput s
      r <- region (replaceEof l) (URI.parser <* label l eof)
      setInput s'
      return r

-- | Parse a title of a link or an image.

pTitle :: MonadParsec MMarkErr Text m => m Text
pTitle = choice
  [ p '\"' '\"'
  , p '\'' '\''
  , p '('  ')' ]
  where
    p start end = between (char start) (char end) $
      let f x = x /= end
      in manyEscapedWith f "unescaped character"

-- | Parse label of a reference link.

pRefLabel :: MonadParsec MMarkErr Text m => m (SourcePos, Text)
pRefLabel = do
  try $ do
    void (char '[')
    notFollowedBy (char ']')
  pos <- getPosition
  sc
  let f x = x /= '[' && x /= ']'
  dlabel <- someEscapedWith f <?> "reference label"
  void (char ']')
  return (pos, dlabel)

-- | Parse an opening markup sequence corresponding to given 'InlineState'.

pLfdr :: IParser InlineState
pLfdr = try $ do
  pos <- getPosition
  let r st = st <$ string (inlineStateDel st)
  st <- hidden $ choice
    [ r (DoubleFrame StrongFrame StrongFrame)
    , r (DoubleFrame StrongFrame EmphasisFrame)
    , r (SingleFrame StrongFrame)
    , r (SingleFrame EmphasisFrame)
    , r (DoubleFrame StrongFrame_ StrongFrame_)
    , r (DoubleFrame StrongFrame_ EmphasisFrame_)
    , r (SingleFrame StrongFrame_)
    , r (SingleFrame EmphasisFrame_)
    , r (DoubleFrame StrikeoutFrame StrikeoutFrame)
    , r (DoubleFrame StrikeoutFrame SubscriptFrame)
    , r (SingleFrame StrikeoutFrame)
    , r (SingleFrame SubscriptFrame)
    , r (SingleFrame SuperscriptFrame) ]
  let dels = inlineStateDel st
      failNow = do
        setPosition pos
        (customFailure . NonFlankingDelimiterRun . toNesTokens) dels
  lch <- getLastChar
  rch <- getNextChar OtherChar
  when (lch >= rch) failNow
  return st

-- | Parse a closing markup sequence corresponding to given 'InlineFrame'.

pRfdr :: InlineFrame -> IParser InlineFrame
pRfdr frame = try $ do
  let dels = inlineFrameDel frame
      expectingInlineContent = region $ \case
        TrivialError pos us es -> TrivialError pos us $
          E.insert (Label $ NE.fromList "inline content") es
        other -> other
  pos <- getPosition
  (void . expectingInlineContent . string) dels
  let failNow = do
        setPosition pos
        (customFailure . NonFlankingDelimiterRun . toNesTokens) dels
  lch <- getLastChar
  rch <- getNextChar SpaceChar
  when (lch <= rch) failNow
  return frame

-- | Get 'CharType' of the next char in the input stream.

getNextChar
  :: CharType          -- ^ What we should consider frame constituent characters
  -> IParser CharType
getNextChar frameType = lookAhead (option SpaceChar (charType <$> anyChar))
  where
    charType ch
      | isFrameConstituent ch = frameType
      | Char.isSpace       ch = SpaceChar
      | ch == '\\'            = OtherChar
      | Char.isPunctuation ch = PunctChar
      | otherwise             = OtherChar

----------------------------------------------------------------------------
-- Parsing helpers

manyIndexed :: (Alternative m, Num n) => n -> (n -> m a) -> m [a]
manyIndexed n' m = go n'
  where
    go !n = liftA2 (:) (m n) (go (n + 1)) <|> pure []

foldMany :: MonadPlus m => m (a -> a) -> m (a -> a)
foldMany f = go id
  where
    go g =
      optional f >>= \case
        Nothing -> pure g
        Just h  -> go (h . g)

foldMany' :: MonadPlus m => m ([a] -> [a]) -> m [a]
foldMany' f = ($ []) <$> go id
  where
    go g =
      optional f >>= \case
        Nothing -> pure g
        Just h  -> go (g . h)

foldSome :: MonadPlus m => m (a -> a) -> m (a -> a)
foldSome f = liftA2 (flip (.)) f (foldMany f)

foldSome' :: MonadPlus m => m ([a] -> [a]) -> m [a]
foldSome' f = liftA2 ($) f (foldMany' f)

sepByCount :: MonadPlus m => Int -> m a -> m sep -> m [a]
sepByCount 0 _ _   = pure []
sepByCount n p sep = liftA2 (:) p (count (n - 1) (sep *> p))

nonEmptyLine :: BParser Text
nonEmptyLine = takeWhile1P Nothing notNewline

manyEscapedWith :: MonadParsec MMarkErr Text m
  => (Char -> Bool)
  -> String
  -> m Text
manyEscapedWith f l = fmap T.pack . foldMany' . choice $
  [ (:) <$> escapedChar
  , (:) <$> numRef
  , (++) . reverse <$> entityRef
  , (:) <$> satisfy f <?> l ]

someEscapedWith :: MonadParsec MMarkErr Text m
  => (Char -> Bool)
  -> m Text
someEscapedWith f = fmap T.pack . foldSome' . choice $
  [ (:) <$> escapedChar
  , (:) <$> numRef
  , (++) . reverse <$> entityRef
  , (:) <$> satisfy f ]

escapedChar :: MonadParsec e Text m => m Char
escapedChar = label "escaped character" $
  try (char '\\' *> satisfy isAsciiPunctuation)

-- | Parse an HTML5 entity reference.

entityRef :: MonadParsec MMarkErr Text m => m String
entityRef = do
  pos  <- getPosition
  let f (TrivialError _ us es) = TrivialError (nes pos) us es
      f (FancyError   _ xs)    = FancyError   (nes pos) xs
  name <- try . region f $ between (char '&') (char ';')
    (takeWhile1P Nothing Char.isAlphaNum <?> "HTML5 entity name")
  case HM.lookup name htmlEntityMap of
    Nothing -> do
      setPosition pos
      customFailure (UnknownHtmlEntityName name)
    Just txt -> return (T.unpack txt)

-- | Parse a numeric character using the given numeric parser.

numRef :: MonadParsec MMarkErr Text m => m Char
numRef = do
  pos <- getPosition
  let f = between (string "&#") (char ';')
  n   <- try (f (char' 'x' *> L.hexadecimal)) <|> f L.decimal
  if n == 0 || n > fromEnum (maxBound :: Char)
    then do
      setPosition pos
      customFailure (InvalidNumericCharacter n)
    else return (Char.chr n)

sc :: MonadParsec e Text m => m ()
sc = void $ takeWhileP (Just "white space") isSpaceN

sc1 :: MonadParsec e Text m => m ()
sc1 = void $ takeWhile1P (Just "white space") isSpaceN

sc' :: MonadParsec e Text m => m ()
sc' = void $ takeWhileP (Just "white space") isSpace

sc1' :: MonadParsec e Text m => m ()
sc1' = void $ takeWhile1P (Just "white space") isSpace

eol :: MonadParsec e Text m => m ()
eol = void . label "newline" $ choice
  [ string "\n"
  , string "\r\n"
  , string "\r" ]

eol' :: MonadParsec e Text m => m Bool
eol' = option False (True <$ eol)

----------------------------------------------------------------------------
-- Char classification

isSpace :: Char -> Bool
isSpace x = x == ' ' || x == '\t'

isSpaceN :: Char -> Bool
isSpaceN x = isSpace x || isNewline x

isNewline :: Char -> Bool
isNewline x = x == '\n' || x == '\r'

notNewline :: Char -> Bool
notNewline = not . isNewline

isFrameConstituent :: Char -> Bool
isFrameConstituent = \case
  '*' -> True
  '^' -> True
  '_' -> True
  '~' -> True
  _   -> False

isMarkupChar :: Char -> Bool
isMarkupChar x = isFrameConstituent x || f x
  where
    f = \case
      '[' -> True
      ']' -> True
      '`' -> True
      _   -> False

isSpecialChar :: Char -> Bool
isSpecialChar x = isMarkupChar x || x == '\\' || x == '!' || x == '<'

isAsciiPunctuation :: Char -> Bool
isAsciiPunctuation x =
  (x >= '!' && x <= '/') ||
  (x >= ':' && x <= '@') ||
  (x >= '[' && x <= '`') ||
  (x >= '{' && x <= '~')

----------------------------------------------------------------------------
-- Other helpers

slevel :: Pos -> Pos -> Pos
slevel a l = if l >= ilevel a then a else l

ilevel :: Pos -> Pos
ilevel = (<> mkPos 4)

isBlank :: Text -> Bool
isBlank = T.all isSpace

assembleCodeBlock :: Pos -> [Text] -> Text
assembleCodeBlock indent ls = T.unlines (stripIndent indent <$> ls)

stripIndent :: Pos -> Text -> Text
stripIndent indent txt = T.drop m txt
  where
    m = snd $ T.foldl' f (0, 0) (T.takeWhile isSpace txt)
    f (!j, !n) ch
      | j  >= i    = (j, n)
      | ch == ' '  = (j + 1, n + 1)
      | ch == '\t' = (j + 4, n + 1)
      | otherwise  = (j, n)
    i = unPos indent - 1

assembleParagraph :: [Text] -> Text
assembleParagraph = go
  where
    go []     = ""
    go [x]    = T.dropWhileEnd isSpace x
    go (x:xs) = x <> "\n" <> go xs

collapseWhiteSpace :: Text -> Text
collapseWhiteSpace =
  T.stripEnd . T.filter (/= '\0') . snd . T.mapAccumL f True
  where
    f seenSpace ch =
      case (seenSpace, g ch) of
        (False, False) -> (False, ch)
        (True,  False) -> (False, ch)
        (False, True)  -> (True,  ' ')
        (True,  True)  -> (True,  '\0')
    g ' '  = True
    g '\t' = True
    g '\n' = True
    g _    = False

inlineStateDel :: InlineState -> Text
inlineStateDel = \case
  SingleFrame x   -> inlineFrameDel x
  DoubleFrame x y -> inlineFrameDel x <> inlineFrameDel y

liftFrame :: InlineFrame -> NonEmpty Inline -> Inline
liftFrame = \case
  StrongFrame      -> Strong
  EmphasisFrame    -> Emphasis
  StrongFrame_     -> Strong
  EmphasisFrame_   -> Emphasis
  StrikeoutFrame   -> Strikeout
  SubscriptFrame   -> Subscript
  SuperscriptFrame -> Superscript

inlineFrameDel :: InlineFrame -> Text
inlineFrameDel = \case
  EmphasisFrame    -> "*"
  EmphasisFrame_   -> "_"
  StrongFrame      -> "**"
  StrongFrame_     -> "__"
  StrikeoutFrame   -> "~~"
  SubscriptFrame   -> "~"
  SuperscriptFrame -> "^"

replaceEof :: forall e. Show e => String -> ParseError Char e -> ParseError Char e
replaceEof altLabel = \case
  TrivialError pos us es -> TrivialError pos (f <$> us) (E.map f es)
  FancyError   pos xs    -> FancyError pos xs
  where
    f EndOfInput = Label (NE.fromList altLabel)
    f x          = x

isEmailUri :: URI -> Maybe Text
isEmailUri uri =
  case URI.unRText <$> uri ^. uriPath of
    [x] ->
      if Email.isValid (TE.encodeUtf8 x) &&
          (isNothing (URI.uriScheme uri) ||
           URI.uriScheme uri == Just mailtoScheme)
        then Just x
        else Nothing
    _ -> Nothing

splitYamlError :: FilePath -> String -> (Maybe SourcePos, String)
splitYamlError file str = maybe (Nothing, str) (first pure) (parseMaybe p str)
  where
    p :: Parsec Void String (SourcePos, String)
    p = do
      void (string "YAML parse exception at line ")
      l <- mkPos . (+ 2) <$> L.decimal
      void (string ", column ")
      c <- mkPos . (+ 1) <$> L.decimal
      void (string ":\n")
      r <- takeRest
      return (SourcePos file l c, r)

emptyIspSpan :: Isp
emptyIspSpan = IspSpan (initialPos "") ""

normalizeListItems :: NonEmpty [Block Isp] -> NonEmpty [Block Isp]
normalizeListItems xs' =
  if getAny $ foldMap (foldMap (Any . isParagraph)) (drop 1 x :| xs)
    then fmap toParagraph <$> xs'
    else case x of
           [] -> xs'
           (y:ys) -> r $ (toNaked y : ys) :| xs
  where
    (x:|xs) = r xs'
    r = NE.reverse . fmap reverse
    isParagraph = \case
      OrderedList _ _ -> False
      UnorderedList _ -> False
      Naked         _ -> False
      _               -> True
    toParagraph (Naked inner) = Paragraph inner
    toParagraph other         = other
    toNaked (Paragraph inner) = Naked inner
    toNaked other             = other

succeeds :: Alternative m => m () -> m Bool
succeeds m = True <$ m <|> pure False

prependErr :: SourcePos -> MMarkErr -> [Block Isp] -> [Block Isp]
prependErr pos custom blocks = Naked (IspError err) : blocks
  where
    err = FancyError (nes pos) (E.singleton $ ErrorCustom custom)

mailtoScheme :: URI.RText 'URI.Scheme
mailtoScheme = fromJust (URI.mkScheme "mailto")

toNesTokens :: Text -> NonEmpty Char
toNesTokens = NE.fromList . T.unpack

unexpEic :: MonadParsec e Text m => ErrorItem Char -> m a
unexpEic x = failure
  (Just x)
  (E.singleton . Label . NE.fromList $ "inline content")

nes :: a -> NonEmpty a
nes a = a :| []

fromRight :: Either a b -> b
fromRight (Right x) = x
fromRight _         =
  error "Text.MMark.Parser.fromRight: the impossible happened"

bakeText :: (String -> String) -> Text
bakeText = T.pack . reverse . ($ [])