{-|
This is a tag soup parser with a custom tag data structure.

The parser works only on proper Unicode texts,
that is, you must have decoded it before,
e.g. using decoding functions from hxt or encoding package.
-}
module Text.HTML.TagSoup.HT.Parser (
    CharType,
    runSoup, runSoupWithPositions, runSoupWithPositionsName,
    runTag, runInnerOfTag,
  ) where

import Text.HTML.TagSoup.HT.Parser.Combinator
   (allowFail, withDefault,
    char, dropSpaces, getPos,
    many, many1, many1Satisfy, readUntil,
    satisfy, string,
    emit, mfix, )

import qualified Text.HTML.TagSoup.HT.Parser.Combinator as Parser

import qualified Text.HTML.TagSoup.HT.Tag         as Tag
import qualified Text.HTML.TagSoup.HT.PositionTag as PosTag
import qualified Text.HTML.TagSoup.HT.Position    as Position
import qualified Text.HTML.TagSoup.HT.HTMLChar    as HTMLChar

import qualified Text.HTML.TagSoup.Entity as HTMLEntity

import Control.Monad (mplus, msum, when, liftM, )

-- import Control.Monad.Identity (Identity(..), )

import Data.Char (isAlphaNum, isAscii, isDigit, chr, )
import Data.Maybe (fromMaybe, )



-- * run parser in several ways

{- |
Parse a single tag, throws an error if there is a syntax error.
This is useful for parsing a match pattern.
-}
runTag :: (CharType char, Show char) =>
   String -> Tag.T char
runTag str =
   let tags =
          fromMaybe (error "runTag: no parse at all") $
          Parser.write "string" parsePosTag str
       makeError = error $
          "runTag: parsing results in\n" ++
          unlines (map show tags)
   in  case tags of
          [(_,tag)] ->
              if Tag.isWarning tag
                then makeError
                else tag
          _ -> makeError

{- |
Parse the inner of a single tag.
That is, @runTag \"\<bla\>\"@ is the same as @runInnerOfTag \"\<bla\>\"@.
-}
runInnerOfTag :: (CharType char, Show char) =>
   String -> Tag.T char
runInnerOfTag str = runTag $ "<"++str++">"



runSoupWithPositionsName :: CharType char =>
   FilePath -> String -> [PosTag.T char]
runSoupWithPositionsName fileName =
   Parser.runIdentity .
   Parser.write fileName (many parsePosTag >> return ())


-- | Parse an HTML document to a list of 'Tag.T'.
-- Automatically expands out escape characters.
runSoupWithPositions :: CharType char =>
   String -> [PosTag.T char]
runSoupWithPositions =
   Parser.runIdentity .
   Parser.write "input" (many parsePosTag >> return ())

-- | Like 'runSoupWithPositions' but hides source file positions.
runSoup :: CharType char => String -> [Tag.T char]
runSoup = map snd . runSoupWithPositions



-- * parser parts

type Parser     char a = Parser.Full     (PosTag.T char) a
type ParserEmit char a = Parser.Emitting (PosTag.T char) a


parsePosTag :: CharType char => Parser char ()
parsePosTag = do
   pos <- getPos
   mplus
      (do char '<'
          allowFail $ withDefault
             (msum $
                parseSpecialTag pos :
                parseProcessingTag pos :
                parseCloseTag pos :
                parseOpenTag pos :
                [])
             (do emitTag pos (Tag.Text [fromChar '<'])
                 emitWarning pos "A '<', that is not part of a tag. Encode it as &lt; please."))
      (parseText pos)


parseOpenTag :: CharType char => Position.T -> Parser char ()
parseOpenTag pos =
   do name <- parseName
      allowFail $
         do dropSpaces
            mfix
               (\attrs ->
                  emit (pos, Tag.Open name attrs) >>
                  many parseAttribute)
            withDefault
               (do closePos <- getPos
                   string "/>"
                   allowFail $ emitTag closePos (Tag.Close name))
               (do junkPos <- getPos
                   readUntilTerm
                      (\ junk ->
                         emitWarningWhen
                            (not $ null junk)
                            junkPos ("Junk in opening tag: \"" ++ junk ++ "\""))
                      ("Unterminated open tag \"" ++ name ++ "\"") ">")

parseCloseTag :: Position.T -> Parser char ()
parseCloseTag pos =
   do char '/'
      name <- parseName
      allowFail $
         do emitTag pos (Tag.Close name)
            dropSpaces
            junkPos <- getPos
            readUntilTerm
               (\ junk ->
                  emitWarningWhen
                     (not $ null junk)
                     junkPos ("Junk in closing tag: \"" ++ junk ++"\""))
               ("Unterminated closing tag \"" ++ name ++"\"") ">"

parseSpecialTag :: Position.T -> Parser char ()
parseSpecialTag pos =
   do char '!'
      msum $
       (do string "--"
           allowFail $ readUntilTerm
              (\ cmt -> emitTag pos (Tag.Comment cmt))
              "Unterminated comment" "-->") :
       (do string "[CDATA["
           allowFail $ readUntilTerm
              (\ cdata -> emitTag pos (Tag.Special "[CDATA[" cdata))
              "Unterminated cdata" "]]>") :
       (do name <- many1Satisfy isAlphaNum
           allowFail $
              do dropSpaces
                 readUntilTerm
                    (\ info -> emitTag pos (Tag.Special name info))
                    ("Unterminated special tag \"" ++ name ++ "\"") ">") :
       []

parseProcessingTag :: CharType char => Position.T -> Parser char ()
parseProcessingTag pos =
   do char '?'
      name <- parseName
      allowFail $
         do dropSpaces
            mfix
               (\proc ->
                  emit (pos, Tag.Processing name proc) >>
                  if elem name ["xml", "xml-stylesheet"]
                    then
                      do attrs <- many parseAttribute
                         junkPos <- getPos
                         readUntilTerm
                            (\ junk ->
                               emitWarningWhen (not $ null junk) junkPos
                                  ("Junk in processing info tag: \"" ++ junk ++ "\""))
                            ("Unterminated processing info tag \"" ++ name ++ "\"") "?>"
                         return $ Tag.KnownProcessing attrs
                    else readUntilTerm (return . Tag.UnknownProcessing)
                            "Unterminated processing instruction" "?>")
            return ()

parseText :: CharType char => Position.T -> Parser char ()
parseText pos =
   mfix
     (\ text ->
        allowFail (emitTag pos (Tag.Text text)) >>
        parseString1 ('<'/=))
     >> return ()


parseAttribute :: CharType char => Parser char (Tag.Attribute char)
parseAttribute =
   parseName >>= \name -> allowFail $
   do dropSpaces
      value <-
         withDefault
            (string "=" >> allowFail (dropSpaces >> parseValue))
            (return [])
      dropSpaces
      return (name, value)

parseName :: Parser char String
parseName =
   -- we must restrict to ASCII alphanum characters in order to exclude umlauts
   many1Satisfy (\c -> isAlphaNum c && isAscii c || c `elem` "_-:")

parseValue :: CharType char => ParserEmit char [char]
parseValue =
   (msum $
      parseQuoted "Unterminated doubly quoted value string" '"' :
      parseQuoted "Unterminated singly quoted value string" '\'' :
      [])
   `withDefault`
   parseUnquotedValue

parseUnquotedValueChar :: ParserEmit Char String
parseUnquotedValueChar =
   let parseValueChar =
          do pos <- getPos
             str <- parseChar (not . flip elem " >\"\'")
             let wrong = filter (not . isValidValueChar) str
             allowFail $
                emitWarningWhen (not (null wrong)) pos $
                "Illegal characters in unquoted value: " ++ wrong
             return str
   in  liftM concat $ many parseValueChar

parseUnquotedValueHTMLChar :: ParserEmit HTMLChar.T [HTMLChar.T]
parseUnquotedValueHTMLChar =
   let parseValueChar =
          do pos <- getPos
             hc <- parseHTMLChar (not . flip elem " >\"\'")
             case hc of
                HTMLChar.Char c ->
                   allowFail $
                   emitWarningWhen (not (isValidValueChar c)) pos $
                      "Illegal characters in unquoted value: '" ++ c : "'"
                _ -> return ()
             return hc
   in  many parseValueChar

isValidValueChar :: Char -> Bool
isValidValueChar c  =  isAlphaNum c || c `elem` "_-:."

parseQuoted :: CharType char => String -> Char -> Parser char [char]
parseQuoted termMsg quote =
   char quote >>
   (allowFail $
    do str <- parseString (quote/=)
       withDefault
          (char quote >> return ())
          (do termPos <- getPos
              emitWarning termPos termMsg)
       return str)

{-
Instead of using 'generateTag' we could also wrap the call to 'readUntilTerm'
in 'mfix' in order to emit a tag, where some information is read later.
-}
readUntilTerm ::
   (String -> ParserEmit char a) -> String -> String -> ParserEmit char a
readUntilTerm generateTag termWarning termPat =
   do ~(termFound,str) <- readUntil termPat
      result <- generateTag str
      termPos <- getPos
      emitWarningWhen (not termFound) termPos termWarning
      return result


class CharType char where
   fromChar :: Char -> char
   parseString  :: (Char -> Bool) -> ParserEmit char [char]
   parseString1 :: (Char -> Bool) -> Parser     char [char]
   parseUnquotedValue :: ParserEmit char [char]

instance CharType Char where
   fromChar = id
   parseString  p = liftM concat $ many  (parseChar p)
   parseString1 p = liftM concat $ many1 (parseChar p)
   parseUnquotedValue = parseUnquotedValueChar

instance CharType HTMLChar.T where
   fromChar = HTMLChar.Char
   parseString  p = many  (parseHTMLChar p)
   parseString1 p = many1 (parseHTMLChar p)
   parseUnquotedValue = parseUnquotedValueHTMLChar


parseChar :: (Char -> Bool) -> Parser char String
parseChar p =
   do pos <- getPos
      x <- parseHTMLChar p
      let returnChar c = return $ c:[]
      allowFail $
         case x of
            HTMLChar.Char c -> returnChar c
            HTMLChar.NumericRef num -> returnChar (chr num)
            HTMLChar.NamedRef name ->
               maybe
                  (let refName = '&':name++";"
                   in  emitWarning pos ("Unknown HTML entity " ++ refName) >>
                       return refName)
                  (returnChar . chr)
                  (lookup name HTMLEntity.htmlEntities)


parseHTMLChar :: (Char -> Bool) -> Parser char HTMLChar.T
parseHTMLChar p =
   do pos <- getPos
      c <- satisfy p
      allowFail $
        if c=='&'
          then
            withDefault
              (do ent <-
                     mplus
                        (char '#' >>
                         liftM (HTMLChar.NumericRef . read) (many1Satisfy isDigit))
                        (liftM HTMLChar.NamedRef $ many1Satisfy isAlphaNum)
                  char ';'
                  return ent)
              (emitWarning pos "Ill formed entity" >>
               return (HTMLChar.Char '&'))
          else return (HTMLChar.Char c)


emitWarningWhen :: Bool -> Position.T -> String -> ParserEmit char ()
emitWarningWhen cond pos msg =
   when cond $ emitWarning pos msg

emitWarning :: Position.T -> String -> ParserEmit char ()
emitWarning pos msg = emitTag pos (Tag.Warning msg)

emitTag :: Position.T -> Tag.T char -> ParserEmit char ()
emitTag = curry emit