module Text.HTML.Tagchup.Parser.Tag where import Text.HTML.Tagchup.Parser.Combinator (allowFail, withDefault, voidChar, dropSpaces, getPos, many, many0toN, many1toN, many1Satisfy, readUntil, satisfy, voidString, emit, modifyEmission, ) import qualified Text.HTML.Tagchup.Parser.Combinator as Parser import qualified Text.HTML.Tagchup.Parser.Status as Status import qualified Text.HTML.Tagchup.Parser.Stream as Stream import qualified Text.HTML.Tagchup.PositionTag as PosTag import qualified Text.HTML.Tagchup.Tag as Tag import qualified Text.XML.Basic.Position as Position import qualified Text.HTML.Basic.Character as HTMLChar import qualified Text.XML.Basic.ProcessingInstruction as PI import qualified Text.XML.Basic.Attribute as Attr import qualified Text.XML.Basic.Name as Name import qualified Text.XML.Basic.Tag as TagName import qualified Text.HTML.Tagchup.Character as Chr import Text.HTML.Tagchup.Character (fromChar, ) import qualified Text.HTML.Basic.Entity as HTMLEntity import qualified Control.Monad.Exception.Synchronous as Exc import Control.Monad.Trans.Writer (runWriterT, ) import Control.Monad.Trans.State (StateT(..), ) import Control.Monad (mplus, msum, when, liftM, ) import Data.Monoid (Monoid, mempty, mconcat, ) import qualified Data.Map as Map import Data.Tuple.HT (mapSnd, ) import Data.Char (isAlphaNum, chr, ord, ) import Data.Maybe (maybeToList, ) -- import qualified Numeric type Warning = (Position.T, String) type Parser source a = Parser.Full source Warning a type ParserEmit source a = Parser.Emitting source Warning a parsePosTagMergeWarnings :: (Stream.C source, StringType sink, Name.Attribute name, Name.Tag name) => StateT (Status.T source) Maybe [PosTag.T name sink] parsePosTagMergeWarnings = liftM (\((ot,ct),warns) -> ot : map (\(pos,warn) -> PosTag.cons pos $ Tag.Warning warn) warns ++ maybeToList ct) $ runWriterT parsePosTag parsePosTag :: (Stream.C source, StringType sink, Name.Attribute name, Name.Tag name) => Parser source (PosTag.T name sink, Maybe (PosTag.T name sink)) parsePosTag = do let omitClose :: Monad m => m t -> m (t, Maybe t) omitClose = liftM (\t -> (t, Nothing)) pos <- getPos mplus (do voidChar '<' allowFail $ withDefault (msum $ omitClose (parseSpecialTag pos) : omitClose (parseProcessingTag pos) : omitClose (parseCloseTag pos) : parseOpenTag pos : []) (do emitWarning pos "A '<', that is not part of a tag. Encode it as < please." omitClose (returnTag pos (Tag.Text $ stringFromChar '<')))) (omitClose (parseText pos)) {- | Parsing an open tag may also emit a close tag if the tag is self-closing, e.g. @
@. For formatting self-closing tags correctly it would be better to emit tags in the order @open tag, close tag, warnings@. However, if there are infinitely many warnings, we don't know whether a self-closing slash comes and thus whether there is a close tag or not. This implies, that we cannot even emit the warnings. Thus we choose the order @open tag, warnings, close tag@. -} parseOpenTag :: (Stream.C source, StringType sink, Name.Attribute name, Name.Tag name) => Position.T -> Parser source (PosTag.T name sink, Maybe (PosTag.T name sink)) parseOpenTag pos = do name <- parseName allowFail $ do dropSpaces tag <- returningTag pos (Tag.Open name) $ modifyEmission (restrictWarnings 10) $ many parseAttribute liftM ((,) tag) $ withDefault (do closePos <- getPos voidString "/>" allowFail $ liftM Just $ returnTag closePos (Tag.Close name)) (do junkPos <- getPos readUntilTerm (\ junk -> emitWarningWhen (not $ null junk) junkPos ("Junk in opening tag: \"" ++ junk ++ "\"")) ("Unterminated open tag \"" ++ Name.toString name ++ "\"") ">" return Nothing) parseCloseTag :: (Stream.C source, Name.Tag name) => Position.T -> Parser source (PosTag.T name sink) parseCloseTag pos = do voidChar '/' name <- parseName allowFail $ do tag <- returnTag pos (Tag.Close name) dropSpaces junkPos <- getPos readUntilTerm (\ junk -> emitWarningWhen (not $ null junk) junkPos ("Junk in closing tag: \"" ++ junk ++"\"")) ("Unterminated closing tag \"" ++ Name.toString name ++"\"") ">" return tag parseSpecialTag :: (Stream.C source, Name.Tag name) => Position.T -> Parser source (PosTag.T name sink) parseSpecialTag pos = do voidChar '!' msum $ (do voidString "--" allowFail $ readUntilTerm (\ cmt -> returnTag pos (Tag.Comment cmt)) "Unterminated comment" "-->") : (do voidString TagName.cdataString allowFail $ readUntilTerm (\ cdata -> returnTag pos (Tag.cdata cdata)) "Unterminated cdata" "]]>") : (do name <- parseName allowFail $ do dropSpaces readUntilTerm (\ info -> returnTag pos (Tag.Special name info)) ("Unterminated special tag \"" ++ Name.toString name ++ "\"") ">") : [] parseProcessingTag :: (Stream.C source, StringType sink, Name.Attribute name, Name.Tag name) => Position.T -> Parser source (PosTag.T name sink) parseProcessingTag pos = do voidChar '?' name <- parseName allowFail $ do dropSpaces returningTag pos (Tag.Processing name) $ if Name.matchAny ["xml", "xml-stylesheet"] name then do attrs <- many parseAttribute junkPos <- getPos readUntilTerm (\ junk -> emitWarningWhen (not $ null junk) junkPos ("Junk in processing info tag: \"" ++ junk ++ "\"")) ("Unterminated processing info tag \"" ++ Name.toString name ++ "\"") "?>" return $ PI.Known attrs else readUntilTerm (return . PI.Unknown) "Unterminated processing instruction" "?>" parseText :: (Stream.C source, StringType sink) => Position.T -> Parser source (PosTag.T name sink) parseText pos = returningTag pos Tag.Text (parseCharAsString (const True)) -- returningTag pos Tag.Text (parseCharAsString ('<'/=)) -- returningTag pos Tag.Text (parseString1 ('<'/=)) parseAttribute :: (Stream.C source, StringType sink, Name.Attribute name) => Parser source (Attr.T name sink) parseAttribute = parseName >>= \name -> allowFail $ do dropSpaces value <- withDefault (voidString "=" >> allowFail (dropSpaces >> parseValue)) (return mempty) dropSpaces return $ Attr.Cons name value parseName :: (Stream.C source, Name.C pname) => Parser source pname parseName = liftM Name.fromString $ many1Satisfy (\c -> isAlphaNum c || c `elem` "_-.:") parseValue :: (Stream.C source, StringType sink) => ParserEmit source sink parseValue = (msum $ parseQuoted "Unterminated doubly quoted value string" '"' : parseQuoted "Unterminated singly quoted value string" '\'' : []) `withDefault` parseUnquotedValueAsString parseUnquotedValueChar :: (Stream.C source) => ParserEmit source String parseUnquotedValueChar = let parseValueChar = do pos <- getPos str <- parseUnicodeChar (not . flip elem " >\"\'") let wrong = filter (not . isValidValueChar) str allowFail $ emitWarningWhen (not (null wrong)) pos $ "Illegal characters in unquoted value: " ++ wrong return str in liftM concat $ many parseValueChar parseUnquotedValueHTMLChar :: (Stream.C source) => ParserEmit source [HTMLChar.T] parseUnquotedValueHTMLChar = let parseValueChar = do pos <- getPos hc <- parseHTMLChar (not . flip elem " >\"\'") {- We do the check after each parseHTMLChar and not after (many parseValueChar) in order to correctly interleave warnings. -} allowFail $ mapM_ (checkUnquotedChar pos) hc return hc in liftM concat $ many parseValueChar checkUnquotedChar :: Position.T -> HTMLChar.T -> ParserEmit source () checkUnquotedChar pos x = case x of HTMLChar.Unicode c -> emitWarningWhen (not (isValidValueChar c)) pos $ "Illegal characters in unquoted value: '" ++ c : "'" _ -> return () isValidValueChar :: Char -> Bool isValidValueChar c = isAlphaNum c || c `elem` "_-:." parseQuoted :: (Stream.C source, StringType sink) => String -> Char -> Parser source sink parseQuoted termMsg quote = voidChar quote >> (allowFail $ do str <- parseString (quote/=) withDefault (voidChar quote) (do termPos <- getPos emitWarning termPos termMsg) return str) {- Instead of using 'generateTag' we could also wrap the call to 'readUntilTerm' in 'mfix' in order to emit a tag, where some information is read later. -} readUntilTerm :: (Stream.C source) => (String -> ParserEmit source a) -> String -> String -> ParserEmit source a readUntilTerm generateTag termWarning termPat = do ~(termFound,str) <- readUntil termPat result <- generateTag str termPos <- getPos emitWarningWhen (not termFound) termPos termWarning return result class Chr.C char => CharType char where parseChar :: (Stream.C source) => (Char -> Bool) -> Parser source [char] parseUnquotedValue :: (Stream.C source) => ParserEmit source [char] instance CharType Char where parseChar = parseUnicodeChar parseUnquotedValue = parseUnquotedValueChar instance CharType HTMLChar.T where parseChar = parseHTMLChar parseUnquotedValue = parseUnquotedValueHTMLChar class Monoid sink => StringType sink where stringFromChar :: Char -> sink parseCharAsString :: (Stream.C source) => (Char -> Bool) -> Parser source sink parseUnquotedValueAsString :: (Stream.C source) => ParserEmit source sink instance CharType char => StringType [char] where stringFromChar c = [fromChar c] parseCharAsString = parseChar parseUnquotedValueAsString = parseUnquotedValue parseString :: (Stream.C source, StringType sink) => (Char -> Bool) -> ParserEmit source sink parseString p = liftM mconcat $ many (parseCharAsString p) {- parseString1 :: (Stream.C source, StringType sink) => (Char -> Bool) -> Parser name source sink sink parseString1 p = liftM mconcat $ many1 (parseCharAsString p) -} parseUnicodeChar :: (Stream.C source) => (Char -> Bool) -> Parser source String parseUnicodeChar p = do pos <- getPos x <- parseHTMLChar p allowFail $ liftM concat $ mapM (htmlCharToString pos) x htmlCharToString :: Position.T -> HTMLChar.T -> ParserEmit source String htmlCharToString pos x = let returnChar c = return $ c:[] in case x of HTMLChar.Unicode c -> returnChar c HTMLChar.CharRef num -> returnChar (chr num) HTMLChar.EntityRef name -> maybe (let refName = '&':name++";" in emitWarning pos ("Unknown HTML entity " ++ refName) >> return refName) returnChar (Map.lookup name HTMLEntity.mapNameToChar) {- | Only well formed entity references are interpreted as single HTMLChars, whereas ill-formed entity references are interpreted as sequence of unicode characters without special meaning. E.g. "& ;" is considered as plain "& ;", and only "&" is considered an escaped ampersand. It is a very common error in HTML documents to not escape an ampersand. With the interpretation used here, those ampersands are left as they are. At most one warning can be emitted. -} parseHTMLChar :: (Stream.C source) => (Char -> Bool) -> Parser source [HTMLChar.T] parseHTMLChar p = do pos <- getPos c <- satisfy p allowFail $ if c=='&' then withDefault (do ent <- mplus (do voidChar '#' digits <- allowFail $ many0toN 10 (satisfy isAlphaNum) -- exclude ';', '"', '<' -- include 'x' Exc.switch (\e -> allowFail (emitWarning pos ("Error in numeric entity: " ++ e)) >> return (map HTMLChar.fromUnicode ('&':'#':digits))) (return . (:[]) . HTMLChar.CharRef . ord) (HTMLEntity.numberToChar digits)) (liftM ((:[]) . HTMLChar.EntityRef) $ many1toN 10 (satisfy isAlphaNum)) voidChar ';' return ent) (emitWarning pos "Non-terminated entity reference" >> return [HTMLChar.Unicode '&']) else return [HTMLChar.Unicode c] {- readHex :: (Num a) => String -> a readHex str = case Numeric.readHex str of [(n,"")] -> n _ -> error "readHex: no parse" {- We cannot emit specific warnings, because the sub-parsers simply fail and then throw away the warnings. -} parseHTMLCharGenericWarning :: (Stream.C source) => (Char -> Bool) -> Parser source [HTMLChar.T] parseHTMLCharGenericWarning p = do pos <- getPos c <- satisfy p allowFail $ if c=='&' then withDefault (do ent <- mplus (voidChar '#' >> liftM HTMLChar.CharRef (mplus (voidChar 'x' >> liftM readHex (many1toN 8 (satisfy isHexDigit))) (liftM read (many1toN 10 (satisfy isDigit))))) (liftM HTMLChar.EntityRef $ many1toN 10 (satisfy isAlphaNum)) voidChar ';' return [ent]) (emitWarning pos "Ill formed entity" >> return [HTMLChar.Unicode '&']) else return [HTMLChar.Unicode c] -} restrictWarnings :: Int -> [Warning] -> [Warning] restrictWarnings n = uncurry (++) . mapSnd (\rest -> case rest of (pos, _) : _ -> [(pos, "further warnings suppressed")] _ -> []) . splitAt n -- these functions have intentionally restricted types emitWarningWhen :: Bool -> Position.T -> String -> ParserEmit source () emitWarningWhen cond pos msg = when cond $ emitWarning pos msg emitWarning :: Position.T -> String -> ParserEmit source () emitWarning = curry emit returnTag :: Position.T -> Tag.T name sink -> ParserEmit source (PosTag.T name sink) returnTag p t = return (PosTag.cons p t) returningTag :: (Monad m) => Position.T -> (a -> Tag.T name sink) -> m a -> m (PosTag.T name sink) returningTag pos f = liftM (PosTag.cons pos . f)