{- | Detects character encoding, decodes, and convert all special characters to HTML references. CDATA sections are converted to plain text. Input is read from standard input, output is written to standard output. The encoding can be given as one argument, which then overrides encoding information within the document. Since an HTML tree is used as internal representation, in the output opening and closing tags are correctly nested. This may be an improvement, but if the input is written with bugs or strange behaviour of a particular HTML viewer in mind, this may be a loss. > escape-tidy-html utf-8 output.xhtml See also @escape-html@ example from tagchup package. -} module Main where import qualified Text.XML.HXT.DOM.Unicode as Unicode import qualified Text.HTML.WraXML.Document as HtmlDoc import qualified Text.XML.WraXML.Document.Tagchup as TagchupDoc import qualified Text.XML.WraXML.Tree as XmlTree import qualified Text.XML.WraXML.String as XmlString import qualified Text.HTML.Tagchup.Parser as Parser import qualified Text.HTML.Basic.Character as HtmlChar import qualified Text.XML.Basic.Name.LowerCase as Name import System.Environment (getArgs, ) getDecoder :: XmlString.Encoding -> XmlString.Encoded -> String getDecoder = maybe Unicode.latin1ToUnicode (fst.) . Unicode.getDecodingFct escape :: HtmlDoc.T i Name.T String -> String escape = flip HtmlDoc.format "" . fmap (map HtmlChar.asciiFromUnicode) . HtmlDoc.lift id (map XmlTree.textFromCData) main :: IO () main = do argv <- getArgs case argv of [] -> interact (escape . XmlString.evalDecodeAdaptive . HtmlDoc.decodeAdaptive getDecoder . TagchupDoc.toXmlDocument . Parser.runSoupWithPositions) [encoding] -> interact (escape . TagchupDoc.toXmlDocument . Parser.runSoupWithPositions . getDecoder encoding) _ -> putStrLn "Error: more than one decoding given"