-- | A module for conversion from HTML to BlazeHtml Haskell code.
--
module Main where
import Control.Monad (forM_, when)
import Control.Applicative ((<$>))
import Data.List (stripPrefix)
import Data.Maybe (listToMaybe, fromMaybe)
import Data.Char (toLower, isSpace)
import Control.Arrow (first)
import System.Environment (getArgs)
import System.FilePath (dropExtension)
import qualified Data.Map as M
import System.Console.GetOpt
import Text.HTML.TagSoup
import Util.Sanitize (sanitize)
import Util.GenerateHtmlCombinators hiding (main)
-- | Simple type to represent attributes.
--
type Attributes = [(String, String)]
-- | Intermediate tree representation. This representation contains several
-- constructors aimed at pretty-printing.
--
data Html = Parent String Attributes Html
| Block [Html]
| Text String
| Comment String
| Doctype
deriving (Show)
-- | Different combinator types.
--
data CombinatorType = ParentCombinator
| LeafCombinator
| UnknownCombinator
deriving (Eq, Show)
-- | Traverse the list of tags to produce an intermediate representation of the
-- HTML tree.
--
makeTree :: HtmlVariant -- ^ HTML variant used
-> Bool -- ^ Should ignore errors
-> [String] -- ^ Stack of open tags
-> [Tag String] -- ^ Tags to parse
-> (Html, [Tag String]) -- ^ (Result, unparsed part)
makeTree _ ignore stack []
| null stack || ignore = (Block [], [])
| otherwise = error $ "Error: tags left open at the end: " ++ show stack
makeTree variant ignore stack (TagPosition row _ : x : xs) = case x of
TagOpen tag attrs -> if toLower' tag == "!doctype"
then addHtml Doctype xs
else let tag' = toLower' tag
(inner, t) = case combinatorType variant tag' of
LeafCombinator -> (Block [], xs)
_ -> makeTree variant ignore (tag' : stack) xs
p = Parent tag' (map (first toLower') attrs) inner
in addHtml p t
-- The closing tag must match the stack. If it is a closing leaf, we can
-- ignore it
TagClose tag ->
let isLeafCombinator = combinatorType variant tag == LeafCombinator
matchesStack = listToMaybe stack == Just (toLower' tag)
in case (isLeafCombinator, matchesStack, ignore) of
-- It's a leaf combinator, don't care about this element
(True, _, _) -> makeTree variant ignore stack xs
-- It's a parent and the stack doesn't match
(False, False, False) -> error $
"Line " ++ show row ++ ": " ++ show tag ++ " closed but "
++ show stack ++ " should be closed instead."
-- Stack might not match but we ignore it anyway
(False, _, _) -> (Block [], xs)
TagText text -> addHtml (Text text) xs
TagComment comment -> addHtml (Comment comment) xs
_ -> makeTree variant ignore stack xs
where
addHtml html xs' = let (Block l, r) = makeTree variant ignore stack xs'
in (Block (html : l), r)
toLower' = map toLower
makeTree _ _ _ _ = error "TagSoup error"
-- | Remove empty text from the HTML.
--
removeEmptyText :: Html -> Html
removeEmptyText (Block b) = Block $ map removeEmptyText $ flip filter b $ \h ->
case h of Text text -> any (not . isSpace) text
_ -> True
removeEmptyText (Parent tag attrs inner) =
Parent tag attrs $ removeEmptyText inner
removeEmptyText x = x
-- | Try to eliminiate Block constructors as much as possible.
--
minimizeBlocks :: Html -> Html
minimizeBlocks (Parent t a (Block [x])) = minimizeBlocks $ Parent t a x
minimizeBlocks (Parent t a x) = Parent t a $ minimizeBlocks x
minimizeBlocks (Block x) = Block $ map minimizeBlocks x
minimizeBlocks x = x
-- | Get the type of a combinator, using a given variant.
--
combinatorType :: HtmlVariant -> String -> CombinatorType
combinatorType variant combinator
| combinator == "docTypeHtml" = ParentCombinator
| combinator `elem` parents variant = ParentCombinator
| combinator `elem` leafs variant = LeafCombinator
| otherwise = UnknownCombinator
-- | Create a special @@ parent that includes the docype.
--
joinHtmlDoctype :: Html -> Html
joinHtmlDoctype (Block (Doctype : Parent "html" attrs inner : xs)) =
Block $ Parent "docTypeHtml" attrs inner : xs
joinHtmlDoctype x = x
-- | Produce the Blaze code from the HTML. The result is a list of lines.
--
fromHtml :: HtmlVariant -- ^ Used HTML variant
-> Bool -- ^ Should ignore errors
-> Html -- ^ HTML tree
-> [String] -- ^ Resulting lines of code
fromHtml _ _ Doctype = ["docType"]
fromHtml _ _ (Text text) = ["\"" ++ concatMap escape (trim text) ++ "\""]
where
-- Remove whitespace on both ends of a string
trim = reverse . dropWhile isSpace . reverse . dropWhile isSpace
-- Escape a number of characters
escape '"' = "\\\""
escape '\n' = "\\n"
escape x = [x]
fromHtml _ _ (Comment comment) = map ("-- " ++) $ lines comment
fromHtml variant ignore (Block block) =
concatMap (fromHtml variant ignore) block
fromHtml variant ignore (Parent tag attrs inner) =
case combinatorType variant tag of
-- Actual parent tags
ParentCombinator -> case inner of
(Block ls) -> if null ls
then [combinator ++
(if null attrs then " " else " $ ") ++ "mempty"]
else (combinator ++ " $ do") :
indent (fromHtml variant ignore inner)
-- We join non-block parents for better readability.
x -> let ls = fromHtml variant ignore x
apply = if dropApply x then " " else " $ "
in case ls of (y : ys) -> (combinator ++ apply ++ y) : ys
[] -> [combinator]
-- Leaf tags
LeafCombinator -> [combinator]
-- Unknown tag
UnknownCombinator -> if ignore
then fromHtml variant ignore inner
else error $ "Tag " ++ tag ++ " is illegal in "
++ show variant
where
combinator = qualifiedSanitize "H." tag ++ attributes'
attributes' = attrs >>= \(k, v) -> case k `elem` attributes variant of
True -> " ! " ++ qualifiedSanitize "A." k ++ " " ++ show v
False -> case stripPrefix "data-" k of
Just prefix -> " ! "
++ "dataAttribute" ++ " "
++ show prefix
++ " " ++ show v
Nothing | ignore -> ""
| otherwise -> error $ "Attribute "
++ k ++ " is illegal in "
++ show variant
-- Qualifies a tag with the given qualifier if needed, and sanitizes it.
qualifiedSanitize qualifier tag' =
(if isNameClash variant tag' then qualifier else "") ++ sanitize tag'
-- Check if we can drop the apply operator ($), for readability reasons.
-- This would change:
--
-- > p $ "Some text"
--
-- Into
--
-- > p "Some text"
--
dropApply (Parent _ _ _) = False
dropApply (Block _) = False
dropApply _ = null attrs
-- | Produce the code needed for initial imports.
--
getImports :: HtmlVariant -> [String]
getImports variant =
[ "{-# LANGUAGE OverloadedStrings #-}"
, ""
, import_ "Prelude"
, qualify "Prelude" "P"
, import_ "Data.Monoid (mempty)"
, ""
, import_ h
, qualify h "H"
, import_ a
, qualify a "A"
]
where
import_ = ("import " ++)
qualify name short = "import qualified " ++ name ++ " as " ++ short
h = getModuleName variant
a = getAttributeModuleName variant
-- | Convert the HTML to blaze code.
--
blazeFromHtml :: HtmlVariant -- ^ Variant to use
-> Bool -- ^ Produce standalone code
-> Bool -- ^ Should we ignore errors
-> String -- ^ Template name
-> String -- ^ HTML code
-> String -- ^ Resulting code
blazeFromHtml variant standalone ignore name =
unlines . addSignature . fromHtml variant ignore
. joinHtmlDoctype . minimizeBlocks
. removeEmptyText . fst . makeTree variant ignore []
. parseTagsOptions parseOptions { optTagPosition = True }
where
addSignature body = if standalone then [ name ++ " :: Html"
, name ++ " = do"
] ++ indent body
else body
-- | Indent block of code.
--
indent :: [String] -> [String]
indent = map (" " ++)
-- | Main function
--
main :: IO ()
main = do
args <- getOpt Permute options <$> getArgs
case args of
(o, n, []) -> let v = getVariant o
s = standalone' o
i = ignore' o
in do imports' v o
main' v s i n
(_, _, _) -> putStr help
where
-- No files given, work with stdin
main' variant standalone ignore [] = interact $
blazeFromHtml variant standalone ignore "template"
-- Handle all files
main' variant standalone ignore files = forM_ files $ \file -> do
body <- readFile file
putStrLn $ blazeFromHtml variant standalone ignore
(dropExtension file) body
-- Print imports if needed
imports' variant opts = when (standalone' opts) $
putStrLn $ unlines $ getImports variant
-- Should we produce standalone code?
standalone' opts = ArgStandalone `elem` opts
-- Should we ignore errors?
ignore' opts = ArgIgnoreErrors `elem` opts
-- Get the variant from the options
getVariant opts = fromMaybe defaultHtmlVariant $ listToMaybe $
flip concatMap opts $ \o -> case o of (ArgHtmlVariant x) -> [x]
_ -> []
-- | Help information.
--
help :: String
help = unlines $
[ "This is a tool to convert HTML code to BlazeHtml code. It is still"
, "experimental and the results might need to be edited manually."
, ""
, "USAGE"
, ""
, " blaze-from-html [OPTIONS...] [FILES ...]"
, ""
, "When no files are given, it works as a filter."
, ""
, "EXAMPLE"
, ""
, " blaze-from-html -v html4-strict index.html"
, ""
, "This converts the index.html file to Haskell code, writing to stdout."
, ""
, "OPTIONS"
, usageInfo "" options
, "VARIANTS"
, ""
] ++
map ((" " ++) . fst) (M.toList htmlVariants) ++
[ ""
, "By default, " ++ show defaultHtmlVariant ++ " is used."
]
-- | Options for the CLI program
--
data Arg = ArgHtmlVariant HtmlVariant
| ArgStandalone
| ArgIgnoreErrors
deriving (Show, Eq)
-- | A description of the options
--
options :: [OptDescr Arg]
options =
[ Option "v" ["html-variant"] htmlVariantOption "HTML variant to use"
, Option "s" ["standalone"] (NoArg ArgStandalone) "Produce standalone code"
, Option "e" ["ignore-errors"] (NoArg ArgIgnoreErrors) "Ignore most errors"
]
where
htmlVariantOption = flip ReqArg "VARIANT" $ \name -> ArgHtmlVariant $
fromMaybe (error $ "No HTML variant called " ++ name ++ " found.")
(M.lookup name htmlVariants)
-- | The default HTML variant
--
defaultHtmlVariant :: HtmlVariant
defaultHtmlVariant = html5