{-|
Module : Text.FromHTML
Description : Simple library for transformation of HTML to other formats
Copyright : (c) Marek Suchánek, 2018
License : MIT
Maintainer : marek.suchanek@fit.cvut.cz
Stability : experimental
Portability : POSIX
Simplified API for transformation of HTML to other formats with Pandoc
and wkhtmltopdf in Haskell code. It requires @wkhtmltopdf@ installed
locally (see ).
-}
module Text.FromHTML
( fromHTML
, ExportType(..)
) where
-- import Debug.Trace
import qualified Data.Text as T
import qualified Data.Text.Encoding as E
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import qualified Text.Pandoc as Pandoc
import qualified Text.Pandoc.Templates as PandocTemplates
import qualified Text.Pandoc.Writers as PandocWriters
import qualified Text.Pandoc.Error as PandocError
import qualified Text.Pandoc.PDF as PandocPDF
import GHC.IO.Handle
import System.Process
import System.IO.Unsafe
-- | Allowed export types
data ExportType = HTML
| LaTeX
| RTF
| RST
| Markdown
| AsciiDoc
| Docx
| ODT
| DokuWiki
| MediaWiki
| EPUB2
| EPUB3
| PDF
deriving (Show, Read, Enum, Bounded, Eq)
-- | Type alias for Pure Pandoc writer
type Writer = (Pandoc.WriterOptions -> Pandoc.Pandoc -> Pandoc.PandocPure B.ByteString)
-- | Helper function to translate Either to Maybe
eitherToMaybe :: Show a => Either a b -> Maybe b
eitherToMaybe (Right x) = Just x
eitherToMaybe _ = Nothing
-- Variant for debugging
-- eitherToMaybe :: Show a => Either a b -> Maybe b
-- eitherToMaybe (Right x) = Just x
-- eitherToMaybe (Left x) = traceShow x Nothing
readerOptions = Pandoc.def { Pandoc.readerStandalone = True }
writerOptions = Pandoc.def
-- | Transform given HTML as String to selected format
fromHTML :: ExportType -> String -> Maybe B.ByteString
fromHTML HTML html = Just . E.encodeUtf8 . T.pack $ html -- HTML is already provided!
fromHTML PDF html = writerHTML2PDF html
fromHTML extp html = case html2pd html of
Just pd -> eitherToMaybe . Pandoc.runPure $ runWriter extp pd
Nothing -> Nothing
runWriter :: ExportType -> Pandoc.Pandoc -> Pandoc.PandocPure B.ByteString
runWriter extp pd = do
template <- getTemplate extp
let opts = writerOptions { Pandoc.writerTemplate = template }
writer extp opts pd
getTemplate :: ExportType -> Pandoc.PandocPure (Maybe String)
getTemplate HTML = Just <$> PandocTemplates.getDefaultTemplate "html5"
getTemplate LaTeX = Just <$> PandocTemplates.getDefaultTemplate "latex"
getTemplate RTF = Just <$> PandocTemplates.getDefaultTemplate "rtf"
getTemplate RST = Just <$> PandocTemplates.getDefaultTemplate "rst"
getTemplate Markdown = Just <$> PandocTemplates.getDefaultTemplate "markdown"
getTemplate AsciiDoc = Just <$> PandocTemplates.getDefaultTemplate ""
getTemplate Docx = Just <$> PandocTemplates.getDefaultTemplate "docx"
getTemplate ODT = Just <$> PandocTemplates.getDefaultTemplate "odt"
getTemplate DokuWiki = Just <$> PandocTemplates.getDefaultTemplate "dokuwiki"
getTemplate MediaWiki = Just <$> PandocTemplates.getDefaultTemplate "mediawiki"
getTemplate EPUB2 = Just <$> PandocTemplates.getDefaultTemplate "epub2"
getTemplate EPUB3 = Just <$> PandocTemplates.getDefaultTemplate "epub3"
getTemplate _ = return Nothing
html2pd :: String -> Maybe Pandoc.Pandoc
html2pd html = eitherToMaybe . Pandoc.runPure $ Pandoc.readHtml readerOptions (T.pack html)
-- | Ugly PDF writer from HTML
-- writerHTML2PDF opts pd = fixError . unsafePerformIO . Pandoc.runIO $ PandocPDF.makePDF "wkhtmltopdf" ["--quiet"] PandocWriters.writeHtml5String opts pd
-- where
-- fixError (Left pderr) = Left pderr
-- fixError (Right (Left bserr)) = Left . PandocError.PandocSomeError . T.unpack . E.decodeUtf8 . BL.toStrict $ bserr
-- fixError (Right (Right x)) = Right (BL.toStrict x)
-- | Wrapping HTML to PDF conversion which is unsafe
writerHTML2PDF :: String -> Maybe B.ByteString
writerHTML2PDF = Just . unsafePerformIO . html2pdf
-- | Simple conversion of HTML to PDF using process wkhtmltopdf
html2pdf :: String -> IO B.ByteString
html2pdf html = do
(Just stdin, Just stdout, _, _) <- createProcess cprocess
hPutStr stdin html >> hClose stdin
B.hGetContents stdout
where
procWith p = p { std_out = CreatePipe
, std_in = CreatePipe
}
opts = ["--quiet", "--encoding", "utf-8", "-", "-"]
cprocess = procWith $ proc "wkhtmltopdf" opts
-- | Select Writer based on given ExportType
writer :: ExportType -> Writer
writer = wrapWriter . pandocWriter
where
wrapWriter :: Pandoc.Writer Pandoc.PandocPure -> Writer
wrapWriter (Pandoc.TextWriter tw) = \opts pd -> E.encodeUtf8 <$> tw opts pd
wrapWriter (Pandoc.ByteStringWriter bsw) = \opts pd -> BL.toStrict <$> bsw opts pd
-- | Pick Pandoc writer for pure transformation
pandocWriter :: ExportType -> Pandoc.Writer Pandoc.PandocPure
pandocWriter HTML = Pandoc.TextWriter PandocWriters.writeHtml5String
pandocWriter LaTeX = Pandoc.TextWriter PandocWriters.writeLaTeX
pandocWriter RTF = Pandoc.TextWriter PandocWriters.writeRTF
pandocWriter RST = Pandoc.TextWriter PandocWriters.writeRST
pandocWriter Markdown = Pandoc.TextWriter PandocWriters.writeMarkdown
pandocWriter AsciiDoc = Pandoc.TextWriter PandocWriters.writeAsciiDoc
pandocWriter DokuWiki = Pandoc.TextWriter PandocWriters.writeDokuWiki
pandocWriter MediaWiki = Pandoc.TextWriter PandocWriters.writeMediaWiki
pandocWriter Docx = Pandoc.ByteStringWriter PandocWriters.writeDocx
pandocWriter ODT = Pandoc.ByteStringWriter PandocWriters.writeODT
pandocWriter EPUB2 = Pandoc.ByteStringWriter PandocWriters.writeEPUB2
pandocWriter EPUB3 = Pandoc.ByteStringWriter PandocWriters.writeEPUB3
pandocWriter PDF = pandocWriter HTML -- cannot be done as PandocPure