{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.SelfContained Copyright : Copyright (C) 2011-2019 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane Stability : alpha Portability : portable Functions for converting an HTML file into one that can be viewed offline, by incorporating linked images, CSS, and scripts into the HTML using data URIs. -} module Text.Pandoc.SelfContained ( makeDataURI, makeSelfContained ) where import Prelude import Codec.Compression.GZip as Gzip import Control.Applicative ((<|>)) import Control.Monad.Except (throwError) import Control.Monad.Trans (lift) import Data.ByteString (ByteString) import Data.ByteString.Base64 import qualified Data.ByteString.Char8 as B import qualified Data.ByteString.Lazy as L import Data.Char (isAlphaNum, isAscii, toLower) import Data.List (isPrefixOf) import Network.URI (escapeURIString) import System.FilePath (takeDirectory, takeExtension, ()) import Text.HTML.TagSoup import Text.Pandoc.Class (PandocMonad (..), fetchItem, getInputFiles, report, setInputFiles) import Text.Pandoc.Error import Text.Pandoc.Logging import Text.Pandoc.MIME (MimeType) import Text.Pandoc.Shared (isURI, renderTags', trim) import Text.Pandoc.UTF8 (toString) import Text.Parsec (ParsecT, runParserT) import qualified Text.Parsec as P isOk :: Char -> Bool isOk c = isAscii c && isAlphaNum c makeDataURI :: (MimeType, ByteString) -> String makeDataURI (mime, raw) = if textual then "data:" ++ mime' ++ "," ++ escapeURIString isOk (toString raw) else "data:" ++ mime' ++ ";base64," ++ toString (encode raw) where textual = "text/" `Data.List.isPrefixOf` mime mime' = if textual && ';' `notElem` mime then mime ++ ";charset=utf-8" else mime -- mime type already has charset convertTags :: PandocMonad m => [Tag String] -> m [Tag String] convertTags [] = return [] convertTags (t@TagOpen{}:ts) | fromAttrib "data-external" t == "1" = (t:) <$> convertTags ts convertTags (t@(TagOpen tagname as):ts) | tagname `elem` ["img", "embed", "video", "input", "audio", "source", "track", "section"] = do as' <- mapM processAttribute as rest <- convertTags ts return $ TagOpen tagname as' : rest where processAttribute (x,y) = if x `elem` ["src", "data-src", "href", "poster", "data-background-image"] then do enc <- getDataURI (fromAttrib "type" t) y return (x, enc) else return (x,y) convertTags (t@(TagOpen "script" as):TagClose "script":ts) = case fromAttrib "src" t of [] -> (t:) <$> convertTags ts src -> do let typeAttr = fromAttrib "type" t res <- getData typeAttr src rest <- convertTags ts case res of Left dataUri -> return $ TagOpen "script" (("src",dataUri) : [(x,y) | (x,y) <- as, x /= "src"]) : TagClose "script" : rest Right (mime, bs) | ("text/javascript" `isPrefixOf` mime || "application/javascript" `isPrefixOf` mime || "application/x-javascript" `isPrefixOf` mime) && not (" return $ TagOpen "script" [("type", typeAttr)|not (null typeAttr)] : TagText (toString bs) : TagClose "script" : rest | otherwise -> return $ TagOpen "script" (("src",makeDataURI (mime, bs)) : [(x,y) | (x,y) <- as, x /= "src"]) : TagClose "script" : rest convertTags (t@(TagOpen "link" as):ts) = case fromAttrib "href" t of [] -> (t:) <$> convertTags ts src -> do res <- getData (fromAttrib "type" t) src case res of Left dataUri -> do rest <- convertTags ts return $ TagOpen "link" (("href",dataUri) : [(x,y) | (x,y) <- as, x /= "href"]) : rest Right (mime, bs) | "text/css" `isPrefixOf` mime && null (fromAttrib "media" t) && not (" do rest <- convertTags $ dropWhile (==TagClose "link") ts return $ TagOpen "style" [("type", mime)] : TagText (toString bs) : TagClose "style" : rest | otherwise -> do rest <- convertTags ts return $ TagOpen "link" (("href",makeDataURI (mime, bs)) : [(x,y) | (x,y) <- as, x /= "href"]) : rest convertTags (t:ts) = (t:) <$> convertTags ts cssURLs :: PandocMonad m => FilePath -> ByteString -> m ByteString cssURLs d orig = do res <- runParserT (parseCSSUrls d) () "css" orig case res of Left e -> do report $ CouldNotParseCSS (show e) return orig Right bs -> return bs parseCSSUrls :: PandocMonad m => FilePath -> ParsecT ByteString () m ByteString parseCSSUrls d = B.concat <$> P.many (pCSSWhite <|> pCSSComment <|> pCSSImport d <|> pCSSUrl d <|> pCSSOther) pCSSImport :: PandocMonad m => FilePath -> ParsecT ByteString () m ByteString pCSSImport d = P.try $ do P.string "@import" P.spaces res <- (pQuoted <|> pUrl) >>= handleCSSUrl d P.spaces P.char ';' P.spaces case res of Left b -> return $ B.pack "@import " <> b Right (_, b) -> return b -- Note: some whitespace in CSS is significant, so we can't collapse it! pCSSWhite :: PandocMonad m => ParsecT ByteString () m ByteString pCSSWhite = B.singleton <$> P.space <* P.spaces pCSSComment :: PandocMonad m => ParsecT ByteString () m ByteString pCSSComment = P.try $ do P.string "/*" P.manyTill P.anyChar (P.try (P.string "*/")) return B.empty pCSSOther :: PandocMonad m => ParsecT ByteString () m ByteString pCSSOther = (B.pack <$> P.many1 (P.noneOf "u/ \n\r\t")) <|> (B.singleton <$> P.char 'u') <|> (B.singleton <$> P.char '/') pCSSUrl :: PandocMonad m => FilePath -> ParsecT ByteString () m ByteString pCSSUrl d = P.try $ do res <- pUrl >>= handleCSSUrl d case res of Left b -> return b Right (mt,b) -> do let enc = makeDataURI (mt, b) return (B.pack $ "url(" ++ enc ++ ")") pQuoted :: PandocMonad m => ParsecT ByteString () m (String, ByteString) pQuoted = P.try $ do quote <- P.oneOf "\"'" url <- P.manyTill P.anyChar (P.char quote) let fallback = B.pack ([quote] ++ trim url ++ [quote]) return (url, fallback) pUrl :: PandocMonad m => ParsecT ByteString () m (String, ByteString) pUrl = P.try $ do P.string "url(" P.spaces quote <- P.option Nothing (Just <$> P.oneOf "\"'") url <- P.manyTill P.anyChar (maybe (P.lookAhead (P.char ')')) P.char quote) P.spaces P.char ')' let fallback = B.pack ("url(" ++ maybe "" (:[]) quote ++ trim url ++ maybe "" (:[]) quote ++ ")") return (url, fallback) handleCSSUrl :: PandocMonad m => FilePath -> (String, ByteString) -> ParsecT ByteString () m (Either ByteString (MimeType, ByteString)) handleCSSUrl d (url, fallback) = case escapeURIString (/='|') (trim url) of '#':_ -> return $ Left fallback 'd':'a':'t':'a':':':_ -> return $ Left fallback u -> do let url' = if isURI u then u else d u res <- lift $ getData "" url' case res of Left uri -> return $ Left (B.pack $ "url(" ++ uri ++ ")") Right (mt, raw) -> do -- note that the downloaded CSS may -- itself contain url(...). b <- if "text/css" `isPrefixOf` mt then cssURLs d raw else return raw return $ Right (mt, b) getDataURI :: PandocMonad m => MimeType -> String -> m String getDataURI mimetype src = do res <- getData mimetype src case res of Left uri -> return uri Right x -> return $ makeDataURI x getData :: PandocMonad m => MimeType -> String -> m (Either String (MimeType, ByteString)) getData _ src@('d':'a':'t':'a':':':_) = return $ Left src-- already data: uri getData mimetype src = do let ext = map toLower $ takeExtension src (raw, respMime) <- fetchItem src let raw' = if ext `elem` [".gz", ".svgz"] then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks [raw] else raw mime <- case (mimetype, respMime) of ("",Nothing) -> throwError $ PandocSomeError $ "Could not determine mime type for `" ++ src ++ "'" (x, Nothing) -> return x (_, Just x ) -> return x result <- if "text/css" `isPrefixOf` mime then do oldInputs <- getInputFiles setInputFiles [src] res <- cssURLs (takeDirectory src) raw' setInputFiles oldInputs return res else return raw' return $ Right (mime, result) -- | Convert HTML into self-contained HTML, incorporating images, -- scripts, and CSS using data: URIs. makeSelfContained :: PandocMonad m => String -> m String makeSelfContained inp = do let tags = parseTags inp out' <- convertTags tags return $ renderTags' out'