{-# LANGUAGE OverloadedStrings, TupleSections, ViewPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE TypeFamilies #-}
module Text.HTML.TagStream.Text where
import Control.Applicative
import Control.Monad (unless, when, liftM)
import qualified Control.Monad.Fail as Fail
import Control.Monad.Trans.Class (lift)
import Control.Monad.Trans.Resource (MonadThrow)
import Data.Char
import qualified Data.Conduit.List as CL
import Data.Default
import Prelude hiding (mapM)
import qualified Data.Attoparsec.ByteString.Char8 as S
import Data.Attoparsec.Text
import Data.ByteString (ByteString)
import qualified Data.CaseInsensitive as CI
import Data.Conduit
import Data.Functor.Identity (runIdentity)
import Data.Maybe (fromMaybe)
import Data.Monoid (mconcat)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Builder as B
import Data.Traversable (mapM)
import qualified Text.XML.Stream.Parse as XML
#if MIN_VERSION_conduit(1, 0, 0)
#else
import Data.Conduit.Internal (pipeL)
#endif
import qualified Data.Conduit.List as C
import qualified Data.Conduit.Attoparsec as C
import qualified Data.Conduit.Text as C
import qualified Text.HTML.TagStream.ByteString as S
import Text.HTML.TagStream.Entities
import Text.HTML.TagStream.Types
import Text.HTML.TagStream.Utils (splitAccum)
type Token = Token' Text
type Attr = Attr' Text
quoted :: Char -> Parser Text
quoted q = T.append <$> takeTill (in2 ('\\',q))
<*> ( char q *> pure ""
<|> char '\\' *> atLeast 1 (quoted q) )
quotedOr :: Parser Text -> Parser Text
quotedOr p = maybeP (satisfy (in2 ('"','\''))) >>=
maybe p quoted
attrValue :: Parser Text
attrValue = quotedOr $ takeTill ((=='>') ||. isSpace)
attrName :: Parser Text
attrName = quotedOr $
T.cons <$> satisfy (/='>')
<*> takeTill (in3 ('/','>','=') ||. isSpace)
tagEnd :: Parser Bool
tagEnd = char '>' *> pure False
<|> string "/>" *> pure True
attr :: Parser Attr
attr = (,) <$> attrName <* skipSpace
<*> ( boolP (char '=') >>=
cond (skipSpace *> attrValue)
(pure "")
)
attrs :: Parser ([Attr], Bool)
attrs = loop []
where
loop acc = skipSpace *> (Left <$> tagEnd <|> Right <$> attr) >>=
either
(return . (reverse acc,))
(loop . (:acc))
comment :: Parser Token
comment = Comment <$> comment'
where comment' = T.append <$> takeTill (=='-')
<*> ( string "-->" *> return ""
<|> atLeast 1 comment' )
special :: Parser Token
special = Special
<$> ( T.cons <$> satisfy (not . ((=='-') ||. isSpace))
<*> takeTill ((=='>') ||. isSpace)
<* skipSpace )
<*> takeTill (=='>') <* char '>'
tag :: Parser Token
tag = do
t <- string "/" *> return TagTypeClose
<|> string "!" *> return TagTypeSpecial
<|> return TagTypeNormal
case t of
TagTypeClose ->
TagClose <$> takeTill (=='>')
<* char '>'
TagTypeSpecial -> boolP (string "--") >>=
cond comment special
TagTypeNormal -> do
name <- takeTill (in3 ('<','>','/') ||. isSpace)
(as, close) <- attrs
return $ TagOpen name as close
incomplete :: Parser Token
incomplete = Incomplete . T.cons '<' <$> takeText
text :: Parser Token
text = Text <$> atLeast 1 (takeTill (=='<'))
decodeEntitiesText :: Monad m => Conduit Token m Token
decodeEntitiesText =
decodeEntities
Dec { decToS = L.toStrict . B.toLazyText
, decBreak = T.break
, decBuilder = B.fromText
, decDrop = T.drop
, decEntity = decodeEntity
, decUncons = T.uncons }
where decodeEntity entity =
CL.sourceList ["&",entity,";"]
$= XML.parseTextPos def { XML.psDecodeEntities = XML.decodeHtmlEntities }
$= CL.map snd
$$ XML.content
token :: Parser Token
token = char '<' *> (tag <|> incomplete)
<|> text
tillScriptEnd :: Token -> Parser [Token]
tillScriptEnd t = reverse <$> loop [t]
<|> (:[]) . Incomplete . T.append script <$> takeText
where
script = L.toStrict . B.toLazyText $ showToken id t
loop acc = (:acc) <$> scriptEnd
<|> (text >>= loop . (:acc))
scriptEnd = string "</script>" *> return (TagClose "script")
html :: Parser [Token]
html = tokens <|> pure []
where
tokens :: Parser [Token]
tokens = do
t <- token
case t of
(TagOpen name _ close)
| not close && name=="script"
-> (++) <$> tillScriptEnd t <*> html
_ -> (t:) <$> html
decode :: Text -> Either String [Token]
decode = fmap decodeEntitiesText' . parseOnly html
where
decodeEntitiesText' tokens = runIdentity $ mapM_ yield tokens $$ decodeEntitiesText =$ CL.consume
atLeast :: Int -> Parser Text -> Parser Text
atLeast 0 p = p
atLeast n p = T.cons <$> anyChar <*> atLeast (n-1) p
cond :: a -> a -> Bool -> a
cond a1 a2 b = if b then a1 else a2
(||.) :: Applicative f => f Bool -> f Bool -> f Bool
(||.) = liftA2 (||)
in2 :: Eq a => (a,a) -> a -> Bool
in2 (a1,a2) a = a==a1 || a==a2
in3 :: Eq a => (a,a,a) -> a -> Bool
in3 (a1,a2,a3) a = a==a1 || a==a2 || a==a3
boolP :: Parser a -> Parser Bool
boolP p = p *> pure True <|> pure False
maybeP :: Parser a -> Parser (Maybe a)
maybeP p = Just <$> p <|> return Nothing
cc :: [Text] -> B.Builder
cc = mconcat . map B.fromText
showToken :: (Text -> Text) -> Token -> B.Builder
showToken hl (TagOpen name as close) =
cc $ [hl "<", name]
++ map showAttr as
++ [hl (if close then "/>" else ">")]
where
showAttr :: Attr -> Text
showAttr (key, value) = T.concat $ [" ", key, hl "=\""] ++ map escape (T.unpack value) ++ [hl "\""]
escape '"' = "\\\""
escape '\\' = "\\\\"
escape c = T.singleton c
showToken hl (TagClose name) = cc [hl "</", name, hl ">"]
showToken _ (Text s) = B.fromText s
showToken hl (Comment s) = cc [hl "<!--", s, hl "-->"]
showToken hl (Special name s) = cc [hl "<!", name, " ", s, hl ">"]
showToken _ (Incomplete s) = B.fromText s
tokenStream :: Fail.MonadFail m
#if MIN_VERSION_conduit(1, 0, 0)
=> Conduit Text m Token
#else
=> GInfConduit Text m Token
#endif
tokenStream =
loop T.empty =$= decodeEntitiesText
where
#if MIN_VERSION_conduit(1, 0, 0)
loop accum = await >>= maybe (close accum ()) (push accum)
#else
loop accum = awaitE >>= either (close accum) (push accum)
#endif
push accum input =
case parseOnly html (accum `T.append` input) of
Right (splitAccum -> (accum', tokens)) -> mapM_ yield tokens >> loop accum'
Left err -> fail err
close s r = do
unless (T.null s) $ yield $ Text s
return r
tokenStreamBS :: (MonadThrow m, Fail.MonadFail m)
#if MIN_VERSION_conduit(1, 0, 0)
=> Conduit ByteString m Token
#else
=> GLInfConduit ByteString m Token
#endif
tokenStreamBS = do
tk <- C.sinkParser (skipBOM *> S.skipSpace *> S.char '<' *> S.tag)
let (mencoding, yieldToken) =
case tk of
(TagOpen "?xml" as _) ->
(lookup "encoding" as, False)
_ -> (Nothing, True)
let codec = fromMaybe C.utf8 (mencoding >>= getCodec . CI.mk)
when yieldToken $ (lift (mapM (decodeBS codec) tk) >>= yield) =$= decodeEntitiesText
#if MIN_VERSION_conduit(1, 0, 0)
C.decode codec =$= tokenStream
#else
C.decode codec `pipeL` tokenStream
#endif
where
skipBOM :: S.Parser ()
skipBOM =
( S.string "\xff\xfe"
<|> S.string "\xef\xbb\xbf"
) *> return ()
<|> return ()
getCodec :: CI.CI ByteString -> Maybe C.Codec
getCodec c =
case c of
"utf-8" -> Just C.utf8
"utf8" -> Just C.utf8
"iso8859" -> Just C.iso8859_1
_ -> Nothing
decodeBS codec bs = liftM T.concat $ yield bs $= C.decode codec $$ C.consume