module Text.HTML.TagStream.Text where
import Control.Applicative
import Control.Monad (unless, when, liftM)
import Control.Monad.Trans.Class (lift)
import Data.Char
import qualified Data.Conduit.List as CL
import Data.Default
import Prelude hiding (mapM)
import qualified Data.Attoparsec.ByteString.Char8 as S
import Data.Attoparsec.Text
import Data.ByteString (ByteString)
import qualified Data.CaseInsensitive as CI
import Data.Conduit
import Data.Functor.Identity (runIdentity)
import Data.Maybe (fromMaybe)
import Data.Monoid (mconcat)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Builder as B
import Data.Traversable (mapM)
import qualified Text.XML.Stream.Parse as XML
#if MIN_VERSION_conduit(1, 0, 0)
#else
import Data.Conduit.Internal (pipeL)
#endif
import qualified Data.Conduit.List as C
import qualified Data.Conduit.Attoparsec as C
import qualified Data.Conduit.Text as C
import qualified Text.HTML.TagStream.ByteString as S
import Text.HTML.TagStream.Entities
import Text.HTML.TagStream.Types
import Text.HTML.TagStream.Utils (splitAccum)
type Token = Token' Text
type Attr = Attr' Text
quoted :: Char -> Parser Text
quoted q = T.append <$> takeTill (in2 ('\\',q))
<*> ( char q *> pure ""
<|> char '\\' *> atLeast 1 (quoted q) )
quotedOr :: Parser Text -> Parser Text
quotedOr p = maybeP (satisfy (in2 ('"','\''))) >>=
maybe p quoted
attrValue :: Parser Text
attrValue = quotedOr $ takeTill ((=='>') ||. isSpace)
attrName :: Parser Text
attrName = quotedOr $
T.cons <$> satisfy (/='>')
<*> takeTill (in3 ('/','>','=') ||. isSpace)
tagEnd :: Parser Bool
tagEnd = char '>' *> pure False
<|> string "/>" *> pure True
attr :: Parser Attr
attr = (,) <$> attrName <* skipSpace
<*> ( boolP (char '=') >>=
cond (skipSpace *> attrValue)
(pure "")
)
attrs :: Parser ([Attr], Bool)
attrs = loop []
where
loop acc = skipSpace *> (Left <$> tagEnd <|> Right <$> attr) >>=
either
(return . (reverse acc,))
(loop . (:acc))
comment :: Parser Token
comment = Comment <$> comment'
where comment' = T.append <$> takeTill (=='-')
<*> ( string "-->" *> return ""
<|> atLeast 1 comment' )
special :: Parser Token
special = Special
<$> ( T.cons <$> satisfy (not . ((=='-') ||. isSpace))
<*> takeTill ((=='>') ||. isSpace)
<* skipSpace )
<*> takeTill (=='>') <* char '>'
tag :: Parser Token
tag = do
t <- string "/" *> return TagTypeClose
<|> string "!" *> return TagTypeSpecial
<|> return TagTypeNormal
case t of
TagTypeClose ->
TagClose <$> takeTill (=='>')
<* char '>'
TagTypeSpecial -> boolP (string "--") >>=
cond comment special
TagTypeNormal -> do
name <- takeTill (in3 ('<','>','/') ||. isSpace)
(as, close) <- attrs
return $ TagOpen name as close
incomplete :: Parser Token
incomplete = Incomplete . T.cons '<' <$> takeText
text :: Parser Token
text = Text <$> atLeast 1 (takeTill (=='<'))
decodeEntitiesText :: Monad m => Conduit Token m Token
decodeEntitiesText =
decodeEntities
Dec { decToS = L.toStrict . B.toLazyText
, decBreak = T.break
, decBuilder = B.fromText
, decDrop = T.drop
, decEntity = decodeEntity
, decUncons = T.uncons }
where decodeEntity entity =
CL.sourceList ["&",entity,";"]
$= XML.parseText def { XML.psDecodeEntities = XML.decodeHtmlEntities }
$= CL.map snd
$$ XML.content
token :: Parser Token
token = char '<' *> (tag <|> incomplete)
<|> text
tillScriptEnd :: Token -> Parser [Token]
tillScriptEnd t = reverse <$> loop [t]
<|> (:[]) . Incomplete . T.append script <$> takeText
where
script = L.toStrict . B.toLazyText $ showToken id t
loop acc = (:acc) <$> scriptEnd
<|> (text >>= loop . (:acc))
scriptEnd = string "</script>" *> return (TagClose "script")
html :: Parser [Token]
html = tokens <|> pure []
where
tokens :: Parser [Token]
tokens = do
t <- token
case t of
(TagOpen name _ close)
| not close && name=="script"
-> (++) <$> tillScriptEnd t <*> html
_ -> (t:) <$> html
decode :: Text -> Either String [Token]
decode = fmap decodeEntitiesText' . parseOnly html
where
decodeEntitiesText' tokens = runIdentity $ mapM_ yield tokens $$ decodeEntitiesText =$ CL.consume
atLeast :: Int -> Parser Text -> Parser Text
atLeast 0 p = p
atLeast n p = T.cons <$> anyChar <*> atLeast (n1) p
cond :: a -> a -> Bool -> a
cond a1 a2 b = if b then a1 else a2
(||.) :: Applicative f => f Bool -> f Bool -> f Bool
(||.) = liftA2 (||)
in2 :: Eq a => (a,a) -> a -> Bool
in2 (a1,a2) a = a==a1 || a==a2
in3 :: Eq a => (a,a,a) -> a -> Bool
in3 (a1,a2,a3) a = a==a1 || a==a2 || a==a3
boolP :: Parser a -> Parser Bool
boolP p = p *> pure True <|> pure False
maybeP :: Parser a -> Parser (Maybe a)
maybeP p = Just <$> p <|> return Nothing
cc :: [Text] -> B.Builder
cc = mconcat . map B.fromText
showToken :: (Text -> Text) -> Token -> B.Builder
showToken hl (TagOpen name as close) =
cc $ [hl "<", name]
++ map showAttr as
++ [hl (if close then "/>" else ">")]
where
showAttr :: Attr -> Text
showAttr (key, value) = T.concat $ [" ", key, hl "=\""] ++ map escape (T.unpack value) ++ [hl "\""]
escape '"' = "\\\""
escape '\\' = "\\\\"
escape c = T.singleton c
showToken hl (TagClose name) = cc [hl "</", name, hl ">"]
showToken _ (Text s) = B.fromText s
showToken hl (Comment s) = cc [hl "<!--", s, hl "-->"]
showToken hl (Special name s) = cc [hl "<!", name, " ", s, hl ">"]
showToken _ (Incomplete s) = B.fromText s
tokenStream :: Monad m
#if MIN_VERSION_conduit(1, 0, 0)
=> Conduit Text m Token
#else
=> GInfConduit Text m Token
#endif
tokenStream =
loop T.empty =$= decodeEntitiesText
where
#if MIN_VERSION_conduit(1, 0, 0)
loop accum = await >>= maybe (close accum ()) (push accum)
#else
loop accum = awaitE >>= either (close accum) (push accum)
#endif
push accum input =
case parseOnly html (accum `T.append` input) of
Right (splitAccum -> (accum', tokens)) -> mapM_ yield tokens >> loop accum'
Left err -> fail err
close s r = do
unless (T.null s) $ yield $ Text s
return r
tokenStreamBS :: MonadThrow m
#if MIN_VERSION_conduit(1, 0, 0)
=> Conduit ByteString m Token
#else
=> GLInfConduit ByteString m Token
#endif
tokenStreamBS = do
tk <- C.sinkParser (skipBOM *> S.skipSpace *> S.char '<' *> S.tag)
let (mencoding, yieldToken) =
case tk of
(TagOpen "?xml" as _) ->
(lookup "encoding" as, False)
_ -> (Nothing, True)
let codec = fromMaybe C.utf8 (mencoding >>= getCodec . CI.mk)
when yieldToken $ lift (mapM (decodeBS codec) tk) >>= yield
#if MIN_VERSION_conduit(1, 0, 0)
C.decode codec =$= tokenStream
#else
C.decode codec `pipeL` tokenStream
#endif
where
skipBOM :: S.Parser ()
skipBOM =
( S.string "\xff\xfe"
<|> S.string "\xef\xbb\xbf"
) *> return ()
<|> return ()
getCodec :: CI.CI ByteString -> Maybe C.Codec
getCodec c =
case c of
"utf-8" -> Just C.utf8
"utf8" -> Just C.utf8
"iso8859" -> Just C.iso8859_1
_ -> Nothing
decodeBS codec bs = liftM T.concat $ yield bs $= C.decode codec $$ C.consume