module Text.HTML.TagStream.Text where
import Prelude hiding (mapM)
import Control.Applicative
import Control.Monad (unless, when, liftM)
import Control.Monad.Trans.Class (lift)
import Data.Traversable (mapM)
import Data.Maybe (fromMaybe)
import Data.Monoid (mconcat)
import Data.Char (isSpace)
import Data.ByteString (ByteString)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Builder as B
import qualified Data.CaseInsensitive as CI
import qualified Data.Attoparsec.ByteString.Char8 as S
import Data.Attoparsec.Text
import Data.Conduit
#if MIN_VERSION_conduit(1, 0, 0)
import Data.Conduit.Internal (unConduitM)
#else
import Data.Conduit.Internal (pipeL)
#endif
import qualified Data.Conduit.List as C
import qualified Data.Conduit.Attoparsec as C
import qualified Data.Conduit.Text as C
import qualified Text.HTML.TagStream.ByteString as S
import Text.HTML.TagStream.Types
import Text.HTML.TagStream.Utils (splitAccum)
type Token = Token' Text
type Attr = Attr' Text
quoted :: Char -> Parser Text
quoted q = T.append <$> takeTill (in2 ('\\',q))
<*> ( char q *> pure ""
<|> char '\\' *> atLeast 1 (quoted q) )
quotedOr :: Parser Text -> Parser Text
quotedOr p = maybeP (satisfy (in2 ('"','\''))) >>=
maybe p quoted
attrValue :: Parser Text
attrValue = quotedOr $ takeTill ((=='>') ||. isSpace)
attrName :: Parser Text
attrName = quotedOr $
T.cons <$> satisfy (/='>')
<*> takeTill (in3 ('/','>','=') ||. isSpace)
tagEnd :: Parser Bool
tagEnd = char '>' *> pure False
<|> string "/>" *> pure True
attr :: Parser Attr
attr = (,) <$> attrName <* skipSpace
<*> ( boolP (char '=') >>=
cond (skipSpace *> attrValue)
(pure "")
)
attrs :: Parser ([Attr], Bool)
attrs = loop []
where
loop acc = skipSpace *> (Left <$> tagEnd <|> Right <$> attr) >>=
either
(return . (reverse acc,))
(loop . (:acc))
comment :: Parser Token
comment = Comment <$> comment'
where comment' = T.append <$> takeTill (=='-')
<*> ( string "-->" *> return ""
<|> atLeast 1 comment' )
special :: Parser Token
special = Special
<$> ( T.cons <$> satisfy (not . ((=='-') ||. isSpace))
<*> takeTill ((=='>') ||. isSpace)
<* skipSpace )
<*> takeTill (=='>') <* char '>'
tag :: Parser Token
tag = do
t <- string "/" *> return TagTypeClose
<|> string "!" *> return TagTypeSpecial
<|> return TagTypeNormal
case t of
TagTypeClose ->
TagClose <$> takeTill (=='>')
<* char '>'
TagTypeSpecial -> boolP (string "--") >>=
cond comment special
TagTypeNormal -> do
name <- takeTill (in3 ('<','>','/') ||. isSpace)
(as, close) <- attrs
return $ TagOpen name as close
incomplete :: Parser Token
incomplete = Incomplete . T.cons '<' <$> takeText
text :: Parser Token
text = Text <$> atLeast 1 (takeTill (=='<'))
token :: Parser Token
token = char '<' *> (tag <|> incomplete)
<|> text
tillScriptEnd :: Token -> Parser [Token]
tillScriptEnd t = reverse <$> loop [t]
<|> (:[]) . Incomplete . T.append script <$> takeText
where
script = L.toStrict . B.toLazyText $ showToken id t
loop acc = (:acc) <$> scriptEnd
<|> (text >>= loop . (:acc))
scriptEnd = string "</script>" *> return (TagClose "script")
html :: Parser [Token]
html = tokens <|> pure []
where
tokens :: Parser [Token]
tokens = do
t <- token
case t of
(TagOpen name _ close)
| not close && name=="script"
-> (++) <$> tillScriptEnd t <*> html
_ -> (t:) <$> html
decode :: Text -> Either String [Token]
decode = parseOnly html
atLeast :: Int -> Parser Text -> Parser Text
atLeast 0 p = p
atLeast n p = T.cons <$> anyChar <*> atLeast (n1) p
cond :: a -> a -> Bool -> a
cond a1 a2 b = if b then a1 else a2
(||.) :: Applicative f => f Bool -> f Bool -> f Bool
(||.) = liftA2 (||)
in2 :: Eq a => (a,a) -> a -> Bool
in2 (a1,a2) a = a==a1 || a==a2
in3 :: Eq a => (a,a,a) -> a -> Bool
in3 (a1,a2,a3) a = a==a1 || a==a2 || a==a3
boolP :: Parser a -> Parser Bool
boolP p = p *> pure True <|> pure False
maybeP :: Parser a -> Parser (Maybe a)
maybeP p = Just <$> p <|> return Nothing
cc :: [Text] -> B.Builder
cc = mconcat . map B.fromText
showToken :: (Text -> Text) -> Token -> B.Builder
showToken hl (TagOpen name as close) =
cc $ [hl "<", name]
++ map showAttr as
++ [hl (if close then "/>" else ">")]
where
showAttr :: Attr -> Text
showAttr (key, value) = T.concat $ [" ", key, hl "=\""] ++ map escape (T.unpack value) ++ [hl "\""]
escape '"' = "\\\""
escape '\\' = "\\\\"
escape c = T.singleton c
showToken hl (TagClose name) = cc [hl "</", name, hl ">"]
showToken _ (Text s) = B.fromText s
showToken hl (Comment s) = cc [hl "<!--", s, hl "-->"]
showToken hl (Special name s) = cc [hl "<!", name, " ", s, hl ">"]
showToken _ (Incomplete s) = B.fromText s
tokenStream :: Monad m
#if MIN_VERSION_conduit(1, 0, 0)
=> Conduit Text m Token
#else
=> GInfConduit Text m Token
#endif
tokenStream =
loop T.empty
where
#if MIN_VERSION_conduit(1, 0, 0)
loop accum = await >>= maybe (close accum ()) (push accum)
#else
loop accum = awaitE >>= either (close accum) (push accum)
#endif
push accum input =
case parseOnly html (accum `T.append` input) of
Right (splitAccum -> (accum', tokens)) -> mapM_ yield tokens >> loop accum'
Left err -> fail err
close s r = do
unless (T.null s) $ yield $ Text s
return r
tokenStreamBS :: MonadThrow m
#if MIN_VERSION_conduit(1, 0, 0)
=> Conduit ByteString m Token
#else
=> GLInfConduit ByteString m Token
#endif
tokenStreamBS = do
tk <- C.sinkParser (skipBOM *> S.skipSpace *> S.char '<' *> S.tag)
let (mencoding, yieldToken) =
case tk of
(TagOpen "?xml" as _) ->
(lookup "encoding" as, False)
_ -> (Nothing, True)
let codec = fromMaybe C.utf8 (mencoding >>= getCodec . CI.mk)
when yieldToken $ lift (mapM (decodeBS codec) tk) >>= yield
#if MIN_VERSION_conduit(1, 0, 0)
C.decode codec =$= tokenStream
#else
C.decode codec `pipeL` tokenStream
#endif
where
skipBOM :: S.Parser ()
skipBOM =
( S.string "\xff\xfe"
<|> S.string "\xef\xbb\xbf"
) *> return ()
<|> return ()
getCodec :: CI.CI ByteString -> Maybe C.Codec
getCodec c =
case c of
"utf-8" -> Just C.utf8
"utf8" -> Just C.utf8
"iso8859" -> Just C.iso8859_1
_ -> Nothing
decodeBS codec bs = liftM T.concat $ yield bs $= C.decode codec $$ C.consume