{-# LANGUAGE OverloadedStrings, TupleSections, ViewPatterns #-} {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE CPP #-} module Text.HTML.TagStream ( Token (..) , tokenStream ) where import Control.Applicative import Control.Monad (unless) import Control.Monad.Trans.Resource (MonadThrow) import Data.Char import qualified Data.Conduit.List as CL import Data.Attoparsec.Text import Data.Conduit import qualified Data.Conduit.Attoparsec as CA import Data.Maybe (fromMaybe, isJust) import Data.Monoid ((<>)) import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Lazy as L import qualified Data.Text.Lazy.Builder as B import qualified Text.XML.Stream.Parse as XML import Data.Map (Map) import qualified Data.Map.Strict as Map import Control.Arrow (first) data Token = TagOpen Text (Map Text Text) Bool | TagClose Text | Text Text | Comment Text | Special Text Text | Incomplete Text deriving (Eq, Show) data TagType = TagTypeClose | TagTypeSpecial | TagTypeNormal {-- - match quoted string, can fail. -} quoted :: Char -> Parser Text quoted q = T.append <$> takeTill (in2 ('\\',q)) <*> ( char q *> pure "" <|> char '\\' *> atLeast 1 (quoted q) ) quotedOr :: Parser Text -> Parser Text quotedOr p = maybeP (satisfy (in2 ('"','\''))) >>= maybe p quoted {-- - attribute value, can't fail. -} attrValue :: Parser Text attrValue = quotedOr $ takeTill ((=='>') ||. isSpace) {-- - attribute name, at least one char, can fail when meet tag end. - might match self-close tag end "/>" , make sure match `tagEnd' first. -} attrName :: Parser Text attrName = quotedOr $ T.cons <$> satisfy (/='>') <*> takeTill (in3 ('/','>','=') ||. isSpace) {-- - tag end, return self-close or not, can fail. -} tagEnd :: Parser Bool tagEnd = char '>' *> pure False <|> string "/>" *> pure True {-- - attribute pair or tag end, can fail if tag end met. -} attr :: Parser (Text, Text) attr = (,) <$> attrName <* skipSpace <*> ( boolP (char '=') >>= cond (skipSpace *> attrValue) (pure "") ) {-- - all attributes before tag end. can't fail. -} attrs :: Parser (Map Text Text, Bool) attrs = loop Map.empty where loop acc = skipSpace *> (Left <$> tagEnd <|> Right <$> attr) >>= either (return . (acc,)) (\(key, value) -> loop $ Map.insert key value acc) {-- - comment tag without prefix. -} comment :: Parser Token comment = Comment <$> comment' where comment' = T.append <$> takeTill (=='-') <*> ( string "-->" *> return "" <|> atLeast 1 comment' ) {-- - tags begine with -} special :: Parser Token special = Special <$> ( T.cons <$> satisfy (not . ((=='-') ||. isSpace)) <*> takeTill ((=='>') ||. isSpace) <* skipSpace ) <*> takeTill (=='>') <* char '>' {-- - parse a tag, can fail. -} tag :: Parser Token tag = do t <- string "/" *> return TagTypeClose <|> string "!" *> return TagTypeSpecial <|> return TagTypeNormal case t of TagTypeClose -> TagClose <$> takeTill (=='>') <* char '>' TagTypeSpecial -> boolP (string "--") >>= cond comment special TagTypeNormal -> do name <- takeTill (in3 ('<','>','/') ||. isSpace) (as, close) <- attrs return $ TagOpen name (Map.map decodeString as) close {-- - record incomplete tag for streamline processing. -} incomplete :: Parser Token incomplete = Incomplete . T.cons '<' <$> takeText {-- - parse text node. consume at least one char, to make sure progress. -} text :: Parser Token text = Text <$> atLeast 1 (takeTill (=='<')) decodeEntity :: MonadThrow m => Text -> m Text decodeEntity entity = runConduit $ CL.sourceList ["&",entity,";"] #if MIN_VERSION_xml_conduit(1,9,0) .| XML.parseText XML.def { XML.psDecodeEntities = XML.decodeHtmlEntities } #else .| XML.parseText' XML.def { XML.psDecodeEntities = XML.decodeHtmlEntities } #endif .| XML.content token :: Parser Token token = char '<' *> (tag <|> incomplete) <|> text {-- - treat script tag specially, can't fail. -} tillScriptEnd :: Token -> Parser [Token] tillScriptEnd open = loop mempty where loop acc = do chunk <- takeTill (== '<') let acc' = acc <> B.fromText chunk finish = pure [open, Text $ L.toStrict $ B.toLazyText acc', TagClose "script"] hasContent = (string "/script>" *> finish) <|> loop (acc' <> "<") (char '<' *> hasContent) <|> finish tokens :: Parser [Token] tokens = do t <- token case t of TagOpen "script" _ False -> tillScriptEnd t Text text0 -> do let parseText = do Text text <- token pure text texts <- many parseText pure [Text $ decodeString $ T.concat $ text0 : texts] _ -> pure [t] {-- - Utils {{{ -} atLeast :: Int -> Parser Text -> Parser Text atLeast 0 p = p atLeast n p = T.cons <$> anyChar <*> atLeast (n-1) p cond :: a -> a -> Bool -> a cond a1 a2 b = if b then a1 else a2 (||.) :: Applicative f => f Bool -> f Bool -> f Bool (||.) = liftA2 (||) in2 :: Eq a => (a,a) -> a -> Bool in2 (a1,a2) a = a==a1 || a==a2 in3 :: Eq a => (a,a,a) -> a -> Bool in3 (a1,a2,a3) a = a==a1 || a==a2 || a==a3 boolP :: Parser a -> Parser Bool boolP p = p *> pure True <|> pure False maybeP :: Parser a -> Parser (Maybe a) maybeP p = Just <$> p <|> return Nothing -- }}} -- {{{ Stream tokenStream :: Monad m => ConduitT Text Token m () tokenStream = CL.filter (not . T.null) .| CA.conduitParserEither tokens .| CL.concatMap go where go (Left e) = error $ "html-conduit: parse error that should never happen occurred! " ++ show e go (Right (_, tokens')) = tokens' splitAccum :: [Token] -> (Text, [Token]) splitAccum [] = (mempty, []) splitAccum (reverse -> (Incomplete s : xs)) = (s, reverse xs) splitAccum tokens = (mempty, tokens) -- Entities -- | A conduit to decode entities from a stream of tokens into a new stream of tokens. decodeEntities :: Monad m => ConduitT Token Token m () decodeEntities = start where start = await >>= maybe (return ()) (\token' -> start' token' >> start) start' (Text t) = (yield t >> yieldWhileText) .| decodeEntities' .| CL.mapMaybe go start' (TagOpen name attrs' bool) = yield (TagOpen name (Map.map decodeString attrs') bool) start' token' = yield token' go t | t == "" = Nothing | otherwise = Just (Text t) -- | Decode entities in a complete string. decodeString :: Text -> Text decodeString input = case makeEntityDecoder input of (value', remainder) | value' /= mempty -> value' <> decodeString remainder | otherwise -> input decodeEntities' :: Monad m => ConduitT Text Text m () decodeEntities' = loop id where loop accum = do mchunk <- await let chunk = accum $ fromMaybe mempty mchunk (newStr, remainder) = makeEntityDecoder chunk yield newStr if isJust mchunk then loop (mappend remainder) else yield remainder -- | Yield contiguous text tokens as strings. yieldWhileText :: Monad m => ConduitT Token Text m () yieldWhileText = loop where loop = await >>= maybe (return ()) go go (Text t) = yield t >> loop go token' = leftover token' -- | Decode the entities in a string type with a decoder. makeEntityDecoder :: Text -> (Text, Text) makeEntityDecoder = first (L.toStrict . B.toLazyText) . go where go s = case T.break (=='&') s of (_,"") -> (B.fromText s, "") (before,restPlusAmp@(T.drop 1 -> rest)) -> case T.break (not . (\c -> isNameChar c || c == '#')) rest of (_,"") -> (B.fromText before, restPlusAmp) (entity,after) -> (before1 <> before2, after') where before1 = B.fromText before (before2, after') = case mdecoded of Nothing -> first (("&" <> B.fromText entity) <>) (go after) Just (B.fromText -> decoded) -> case T.uncons after of Just (';',validAfter) -> first (decoded <>) (go validAfter) Just (_invalid,_rest) -> first (decoded <>) (go after) Nothing -> (mempty, s) mdecoded = if entity == mempty then Nothing else decodeEntity entity -- | Is the character a valid Name starter? isNameStart :: Char -> Bool isNameStart c = c == ':' || c == '_' || isAsciiUpper c || isAsciiLower c || (c >= '\xC0' && c <= '\xD6') || (c >= '\xD8' && c <= '\xF6') || (c >= '\xF8' && c <= '\x2FF') || (c >= '\x370' && c <= '\x37D') || (c >= '\x37F' && c <= '\x1FFF') || (c >= '\x200C' && c <= '\x200D') || (c >= '\x2070' && c <= '\x218F') || (c >= '\x2C00' && c <= '\x2FEF') || (c >= '\x3001' && c <= '\xD7FF') || (c >= '\xF900' && c <= '\xFDCF') || (c >= '\xFDF0' && c <= '\xFFFD') || (c >= '\x10000' && c <= '\xEFFFF') -- | Is the character valid in a Name? isNameChar :: Char -> Bool isNameChar c = c == '-' || c == '.' || c == '\xB7' || isDigit c || isNameStart c || (c >= '\x0300' && c <= '\x036F') || (c >= '\x203F' && c <= '\x2040')