{-# LANGUAGE LambdaCase, BangPatterns, OverloadedStrings #-} {-# LANGUAGE PatternSynonyms #-} {-# LANGUAGE ViewPatterns #-} {-# OPTIONS -ddump-simpl -dsuppress-all -ddump-to-file #-} module Text.HTML.TagStew (parseTags) where import qualified Data.ByteString.Unsafe as B import qualified Data.ByteString.Char8 as BC import qualified Data.ByteString as B import qualified Text.HTML.TagSoup as S import Text.HTML.TagStew.Entity (lookupEntity) import Data.Bits import Data.Char import Data.Word (Word8) data Stack = Text !Int | Entity !Int | TagOpen !Int | TagClose !Int | Attrs !Range !Attrs | AttrName !Range !Int !Attrs | AttrValue !Quote !Range !Range !Int !Attrs | Comment !Int | Script !Int deriving Show newtype Range = MkRange Int deriving Show data Quote = QNone | QSingle | QDouble deriving Show type Attrs = [(Range, Range)] pattern Range :: Int -> Int -> Range pattern Range i j <- (unpack -> (i, j)) where Range i j = MkRange $ i `shiftL` 32 .|. j {-# COMPLETE Range #-} unpack :: Range -> (Int, Int) unpack (MkRange x) = (x `shiftR` 32, x .&. 0xffffffff) {-# INLINE unpack #-} parseTags :: B.ByteString -> [S.Tag B.ByteString] parseTags bs = go 0 (Text 0) where slice (Range i j) = B.unsafeTake (j - i) $ B.unsafeDrop i bs isScript (Range i j) = BC.map toLower (B.unsafeTake (j - i) (B.unsafeDrop i bs)) == "script" {-# INLINE isScript #-} go :: Int -> Stack -> [S.Tag B.ByteString] go loc _ | loc >= B.length bs = [] go !loc !st = case st of Comment i | B.take 3 (B.drop loc bs) == "-->" -> S.TagComment (slice $ Range i loc) : go (loc + 3) (Text (loc + 3)) | otherwise -> char Entity i -> case ch of C_SC -> do case lookupEntity (B.take (loc - i) $ B.drop i bs) of Just x -> S.TagText x : next (Text loc') Nothing -> S.TagText (slice $ Range i loc) : next (Text loc') C_NS -> char _ | isAlphaNum (toEnum (fromEnum ch)) -> char _ -> go loc $ Text (i - 1) Text i -> case ch of C_LT -> pushText (Range i loc) $ next (TagOpen loc') C_AMP -> pushText (Range i loc) $ next (Entity loc') _ -> char TagOpen i -> case ch of C_GT -> pushOpen 1 (Range i loc) [] C_SL -> next $ TagClose loc' _ | isSpace' ch -> next $ AttrName (Range i loc) loc' [] 33 | B.unsafeIndex bs loc' == c2w '-', B.unsafeIndex bs (loc + 2) == c2w '-' -> go (loc + 3) (Comment (loc + 3)) _ -> char Attrs tname attrs -> case ch of C_GT -> pushOpen 1 tname attrs C_SL -> pushOpen 2 tname attrs _ | isSpace' ch -> next $ AttrName tname loc' attrs _ -> char AttrName tname i attrs -> case ch of C_SQ -> next $ AttrValue QSingle tname (Range 0 0) loc' attrs C_DQ -> next $ AttrValue QDouble tname (Range 0 0) loc' attrs C_EQ -> next $ AttrValue QNone tname (Range i loc) loc' attrs C_SL -> pushOpen 2 tname attrs C_GT -> pushOpen 1 tname attrs _ | isSpace' ch -> next $ AttrName tname loc' ((Range i loc, Range 0 0) : attrs) _ -> char AttrValue QSingle tname aname i attrs -> case ch of C_SQ -> next $ Attrs tname $ (aname, Range i loc) : attrs C_BS -> go (loc + 2) st _ -> char AttrValue QDouble tname aname i attrs -> case ch of C_DQ -> next $ Attrs tname $ (aname, Range i loc) : attrs C_BS -> go (loc + 2) st _ -> char AttrValue QNone tname aname i attrs -> case ch of C_SQ -> next $ AttrValue QSingle tname aname (i + 1) attrs C_DQ -> next $ AttrValue QDouble tname aname (i + 1) attrs C_SL -> pushOpen 2 tname $ (aname, Range i loc) : attrs C_GT -> pushOpen 1 tname $ (aname, Range i loc) : attrs _ | isSpace' ch -> next $ AttrName tname loc' $ (aname, Range i loc) : attrs _ -> char TagClose i -> case ch of C_GT -> (S.TagClose $! slice $ Range i loc) : next (Text $ loc + 1) _ -> char Script i | map toLower (BC.unpack (B.take 9 (B.drop loc bs))) == "" -> do pushText (Range i loc) $ S.TagClose (slice $ Range (loc + 2) (loc + 8)) : next (Text $ loc + 9) | otherwise -> char where pushText str@(Range i j) cont | i == j = cont | otherwise = S.TagText (slice str) : cont pushOpen ofs tname attrs = fmap slice (S.TagOpen tname attrs) : if isScript tname then go (loc + ofs) $ Script loc' else next $ Text $ loc + ofs {-# INLINE pushOpen #-} loc' = loc + 1 next = go loc' ch = B.unsafeIndex bs loc byteCount = max 1 $ countLeadingZeros (complement ch) char = go (loc + byteCount) st isSpace' :: Word8 -> Bool isSpace' ch = ch == 13 || ch == 10 || ch == 32 pattern C_LT, C_EQ, C_GT, C_SL, C_SQ, C_DQ, C_BS, C_AMP, C_SC, C_NS :: Word8 pattern C_LT = 60 pattern C_EQ = 61 pattern C_GT = 62 pattern C_SL = 47 pattern C_SQ = 39 pattern C_DQ = 34 pattern C_BS = 92 pattern C_AMP = 38 pattern C_SC = 59 pattern C_NS = 35 c2w :: Char -> Word8 c2w = toEnum . fromEnum