{-# LANGUAGE CPP #-} {-# LANGUAGE OverloadedStrings #-} {-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-} module Test.Mangrove.Html5Lib.TreeConstruction.Parser ( parseTestFile , TreeTest ( .. ) ) where import qualified Data.ByteString.Char8 as BS.C import qualified Data.HashMap.Strict as M import qualified Data.List as L import qualified Data.Maybe as Y import qualified Data.Text as T import qualified Data.Text.Encoding as T import Web.Mangrove.Parse.Tree import Web.Willow.DOM import Test.Mangrove.Html5Lib.Common #if MIN_VERSION_base(4,11,0) #else import Data.Semigroup ( (<>) ) #endif import System.FilePath ( (), (<.>) ) data TreeTest = TreeTest { input :: BS.C.ByteString , errors :: [(Word, Word, T.Text)] , output :: Tree , state :: TreeState } deriving ( Eq, Show, Read ) data RawTest = RawTest { input' :: [BS.C.ByteString] , errors' :: [BS.C.ByteString] , output' :: [BS.C.ByteString] , state' :: Maybe BS.C.ByteString , scripting' :: Maybe Bool } deriving ( Eq, Show, Read ) testFile :: FilePath -> IO FilePath testFile f = ( f) <$> dataFile "tree-construction" parseTestFile :: FilePath -> IO [TreeTest] parseTestFile p = fmap parse $ testFile (p <.> "dat") >>= BS.C.readFile parse :: BS.C.ByteString -> [TreeTest] parse = Y.mapMaybe (parseTest . rawTest) . breakTests where breakTests = breakTests' . BS.C.split '\n' breakTests' [] = [] breakTests' bss = case break BS.C.null bss of (bs, []) -> [bs] (bs, bss') -> bs : breakTests' (drop 1 bss') rawTest :: [BS.C.ByteString] -> RawTest rawTest ("#data":bss) = (rawTest bss') { input' = d } where (d, bss') = break isInstruction bss rawTest ("#errors":bss) = (rawTest bss') { errors' = e } where (e, bss') = break isInstruction bss rawTest ("#document-fragment":bss) = (rawTest bss') { state' = Y.listToMaybe x } where (x, bss') = break isInstruction bss rawTest ("#document":bss) = (rawTest bss') { output' = t } where (t, bss') = break isInstruction bss rawTest ("#script-off":bss) = (rawTest bss') { scripting' = Just False } where (_, bss') = break isInstruction bss rawTest ("#script-on":bss) = (rawTest bss') { scripting' = Just True } where (_, bss') = break isInstruction bss rawTest (_:bss) = rawTest $ dropWhile (not . isInstruction) bss rawTest [] = RawTest { input' = [] , errors' = [] , output' = [] , state' = Nothing , scripting' = Nothing } isInstruction :: BS.C.ByteString -> Bool isInstruction bs = fmap fst (BS.C.uncons bs) == Just '#' parseTest :: RawTest -> Maybe TreeTest parseTest t | null $ input' t = Nothing | null $ output' t = Nothing | scripting' t == Just True = Nothing | otherwise = Just $ TreeTest { input = BS.C.intercalate "\n" $ input' t , errors = Y.mapMaybe parseError $ errors' t , output = parseOutput . map (BS.C.drop 2) . foldr joinMultiline [] $ output' t , state = case state' t of Just bs -> treeFragment (packElement bs) [] Nothing Nothing defaultTreeState Nothing -> defaultTreeState } where joinMultiline bs [] = [bs] joinMultiline bs (bs':bss) = if BS.C.take 2 bs' == "| " then bs : bs' : bss else bs <> "\n" <> bs' : bss htmlElement = emptyElementParams { elementName = "html" , elementNamespace = Just htmlNamespace } packElement = packElement' . parseNode . BS.C.cons '<' . flip BS.C.snoc '>' packElement' (Element d) = d packElement' _ = htmlElement parseError :: BS.C.ByteString -> Maybe (Word, Word, T.Text) parseError _ = Nothing --TODO parseOutput :: [BS.C.ByteString] -> Tree parseOutput bss = Tree { node = Document NoQuirks , children = map snd . foldr encapsulate [] $ map dropParents bss } where dropParents bs = let (ws, n) = BS.C.span (== ' ') bs in (BS.C.length ws, parseNode n) encapsulate (d, e) es = let (cs, ss) = span ((d <) . fst) es (attrs, nonAttrs) = L.partition (\c -> nodeType (node $ snd c) == Just AttributeNode) cs (e', cs') = if nodeType e == Just ElementNode then let Element t = e t' = t { elementAttributes = M.union (elementAttributes t) . fromAttrList $ Y.mapMaybe (unpack . node . snd) attrs } unpack (Attribute a) = Just a unpack _ = Nothing in (Element t', nonAttrs) else (e, cs) in (d, Tree { node = e' , children = map snd cs' }) : ss parseNode :: BS.C.ByteString -> Node parseNode bs | bs == "content" = DocumentFragment | BS.C.take 10 bs == " c == ' ' || c == '>') $ BS.C.drop 10 bs (n, p, s) = if BS.C.take 1 bs' == ">" then (repack 0 0 n', T.empty, T.empty) else let (p', s') = BS.C.break (== '"') $ BS.C.drop 2 bs' in (repack 0 0 n', repack 0 0 p', repack 3 2 s') in DocumentType $ emptyDocumentTypeParams { documentTypeName = n , documentTypePublicId = p , documentTypeSystemId = s } | BS.C.take 5 bs == "