module Test.Mangrove.Html5Lib.Tokenizer ( tests ) where import qualified Data.ByteString.Short as BS.S import qualified Data.List as L import qualified Data.Text as T import qualified Test.HUnit as U import Web.Mangrove.Parse.Common.Error import Web.Mangrove.Parse.Tokenize import Web.Willow.DOM import Test.Mangrove.Html5Lib.Tokenizer.JSON import Test.HUnit ( (~:), (@?=) ) tests :: IO U.Test tests = U.TestLabel "tokenizer" . U.TestList <$> mapM runTestFile [ "test1" , "test2" , "test3" , "test4" , "entities" , "namedEntities" , "numericEntities" , "unicodeChars" -- , "unicodeCharsProblematic" -- Seems to contradict the standard in allowing surrogates to be decoded , "contentModelFlags" , "domjs" , "escapeFlag" , "pendingSpecChanges" -- , "xmlViolation" ] runTestFile :: FilePath -> IO U.Test runTestFile p = U.TestLabel (p ++ ".test") . U.TestList . map run <$> parseTestFile p run :: JsonTest -> U.Test run test = T.unpack (description test) ~: U.TestList $ map (runState test) [ (show state, tokenizerMode state tokState) | state <- initialStates test ] where tokState = case lastStartTag test of Just n -> tokenizerStartTag Nothing n $ defaultTokenizerState Nothing -> defaultTokenizerState runState :: JsonTest -> (String, TokenizerState) -> U.Test runState test (mode, state) = mode ~: U.TestCase $ do let (out, state') = tokenize state $ input test final = finalizeTokenizer state' out' = out ++ final (L.sortOn show . concat $ map mapError out', trimEof $ map mapToken out') @?= (map errorCode $ errors test, concat $ output test) where normalizeToken (EndTag d) = EndTag $ emptyTagParams { tagName = tagName d } normalizeToken tok = tok mapToken = normalizeToken . snd normalizeError (CharacterReferenceOutsideUnicodeRange _) = CharacterReferenceOutsideUnicodeRange 0 normalizeError (DuplicateAttribute _) = DuplicateAttribute (T.empty, T.empty) normalizeError (DuplicateSingletonElement _) = DuplicateSingletonElement emptyElementParams normalizeError (FramesetInBody _) = FramesetInBody emptyElementParams normalizeError (MalformedTableStructure _) = MalformedTableStructure emptyElementParams normalizeError (InvalidByteSequence _) = InvalidByteSequence $ BS.S.pack [] normalizeError (NoncharacterCharacterReference _) = NoncharacterCharacterReference '\NUL' normalizeError (ObsoleteTagName _) = ObsoleteTagName T.empty normalizeError (SurrogateCharacterReference _) = SurrogateCharacterReference '\NUL' normalizeError (UnexpectedCharacterAfterDoctypeSystemIdentifier _) = UnexpectedCharacterAfterDoctypeSystemIdentifier '\NUL' normalizeError (UnexpectedDoctype _) = UnexpectedDoctype emptyDocumentTypeParams normalizeError (UnexpectedDescendantElement _) = UnexpectedDescendantElement emptyElementParams normalizeError (UnexpectedEndTag _) = UnexpectedEndTag emptyElementParams normalizeError (UnmatchedEndTag _) = UnmatchedEndTag emptyElementParams normalizeError err = err mapError = map normalizeError . fst trimEof ts = case reverse ts of (EndOfStream:ts') -> reverse ts' _ -> ts