{-# LANGUAGE OverloadedStrings #-} {-# OPTIONS_GHC -Wno-orphans #-} module Test.Mangrove.Html5Lib.Tokenizer.JSON ( JsonTest ( .. ) , TestError ( .. ) , parseTestFile ) where import qualified Data.Aeson as J import qualified Data.Aeson.Types as J import qualified Data.ByteString as BS import qualified Data.ByteString.Builder as BS.B import qualified Data.ByteString.Lazy as BS.L import qualified Data.ByteString.Short as BS.S import qualified Data.HashMap.Strict as M import qualified Data.List as L import qualified Data.Maybe as Y import qualified Data.Text as T import qualified Numeric as N import qualified Text.Read as R import Web.Mangrove.Parse.Common.Error import Web.Mangrove.Parse.Tokenize import Web.Willow.DOM hiding ( Node ( .. ) ) import Test.Mangrove.Html5Lib.Common import Data.Aeson ( (.:), (.:?), (.!=) ) import Data.Vector ( (!?) ) import System.FilePath ( (), (<.>) ) testFile :: FilePath -> IO FilePath testFile f = ( f) <$> dataFile "tokenizer" parseTestFile :: FilePath -> IO [JsonTest] parseTestFile p = testFile (p <.> "test") >>= J.eitherDecodeFileStrict >>= either fail rewrap where rewrap (TestFile ts) = return ts data JsonTest = JsonTest { description :: T.Text , input :: BS.ByteString , output :: [[Token]] , initialStates :: [CurrentTokenizerState] , lastStartTag :: Maybe T.Text , errors :: [TestError] } deriving ( Eq, Show, Read ) instance J.FromJSON JsonTest where parseJSON = J.withObject "test" $ \v -> do doubleEscaped <- v .:? "doubleEscaped" .!= False JsonTest <$> v .: "description" <*> fmap (BS.L.toStrict . BS.B.toLazyByteString . BS.B.stringUtf8 . unescape doubleEscaped) (v .: "input") <*> fmap (map $ unpackTokens doubleEscaped) (v .: "output") <*> v .:? "initialStates" .!= [DataState] <*> v .:? "lastStartTag" <*> fmap (L.sortOn (show . errorCode)) (v .:? "errors" .!= []) where unpackTokens doubleEscaped (TestToken (Right (Comment str))) = [Comment . T.pack $ unescape doubleEscaped str] unpackTokens _ (TestToken (Right tok)) = [tok] unpackTokens doubleEscaped (TestToken (Left str)) = map Character . unescape doubleEscaped $ T.pack str unescape False = T.unpack unescape True = unescape' . T.splitOn "\\u" unescape' [] = "" unescape' (t:ts) = T.unpack t ++ concatMap unescapeCode ts unescapeCode t = maybe '\NUL' (toEnum . fst) (Y.listToMaybe $ N.readHex code) : ts where (code, ts) = splitAt 4 $ T.unpack t data TestError = TestError { errorCode :: ParseError , line :: Word , column :: Word } deriving ( Eq, Show, Read ) instance J.FromJSON TestError where parseJSON = J.withObject "error" $ \v -> TestError <$> v .: "code" <*> v .: "line" <*> v .: "col" instance J.FromJSON CurrentTokenizerState where parseJSON = J.withText "initialState" $ \v -> case v of "Data state" -> return DataState "PLAINTEXT state" -> return PlainTextState "RCDATA state" -> return RCDataState "RAWTEXT state" -> return RawTextState "Script data state" -> return ScriptDataState "CDATA section state" -> return CDataState _ -> fail $ "Unknown tokenizer state '" ++ T.unpack v ++ "'" newtype TestToken = TestToken (Either String Token) deriving ( Eq, Show, Read ) instance J.FromJSON TestToken where parseJSON = J.withArray "output" $ \v -> case v !? 0 of Just "DOCTYPE" -> do name <- maybe (return Nothing) parseTextOrNull $ v !? 1 public <- maybe (return Nothing) parseTextOrNull $ v !? 2 system <- maybe (return Nothing) parseTextOrNull $ v !? 3 correctness <- maybe (return True) parseBool $ v !? 4 return . TestToken . Right . Doctype $ emptyDoctypeParams { doctypeName = name , doctypePublicId = public , doctypeSystemId = system , doctypeQuirks = not correctness } Just "StartTag" -> do name <- maybe (return T.empty) parseText $ v !? 1 attrs <- maybe (return []) parseAttributes $ v !? 2 selfClosing <- maybe (return False) parseBool $ v !? 3 return . TestToken . Right . StartTag $ emptyTagParams { tagName = name , tagIsSelfClosing = selfClosing , tagAttributes = M.fromList attrs } Just "EndTag" -> J.withText "EndTag" (return . TestToken . Right . packEndTag) $ Y.fromMaybe (J.String T.empty) (v !? 1) Just "Comment" -> J.withText "Comment" (return . TestToken . Right . Comment) $ Y.fromMaybe (J.String T.empty) (v !? 1) Just "Character" -> J.withText "Character" (return . TestToken . Left . T.unpack) $ Y.fromMaybe (J.String T.empty) (v !? 1) Just _ -> fail $ "Unknown output token" Nothing -> fail $ "Empty output token" where packEndTag name = EndTag $ emptyTagParams { tagName = name } parseText = J.withText "text" return parseTextOrNull (J.String t) = return $ Just t parseTextOrNull J.Null = return Nothing parseTextOrNull v = J.typeMismatch "String or Null" v parseBool = J.withBool "bool" return parseAttributes = J.withObject "attributes" $ fmap M.toList . mapM parseText instance J.FromJSON ParseError where parseJSON = J.withText "code" $ \v -> case v of "character-reference-outside-unicode-range" -> return $ CharacterReferenceOutsideUnicodeRange 0 "duplicate-attribute" -> return $ DuplicateAttribute (T.empty, T.empty) "duplicate-singleton-element" -> return $ DuplicateSingletonElement emptyElementParams "frameset-in-body" -> return $ FramesetInBody emptyElementParams "malformed-table-structure" -> return $ MalformedTableStructure emptyElementParams "invalid-byte-sequence" -> return . InvalidByteSequence $ BS.S.pack [] "noncharacter-character-reference" -> return $ NoncharacterCharacterReference '\NUL' "obsolete-tag-name" -> return $ ObsoleteTagName T.empty "surrogate-character-reference" -> return $ SurrogateCharacterReference '\NUL' "unexpected-character-after-doctype-system-identifier" -> return $ UnexpectedCharacterAfterDoctypeSystemIdentifier '\NUL' "unexpected-descendant-element" -> return $ UnexpectedDescendantElement emptyElementParams "unexpected-doctype" -> return $ UnexpectedDoctype emptyDocumentTypeParams "unexpected-end-tag" -> return $ UnexpectedEndTag emptyElementParams "unmatched-end-tag" -> return $ UnmatchedEndTag emptyElementParams _ -> maybe (fail $ err v) return . R.readMaybe . concat . map (T.unpack . conCase) $ T.split (== '-') v where conCase "cdata" = "CData" conCase "eof" = "EOF" conCase t = T.toTitle t err v = "Could not parse error code '" ++ T.unpack v ++ "'" newtype TestFile = TestFile [JsonTest] deriving ( Eq, Show, Read ) instance J.FromJSON TestFile where parseJSON = J.withObject "file" $ \v -> TestFile <$> v .: "tests"