{-# LANGUAGE OverloadedStrings #-} {-| Description: Copyright: (c) 2020 Samuel May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: experimental Portability: portable -} module Test.Willow.Html5Lib.Encoding.Sniffer ( tests ) where import qualified Test.HUnit as U import Test.HUnit ( (~?=), (~:) ) import System.FilePath ( () ) import qualified Data.ByteString as BS import qualified Data.ByteString.Char8 as BS.C import qualified Data.Char as C import qualified Data.Either as E import qualified Data.Maybe as Y import qualified Text.Read as R import Web.Willow.Common.Encoding.Sniffer import Test.Willow.Html5Lib.Common testFile :: FilePath -> IO FilePath testFile f = ( f) <$> dataFile "encoding" tests :: IO U.Test tests = do ts <- mapM id [ test (Just 47) "tests1.dat" -- More complex than the presniffer after that , test Nothing "tests2.dat" ] return $ "encoding" ~: U.TestList ts test :: Maybe Word -> FilePath -> IO U.Test test len p = ((~:) p . U.TestList . cut) <$> parse p where cut = case len of Just i -> take $ fromIntegral i Nothing -> id parse :: FilePath -> IO [U.Test] parse p = do ls <- testFile p >>= BS.readFile let ts = uncurry zip . E.partitionEithers . foldr gatherLines [] . fst . foldr categorizeLines ([], True) $ BS.C.lines ls return $ map (\(testData, enc) -> run testData ~?= parseEncoding enc) ts categorizeLines :: BS.ByteString -> ([Either BS.ByteString BS.ByteString], Bool) -> ([Either BS.ByteString BS.ByteString], Bool) categorizeLines "#data" (ls, _) = (ls, True) categorizeLines "#encoding" (ls, _) = (ls, False) categorizeLines l (ls, True) = (Right l : ls, True) categorizeLines l (ls, False) = (Left l : ls, False) gatherLines :: Either BS.ByteString BS.ByteString -> [Either BS.ByteString BS.ByteString] -> [Either BS.ByteString BS.ByteString] gatherLines l [] = [l] gatherLines (Left l) ((Left l'):ls) = Left (BS.concat [l, "\n", l']) : ls gatherLines (Right l) ((Right l'):ls) = Right (BS.concat [l, "\n", l']) : ls gatherLines l ls = l : ls -- | Avoids relying on 'lookupEncoding', and so is able to test that at the -- same time. parseEncoding :: BS.ByteString -> Encoding parseEncoding enc = Y.fromMaybe Windows1252 . R.readMaybe . foldr foldEncoding [] . map (map (toEnum . fromIntegral) . BS.unpack) $ BS.split 0x2D enc where foldEncoding s@(c:cs) ss@(d:_) | C.isDigit (last s) && C.isDigit d = s' ++ '_' : ss | otherwise = s' ++ ss where s' = C.toUpper c : map C.toLower cs foldEncoding [] ss = ss foldEncoding (c:cs) [] = C.toUpper c : map C.toLower cs run :: BS.ByteString -> Encoding run = confidenceEncoding . sniff emptySnifferEnvironment