{-# LANGUAGE OverloadedStrings #-} import Data.ByteString (ByteString, pack) import Data.Foldable import Data.String.UTF8 (fromString, toRep) import Data.Text () import Data.Text.Encoding import Numeric import Test.Tasty import Test.Tasty.HUnit import Text.Utf8Validator main :: IO () main = defaultMain tests tests :: TestTree tests = testGroup "tests" [ testGroup "simple tests" [ testCase "valid utf8 1" (assertBool "validation returned False" . validateBS $ encodeUtf8 "Hello-µ@ßöäüàá-UTF-8!!") , testCase "valid utf8 2" (assertBool "validation returned False" $ validateBS "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5") , testCase "invalid utf8 1" (assertBool "validation returned False" . not $ validateBS "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5\xed\xa0\x80\x65\x64\x69\x74\x65\x64") , testCase "invalid utf8 2" (assertBool "validation returned False" . not $ validateBS "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5\xf4\x90\x80\x80\x65\x64\x69\x74\x65\x64") ] , testGroup "test sequences" $ map gentest utf8TestSequences , testGroup "codepoints" [ testCase "non-surrogates" $ traverse_ (valCodepoint True) [x | x <- [0 .. 0x10ffff], x < 0xd800 && x > 0xdfff] , testCase "single utf16 surrogates" $ traverse_ (valCodepoint False) [0xd800 .. 0xdfff] , testCase "paired utf16 surrogates" $ traverse_ (valCodepoints False) (do x <- [0xd800 .. 0xdbff] y <- [0xdc00 .. 0xdfff] [[x, y], [y, x]]) ] ] where gentest (s, d) = testCaseSteps s . fmap snd $ foldl' indtest init' d indtest x (e, d) step = let (i, act) = x step in ( i + 1 , do _ <- act _ <- step ("sequence " ++ show i) validateBS d @=? e) init' _ = (1 :: Int, return ()) fromCodepoint cps = toRep $ fromString (map toEnum cps) valCodepoint eq cp = assertEqual (showHex cp "") eq (validateBS $ fromCodepoint [cp]) valCodepoints eq cps = assertEqual (show $ map (flip showHex "") cps) eq (validateBS $ fromCodepoint cps) -- Create test sequences for UTF-8 decoder tests from -- http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt -- implementation translated from case6_x_x.py in autobahntestcase utf8TestSequences :: [(String, [(Bool, ByteString)])] utf8TestSequences = [ ( "Some valid UTF-8 sequences" , [ (True, "hello\x24world") , (True, "hello\xC2\xA2world") , (True, "hello\xE2\x82\xACworld") , (True, "hello\xF0\xA4\xAD\xA2world") , (True, "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5") ]) , ( "First possible sequence of a certain length" , [ (True, "\x00") , (True, "\xc2\x80") , (True, "\xe0\xa0\x80") , (True, "\xf0\x90\x80\x80") ]) , ( "First possible sequence length 5/6 (invalid codepoints)" , [(False, "\xf8\x88\x80\x80\x80"), (False, "\xfc\x84\x80\x80\x80\x80")]) , ( "Last possible sequence of a certain length" , [ (True, "\x7f") , (True, "\xdf\xbf") , (True, "\xef\xbf\xbf") , (True, "\xf4\x8f\xbf\xbf") ]) , ( "Last possible sequence length 4/5/6 (invalid codepoints)" , [ (False, "\xf7\xbf\xbf\xbf") , (False, "\xfb\xbf\xbf\xbf\xbf") , (False, "\xfd\xbf\xbf\xbf\xbf\xbf") ]) , ( "Other boundary conditions" , [ (True, "\xed\x9f\xbf") , (True, "\xee\x80\x80") , (True, "\xef\xbf\xbd") , (True, "\xf4\x8f\xbf\xbf") , (False, "\xf4\x90\x80\x80") ]) , ( "Unexpected continuation bytes" , [ (False, "\x80") , (False, "\xbf") , (False, "\x80\xbf") , (False, "\x80\xbf\x80") , (False, "\x80\xbf\x80\xbf") , (False, "\x80\xbf\x80\xbf\x80") , (False, "\x80\xbf\x80\xbf\x80\xbf") , (False, pack [0x80 .. 0xbe]) ]) , ( "Lonely start characters" , [ (False, pack [a .. b]) | (a, b) <- [(0xc0, 0xde), (0xe0, 0xee), (0xf0, 0xf6), (0xf8, 0xfb), (0xfc, 0xfc)] ]) , ( "Sequences with last continuation byte missing" , [ (False, "\xc0") , (False, "\xe0\x80") , (False, "\xf0\x80\x80") , (False, "\xf8\x80\x80\x80") , (False, "\xfc\x80\x80\x80\x80") , (False, "\xdf") , (False, "\xef\xbf") , (False, "\xf7\xbf\xbf") , (False, "\xfb\xbf\xbf\xbf") , (False, "\xfd\xbf\xbf\xbf\xbf") ]) -- missing: Concatenation of incomplete sequences , ( "Concatenation of incomplete sequences" , [ ( False , "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80" `mappend` "\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf") ]) , ( "Impossible bytes" , [(False, "\xfe"), (False, "\xff"), (False, "\xfe\xfe\xff\xff")]) , ( "Examples of an overlong ASCII character" , [ (False, "\xc0\xaf") , (False, "\xe0\x80\xaf") , (False, "\xf0\x80\x80\xaf") , (False, "\xf8\x80\x80\x80\xaf") , (False, "\xfc\x80\x80\x80\x80\xaf") ]) , ( "Maximum overlong sequences" , [ (False, "\xc1\xbf") , (False, "\xe0\x9f\xbf") , (False, "\xf0\x8f\xbf\xbf") , (False, "\xf8\x87\xbf\xbf\xbf") , (False, "\xfc\x83\xbf\xbf\xbf\xbf") ]) , ( "Overlong representation of the NUL character" , [ (False, "\xc0\x80") , (False, "\xe0\x80\x80") , (False, "\xf0\x80\x80\x80") , (False, "\xf8\x80\x80\x80\x80") , (False, "\xfc\x80\x80\x80\x80\x80") ]) , ( "Single UTF-16 surrogates" , [ (False, "\xed\xa0\x80") , (False, "\xed\xad\xbf") , (False, "\xed\xae\x80") , (False, "\xed\xaf\xbf") , (False, "\xed\xb0\x80") , (False, "\xed\xbe\x80") , (False, "\xed\xbf\xbf") ]) , ( "Paired UTF-16 surrogates" , [ (False, "\xed\xa0\x80\xed\xb0\x80") , (False, "\xed\xa0\x80\xed\xbf\xbf") , (False, "\xed\xad\xbf\xed\xb0\x80") , (False, "\xed\xad\xbf\xed\xbf\xbf") , (False, "\xed\xae\x80\xed\xb0\x80") , (False, "\xed\xae\x80\xed\xbf\xbf") , (False, "\xed\xaf\xbf\xed\xb0\x80") , (False, "\xed\xaf\xbf\xed\xbf\xbf") ]) , ( "Non-character code points (valid UTF-8)" , [(True, "\xef\xbf\xbe"), (True, "\xef\xbf\xbf")] ++ [ (True, pack [z1, z2, 0xbf, z3]) | z1 <- [0xf0, 0xf1, 0xf2, 0xf3, 0xf4] , z2 <- [0x8f, 0x9f, 0xaf, 0xbf] , z1 /= 0xf4 && z2 /= 0x8f , z1 /= 0xf0 && z2 /= 0x8f , z3 <- [0xbe, 0xbf] ]) , ( "Unicode specials (i.e. replacement char)" , [ (True, "\xef\xbf\xb9") , (True, "\xef\xbf\xba") , (True, "\xef\xbf\xbb") , (True, "\xef\xbf\xbc") , (True, "\xef\xbf\xbd") , (True, "\xef\xbf\xbe") , (True, "\xef\xbf\xbf") ]) ]