{-# LANGUAGE CPP #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TupleSections #-} -- | -- Copyright : (c) 2016 Harendra Kumar -- -- License : BSD-3-Clause -- Maintainer : harendra.kumar@gmail.com -- Stability : experimental -- Portability : GHC -- import Control.Monad (when) import Data.Char (chr, isSpace, ord, toUpper) #if MIN_VERSION_base(4,8,0) import Data.Function ((&)) #endif import Data.List (intercalate, isPrefixOf) import Data.List.Split (splitOn) import Data.Text (Text) import qualified Data.Text as T import Data.Text.Normalize (NormalizationMode(NFD, NFKD, NFC, NFKC), normalize) import Text.Printf (printf) #if !MIN_VERSION_base(4,8,0) (&) :: a -> (a -> b) -> b x & f = f x #endif chrToHex :: Char -> [Char] chrToHex = (map toUpper) . (printf "%.4x") . ord strToHex :: [Char] -> String strToHex = unwords . (map chrToHex) checkEqual :: String -> (Text -> Text) -> (Text, Text) -> IO Bool checkEqual opName op (c1, c2) = if c1 /= op c2 then do putStrLn $ opName ++ " " ++ txtToHex c2 ++ " = " ++ txtToHex (op c2) ++ "; Expected: " ++ txtToHex c1 return False else return True where txtToHex = strToHex . T.unpack checkOp :: String -> NormalizationMode -> [(Text, Text)] -> IO Bool checkOp name op pairs = do res <- mapM (checkEqual name ((normalize op))) pairs return $ all (== True) res checkNFC :: (Text, Text, Text, Text, Text) -> IO Bool checkNFC (c1, c2, c3, c4, c5) = checkOp "toNFC" NFC $ concat [ map (c2,) [c1, c2, c3] , map (c4,) [c4, c5] ] checkNFD :: (Text, Text, Text, Text, Text) -> IO Bool checkNFD (c1, c2, c3, c4, c5) = checkOp "toNFD" NFD $ concat [ map (c3,) [c1, c2, c3] , map (c5,) [c4, c5] ] checkNFKC :: (Text, Text, Text, Text, Text) -> IO Bool checkNFKC (c1, c2, c3, c4, c5) = checkOp "toNFKC" NFKC $ map (c4,) [c1, c2, c3, c4, c5] checkNFKD :: (Text, Text, Text, Text, Text) -> IO Bool checkNFKD (c1, c2, c3, c4, c5) = checkOp "toNFKD" NFKD $ map (c5,) [c1, c2, c3, c4, c5] checkAllTestCases :: Int -> String -> IO () checkAllTestCases lineno line = do case splitOn ";" line of c1 : c2 : c3 : c4 : c5 : _ -> do let cps = map cpToText [c1, c2, c3, c4, c5] mapM_ (checkOneTestCase cps) [checkNFD, checkNFKD, checkNFC, checkNFKC] _ -> error $ "Unrecognized line: " ++ line where cpToText xs = T.pack $ map (chr . read . ("0x" ++)) (words xs) checkOneTestCase cps f = do res <- f (tuplify cps) when (not res) $ do putStrLn ("Failed at line: " ++ show lineno) putStrLn line putStrLn $ codes ++ "; # (" ++ txt error "Bailing out" where strs = map T.unpack cps codes = intercalate ";" $ map strToHex strs txt = intercalate "; " (map T.unpack cps) tuplify [c1, c2, c3, c4, c5] = (c1, c2, c3, c4, c5) tuplify _ = error "tuplify bad arguments" checkLine :: (Int, String) -> IO () checkLine (lineno, line) = do -- marker lines indicating a test block start with @ if "@" `isPrefixOf` line then putStrLn line else checkAllTestCases lineno line testNormalize :: FilePath -> IO () testNormalize file = do contents <- readFile file let ls = lines contents -- split into lines & map (dropWhile isSpace) -- trim leading spaces & zip [1..] -- add numbering & filter ((/= []) . snd) -- remove blank lines & filter (not . ("#" `isPrefixOf`) . snd) -- remove comments checkAll ls where checkAll (x:xs) = checkLine x >> checkAll xs checkAll [] = return () main :: IO () main = do testNormalize "unicode-data/ucd/NormalizationTest.txt" -- Additional test cases not in the unicode standard suite testNormalize "unicode-data/extra/NormalizationTest.txt"