{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} module Data.Text.Utf8Spec where import Control.Exception (evaluate) import Control.Monad (forM_) import Test.Hspec (Spec, anyErrorCall, describe, it, shouldBe, shouldSatisfy, shouldThrow) import Test.Hspec.QuickCheck (prop) import Test.QuickCheck (Gen, choose, forAllShrink, shrink) import qualified Data.Char as Char import Data.Text.TestInstances () import qualified Data.Text.Utf8 as Utf8 spec :: Spec spec = do describe "Properties of the BMP in UTF-8" $ do describe "Char.toLower" $ do {- it "does not generate common suffixes" $ do forM_ bmpCodepoints $ flip shouldSatisfy $ \cp -> let lowerCp = mapCp Char.toLower cp in cp == lowerCp || null (commonSuffix (Utf8.unicode2utf8 cp) (Utf8.unicode2utf8 lowerCp)) -- Sadly, it "actually does" -} it "is idempotent" $ do forM_ bmpCodepoints $ flip shouldSatisfy $ \cp -> Char.toLower cp == Char.toLower (Char.toLower cp) describe "toLowerAscii" $ do it "is equivalent to Char.toLower on ASCII" $ do forM_ asciiCodepoints $ flip shouldSatisfy $ \cp -> Char.toLower cp == Utf8.toLowerAscii cp describe "lowerCodePoint" $ do prop "is equivalent to Char.toLower on all of Unicode" $ \c -> Utf8.lowerCodePoint c `shouldBe` Char.toLower c describe "unlowerCodePoint" $ do it "should return nothing if it's not a lower case of anything" $ do Utf8.unlowerCodePoint 'A' `shouldBe` "" Utf8.unlowerCodePoint 'ẞ' `shouldBe` "" it "should return itself if it doesn't have any casings" $ do Utf8.unlowerCodePoint '1' `shouldBe` "1" it "can return multiple values" $ do Utf8.unlowerCodePoint 'a' `shouldBe` "aA" Utf8.unlowerCodePoint 'ß' `shouldBe` "ẞß" Utf8.unlowerCodePoint 'i' `shouldBe` "İiI" describe "isCaseInvariant" $ do it "holds vacuously for empty texts" $ do Utf8.isCaseInvariant "" `shouldBe` True it "should be true for characters without upper/lower cases" $ do Utf8.isCaseInvariant "." `shouldBe` True Utf8.isCaseInvariant ".,;'123" `shouldBe` True Utf8.isCaseInvariant "💩" `shouldBe` True it "should be false if there are characters with upper/lower cases" $ do Utf8.isCaseInvariant "a" `shouldBe` False Utf8.isCaseInvariant "A.." `shouldBe` False Utf8.isCaseInvariant "ß." `shouldBe` False Utf8.isCaseInvariant "ẞ" `shouldBe` False Utf8.isCaseInvariant "İ" `shouldBe` False describe "dropWhile" $ do it "handles a simple example well" $ do Utf8.dropWhile (== 'b') "bbba" `shouldBe` "a" describe "slicing functions" $ do let -- | Example shown in section "Slicing Functions" in 'Data.Text.Utf8". slicingExample :: Utf8.Text slicingExample = Utf8.Text u8data 1 11 where Utf8.Text u8data _ _ = Utf8.pack "ABCDEFGHIJKLMN" it "satisfies the example in Data.Text.Utf8" $ do let begin = Utf8.CodeUnitIndex 2 let length_ = Utf8.CodeUnitIndex 6 Utf8.unsafeSliceUtf8 begin length_ slicingExample `shouldBe` "DEFGHI" Utf8.unsafeCutUtf8 begin length_ slicingExample `shouldBe` ("BC", "JKL") prop "unsafeSliceUtf8 and unsafeCutUtf8 are complementary" $ forAllShrink (arbitrarySlicingIndices slicingExample) shrink $ \ (begin, length_) -> do let (prefix, suffix) = Utf8.unsafeCutUtf8 begin length_ slicingExample Utf8.concat [prefix, Utf8.unsafeSliceUtf8 begin length_ slicingExample, suffix] `shouldBe` slicingExample describe "Basic Text instances" $ do prop "Show Text behaves like Show String" $ \ (str :: String) -> do show (Utf8.pack str) `shouldBe` show str prop "Eq Text behaves like Eq String" $ \ (a :: String) (b :: String) -> do Utf8.pack a == Utf8.pack b `shouldBe` a == b prop "Ord Text behaves like Ord String" $ \ (a :: String) (b :: String) -> do compare (Utf8.pack a) (Utf8.pack b) `shouldBe` compare a b describe "skipCodePointsBackwards" $ do it "works with ascii" $ do Utf8.skipCodePointsBackwards "abcd" 3 0 `shouldBe` 3 Utf8.skipCodePointsBackwards "abcd" 3 1 `shouldBe` 2 Utf8.skipCodePointsBackwards "abcd" 3 2 `shouldBe` 1 Utf8.skipCodePointsBackwards "abcd" 3 3 `shouldBe` 0 it "moves to start of codepoint if you skip 0" $ do Utf8.skipCodePointsBackwards "💩💩" 0 0 `shouldBe` 0 Utf8.skipCodePointsBackwards "💩💩" 1 0 `shouldBe` 0 Utf8.skipCodePointsBackwards "💩💩" 2 0 `shouldBe` 0 Utf8.skipCodePointsBackwards "💩💩" 3 0 `shouldBe` 0 Utf8.skipCodePointsBackwards "💩💩" 4 0 `shouldBe` 4 Utf8.skipCodePointsBackwards "💩💩" 5 0 `shouldBe` 4 Utf8.skipCodePointsBackwards "💩💩" 6 0 `shouldBe` 4 Utf8.skipCodePointsBackwards "💩💩" 7 0 `shouldBe` 4 it "can skip 1 multi-byte codepoint" $ do Utf8.skipCodePointsBackwards "💩💩" 4 1 `shouldBe` 0 Utf8.skipCodePointsBackwards "💩💩" 5 1 `shouldBe` 0 Utf8.skipCodePointsBackwards "💩💩" 6 1 `shouldBe` 0 Utf8.skipCodePointsBackwards "💩💩" 7 1 `shouldBe` 0 it "can skip multiple multi-byte codepoint" $ do -- a, İ, ẞ and 💩 have byte lengths 1, 2, 3 and 4 Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 0 `shouldBe` 15 -- stays at a Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 1 `shouldBe` 13 -- skips to İ Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 2 `shouldBe` 10 -- skips to ẞ Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 3 `shouldBe` 6 -- skips to 💩 Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 4 `shouldBe` 3 -- skips to ẞ Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 5 `shouldBe` 1 -- skips to İ Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 6 `shouldBe` 0 -- skips to a Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 14 2 `shouldBe` 6 -- from İ to 💩 Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 13 2 `shouldBe` 6 -- from İ to 💩 Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 10 3 `shouldBe` 1 -- from ẞ to İ Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 9 3 `shouldBe` 0 -- from 💩 to a it "throws errors when you read out of bounds" $ do evaluate (Utf8.skipCodePointsBackwards "💩💩" 8 0) `shouldThrow` anyErrorCall evaluate (Utf8.skipCodePointsBackwards "💩💩" 7 2) `shouldThrow` anyErrorCall arbitrarySlicingIndices :: Utf8.Text -> Gen (Utf8.CodeUnitIndex, Utf8.CodeUnitIndex) arbitrarySlicingIndices example = do let exampleLength = Utf8.codeUnitIndex $ Utf8.lengthUtf8 example begin <- choose (0, exampleLength) length_ <- choose (0, exampleLength - begin) pure (Utf8.CodeUnitIndex begin, Utf8.CodeUnitIndex length_) asciiCodepoints :: [Char] asciiCodepoints = map Char.chr [0..0x7f] -- | The Basic Multilingual Plane (BMP) contains the Unicode code points -- 0x0000 through 0xFFFF. bmpCodepoints :: [Char] bmpCodepoints = map Char.chr [0..0xffff] commonSuffix :: Eq a => [a] -> [a] -> [a] commonSuffix list list' = reverse $ go (reverse list) (reverse list') where go (x:xs) (y:ys) | x == y = x : go xs ys go _ _ = []