-- | -- Module : Unicode.Char.General.Names -- Copyright : (c) 2022 Composewell Technologies and Contributors -- License : Apache-2.0 -- Maintainer : streamly@composewell.com -- Stability : experimental -- -- Unicode character names and name aliases with 'String' API. -- See Unicode standard 15.0.0, section 4.8. -- -- There are also two optional APIs: -- -- * @ByteString@ API, which requires using the package flag @has-bytestring@. -- * @Text@ API, which requires using the package flag @has-text@. -- -- @since 0.1.0 module Unicode.Char.General.Names ( -- Unicode version unicodeVersion -- * Name , name , nameOrAlias , correctedName -- * Name Aliases , NameAliases.NameAliasType(..) , nameAliases , nameAliasesByType , nameAliasesWithTypes ) where import Control.Applicative ((<|>)) import GHC.Exts ( Addr#, Char(..), Char#, Int# , indexCharOffAddr#, plusAddr#, (+#), (-#), (<#), isTrue#, quotRemInt# , dataToTag#, ord# ) import Unicode.Internal.Bits.Names (unpackNBytes#) import qualified Unicode.Internal.Char.UnicodeData.DerivedName as DerivedName import qualified Unicode.Internal.Char.UnicodeData.NameAliases as NameAliases import Unicode.Internal.Char.Names.Version (unicodeVersion) -- | Name of a character, if defined. -- -- @since 0.1.0 name :: Char -> Maybe String name (C# c#) = case DerivedName.name c# of (# name#, len# #) -> case len# of DerivedName.NoName -> Nothing DerivedName.CjkCompatibilityIdeograph -> Just n where !hex = showHex c# !n = 'C':'J':'K':' ':'C':'O':'M':'P':'A':'T':'I':'B':'I':'L':'I':'T':'Y':' ':'I':'D':'E':'O':'G':'R':'A':'P':'H':'-':hex DerivedName.CjkUnifiedIdeograph -> Just n where !hex = showHex c# !n = 'C':'J':'K':' ':'U':'N':'I':'F':'I':'E':'D':' ':'I':'D':'E':'O':'G':'R':'A':'P':'H':'-':hex DerivedName.TangutIdeograph -> Just n where !hex = showHex c# !n = 'T':'A':'N':'G':'U':'T':' ':'I':'D':'E':'O':'G':'R':'A':'P':'H':'-':hex _ | isTrue# (len# <# DerivedName.HangulSyllable) -> let !n = unpack name# [] len# in Just n | otherwise -> let !rest = unpack name# [] (len# -# DerivedName.HangulSyllable) !n = 'H':'A':'N':'G':'U':'L':' ':'S':'Y':'L':'L':'A':'B':'L':'E':' ':rest in Just n {-# INLINE unpack #-} unpack :: Addr# -> String -> Int# -> String unpack !addr# !acc = \case 0# -> acc !i -> unpack addr# acc' i' where !i' = i -# 1# !c'# = indexCharOffAddr# addr# i' !c' = C# c'# !acc' = c' : acc -- [NOTE] We assume c# >= '\x1000' to avoid to check for padding showHex :: Char# -> String showHex !c# = showIt [] (quotRemInt# (ord# c#) 16#) where showIt !acc (# q, r #) = case q of 0# -> acc' _ -> showIt acc' (quotRemInt# q 16#) where !c = C# (indexCharOffAddr# "0123456789ABCDEF"# r) !acc' = c : acc -- | Returns /corrected/ name of a character (see 'NameAliases.Correction'), -- if defined, otherwise returns its original 'name' if defined. -- -- @since 0.1.0 {-# INLINE correctedName #-} correctedName :: Char -> Maybe String correctedName c@(C# c#) = corrected <|> name c where -- Assumption: fromEnum NameAliases.Correction == 0 !corrected = case indexCharOffAddr# addr# 0# of '\xff'# -> Nothing -- no aliases '\x00'# -> Nothing -- no correction i# -> let !n = unpackNBytes'# (addr# `plusAddr#` ord# i#) in Just n !addr# = NameAliases.nameAliases c# -- | Returns a character’s 'name' if defined, -- otherwise returns its first name alias if defined. -- -- @since 0.1.0 nameOrAlias :: Char -> Maybe String nameOrAlias c@(C# c#) = name c <|> case indexCharOffAddr# addr# 0# of '\xff'# -> Nothing -- no aliases '\x00'# -> let !n = go 1# in Just n _ -> let !n = go 0# in Just n where !addr# = NameAliases.nameAliases c# go t# = case ord# (indexCharOffAddr# (addr# `plusAddr#` t#) 0#) of -- No bound check for t#: there is at least one alias 0# -> go (t# +# 1#) i# -> unpackNBytes'# (addr# `plusAddr#` i#) -- | All name aliases of a character, if defined. -- The names are listed in the original order of the UCD. -- -- See 'nameAliasesWithTypes' for the detailed list by alias type. -- -- @since 0.1.0 {-# INLINE nameAliases #-} nameAliases :: Char -> [String] nameAliases (C# c#) = case indexCharOffAddr# addr0# 0# of '\xff'# -> [] -- no aliases _ -> go (addr0# `plusAddr#` (NameAliases.MaxNameAliasType +# 1#)) where go addr# = case ord# (indexCharOffAddr# addr# 0#) of 0# -> case ord# (indexCharOffAddr# (addr# `plusAddr#` 1#) 0#) of -- End of list 0# -> [] -- skip empty entry l# -> let !s = unpackNBytes# (addr# `plusAddr#` 2#) l# in s : go (addr# `plusAddr#` (l# +# 2#)) l# -> let !s = unpackNBytes# (addr# `plusAddr#` 1#) l# in s : go (addr# `plusAddr#` (l# +# 1#)) where addr0# = NameAliases.nameAliases c# -- | Name aliases of a character for a specific name alias type. -- -- @since 0.1.0 {-# INLINE nameAliasesByType #-} nameAliasesByType :: NameAliases.NameAliasType -> Char -> [String] nameAliasesByType t (C# c#) = case indexCharOffAddr# addr# 0# of '\xff'# -> [] -- no aliases _ -> nameAliasesByType# addr# t where addr# = NameAliases.nameAliases c# -- | Detailed character names aliases. -- The names are listed in the original order of the UCD. -- -- See 'nameAliases' if the alias type is not required. -- -- @since 0.1.0 {-# INLINE nameAliasesWithTypes #-} nameAliasesWithTypes :: Char -> [(NameAliases.NameAliasType, [String])] nameAliasesWithTypes (C# c#) = case indexCharOffAddr# addr# 0# of '\xff'# -> [] -- no aliases '\x00'# -> foldr mk mempty [succ minBound..maxBound] _ -> foldr mk mempty [minBound..maxBound] where addr# = NameAliases.nameAliases c# mk t acc = case nameAliasesByType# addr# t of [] -> acc !as -> (t, as) : acc {-# INLINE unpackNBytes'# #-} unpackNBytes'# :: Addr# -> String unpackNBytes'# addr# = unpackNBytes# (addr# `plusAddr#` 1#) (ord# (indexCharOffAddr# addr# 0#)) {-# INLINE nameAliasesByType# #-} nameAliasesByType# :: Addr# -> NameAliases.NameAliasType -> [String] nameAliasesByType# addr# t = case indexCharOffAddr# (addr# `plusAddr#` t#) 0# of '\0'# -> [] -- no aliases for this type i# -> unpackCStrings# (addr# `plusAddr#` ord# i#) where t# = dataToTag# t {-# INLINE unpackCStrings# #-} unpackCStrings# :: Addr# -> [String] unpackCStrings# = go where go addr# = case ord# (indexCharOffAddr# addr# 0#) of 0# -> [] l# -> let !s = unpackNBytes# (addr# `plusAddr#` 1#) l# in s : go (addr# `plusAddr#` (l# +# 1#))