-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Haskell implementation of the Unicode Collation Algorithm -- -- This library provides a pure Haskell implementation of the Unicode -- Collation Algorithm described at -- http://www.unicode.org/reports/tr10/. It is not as -- fully-featured or as performant as text-icu, but it avoids a -- dependency on a large C library. Locale-specific tailorings are also -- provided. @package unicode-collation @version 0.1.1 module Text.Collate.Lang -- | Represents a BCP 47 language tag -- (https://tools.ietf.org/html/bcp47). data Lang Lang :: Text -> Maybe Text -> Maybe Text -> [Text] -> [(Text, [(Text, Text)])] -> [Text] -> Lang [langLanguage] :: Lang -> Text [langScript] :: Lang -> Maybe Text [langRegion] :: Lang -> Maybe Text [langVariants] :: Lang -> [Text] [langExtensions] :: Lang -> [(Text, [(Text, Text)])] [langPrivateUse] :: Lang -> [Text] -- | Parse a BCP 47 language tag as a Lang. parseLang :: Text -> Either String Lang -- | Render a Lang in BCP 47 form. renderLang :: Lang -> Text -- | Find best match for a Lang in an association list. lookupLang :: Lang -> [(Lang, a)] -> Maybe (Lang, a) instance Language.Haskell.TH.Syntax.Lift Text.Collate.Lang.Lang instance GHC.Show.Show Text.Collate.Lang.Lang instance GHC.Classes.Ord Text.Collate.Lang.Lang instance GHC.Classes.Eq Text.Collate.Lang.Lang instance Data.String.IsString Text.Collate.Lang.Lang instance Data.Binary.Class.Binary Text.Collate.Lang.Lang -- | This library provides a pure Haskell implementation of the Unicode -- Collation Algorithm, allowing proper sorting of Unicode strings. -- -- The simplest way to use the library is to use the IsString -- instance of Collator (together with the -- OverloadedStrings extension): -- --
--   >>> import Data.List (sortBy)
--   
--   >>> import qualified Data.Text.IO as T
--   
--   >>> mapM_ T.putStrLn $ sortBy (collate "en-US") ["饾挾bc","abC","饾晵bc","Abc","ab莽","盲bc"]
--   abC
--   饾挾bc
--   饾晵bc
--   Abc
--   ab莽
--   盲bc
--   
-- -- Note the difference from the default sort: -- --
--   >>> import Data.List (sort)
--   
--   >>> import qualified Data.Text.IO as T
--   
--   >>> mapM_ T.putStrLn $ sort ["饾挾bc","abC","饾晵bc","Abc","ab莽","盲bc"]
--   Abc
--   abC
--   ab莽
--   盲bc
--   饾挾bc
--   饾晵bc
--   
-- -- A Collator provides a function collate that compares two -- texts, and a function sortKey that returns the sort key. Most -- users will just need collate. -- --
--   >>> let de = collatorFor "de"
--   
--   >>> let se = collatorFor "se"
--   
--   >>> collate de "枚" "z"
--   LT
--   
--   >>> collate se "枚" "z"
--   GT
--   
--   >>> sortKey de "枚"
--   SortKey [0x213C,0x0000,0x0020,0x002B,0x0000,0x0002,0x0002,0x0000,0xFFFF,0xFFFF]
--   
--   >>> sortKey se "枚"
--   SortKey [0x22FD,0x0000,0x0020,0x0000,0x0002,0x0000,0xFFFF]
--   
-- -- Because Collator and Lang have IsString -- instances, you can just specify them using string literals, as in the -- above examples. Note, however, that you won't get any feedback if the -- string doesn't parse correctly as a BCP47 language tag, or if no -- collation is defined for the specified language; instead, you'll just -- get the default (root) collator. For this reason, we don't recommend -- relying on the IsString instance. -- -- If you won't know the language until run time, use parseLang to -- parse it to a Lang, handling parse errors, and then pass the -- Lang to collatorFor. -- --
--   >>> let handleParseError = error  -- or something fancier
--   
--   >>> lang <- either handleParseError return $ parseLang "bs-Cyrl"
--   
--   >>> collate (collatorFor lang) "a" "b"
--   LT
--   
-- -- If you know the language at compile-time, use the collator -- quasi-quoter and you'll get compile-time errors and warnings: -- --
--   >>> :set -XQuasiQuotes
--   
--   >>> let esTraditional = [collator|es-u-co-trad|]
--   
--   >>> let esStandard = [collator|es|]
--   
--   >>> collate esStandard "Co" "Ch"
--   GT
--   
--   >>> collate esTraditional "Co" "Ch"
--   LT
--   
-- -- Note that the unicode extension syntax for BCP47 can be used to -- specify a particular collation for the language (here, Spanish -- "traditional" instead of the default ordering; the alias trad -- is used because of length limits for BCP47 keywords). -- -- The extension syntax can also be used to set collator options. The -- keyword kb can be used to specify the "backwards" accent -- sorting that is sometimes used in French: -- --
--   >>> collate "fr" "c么te" "cot茅"
--   GT
--   
--   >>> collate "fr-u-kb" "c么te" "cot茅"
--   LT
--   
-- -- The keyword ka can be used to specify the variable weighting -- options which affect how punctuation and whitespace are treated: -- --
--   >>> collate "en-u-ka-shifted" "de-luge" "de Luge"
--   LT
--   
--   >>> collate "en-u-ka-noignore" "de-luge" "de Luge"
--   GT
--   
-- -- The keyword kk can be used to turn off the normalization step -- (which is required by the algorithm but can be omitted for better -- performance if the input is already in NFD form (canonical -- decomposition). -- --
--   >>> let noNormalizeCollator = [collator|en-u-kk-false|]
--   
-- -- The keyword kf can be used to say whether uppercase or -- lowercase letters should be sorted first. -- --
--   >>> collate "en-u-kf-upper" "A" "a"
--   LT
--   
--   >>> collate "en-u-kf-lower" "A" "a"
--   GT
--   
-- -- These options be combined: -- --
--   >>> collate "de-DE-u-co-phonebk-kb-false-ka-shifted" "Udet" "脺ber"
--   LT
--   
-- -- Options can also be set using the functions -- setVariableWeighting, setNormalization, -- setUpperBeforeLower, and setFrenchAccents: -- --
--   >>> let frC = setFrenchAccents True [collator|fr|]
--   
--   >>> collate frC "c么te" "cot茅"
--   LT
--   
module Text.Collate data Collator collate :: Collator -> Text -> Text -> Ordering -- | Returns a collator based on a BCP 47 language tag. If no exact match -- is found, we try to find the best match (falling back to the root -- collation if nothing else succeeds). If something other than the -- default collation for a language is desired, the co keyword -- of the unicode extensions can be used (e.g. es-u-co-trad for -- traditional Spanish). Other unicode extensions affect the collator -- options: -- -- collatorFor :: Lang -> Collator -- | Create a collator at compile time based on a BCP 47 language tag: -- e.g., [collator|es-u-co-trad|]. Requires the -- QuasiQuotes extension. collator :: QuasiQuoter -- | Default collator based on DUCET table (allkeys.txt). rootCollator :: Collator newtype SortKey SortKey :: [Word16] -> SortKey sortKey :: Collator -> Text -> SortKey -- | VariableWeighting affects how punctuation is treated. See -- http://www.unicode.org/reports/tr10/#Variable_Weighting. data VariableWeighting -- | Don't ignore punctuation (Deluge < deluge-) NonIgnorable :: VariableWeighting -- | Completely ignore punctuation (Deluge = deluge-) Blanked :: VariableWeighting -- | Consider punctuation at lower priority (de-luge < delu-ge < -- deluge < deluge- < Deluge) Shifted :: VariableWeighting -- | Variant of Shifted (deluge < de-luge < delu-ge) ShiftTrimmed :: VariableWeighting -- | Report Lang used for tailoring in a collator. Note that because -- of fallbac rules, this may be somewhat different from the Lang -- passed to collatorFor. This Lang won't contain unicode -- extensions used to set options, but it will contain the collation if a -- non-default collation is being used. collatorLang :: Collator -> Maybe Lang -- | Set method for handling variable elements (punctuation and spaces): -- see http://www.unicode.org/reports/tr10/, Tables 11 and 12. setVariableWeighting :: VariableWeighting -> Collator -> Collator -- | The Unicode Collation Algorithm expects input to be normalized into -- its canonical decomposition (NFD). By default, collators perform this -- normalization. If your input is already normalized, you can increase -- performance by disabling this step: setNormalization False. setNormalization :: Bool -> Collator -> Collator -- | setFrenchAccents True causes secondary weights to be scanned -- in reverse order, so we get the sorting cote c么te cot茅 c么t茅 -- instead of cote cot茅 c么te c么t茅. The default is usually -- False, except for fr-CA where it is True. setFrenchAccents :: Bool -> Collator -> Collator -- | Most collations default to sorting lowercase letters before uppercase -- (exceptions: mt, da, cu). To select the -- opposite behavior, use setUpperBeforeLower True. setUpperBeforeLower :: Bool -> Collator -> Collator -- | An association list matching Langs with tailored -- Collations. tailorings :: [(Lang, Collation)]