{-# LANGUAGE CPP #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
module Text.Collate.Collator
( Collator(..)
, SortKey(..)
, VariableWeighting(..)
, rootCollator
, collatorLang
, setVariableWeighting
, setFrenchAccents
, setUpperBeforeLower
, setNormalization
, collator
, defaultCollatorOptions
, collatorFor
, mkCollator
)
where
import Text.Collate.Lang
import Text.Collate.Tailorings
import Text.Collate.Collation (getCollationElements, Collation(..),
CollationElement(..), VariableWeighting(..))
import Data.Word (Word16)
import Data.String
import qualified Data.Text.Normalize as N
import qualified Data.Text as T
import Data.Text (Text)
import Data.Ord (comparing)
import Data.Char (ord)
import Data.List (intercalate)
import Text.Printf (printf)
import Language.Haskell.TH.Quote (QuasiQuoter(..))
#if MIN_VERSION_base(4,11,0)
#else
import Data.Semigroup (Semigroup(..))
#endif
data CollatorOptions =
CollatorOptions
{ optLang :: Maybe Lang -- ^ Which lang was used for tailoring
, optVariableWeighting :: VariableWeighting -- ^ Method for handling
-- variable elements (see ,
-- Tables 11 and 12).
, optFrenchAccents :: Bool -- ^ If True, secondary weights are scanned
-- in reverse order, so we get the sorting
-- "cote côte coté côté" instead of "cote coté côte côté"
, optUpperBeforeLower :: Bool -- ^ Sort uppercase letters before lower
, optNormalize :: Bool -- ^ If True, strings are normalized
-- to NFD before collation elements are constructed. If the input
-- is already normalized, this option can be set to False for
-- better performance.
, optCollation :: Collation -- ^ The collation to use.
} deriving (Show, Eq, Ord)
showWordList :: [Word16] -> String
showWordList ws =
"[" ++ intercalate ","
(map (printf "0x%04X" . (fromIntegral :: Word16 -> Int)) ws) ++ "]"
newtype SortKey = SortKey [Word16]
deriving (Eq, Ord)
instance Show SortKey where
show (SortKey ws) = "SortKey " ++ showWordList ws
-- Note that & b < q <<< Q is the same as & b < q, & q <<< Q
-- Another syntactic shortcut is:
-- & a <* bcd-gp-s => & a < b < c < d < e < f < g < p < q < r < s
-- & a =* bB => & a = b = B (without that, we have a contraction)
-- &[before 2] a << b => sorts sorts b before a
data Collator = Collator { collate :: Text -> Text -> Ordering
, sortKey :: Text -> SortKey
, collatorOptions :: CollatorOptions }
instance IsString Collator where
fromString = collatorFor . fromString
-- | Default collator based on DUCET table (@allkeys.txt@).
rootCollator :: Collator
rootCollator =
mkCollator defaultCollatorOptions{ optCollation = ducetCollation }
-- | Report 'Lang' used for tailoring in a collator.
-- Note that because of fallbac rules, this may be somewhat
-- different from the 'Lang' passed to 'collatorFor'. This 'Lang'
-- won't contain unicode extensions used to set options, but
-- it will contain the collation if a non-default collation is being used.
collatorLang :: Collator -> Maybe Lang
collatorLang = optLang . collatorOptions
-- | Set method for handling variable elements (punctuation
-- and spaces): see ,
-- Tables 11 and 12.
setVariableWeighting :: VariableWeighting -> Collator -> Collator
setVariableWeighting w coll =
mkCollator (collatorOptions coll){ optVariableWeighting = w }
-- | The Unicode Collation Algorithm expects input to be normalized
-- into its canonical decomposition (NFD). By default, collators perform
-- this normalization. If your input is already normalized, you can increase
-- performance by disabling this step: @setNormalization False@.
setNormalization :: Bool -> Collator -> Collator
setNormalization normalize coll =
mkCollator (collatorOptions coll){ optNormalize = normalize }
-- | @setFrenchAccents True@ causes secondary weights to be scanned
-- in reverse order, so we get the sorting
-- @cote côte coté côté@ instead of @cote coté côte côté@.
-- The default is usually @False@, except for @fr-CA@ where it is @True@.
setFrenchAccents :: Bool -> Collator -> Collator
setFrenchAccents frAccents coll =
mkCollator (collatorOptions coll){ optFrenchAccents = frAccents }
-- | Most collations default to sorting lowercase letters before
-- uppercase (exceptions: @mt@, @da@, @cu@). To select the opposite
-- behavior, use @setUpperBeforeLower True@.
setUpperBeforeLower :: Bool -> Collator -> Collator
setUpperBeforeLower upperBefore coll =
mkCollator (collatorOptions coll){ optUpperBeforeLower = upperBefore }
-- | Create a collator at compile time based on a BCP 47 language
-- tag: e.g., @[collator|es-u-co-trad|]@. Requires the @QuasiQuotes@ extension.
collator :: QuasiQuoter
collator = QuasiQuoter
{ quoteExp = \langtag -> do
case parseLang (T.pack langtag) of
Left e -> do
fail $ "Could not parse BCP47 tag " <> langtag <> e
Right lang ->
case lookupLang lang tailorings of
Nothing -> [| rootCollator |]
Just (_, _) -> [| collatorFor lang |]
, quotePat = undefined
, quoteType = undefined
, quoteDec = undefined
}
-- | Default 'CollatorOptions'.
defaultCollatorOptions :: CollatorOptions
defaultCollatorOptions =
CollatorOptions
{ optLang = Nothing
, optVariableWeighting = NonIgnorable
, optFrenchAccents = False
, optUpperBeforeLower = False
, optNormalize = True
, optCollation = ducetCollation
}
-- | Returns a collator based on a BCP 47 language tag.
-- If no exact match is found, we try to find the best match
-- (falling back to the root collation if nothing else succeeds).
-- If something other than the default collation for a language
-- is desired, the @co@ keyword of the unicode extensions can be
-- used (e.g. @es-u-co-trad@ for traditional Spanish).
-- Other unicode extensions affect the collator options:
--
-- - The @kb@ keyword has the same effect as
-- 'setFrenchAccents' (e.g. @fr-FR-u-kb-true@).
-- - The @ka@ keyword has the same effect as 'setVariableWeight'
-- (e.g. @fr-FR-u-kb-ka-shifted@ or @en-u-ka-noignore@).
-- - The @kf@ keyword has the same effect as 'setUpperBeforeLower'
-- (e.g. @fr-u-kf-upper@ or @fr-u-kf-lower@).
-- - The @kk@ keyword has the same effect as 'setNormalization'
-- (e.g. @fr-u-kk-false@).
collatorFor :: Lang -> Collator
collatorFor lang = mkCollator opts
where
opts = defaultCollatorOptions{
optLang = langUsed,
optFrenchAccents =
case lookup "u" exts >>= lookup "kb" of
Just "" -> True
-- true is default attribute value
Just "true" -> True
Just _ -> False
Nothing -> langLanguage lang == "cu" ||
(langLanguage lang == "fr" && langRegion lang == Just "CA"),
optVariableWeighting =
case lookup "u" exts >>= lookup "ka" of
Just "" -> NonIgnorable
Just "noignore" -> NonIgnorable
Just "shifted" -> Shifted
Nothing | langLanguage lang == "th"
-> Shifted
_ -> NonIgnorable,
optUpperBeforeLower =
case lookup "u" exts >>= lookup "kf" of
Just "" -> True
Just "upper" -> True
Just _ -> False
Nothing -> langLanguage lang == "mt" ||
langLanguage lang == "da" ||
langLanguage lang == "cu",
optNormalize =
case lookup "u" exts >>= lookup "kk" of
Just "" -> True
Just "true" -> True
Just "false" -> False
_ -> True,
optCollation = ducetCollation <> tailoring }
(langUsed, tailoring) = case lookupLang lang tailorings of
Nothing -> (Nothing, mempty)
Just (l,t) -> (Just l, t)
exts = langExtensions lang
-- | Returns a collator constructed using the collation and
-- variable weighting specified in the options.
mkCollator :: CollatorOptions -> Collator
mkCollator opts =
Collator { collate = comparing sortKey'
, sortKey = sortKey'
, collatorOptions = opts
}
where
sortKey' = toSortKey opts
toSortKey :: CollatorOptions -> Text -> SortKey
toSortKey opts =
mkSortKey opts
. handleVariable (optVariableWeighting opts)
. getCollationElements (optCollation opts)
. T.foldr ((:) . ord) []
. if optNormalize opts
then N.normalize N.NFD
else id
handleVariable :: VariableWeighting -> [CollationElement] -> [CollationElement]
handleVariable NonIgnorable = id
handleVariable Blanked = doVariable False False
handleVariable Shifted = doVariable True False
handleVariable ShiftTrimmed = handleVariable Shifted
doVariable :: Bool -> Bool -> [CollationElement] -> [CollationElement]
doVariable _useL4 _afterVariable [] = []
doVariable useL4 afterVariable (e:es)
| collationVariable e
= e{ collationL1 = 0, collationL2 = 0, collationL3 = 0,
collationL4 = -- Table 11
case useL4 of
True
| collationL1 e == 0
, collationL2 e == 0
, collationL3 e == 0 -> 0
| collationL1 e == 0
, collationL3 e /= 0
, afterVariable -> 0
| collationL1 e /= 0 -> collationL1 e
| collationL1 e == 0
, collationL3 e /= 0
, not afterVariable -> 0xFFFF
_ -> 0
} : doVariable useL4 True es
| collationL1 e == 0 -- "ignorable"
, afterVariable
= e{ collationL1 = 0, collationL2 = 0, collationL3 = 0, collationL4 = 0 }
: doVariable useL4 afterVariable es
| collationL1 e /= 0
, not (collationVariable e)
, useL4
= e{ collationL4 = 0xFFFF } : doVariable useL4 False es
| otherwise
= e : doVariable useL4 False es
mkSortKey :: CollatorOptions -> [CollationElement] -> SortKey
mkSortKey opts elts = SortKey $
l1s ++ (0:l2s) ++ (0:l3s) ++ if null l4s
then []
else 0:l4s
where
l1s = filter (/=0) $ map collationL1 elts
l2s = (if optFrenchAccents opts
then reverse
else id) $ filter (/=0) $ map collationL2 elts
l3s = filter (/=0) $ map ((if optUpperBeforeLower opts
then switchUpperAndLower
else id) . collationL3) elts
l4s = (case optVariableWeighting opts of
ShiftTrimmed -> trimTrailingFFFFs
_ -> id) $ filter (/=0) $ map collationL4 elts
switchUpperAndLower 0x0002 = 0x0008
switchUpperAndLower 0x0008 = 0x0002
switchUpperAndLower x = x
trimTrailingFFFFs :: [Word16] -> [Word16]
trimTrailingFFFFs = reverse . dropWhile (== 0xFFFF) . reverse