{-# LANGUAGE OverloadedStrings #-}

module Data.Text.Slugger (toSlug) where


--------------------------------------------------------------------------------
import qualified Data.Char as Char
import qualified Data.Text as T
import qualified Data.Text.ICU.Char as ICUChar
import qualified Data.Text.ICU.Normalize as ICUN


--------------------------------------------------------------------------------
{- | Converts to a US-ASCII, lowercase, hyphenated, URI-friendly "slug"

__Examples:__

@
toSlug (T.pack "Hey there,   world!")
-- "hey-there-world"

toSlug (T.pack "GARÇON - déjà , Forêt — Zoë")
-- "garcon-deja-foret-zoe"
@
-}
toSlug :: T.Text -> T.Text
toSlug :: Text -> Text
toSlug = Text -> Text
hyphenateWords forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
clean forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
normalize


--------------------------------------------------------------------------------
normalize :: T.Text -> T.Text
normalize :: Text -> Text
normalize = NormalizationMode -> Text -> Text
ICUN.normalize NormalizationMode
ICUN.NFKD


--------------------------------------------------------------------------------
clean :: T.Text -> T.Text
clean :: Text -> Text
clean = forall a. (Char -> a -> a) -> a -> Text -> a
T.foldr Char -> Text -> Text
buildCleanText Text
T.empty


buildCleanText :: Char -> T.Text -> T.Text
buildCleanText :: Char -> Text -> Text
buildCleanText Char
x Text
acc
    | Char -> Bool
isCharModifier Char
x Bool -> Bool -> Bool
|| Char -> Bool
isSingleQuote Char
x = Text
acc
    | Bool
otherwise = [Text] -> Text
T.concat [Char -> Text
adjustChar Char
x, Text
acc]


isSingleQuote :: Char -> Bool
isSingleQuote :: Char -> Bool
isSingleQuote = (forall a. Eq a => a -> a -> Bool
== Char
'\'')


isCharModifier :: Char -> Bool
isCharModifier :: Char -> Bool
isCharModifier = forall p v. Property p v => p -> Char -> v
ICUChar.property Bool_
ICUChar.Diacritic


adjustChar :: Char -> T.Text
adjustChar :: Char -> Text
adjustChar Char
'æ' = Text
"ae"
adjustChar Char
'Æ' = Text
"ae"
adjustChar Char
'ð' = Text
"d"
adjustChar Char
'Ð' = Text
"d"
adjustChar Char
'ƒ' = Text
"f"
adjustChar Char
'Ƒ' = Text
"f"
adjustChar Char
'ø' = Text
"o"
adjustChar Char
'Ø' = Text
"o"
adjustChar Char
'œ' = Text
"oe"
adjustChar Char
'Œ' = Text
"oe"
adjustChar Char
'ł' = Text
"l"
adjustChar Char
'Ł' = Text
"l"
adjustChar Char
'ß' = Text
"ss"
adjustChar Char
'þ' = Text
"th"
adjustChar Char
'Þ' = Text
"th"
adjustChar Char
'ı' = Text
"i"
-- See Note [Turkish I]
adjustChar Char
x
  | Char -> Bool
isAsciiAlphaNum Char
x = Char -> Text
toLowerAsText Char
x
  | Bool
otherwise = Text
" "


isAsciiAlphaNum :: Char -> Bool
isAsciiAlphaNum :: Char -> Bool
isAsciiAlphaNum Char
x = Char -> Bool
Char.isAscii Char
x Bool -> Bool -> Bool
&& Char -> Bool
Char.isAlphaNum Char
x


toLowerAsText :: Char -> T.Text
toLowerAsText :: Char -> Text
toLowerAsText = Char -> Text
T.singleton forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Char
Char.toLower


--------------------------------------------------------------------------------
hyphenateWords :: T.Text -> T.Text
hyphenateWords :: Text -> Text
hyphenateWords = Text -> [Text] -> Text
T.intercalate (Char -> Text
T.singleton Char
'-') forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> [Text]
T.words

{-
Note: [Turkish I]

Turkish has an unusual casing of the letter 'I'. In Turkish, we have
'i' and 'ı', two separate letters. They correspond to uppercase 'İ'
and 'I'. Notice that this is the opposite of most other languages,
where lowercase 'i' correspond to uppercase 'I' (losing the dot for no
good reason).

Unicode gets this correctly, so a Unicode-aware `toLower` function would
convert uppercase 'I' to 'ı' when on Turkish locale. This tend to break
functions like the one we are writing, if we incorrectly assume that every
ASCII uppercase letter would correspond to an ASCII lowercase letter.

The surprise is that; `Data.Char.toLower` function we use is not
locale-aware, `Data.Text.ICU.toLower` is. Only because of this fact `I`
becomes `i` even on Turkish locale on this function. This note is here
so that we do not start using `Data.Text.ICU.toLower` and break the
library on Turkish locale.
-}