{-# LANGUAGE LambdaCase #-} {-| Deburr A small package exposing the deburr function, which converts unicode characters with burrs (umlauts, accents, etc) to their ASCII counterparts. The function intelligently handles capitals and some other edge cases. -} module Text.Deburr (deburr) where import Data.Char (isUpper) -- | Deburr a string, removing umlauts, accents, etc. -- -- >>> deburr "Jeg spiser brød." -- "Jeg spiser brod." -- deburr :: String -> String deburr = snd . foldr f (Nothing, "") where f x = \case (Nothing, _) -> (Just x, deburrLetter x Nothing) (Just c, xs) -> (Just x, deburrLetter x (Just c) ++ xs) deburrLetter :: Char -> Maybe Char -> String deburrLetter n _ | --------------------------------------------- A n == '\xc0' || n == '\xc1' || n == '\xc2' || n == '\xc3' || n == '\xc4' || n == '\xc5' || n == '\x0100' || n == '\x0102' || n == '\x0104' = "A" | ----------------------------------------------- a n == '\xe0' || n == '\xe1' || n == '\xe2' || n == '\xe3' || n == '\xe4' || n == '\xe5' || n == '\x0101' || n == '\x0103' || n == '\x0105' = "a" | ----------------------------------------------- C n == '\xc7' || n == '\x0106' || n == '\x0108' || n == '\x010a' || n == '\x010c' = "C" | ----------------------------------------------- c n == '\xe7' || n == '\x0107' || n == '\x0109' || n == '\x010b' || n == '\x010d' = "c" | ----------------------------------------------- D n == '\xd0' || n == '\x010e' || n == '\x0110' = "D" | ----------------------------------------------- d n == '\xf0' || n == '\x010f' || n == '\x0111' = "d" | ----------------------------------------------- E n == '\xc8' || n == '\xc9' || n == '\xca' || n == '\xcb' || n == '\x0112' || n == '\x0114' || n == '\x0116' || n == '\x0118' || n == '\x011a' = "E" | ----------------------------------------------- e n == '\xe8' || n == '\xe9' || n == '\xea' || n == '\xeb' || n == '\x0113' || n == '\x0115' || n == '\x0117' || n == '\x0119' || n == '\x011b' = "e" | ----------------------------------------------- G n == '\x011c' || n == '\x011e' || n == '\x0120' || n == '\x0122' = "G" | ----------------------------------------------- g n == '\x011d' || n == '\x011f' || n == '\x0121' || n == '\x0123' = "g" | n == '\x0124' || n == '\x0126' = "H" --- H | n == '\x0125' || n == '\x0127' = "h" --- h | ----------------------------------------------- I n == '\xcc' || n == '\xcd' || n == '\xce' || n == '\xcf' || n == '\x0128' || n == '\x012a' || n == '\x012c' || n == '\x012e' || n == '\x0130' = "I" | ----------------------------------------------- i n == '\xec' || n == '\xed' || n == '\xee' || n == '\xef' || n == '\x0129' || n == '\x012b' || n == '\x012d' || n == '\x012f' || n == '\x0131' = "i" | n == '\x0134' = "J" --- J | n == '\x0135' = "j" --- j | n == '\x0136' = "K" --- K | n == '\x0137' || n == '\x0138' = "k" --- k | ----------------------------------------------- L n == '\x0139' || n == '\x013b' || n == '\x013d' || n == '\x013f' || n == '\x0141' = "L" | ----------------------------------------------- l n == '\x013a' || n == '\x013c' || n == '\x013e' || n == '\x0140' || n == '\x0142' = "l" | ----------------------------------------------- N n == '\xd1' || n == '\x0143' || n == '\x0145' || n == '\x0147' || n == '\x014a' = "N" | ----------------------------------------------- n n == '\xf1' || n == '\x0144' || n == '\x0146' || n == '\x0148' || n == '\x014b' = "n" | ----------------------------------------------- O n == '\xd2' || n == '\xd3' || n == '\xd4' || n == '\xd5' || n == '\xd6' || n == '\xd8' || n == '\x014c' || n == '\x014e' || n == '\x0150' = "O" | ----------------------------------------------- o n == '\xf2' || n == '\xf3' || n == '\xf4' || n == '\xf5' || n == '\xf6' || n == '\xf8' || n == '\x014d' || n == '\x014f' || n == '\x0151' = "o" | ----------------------------------------------- R n == '\x0154' || n == '\x0156' || n == '\x0158' = "R" | ----------------------------------------------- r n == '\x0155' || n == '\x0157' || n == '\x0159' = "r" | ----------------------------------------------- S n == '\x015a' || n == '\x015c' || n == '\x015e' || n == '\x0160' = "S" | ----------------------------------------------- s n == '\x015b' || n == '\x015d' || n == '\x015f' || n == '\x0161' || n == '\x017f' = "s" | ----------------------------------------------- T n == '\x0162' || n == '\x0164' || n == '\x0166' = "T" | ----------------------------------------------- t n == '\x0163' || n == '\x0165' || n == '\x0167' = "t" | ----------------------------------------------- U n == '\xd9' || n == '\xda' || n == '\xdb' || n == '\xdc' || n == '\x0168' || n == '\x016a' || n == '\x016c' || n == '\x016e' || n == '\x0170' || n == '\x0172' = "U" | ----------------------------------------------- u n == '\xf9' || n == '\xfa' || n == '\xfb' || n == '\xfc' || n == '\x0169' || n == '\x016b' || n == '\x016d' || n == '\x016f' || n == '\x0171' || n == '\x0173' = "u" | n == '\x0174' = "W" --- W | n == '\x0175' = "w" --- w | ----------------------------------------------- Y n == '\xdd' || n == '\x0176' || n == '\x0178' = "Y" | ----------------------------------------------- y n == '\xfd' || n == '\xff' || n == '\x0177' = "y" | ----------------------------------------------- Z n == '\x0179' || n == '\x017b' || n == '\x017d' = "Z" | ----------------------------------------------- z n == '\x017a' || n == '\x017c' || n == '\x017e' = "z" deburrLetter n nxt | n == '\xc6' && maybe False isUpper nxt = "AE" -- AE | n == '\xc6' = "Ae" -- Ae | n == '\xe6' = "ae" -- ae | n == '\xde' && maybe False isUpper nxt = "TH" -- TH | n == '\xde' = "Th" -- Th | n == '\xfe' = "th" -- th | n == '\xdf' = "ss" -- ss | n == '\x0132' = "IJ" -- IJ | n == '\x0133' = "ij" -- ij | n == '\x0152' && maybe False isUpper nxt = "OE" -- OE | n == '\x0152' = "Oe" -- Oe | n == '\x0153' = "oe" -- oe | n == '\x0149' = "'n" -- 'n | otherwise = [n]