-- | -- Module: $Header$ -- Description: Encoding function for Punycode -- Copyright: Copyright © 2011 Jon Kristensen -- License: BSD3 -- -- Maintainer: jon.kristensen@pontarius.org -- Stability: unstable -- Portability: portable -- -- Uniquely and reversibly transform a Unicode string into an ASCII string. -- 0.0.2.0 will support going the other way - from ASCII to Unicode. -- TODO: Assmuption: All code points less than initial_n (128) are basic code -- points; code points are unsigned. module Data.Text.Punycode (encode) where import Data.Text (Text) import Data.Char (chr, intToDigit, isAlphaNum, isAscii, ord, toUpper) base :: Int base = 36 tmin :: Int tmin = 1 tmax :: Int tmax = 26 skew :: Int skew = 38 damp :: Int damp = 700 initial_bias :: Int initial_bias = 72 initial_n :: Int initial_n = 128 -- The first character after the ASCII characters -- decode will implement the following pseudo code from Section 6.2 of RFC 3492: -- -- let n = initial_n -- let i = 0 -- let bias = initial_bias -- let output = an empty string indexed from 0 -- consume all code points before the last delimiter (if there is one) -- and copy them to output, fail on any non-basic code point -- if more than zero code points were consumed then consume one more -- (which will be the last delimiter) -- while the input is not exhausted do begin -- let oldi = i -- let w = 1 -- for k = base to infinity in steps of base do begin -- consume a code point, or fail if there was none to consume -- let digit = the code point's digit-value, fail if it has none -- let i = i + digit * w, fail on overflow -- let t = tmin if k <= bias {+ tmin}, or -- tmax if k >= bias + tmax, or k - bias otherwise -- if digit < t then break -- let w = w * (base - t), fail on overflow -- end -- let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?) -- let n = n + i div (length(output) + 1), fail on overflow -- let i = i mod (length(output) + 1) -- {if n is a basic code point then fail} -- insert n into output at position i -- increment i -- end -- | -- Encodes a Unicode label to a Punycode-encoded ASCII string. The encoding does -- not prepend the IDNA @\"xn--\"@ prefix. The output is lower-cased. -- TODO: Support for mixed cases (see Appendix A in the RFC) -- encode implements the following pseudo code from Section 6.3 of RFC 3492: -- -- let n = initial_n -- let delta = 0 -- let bias = initial_bias -- let h = b = the number of basic code points in the input -- copy them to the output in order, followed by a delimiter if b > 0 -- {if the input contains a non-basic code point < n then fail} -- while h < length(input) do begin -- let m = the minimum {non-basic} code point >= n in the input -- let delta = delta + (m - n) * (h + 1), fail on overflow -- let n = m -- for each code point c in the input (in order) do begin -- if c < n {or c is basic} then increment delta, fail on overflow -- if c == n then begin -- let q = delta -- for k = base to infinity in steps of base do begin -- let t = tmin if k <= bias {+ tmin}, or -- tmax if k >= bias + tmax, or k - bias otherwise -- if q < t then break -- output the code point for digit t + ((q - t) mod (base - t)) -- let q = (q - t) div (base - t) -- end -- output the code point for digit q -- let bias = adapt(delta, h + 1, test h equals b?) -- let delta = 0 -- increment h -- end -- end -- increment delta and n -- end encode :: String -> Maybe String encode i = let n = initial_n delta = 0 bias = initial_bias h = b output = if (length $ basic_codepoints) == 0 then "" else basic_codepoints ++ ['-'] -- TODO: Fail for non-basic code point < n in if b == length i then Just i else loop output n delta bias h where b :: Int b = length basic_codepoints basic_codepoints :: String basic_codepoints = filter isAsciiAlphaNum i -- while h < length(input) loop :: String -> Int -> Int -> Int -> Int -> Maybe String loop output n delta bias h | h < (length i) = let m = minimum $ filter (>= n) $ map ord i delta' = delta + (m - n) * (h + 1) -- TODO: Overflow? n' = m (output', delta'', h') = loop' i output n' delta' bias h in loop output' (n' + 1) (delta'' + 1) bias h' | otherwise = Just output -- for each code point loop' :: String -> String -> Int -> Int -> Int -> Int -> (String, Int, Int) loop' [] output _ delta _ h = (output, delta, h) loop' (c:cs) output n delta bias h | ord c < n = loop' cs output n delta bias h | ord c == n = let q = delta (output', q') = loop'' base output bias q -- TODO: output'' seems to add an unnecessary -- character... I'll investigate this more -- tomorrow output'' = output' ++ [toDigitValue q'] in loop' cs output'' n 0 (adapt delta (h + 1) (h == b)) (h + 1) -- for k = base... loop'' :: Int -> String -> Int -> Int -> (String, Int) loop'' k output bias q | q < t = (output, q) | otherwise = let output' = output ++ [toDigitValue (t + ((q - t) `mod` (base - t)))] q' = (q - t) `div` (base - t) in loop'' (k + base) output' bias q' where t = if k <= bias + tmin then tmin else if k >= bias + tmax then tmax else k - bias -- TODO: ^^^^ -- Returns true if the Char is ASCII, alpha-numeric, or a hyphen. -- TODO: Ugly function isAsciiAlphaNum :: Char -> Bool isAsciiAlphaNum '-' = True isAsciiAlphaNum c = isAscii c && isAlphaNum c -- adapt implements the following pseudo code from Section 6.1 of RFC 3492: -- -- function adapt(delta,numpoints,firsttime): -- if firsttime then let delta = delta div damp -- else let delta = delta div 2 -- let delta = delta + (delta div numpoints) -- let k = 0 -- while delta > ((base - tmin) * tmax) div 2 do begin -- let delta = delta div (base - tmin) -- let k = k + base -- end -- return k + (((base - tmin + 1) * delta) div (delta + skew)) adapt :: Int -> Int -> Bool -> Int adapt d n f = loop 0 d' where d' :: Int d' = if f then d `div` damp else d `div` 2 + d `div` n loop :: Int -> Int -> Int loop k d'' | d'' > ((base - tmin) * tmax) `div` 2 = loop (k + base) (d'' `div` (base - tmin)) | otherwise = k + ((base - tmin + 1) * d'') `div` (d'' + skew) -- Maps a character to its Punycode digit value; 0-25 for characters, 26-35 for -- digits. -- toDigitValue :: Char -> Int -- -- toDigitValue c | c `elem` ['0'..'9'] = digitToInt c + 30 -- | toUpper c `elem` ['A'..'Z'] = ord (toUpper c) - 65 -- The 0-25 digits are mapped to 'a'-'z', 26-35 to '0'-'9'. toDigitValue :: Int -> Char toDigitValue i | i >= 0 && i <= 25 = chr $ 97 + i | i >= 26 && i <= 35 = intToDigit $ i - 26