Text/IDNA.hs

{-# LANGUAGE OverloadedStrings #-}
-- | This module implements the two algorithms from RFC 3490. (<http://tools.ietf.org/html/rfc3490>)
module Text.IDNA (acePrefix, toASCII, toUnicode)
where

import Text.StringPrep
import Text.StringPrep.Profiles
import qualified Data.Text as Text
import Data.Text (Text)
import qualified Data.Text.Punycode as Puny
import Data.Text.Encoding as E

-- | The ASCII Compatible Encoding prefix (currently \'@xn--@\').
acePrefix :: Text
acePrefix = "xn--"

-- | Implements the ToASCII algorithm.
toASCII :: Bool -- ^ Whether to allow unassigned code points (in RFC: AllowUnassigned).
	-> Bool -- ^ Whether to disallow certain ASCII characters (in RFC: UseSTD3ASCIIRules). 
	-> Text -- ^ The text to transform.
	-> Maybe Text
toASCII allowUnassigned useSTD3ASCIIRules t = do
		step2 <- if Text.any (>'\x7f') t
			then runStringPrep (namePrepProfile allowUnassigned) t
			else return t

		step3 <- if (useSTD3ASCIIRules && (Text.any isLDHascii step2 ||	Text.head step2 == '-' || Text.last step2 == '-'))
			then Nothing
			else return step2

		step7 <- if (Text.any (>'\x7f') step2)
				then if acePrefix `Text.isPrefixOf` step3
					then Nothing
					else case return (Puny.encode step3) of -- TODO: this can fail?
						Left _ -> Nothing
						Right t' -> return $ acePrefix `Text.append` E.decodeUtf8 t'
				else return step3
		
		if Text.length step7 <= 63
			then return step7
			else Nothing

isLDHascii :: Char -> Bool
isLDHascii c =
	'\x0' <= c && c <= '\x2c' ||
	'\x2e' <= c && c <= '\x2f' ||
	'\x3a' <= c && c <= '\x40' ||
	'\x5b' <= c && c <= '\x60' ||
	'\x7b' <= c && c <= '\x7f' 

toUnicode :: Bool -- ^ Whether to allow unassigned code points (in RFC: AllowUnassigned).
	-> Bool -- ^ Whether to disallow certain ASCII characters (in RFC: UseSTD3ASCIIRules). 
	-> Text -- ^ The text to transform.
	-> Text
toUnicode allowUnassigned useSTD3ASCIIRules t = mergeEither $ do
	step2 <- if Text.any (>'\x7f') t
		then case runStringPrep (namePrepProfile allowUnassigned) t of
			Nothing -> Left t
			Just t' -> return t'
		else return t

	step3 <- if not $ acePrefix `Text.isPrefixOf` step2
		then Left step2
		else return step2
	
	let step4 = Text.drop (Text.length acePrefix) step3
	step5 <- case Puny.decode $ E.encodeUtf8 step4 of
		Left _ -> Left step3
		Right s -> return s

	case toASCII allowUnassigned useSTD3ASCIIRules step5 of
		Nothing -> return step3
		Just t' -> if t' == step3
			then return step5
			else return step3

mergeEither :: Either a a -> a
mergeEither (Left x) = x
mergeEither (Right y) = y

tests :: [Text]
tests = ["Bücher","tūdaliņ"]