{-# LANGUAGE OverloadedStrings #-} -- | High-level API for Internationalized Domain Names (IDN) processing. -- -- This module provides functions for converting between Unicode and ASCII -- domain names, following the IDNA2008 specification (RFC 5890-5893). -- -- == Overview -- -- Internationalized Domain Names allow domain names to contain non-ASCII -- characters. This library implements IDNA2008, which uses Punycode encoding -- to represent Unicode characters in ASCII-compatible form. -- -- == Example -- -- >>> toASCII "münchen.de" -- Right "xn--mnchen-3ya.de" -- -- >>> toUnicode "xn--mnchen-3ya.de" -- Right "münchen.de" -- -- >>> validateLabel "example" -- Right () -- -- == See Also -- -- * 'Data.Text.Punycode' for low-level Punycode encoding/decoding -- * RFC 5890-5893 for the full IDNA2008 specification module Data.Text.IDN ( -- * Main API toASCII , toUnicode , validateLabel -- * Types , module Data.Text.IDN.Types ) where import Data.Text (Text) import qualified Data.Text as T import Data.Text.IDN.Types import Data.Text.IDN.Internal.Unicode import Data.Text.IDN.Internal.Validation import Data.Text.IDN.Internal.Punycode (encodePunycode, decodePunycode) import Data.List.NonEmpty (NonEmpty) import qualified Data.List.NonEmpty as NE import Data.Char (ord, isAscii) -- | Convert a Unicode domain name to ASCII (A-label) form per RFC 5891. -- -- Converts each label in the domain name to its ASCII-compatible form. -- Unicode labels are encoded using Punycode and prefixed with "xn--". -- ASCII labels are validated and passed through unchanged. -- -- === Examples -- -- >>> toASCII "münchen.de" -- Right "xn--mnchen-3ya.de" -- -- >>> toASCII "example.com" -- Right "example.com" -- -- === Errors -- -- Returns 'Left' with 'IDNError' if: -- * Any label is empty or too long -- * Any label contains disallowed code points -- * Bidirectional text rules are violated -- * Contextual validation fails -- -- @since 0.1.0.0 toASCII :: Text -> Either IDNError Text toASCII input = do domain <- mkDomainName input processedLabels <- traverse processLabelToASCII (labels domain) return $ T.intercalate "." $ map labelText (NE.toList processedLabels) where labelText (ULabel t) = t labelText (ALabel t) = t -- | Convert an ASCII domain name to Unicode (U-label) form per RFC 5891. -- -- Converts each label in the domain name to its Unicode form. -- Punycode-encoded labels (starting with "xn--") are decoded to Unicode. -- Pure ASCII labels are validated and passed through unchanged. -- -- === Examples -- -- >>> toUnicode "xn--mnchen-3ya.de" -- Right "münchen.de" -- -- >>> toUnicode "example.com" -- Right "example.com" -- -- === Errors -- -- Returns 'Left' with 'IDNError' if: -- * Any label is empty or too long -- * Punycode decoding fails -- * Any label contains disallowed code points -- * Bidirectional text rules are violated -- -- @since 0.1.0.0 toUnicode :: Text -> Either IDNError Text toUnicode input = do domain <- mkDomainName input processedLabels <- traverse processLabelToUnicode (labels domain) return $ T.intercalate "." $ map labelText (NE.toList processedLabels) where labelText (ULabel t) = t labelText (ALabel t) = t -- | Process a single label to ASCII form. processLabelToASCII :: Label -> Either IDNError Label processLabelToASCII (ALabel t) | "xn--" `T.isPrefixOf` t = do -- It's a Punycode A-label, validate it by decoding and re-encoding -- This uses processLabelToUnicode which validates Punycode properly ulabel <- processLabelToUnicode (ALabel t) -- Then re-process back to ASCII to get a validated A-label processLabelToASCII ulabel | otherwise = -- Regular ASCII label, just validate validateLabel t >> return (ALabel t) processLabelToASCII (ULabel t) | isAsciiLabel t = do -- Pure ASCII label, validate and return as-is validateLabel t return (ULabel t) | otherwise = do -- Unicode label, needs Punycode encoding validateLabel t encoded <- case encodePunycode t of Left err -> Left (PunycodeDecodeError t (T.pack (show err))) Right e -> Right e let aLabel = "xn--" <> encoded -- Validate length constraints if T.length aLabel > 63 then Left (LabelTooLong (T.length aLabel) 63) else return (ALabel aLabel) -- | Process a single label to Unicode form. processLabelToUnicode :: Label -> Either IDNError Label processLabelToUnicode (ULabel t) = -- Already Unicode, validate and return validateLabel t >> return (ULabel t) processLabelToUnicode (ALabel t) | "xn--" `T.isPrefixOf` t = do -- Punycode A-label, decode it -- RFC 5891 Section 5.4: A-labels must be lowercase let encoded = T.drop 4 t encodedLower = T.toLower encoded -- Check if original was already lowercase (uppercase in Punycode is invalid) if encoded /= encodedLower then Left (PunycodeDecodeError encoded "A-label must be lowercase") else pure () decoded <- case decodePunycode encodedLower of Left err -> Left (PunycodeDecodeError encoded (T.pack (show err))) Right d -> Right d -- Punycode-encoded labels must contain at least one non-ASCII character -- If the decoded result is pure ASCII, it shouldn't have been encoded if isAsciiLabel decoded then Left (PunycodeDecodeError encoded "Punycode used for pure ASCII label") else pure () -- RFC 5891 Section 4.4: Verify round-trip consistency -- Re-encode and check it matches the lowercase normalized form reencoded <- case encodePunycode decoded of Left err -> Left (PunycodeDecodeError decoded (T.pack (show err))) Right e -> Right e if reencoded /= encodedLower then Left (PunycodeDecodeError encoded "Round-trip re-encoding failed") else do validateLabel decoded return (ULabel decoded) | otherwise = -- Pure ASCII label, validate and return validateLabel t >> return (ALabel t) -- | Validate a single domain label per IDNA2008 rules. -- -- Performs comprehensive validation including: -- * Length constraints (max 63 characters) -- * Hyphen position rules -- * Code point validity (PVALID, CONTEXTJ, CONTEXTO) -- * Contextual rule validation -- * Bidirectional text rules (RFC 5893) -- -- === Examples -- -- >>> validateLabel "example" -- Right () -- -- >>> validateLabel "" -- Left EmptyLabel -- -- === Errors -- -- Returns 'Left' with 'IDNError' if validation fails. See 'IDNError' -- constructors for specific error types. -- -- @since 0.1.0.0 validateLabel :: Text -> Either IDNError () validateLabel label = do -- Check empty if T.null label then Left EmptyLabel else pure () -- Check length (before encoding) if T.length label > 63 then Left (LabelTooLong (T.length label) 63) else pure () -- Check hyphen positions checkHyphens label -- Check combining marks at start case T.uncons label of Just (c, _) | isCombiningMark c -> Left (StartsWithCombiningMark c (ord c)) _ -> pure () -- Validate each code point mapM_ validateCodePoint (T.unpack label) -- Validate contextual rules for CONTEXTJ/CONTEXTO validateContextualRules label -- Validate bidi rules validateBidiRules label return () -- | Check hyphen position rules. checkHyphens :: Text -> Either IDNError () checkHyphens label = do case T.uncons label of Just ('-', _) -> Left (InvalidHyphenPosition StartsWithHyphen) _ -> pure () case T.unsnoc label of Just (_, '-') -> Left (InvalidHyphenPosition EndsWithHyphen) _ -> pure () -- Check for "--" in positions 3-4 (unless it's an A-label with xn--) if T.length label >= 4 && not ("xn--" `T.isPrefixOf` label) then if T.index label 2 == '-' && T.index label 3 == '-' then Left (InvalidHyphenPosition HyphensAt3And4NotPunycode) else pure () else pure () -- | Validate a single code point. validateCodePoint :: Char -> Either IDNError () validateCodePoint c = case codePointStatus c of PVALID -> pure () DISALLOWED -> Left (DisallowedCodePoint c (ord c)) CONTEXTJ -> pure () -- Will be checked in context validation CONTEXTO -> pure () -- Will be checked in context validation UNASSIGNED -> Left (DisallowedCodePoint c (ord c)) -- | Check if a label is pure ASCII. isAsciiLabel :: Text -> Bool isAsciiLabel = T.all isAscii