{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE OverloadedStrings #-}

-- | Internal Unicode property lookups for IDN validation.
--
-- This module provides Unicode property lookups used internally for IDN
-- validation. For public API, see 'Data.Text.IDN'.
--
-- = Implementation Note: IntMap/IntSet vs CharSet
--
-- This module uses 'IntMap' and 'IntSet' from containers rather than 'CharSet'
-- from the charset package. Benchmarking showed that:
--
-- * Map lookups (codePointStatus, bidiClass, scriptOf, lookupContextRule)
--   are 2-23% slower with CharSet, as it provides no benefit for map operations
-- * Set membership tests (isCombiningMark, isVirama) show mixed results:
--     - CharSet is 36% faster for negative lookups (non-combining chars)
--     - CharSet is 2-6% slower for positive lookups and mixed workloads
-- * Overall: IntMap/IntSet is faster for our workload which is dominated by
--   map lookups rather than pure set membership tests
module Data.Text.IDN.Internal.Unicode
  ( -- * Property Lookup Functions
    codePointStatus
  , bidiClass
  , isCombiningMark
  , isVirama
  , scriptOf
  , lookupContextRule
  
    -- * Unicode Normalization
  , isNFCQuickCheck
  ) where

import Data.Text.IDN.Types
import qualified Data.IntMap.Strict as IM
import qualified Data.IntSet as IS
import Data.IntMap.Strict (IntMap)
import Data.IntSet (IntSet)
import Data.Char (ord)
import qualified Data.Text as T
import qualified Data.Text.Read as TR
import qualified Data.Text.Encoding as TE
import qualified Data.ByteString as BS
import Data.FileEmbed (embedFile)
import System.IO.Unsafe (unsafePerformIO)

-- Embed the Unicode data files at compile time
bidiClassData :: BS.ByteString
bidiClassData = $(embedFile "data/unicode/DerivedBidiClass.txt")

unicodeData :: BS.ByteString
unicodeData = $(embedFile "data/unicode/UnicodeData.txt")

joiningTypeData :: BS.ByteString
joiningTypeData = $(embedFile "data/unicode/DerivedJoiningType.txt")

scriptsData :: BS.ByteString
scriptsData = $(embedFile "data/unicode/Scripts.txt")

-- Parse data files once at module load time
{-# NOINLINE codePointStatusTable #-}
codePointStatusTable :: IntMap CodePointStatus
codePointStatusTable = deriveCodePointStatus (TE.decodeUtf8 unicodeData)

{-# NOINLINE bidiClassTable #-}
bidiClassTable :: IntMap BidiClass
bidiClassTable = unsafePerformIO $ parseBidiClasses (TE.decodeUtf8 bidiClassData)

{-# NOINLINE combiningMarksSet #-}
combiningMarksSet :: IntSet
combiningMarksSet = parseCombiningMarks (TE.decodeUtf8 unicodeData)

{-# NOINLINE viramasSet #-}
viramasSet :: IntSet
viramasSet = parseViramas (TE.decodeUtf8 joiningTypeData)

{-# NOINLINE scriptTable #-}
scriptTable :: IntMap Script
scriptTable = case parseScripts (TE.decodeUtf8 scriptsData) of
  Left err -> error $ "Failed to parse scripts: " ++ err
  Right m -> m

{-# NOINLINE contextRuleTable #-}
contextRuleTable :: IntMap ContextRule
contextRuleTable = IM.fromList $
  [ (0x200C, ZWNJRule)
  , (0x200D, ZWJRule)
  , (0x00B7, MiddleDotRule)
  , (0x0375, GreekKeraiaRule)
  , (0x05F3, HebrewGereshRule)
  , (0x05F4, HebrewGershayimRule)
  , (0x30FB, KatakanaMiddleDotRule)
  ] ++ [(cp, ArabicIndicDigitsRule) | cp <- [0x0660..0x0669]]
    ++ [(cp, ExtendedArabicIndicDigitsRule) | cp <- [0x06F0..0x06F9]]

-- Lookup functions
codePointStatus :: Char -> CodePointStatus
codePointStatus c = IM.findWithDefault PVALID (ord c) codePointStatusTable
{-# INLINE codePointStatus #-}

bidiClass :: Char -> BidiClass
bidiClass c = IM.findWithDefault L (ord c) bidiClassTable
{-# INLINE bidiClass #-}

isCombiningMark :: Char -> Bool
isCombiningMark c = IS.member (ord c) combiningMarksSet
{-# INLINE isCombiningMark #-}

isVirama :: Char -> Bool
isVirama c = IS.member (ord c) viramasSet
{-# INLINE isVirama #-}

scriptOf :: Char -> Maybe Script
scriptOf c = IM.lookup (ord c) scriptTable
{-# INLINE scriptOf #-}

lookupContextRule :: Char -> Maybe ContextRule
lookupContextRule c = IM.lookup (ord c) contextRuleTable
{-# INLINE lookupContextRule #-}

isNFCQuickCheck :: String -> Bool
isNFCQuickCheck = all checkChar
  where
    checkChar c = 
      let cp = ord c
      in cp < 0x0300 || (cp > 0x0370 && cp < 0x1AB0)
{-# INLINE isNFCQuickCheck #-}

-- Parsing functions
parseBidiClasses :: T.Text -> IO (IntMap BidiClass)
parseBidiClasses text = do
  let dataLines = filter (not . isComment) (T.lines text)
      entries = mapM parseBidiLine dataLines
  case entries of
    Left err -> error $ "Failed to parse bidi classes: " ++ err
    Right xs -> return $ IM.fromList (concat xs)

parseBidiLine :: T.Text -> Either String [(Int, BidiClass)]
parseBidiLine line = do
  let content = T.strip $ T.takeWhile (/= '#') line
  case T.words content of
    [] -> Right []
    [cpRange, ";", bidiClassStr] -> do
      (startCP, endCP) <- parseCodePointRange (T.strip cpRange)
      bidiClassVal <- parseBidiClass (T.strip bidiClassStr)
      return [(cp, bidiClassVal) | cp <- [startCP..endCP]]
    [cpRange, bidiClassStr] -> do  -- Handle without semicolon
      (startCP, endCP) <- parseCodePointRange (T.strip cpRange)
      bidiClassVal <- parseBidiClass (T.strip bidiClassStr)
      return [(cp, bidiClassVal) | cp <- [startCP..endCP]]
    _ -> Left $ "Invalid bidi line: " ++ T.unpack line

parseCodePointRange :: T.Text -> Either String (Int, Int)
parseCodePointRange t = case T.splitOn ".." t of
  [single] -> case TR.hexadecimal t of
    Right (cp, _) -> Right (cp, cp)
    Left err -> Left err
  [start, end] -> do
    (startCP, _) <- TR.hexadecimal start
    (endCP, _) <- TR.hexadecimal end
    return (startCP, endCP)
  _ -> Left $ "Invalid range: " ++ T.unpack t

parseBidiClass :: T.Text -> Either String BidiClass
parseBidiClass "L" = Right L
parseBidiClass "R" = Right R
parseBidiClass "AL" = Right AL
parseBidiClass "AN" = Right AN
parseBidiClass "EN" = Right EN
parseBidiClass "ES" = Right ES
parseBidiClass "ET" = Right ET
parseBidiClass "CS" = Right CS
parseBidiClass "NSM" = Right NSM
parseBidiClass "BN" = Right BN
parseBidiClass "B" = Right B
parseBidiClass "S" = Right S
parseBidiClass "WS" = Right WS
parseBidiClass "ON" = Right ON
parseBidiClass "LRE" = Right LRE
parseBidiClass "LRO" = Right LRO
parseBidiClass "RLE" = Right RLE
parseBidiClass "RLO" = Right RLO
parseBidiClass "PDF" = Right PDF
parseBidiClass "LRI" = Right LRI
parseBidiClass "RLI" = Right RLI
parseBidiClass "FSI" = Right FSI
parseBidiClass "PDI" = Right PDI
parseBidiClass t = Left $ "Unknown bidi class: " ++ T.unpack t

-- | Derive code point status according to RFC 5892
-- This implements the IDNA2008 derivation algorithm based on Unicode properties
deriveCodePointStatus :: T.Text -> IntMap CodePointStatus
deriveCodePointStatus text =
  let dataLines = filter (not . T.null) $ T.lines text
      entries = mapMaybe parseUnicodeDataLine dataLines
  in IM.fromList entries
  where
    mapMaybe f = foldr (\x acc -> case f x of Just y -> y : acc; Nothing -> acc) []

data UnicodeEntry = UnicodeEntry
  { ueCodePoint :: Int
  , ueName :: T.Text
  , ueCategory :: T.Text
  , ueCombiningClass :: Int
  }

parseUnicodeDataLine :: T.Text -> Maybe (Int, CodePointStatus)
parseUnicodeDataLine line =
  case T.splitOn ";" line of
    (cpHex:name:cat:ccc:_rest) -> do
      (cp, _) <- either (const Nothing) Just $ TR.hexadecimal (T.strip cpHex)
      let category = T.strip cat
          status = deriveStatus cp category
      Just (cp, status)
    _ -> Nothing

-- | RFC 5892 derivation rules
deriveStatus :: Int -> T.Text -> CodePointStatus
deriveStatus cp cat
  -- Explicit CONTEXTJ code points
  | cp == 0x200C = CONTEXTJ  -- Zero Width Non-Joiner
  | cp == 0x200D = CONTEXTJ  -- Zero Width Joiner
  -- Explicit CONTEXTO code points  
  | cp == 0x00B7 = CONTEXTO  -- Middle Dot
  | cp == 0x0375 = CONTEXTO  -- Greek Lower Numeral Sign (Keraia)
  | cp == 0x05F3 = CONTEXTO  -- Hebrew Punctuation Geresh
  | cp == 0x05F4 = CONTEXTO  -- Hebrew Punctuation Gershayim
  | cp == 0x30FB = CONTEXTO  -- Katakana Middle Dot
  | cp >= 0x0660 && cp <= 0x0669 = CONTEXTO  -- Arabic-Indic Digits
  | cp >= 0x06F0 && cp <= 0x06F9 = CONTEXTO  -- Extended Arabic-Indic Digits
  -- ASCII exceptions that are DISALLOWED (per RFC 5892)
  | cp `elem` [0x0020, 0x007F] = DISALLOWED  -- Space and DEL
  -- ASCII printable range (0x21-0x7E) is mostly PVALID
  | cp >= 0x0021 && cp <= 0x007E = PVALID
  -- Letters and marks are generally PVALID
  | cat `elem` ["Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Mc"] = PVALID
  -- Decimal numbers are PVALID
  | cat == "Nd" = PVALID
  -- Control characters and format characters are DISALLOWED
  | cat `elem` ["Cc", "Cf"] = DISALLOWED
  -- Unassigned
  | cat == "Cn" = UNASSIGNED
  -- Symbols, punctuation, separators, and other categories are DISALLOWED
  | cat `elem` ["Zs", "Zl", "Zp", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", 
                "Sm", "Sc", "Sk", "So", "Me", "No", "Nl"] = DISALLOWED
  -- Default to DISALLOWED for safety
  | otherwise = DISALLOWED

parseCombiningMarks :: T.Text -> IntSet
parseCombiningMarks text =
  let dataLines = filter (not . T.null) $ T.lines text
      marks = mapMaybe parseCombiningMarkLine dataLines
  in IS.fromList marks
  where
    mapMaybe f = foldr (\x acc -> case f x of Just y -> y : acc; Nothing -> acc) []

parseCombiningMarkLine :: T.Text -> Maybe Int
parseCombiningMarkLine line =
  case T.splitOn ";" line of
    (cpHex:_name:cat:ccc:_rest) ->
      if T.strip cat == "Mn" || T.strip cat == "Mc" || T.strip cat == "Me" ||
         T.strip ccc /= "0"
      then case TR.hexadecimal (T.strip cpHex) of
        Right (cp, _) -> Just cp
        Left _ -> Nothing
      else Nothing
    _ -> Nothing

parseViramas :: T.Text -> IntSet
parseViramas text =
  let dataLines = filter (not . isComment) (T.lines text)
      viramas = mapMaybe parseViramaLine dataLines
  in IS.fromList viramas
  where
    mapMaybe f = foldr (\x acc -> case f x of Just y -> y : acc; Nothing -> acc) []

parseViramaLine :: T.Text -> Maybe Int
parseViramaLine line =
  let content = T.strip $ T.takeWhile (/= '#') line
  in case T.splitOn ";" content of
    [cpRange, jType] | T.strip jType == "T" ->
      case parseCodePointRange (T.strip cpRange) of
        Right (cp, _) -> Just cp  -- Just take first of range
        Left _ -> Nothing
    _ -> Nothing

parseScripts :: T.Text -> Either String (IntMap Script)
parseScripts text = do
  let dataLines = filter (not . isComment) (T.lines text)
  entries <- mapM parseScriptLine dataLines
  return $ IM.fromList (concat entries)

parseScriptLine :: T.Text -> Either String [(Int, Script)]
parseScriptLine line =
  let content = T.strip $ T.takeWhile (/= '#') line
  in case T.splitOn ";" content of
    [] -> Right []
    [cpRange, scriptName] -> do
      (startCP, endCP) <- parseCodePointRange (T.strip cpRange)
      script <- parseScriptName (T.strip scriptName)
      return [(cp, script) | cp <- [startCP..endCP]]
    _ -> case T.words content of
      [] -> Right []
      [cpRange, scriptName] -> do
        (startCP, endCP) <- parseCodePointRange (T.strip cpRange)
        script <- parseScriptName (T.strip scriptName)
        return [(cp, script) | cp <- [startCP..endCP]]
      _ -> Left $ "Invalid script line: " ++ T.unpack line

parseScriptName :: T.Text -> Either String Script
parseScriptName "Arabic" = Right Arabic
parseScriptName "Hebrew" = Right Hebrew
parseScriptName "Greek" = Right Greek
parseScriptName "Hiragana" = Right Hiragana
parseScriptName "Katakana" = Right Katakana
parseScriptName "Hangul" = Right Hangul
parseScriptName "Latin" = Right Latin
parseScriptName "Cyrillic" = Right Cyrillic
parseScriptName "Devanagari" = Right Devanagari
parseScriptName "Thai" = Right Thai
parseScriptName _ = Right OtherScript

isComment :: T.Text -> Bool
isComment line = T.null (T.strip line) || "#" `T.isPrefixOf` T.strip line