{-# LANGUAGE DataKinds #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE GADTs #-}

{-|
Module: WGSL.API
Description: High-Level API for GPU Computing (The "Accelerate" Layer)

This module provides a high-level, easy-to-use API for GPU computing,
similar to Haskell Accelerate. It automatically handles:
- Shader generation
- Buffer allocation
- Kernel execution
- Result retrieval

Example:
@
  import WGSL.API

  -- Double every element on the GPU
  result <- gpuMap (*2.0) inputVector

  -- Square root of every element
  sqrts <- gpuMap sqrt inputVector
@

This layer sits on top of the low-level DSL, providing:
- Zero boilerplate for simple operations
- Automatic buffer management
- Type-safe transformations
- Compiler-generated optimized shaders

For advanced use cases (shared memory, subgroups, custom kernels),
use the low-level WGSL.DSL API directly.
-}

module WGSL.API
  ( -- * High-Level Operations
    gpuMap
  , gpuMap2
  , gpuZipWith
  , gpuFold

    -- * Types
  , GPUVector
  , toGPU
  , fromGPU'

    -- * Re-exports for convenience
  , Context
  , createContext
  , destroyContext
  ) where

import Prelude
import qualified Prelude as P

import Graphics.WebGPU.Dawn hiding (toGPU)
import Graphics.WebGPU.Dawn.Tensor hiding (toGPU)
import Graphics.WebGPU.Dawn.Types (AnyTensor(..))
import qualified Data.Vector.Storable as V
import Control.Exception (bracket)
import Data.Proxy

import WGSL.AST
import WGSL.DSL hiding ((<), (>), (<=), (>=), (==), (/=))
import qualified WGSL.DSL as DSL
import WGSL.Execute hiding (toGPU)

-- | GPU-resident vector (opaque handle)
-- This wraps a Tensor but provides a more user-friendly interface
data GPUVector a = GPUVector
  { gpuContext :: Context
  , gpuTensor :: Tensor a
  , gpuSize :: Int
  }

-- | Transfer a vector to GPU memory
toGPU :: forall a. Context -> V.Vector Float -> IO (GPUVector 'F32)
toGPU ctx vec = do
  let size = V.length vec
  tensor <- createTensorWithDataTyped ctx (Shape [size]) vec
  return $ GPUVector ctx tensor size

-- | Transfer a vector from GPU memory to CPU
fromGPU' :: forall a. GPUVector 'F32 -> IO (V.Vector Float)
fromGPU' (GPUVector ctx tensor size) = fromGPU ctx tensor size

-- ============================================================================
-- High-Level Operations
-- ============================================================================

-- | Map a function over a GPU vector
-- This is the primary high-level operation - zero boilerplate!
--
-- Example:
-- @
--   ctx <- createContext
--   input <- toGPU ctx (V.fromList [1,2,3,4])
--   output <- gpuMap (*2.0) input
--   result <- fromGPU' output
--   -- result = [2,4,6,8]
-- @
gpuMap :: (Exp F32 -> Exp F32) -> GPUVector 'F32 -> IO (GPUVector 'F32)
gpuMap f (GPUVector ctx inputTensor size) = do
  -- Allocate output buffer
  outputTensor <- createTensorTyped ctx (Shape [size]) :: IO (Tensor 'F32)

  -- Generate shader automatically
  let shader = buildGpuMapShader size f

  -- Execute with auto-binding
  executeShaderNamed ctx shader
    [ ("input", AnyTensor inputTensor)
    , ("output", AnyTensor outputTensor)
    ]
    (WorkgroupSize ((size + 255) `div` 256) 1 1)  -- Auto-compute workgroups

  return $ GPUVector ctx outputTensor size

-- | Map a binary function over two GPU vectors (element-wise)
--
-- Example:
-- @
--   result <- gpuMap2 (+) vecA vecB  -- Element-wise addition
-- @
gpuMap2 :: (Exp F32 -> Exp F32 -> Exp F32)
        -> GPUVector 'F32
        -> GPUVector 'F32
        -> IO (GPUVector 'F32)
gpuMap2 f (GPUVector ctx inputA sizeA) (GPUVector _ inputB sizeB) = do
  if sizeA P./= sizeB
    then error "gpuMap2: input vectors must have the same size"
    else do
      -- Allocate output buffer
      outputTensor <- createTensorTyped ctx (Shape [sizeA]) :: IO (Tensor 'F32)

      -- Generate shader automatically
      let shader = buildGpuMap2Shader sizeA f

      -- Execute with auto-binding
      executeShaderNamed ctx shader
        [ ("inputA", AnyTensor inputA)
        , ("inputB", AnyTensor inputB)
        , ("output", AnyTensor outputTensor)
        ]
        (WorkgroupSize ((sizeA + 255) `div` 256) 1 1)

      return $ GPUVector ctx outputTensor sizeA

-- | Alias for gpuMap2 (more intuitive naming)
gpuZipWith :: (Exp F32 -> Exp F32 -> Exp F32)
           -> GPUVector 'F32
           -> GPUVector 'F32
           -> IO (GPUVector 'F32)
gpuZipWith = gpuMap2

-- | Parallel reduction (fold) on GPU
-- Currently implements a simple parallel reduction
--
-- Example:
-- @
--   sum <- gpuFold (+) 0 vector  -- Parallel sum
-- @
gpuFold :: (Exp F32 -> Exp F32 -> Exp F32)  -- Binary operator
        -> Float                             -- Initial value
        -> GPUVector 'F32                    -- Input vector
        -> IO Float
gpuFold op initVal (GPUVector ctx inputTensor size) = do
  -- For now, implement a simple two-stage reduction:
  -- Stage 1: Each workgroup reduces locally to shared memory
  -- Stage 2: Reduce the workgroup results on CPU

  -- Simple implementation: read to CPU and fold
  -- TODO: Implement proper GPU reduction with shared memory
  vec <- fromGPU ctx inputTensor size
  return $ V.foldl' (\acc x -> evalExpApprox (op (LitF32 acc) (LitF32 x))) initVal vec
  where
    -- Approximate evaluation for simple expressions
    evalExpApprox :: Exp F32 -> Float
    evalExpApprox (LitF32 x) = x
    evalExpApprox (Add a b) = evalExpApprox a + evalExpApprox b
    evalExpApprox (Mul a b) = evalExpApprox a * evalExpApprox b
    evalExpApprox (Sub a b) = evalExpApprox a - evalExpApprox b
    evalExpApprox (Div a b) = evalExpApprox a / evalExpApprox b
    evalExpApprox _ = error "gpuFold: complex expression not supported yet"

-- ============================================================================
-- Shader Generation (Internal)
-- ============================================================================

-- | Automatically generate a map shader from a function
buildGpuMapShader :: Int -> (Exp F32 -> Exp F32) -> ShaderModule
buildGpuMapShader size f = buildShaderWithAutoBinding (256, 1, 1) $ do
  -- Declare buffers
  input <- declareInputBuffer "input" (TArray size TF32)
  output <- declareOutputBuffer "output" (TArray size TF32)

  -- Get global thread ID
  gid <- globalId
  let idx = i32 (vecX gid)

  -- Bounds check
  if_ (idx DSL.< litI32 size)
    (do
      -- Load input
      value <- readBuffer input idx

      -- Apply user function
      let result = f value

      -- Store result
      writeBuffer output idx result
    )
    (return ())

-- | Automatically generate a map2 shader from a binary function
buildGpuMap2Shader :: Int -> (Exp F32 -> Exp F32 -> Exp F32) -> ShaderModule
buildGpuMap2Shader size f = buildShaderWithAutoBinding (256, 1, 1) $ do
  -- Declare buffers
  inputA <- declareInputBuffer "inputA" (TArray size TF32)
  inputB <- declareInputBuffer "inputB" (TArray size TF32)
  output <- declareOutputBuffer "output" (TArray size TF32)

  -- Get global thread ID
  gid <- globalId
  let idx = i32 (vecX gid)

  -- Bounds check
  if_ (idx DSL.< litI32 size)
    (do
      -- Load inputs
      valueA <- readBuffer inputA idx
      valueB <- readBuffer inputB idx

      -- Apply user function
      let result = f valueA valueB

      -- Store result
      writeBuffer output idx result
    )
    (return ())