{-# LANGUAGE NumericUnderscores #-}
{-# LANGUAGE DataKinds #-}

{-|
Async Pipeline Demo - Demonstrating CPU/GPU Overlap

This demo shows how the async pipeline enables latency hiding by overlapping
CPU encoding with GPU execution.

Usage:
  cabal run async-pipeline-demo

Expected output:
- Performance comparison: sequential vs pipelined execution
- Speedup should be ~1.5-2x for pipelined version
-}

module Main where

import Prelude (Int, Double, Float, IO, ($), (++), (+), (-), (*), (/), (>>=), show, fromIntegral,
                putStrLn, return, (>>))
import qualified Prelude as P
import Graphics.WebGPU.Dawn
import Graphics.WebGPU.Dawn.Types (Shape(..), AnyTensor(..))
import WGSL.DSL
import WGSL.Execute (executeShaderNamed)
import WGSL.Async.Pipeline
import System.Clock (Clock(..), getTime, diffTimeSpec, toNanoSecs)
import Text.Printf (printf)
import qualified Data.Vector.Storable as V
import Control.Monad (forM_)

-- | Number of iterations for benchmark
numIterations :: Int
numIterations = 20

-- | Vector size for testing
vectorSize :: Int
vectorSize = 1_000_000

-- | Simple vector addition shader for testing
-- Note: This is a simplified shader just for testing async pipeline
-- The actual computation is placeholder - focus is on pipeline performance
vectorAddShader :: ShaderM ()
vectorAddShader = do
  vecA <- declareInputBuffer "A" (TArray vectorSize TF32)
  vecB <- declareInputBuffer "B" (TArray vectorSize TF32)
  vecC <- declareOutputBuffer "C" (TArray vectorSize TF32)

  gid <- globalId
  let idx = vecX gid

  -- Placeholder: Just create a simple computation
  -- This tests the pipeline overhead, not shader complexity
  result <- var TF32 (litF32 1.0)
  assign result ((litF32 2.0) + (litF32 3.0))

  return ()

main :: IO ()
main = do
  putStrLn "=== Async Pipeline Demo ==="
  putStrLn ""
  putStrLn "[Config] Comparing sequential vs pipelined execution"
  printf "[Config] Iterations: %d, Vector size: %d\n" numIterations vectorSize
  putStrLn ""

  -- Create GPU context
  ctx <- createContext

  -- Create test data
  let vecA = V.generate vectorSize (\i -> fromIntegral i :: Float)
      vecB = V.generate vectorSize (\i -> fromIntegral (i + 1) :: Float)

  -- Create input tensors (shared across both tests)
  tensorA <- createTensorWithData ctx (Shape [vectorSize]) vecA
  tensorB <- createTensorWithData ctx (Shape [vectorSize]) vecB

  -- Build shader
  let shaderModule = buildShaderWithAutoBinding (256, 1, 1) vectorAddShader
      workgroupSize = WorkgroupSize ((vectorSize + 255) `P.div` 256) 1 1

  putStrLn "[Phase 1] Sequential execution (baseline)"
  putStrLn "--------------------------------------------"
  timeSeq <- benchmarkSequential ctx shaderModule tensorA tensorB workgroupSize

  putStrLn ""
  putStrLn "[Phase 2] Pipelined execution (async)"
  putStrLn "--------------------------------------------"
  timeAsync <- benchmarkAsync ctx shaderModule tensorA tensorB workgroupSize

  putStrLn ""
  putStrLn "[Comparison]"
  putStrLn "--------------------------------------------"
  let speedup = timeSeq / timeAsync
      reduction = (timeSeq - timeAsync) / timeSeq * 100.0
  printf "  Sequential: %.2f ms\n" timeSeq
  printf "  Pipelined:  %.2f ms\n" timeAsync
  printf "  Speedup:    %.2fx faster\n" speedup
  printf "  Reduction:  %.1f%% less time\n" reduction

  putStrLn ""
  if speedup P.> 1.2
    then putStrLn "✓ Pipeline working! Significant speedup achieved."
    else putStrLn "⚠ Warning: Low speedup. Pipeline may not be overlapping effectively."

  -- Cleanup
  destroyTensor tensorA
  destroyTensor tensorB
  destroyContext ctx

  putStrLn ""
  putStrLn "Demo complete!"

-- | Benchmark sequential execution
benchmarkSequential :: Context -> ShaderModule -> Tensor 'F32 -> Tensor 'F32 -> WorkgroupSize -> IO Double
benchmarkSequential ctx shader tensorA tensorB wgSize = do
  -- Warm-up
  tensorC <- createTensor ctx (Shape [vectorSize]) F32
  executeShaderNamed ctx shader
    [ ("A", AnyTensor tensorA)
    , ("B", AnyTensor tensorB)
    , ("C", AnyTensor tensorC)
    ] wgSize
  destroyTensor tensorC

  -- Benchmark
  start <- getTime Monotonic
  forM_ [1..numIterations] $ \_ -> do
    tensorC <- createTensor ctx (Shape [vectorSize]) F32
    executeShaderNamed ctx shader
      [ ("A", AnyTensor tensorA)
      , ("B", AnyTensor tensorB)
      , ("C", AnyTensor tensorC)
      ] wgSize
    destroyTensor tensorC
  end <- getTime Monotonic

  let diff = diffTimeSpec end start
      timeMs = fromIntegral (toNanoSecs diff) / 1_000_000.0
  printf "  Total time: %.2f ms\n" timeMs
  printf "  Per iteration: %.2f ms\n" (timeMs / fromIntegral numIterations)
  return timeMs

-- | Benchmark async pipelined execution
benchmarkAsync :: Context -> ShaderModule -> Tensor 'F32 -> Tensor 'F32 -> WorkgroupSize -> IO Double
benchmarkAsync ctx shader tensorA tensorB wgSize = do
  -- Create async pipeline
  pipeline <- createAsyncPipeline ctx

  -- Warm-up
  submitCommand pipeline $ do
    tensorC <- createTensor ctx (Shape [vectorSize]) F32
    executeShaderNamed ctx shader
      [ ("A", AnyTensor tensorA)
      , ("B", AnyTensor tensorB)
      , ("C", AnyTensor tensorC)
      ] wgSize
    destroyTensor tensorC
  awaitPipeline pipeline

  -- Benchmark
  start <- getTime Monotonic
  forM_ [1..numIterations] $ \_ -> do
    -- Submit command (non-blocking) - CPU continues immediately
    submitCommand pipeline $ do
      tensorC <- createTensor ctx (Shape [vectorSize]) F32
      executeShaderNamed ctx shader
        [ ("A", AnyTensor tensorA)
        , ("B", AnyTensor tensorB)
        , ("C", AnyTensor tensorC)
        ] wgSize
      destroyTensor tensorC

  -- Wait for all commands to complete
  awaitPipeline pipeline
  end <- getTime Monotonic

  -- Get stats
  stats <- getPipelineStats pipeline
  printf "  Commands submitted: %d\n" (statSubmitted stats)
  printf "  Commands executed:  %d\n" (statExecuted stats)
  printf "  Errors:             %d\n" (statErrors stats)

  let diff = diffTimeSpec end start
      timeMs = fromIntegral (toNanoSecs diff) / 1_000_000.0
  printf "  Total time: %.2f ms\n" timeMs
  printf "  Per iteration: %.2f ms\n" (timeMs / fromIntegral numIterations)

  -- Cleanup pipeline
  destroyAsyncPipeline pipeline

  return timeMs