module TestSuite.Test.Convolution.CUDNN (tests) where

import Data.Type.Nat
import Data.Vec.Lazy (Vec(..))
import Foreign
import Foreign.C
import System.IO.Unsafe (unsafePerformIO)
import Test.Tasty
import Test.Tasty.HUnit
import Test.Tasty.QuickCheck

import Test.Tensor (Tensor(..))
import Test.Tensor qualified as Tensor
import Test.Tensor.TestValue

import TestSuite.Test.Convolution.Examples3B1B
import TestSuite.Util.TestKernel

{-------------------------------------------------------------------------------
  Lists of tests
-------------------------------------------------------------------------------}

tests :: TestTree
tests = testGroup "TestSuite.Test.Convolution.CUDNN" [
      testGroup "Sanity" [
            testCase "bindingVersion" test_bindingVersion
          , testCase "libraryVersion" test_libraryVersion
        ]
    , testGroup "Examples" [
          testCase "weightedMovingAverage" example_weightedMovingAverage
        ]
    , testGroup "Properties" [
          testGroup "matchesModel" [
              testGroup "1d" [
                  testProperty "kernelSize2" $ prop_matchesModel_1d @Nat2
                , testProperty "kernelSize3" $ prop_matchesModel_1d @Nat3
                , testProperty "kernelSize4" $ prop_matchesModel_1d @Nat4
                ]
            , testProperty "4d" prop_matchesModel
            ]
        ]
        , testProperty "mode" prop_mode
    ]

{-------------------------------------------------------------------------------
  Sanity checks
-------------------------------------------------------------------------------}

-- | Confirm that basic FFI interaction works as expected
test_bindingVersion :: Assertion
test_bindingVersion =
    assertEqual "" 1 $
      c_test_cudnn_binding_version

-- | Confirm cuDNN version (we expect at least version 9.0)
test_libraryVersion :: Assertion
test_libraryVersion =
    if c_test_cudnn_library_version >= 90000
      then return ()
      else assertFailure "Expect cuDNN version 9.0 or higher"

{-------------------------------------------------------------------------------
  Examples
-------------------------------------------------------------------------------}

example_weightedMovingAverage :: Assertion
example_weightedMovingAverage =
    assertEqual "" (Tensor.dim1 $ movingWeightedAverageResult @TestValue) $
      convolveCUDNN_1d
        (Tensor.dim1 movingWeightedAverageKernel)
        (Tensor.padWith 0 2 $ Tensor.dim1 $ movingAverageInput @TestValue)

{-------------------------------------------------------------------------------
  Properties

  NOTE: cuDNN does not like it when the size of the image is smaller than
  the size of the kernel.
-------------------------------------------------------------------------------}

-- | Compare our implementation against cuDNN, 1D case
prop_matchesModel_1d :: forall w.
     SNatI w
  => TestKernel '[w] TestValue  -- ^ Kernel
  -> Tensor Nat1 TestValue      -- ^ Input
  -> Property
prop_matchesModel_1d (testKernel -> kernel) input =
    Tensor.sizeAtLeast (minWidth ::: VNil) input ==>
          convolveCUDNN_1d kernel input
      === Tensor.convolve kernel input
  where
    minWidth :: Int
    minWidth = fromIntegral $ snatToNatural (snat @w)

-- | Compare our implementation against cuDNN, general case
prop_matchesModel :: ConvolutionParams TestValue -> Property
prop_matchesModel params =
        convolveCUDNN c_mode_cross_correlation stride kernels input
    === convolve_cuDNN_style params
  where
    ConvolutionParams{stride, input, kernels} = params

prop_mode :: ConvolutionParams TestValue -> Property
prop_mode params =
        convolveCUDNN
          c_mode_cross_correlation
          stride
          kernels
          input
    === convolveCUDNN
          c_mode_convolution
          stride
          ( Tensor.foreach kernels $ \outputFeature ->
              Tensor.foreach outputFeature $ \inputFeature ->
                Tensor.rotate inputFeature
          )
          input
  where
    ConvolutionParams{stride, input, kernels} = params

{-------------------------------------------------------------------------------
  Model
-------------------------------------------------------------------------------}

-- | cuDNN-style convolutions, but using our implementation
convolve_cuDNN_style :: Real a => ConvolutionParams a -> Tensor Nat4 a
convolve_cuDNN_style params =
    Tensor.foreach input $ \channels -> Tensor [
        -- Both the input and the kernel have 3 channels, so the result must
        -- be a singleton "channel".
        case Tensor.convolveWithStride stride' inputFeatures channels of
          Tensor [result] -> result
          _otherwise -> error "unexpected result"
      | inputFeatures <- Tensor.getTensor kernels
      ]
  where
    ConvolutionParams{stride = (sv, sh), input, kernels} = params

    stride' :: Vec Nat3 Int
    stride' = 1 ::: sv ::: sh ::: VNil

-- | Convolution parameters
--
-- Although both the input and the output are 4D tensors, their structure is
-- different:
--
-- * The input is NCHW:
--   - N images
--   - each image has C channels
--   - height H and width W
--
-- * The output is KCRS:
--   - K "output features"
--   - C "input features"
--   - height R and width S
--
-- For every input image we compute K output images ("channels"); each output
-- image results from applying C 2D kernels to each channel, adding up the
-- results. The result is an N*K*H*W tensor.
data ConvolutionParams a = ConvolutionParams {
      stride  :: (Int, Int)
    , input   :: Tensor Nat4 a
    , kernels :: Tensor Nat4 a
    }
  deriving stock (Show)

instance (Arbitrary a, Num a, Eq a) => Arbitrary (ConvolutionParams a) where
  arbitrary = sized $ \n -> do
      numImages      <- choose (1, max 1 n)
      inputFeatures  <- choose (1, 3)
      outputFeatures <- choose (1, 3)
      kernelHeight   <- choose (1, 5)
      kernelWidth    <- choose (1, 5)
      inputHeight    <- choose (kernelHeight, max kernelHeight n)
      inputWidth     <- choose (kernelWidth,  max kernelWidth  n)

      let inputSize :: Tensor.Size Nat4
          inputSize = numImages
                  ::: inputFeatures
                  ::: inputHeight
                  ::: inputWidth
                  ::: VNil

      let kernelSize :: Tensor.Size Nat4
          kernelSize = outputFeatures
                   ::: inputFeatures
                   ::: kernelHeight
                   ::: kernelWidth
                   ::: VNil

      stride  <- (,) <$> choose (1, 5) <*> choose (1, 5)
      input   <- Tensor.arbitraryOfSize inputSize arbitrary
      kernels <- Tensor.arbitraryOfSize kernelSize arbitrary

      return ConvolutionParams {stride, input, kernels}

  -- Shrinking is a bit complicated, because we need to maintain consistency
  -- between the kernels and the input
  shrink params = concat [
        -- Shrink stride
        [ params{stride = (sv', sh')}
        | (sv', sh') <- shrink stride
        , sv' > 0
        , sh' > 0
        ]

        -- Shrink input size
      , [ params{input = input', kernels = kernels'}
        | axe <- Tensor.allAxes (Tensor.size input)
        , let input'   = Tensor.axeWith axe input
        , let kernels' = adjustKernels axe kernels

          -- Image should not be smaller than the kernel
        , let (_ ::: _ ::: ih ::: iw ::: VNil) = Tensor.size input'
        , let (_ ::: _ ::: kh ::: kw ::: VNil) = Tensor.size kernels'
        , ih >= kh
        , iw >= kw
        ]

        -- Shrink the kernel
      , [ params{input = input', kernels = kernels'}
        | axe <- Tensor.allAxes (Tensor.size kernels)
        , let kernels' = Tensor.axeWith axe kernels
        , let input'   = adjustInput axe input
        ]

        -- Shrink input elements
      , [ params{input = images'}
        | images' <- Tensor.shrinkElem (Just Tensor.zero) shrink input
        ]

        -- Shrink kernel element
      , [ params{kernels = outputFeatures'}
        | outputFeatures' <- Tensor.shrinkElem Nothing shrink kernels
        ]
      ]
    where
      ConvolutionParams{stride, input, kernels} = params

      -- Adjust each kernel after we axe some part of the input
      adjustKernels :: Tensor.Axe Nat4 -> Tensor Nat4 a -> Tensor Nat4 a
      adjustKernels (Tensor.AxeHere _) =
          -- We dropped some images; kernel is unaffected
          id
      adjustKernels axe@(Tensor.AxeNested (Tensor.AxeHere _)) =
          -- We dropped some input channels; also drop the corresponding
          -- input features from the kernel
          Tensor.axeWith axe
      adjustKernels _otherwise =
          -- We reduced image height or width; kernel is unaffected
          -- (though we must check that the image is large enough now)
          id

      -- Adjust the input after we axe some of the kernels
      adjustInput :: Tensor.Axe Nat4 -> Tensor Nat4 a -> Tensor Nat4 a
      adjustInput (Tensor.AxeHere _) =
          -- We dropped some output features; input is unaffected
          id
      adjustInput axe@(Tensor.AxeNested (Tensor.AxeHere _)) =
          -- We dropped some input features; drop the corresponding channels
          Tensor.axeWith axe
      adjustInput _otherwise =
          -- We shrunk the kernel size (height or width), input is unaffected
          id

{-------------------------------------------------------------------------------
  Compute convolution using cuDNN
-------------------------------------------------------------------------------}

convolveCUDNN_1d :: forall a.
     (Fractional a, Real a)
  => Tensor Nat1 a -> Tensor Nat1 a -> Tensor Nat1 a
convolveCUDNN_1d kernel input = extract1d $
    convolveCUDNN
      c_mode_cross_correlation
      (1, 1)
      (Tensor [Tensor [Tensor [kernel]]])
      (Tensor [Tensor [Tensor [input]]])
  where
    extract1d :: Tensor Nat4 a -> Tensor Nat1 a
    extract1d (Tensor [Tensor [Tensor [output]]]) = output
    extract1d _ = error "convolveCUDNN_1d: unexpected output"

convolveCUDNN ::
     (Fractional a, Real a)
  => CudnnConvolutionMode
  -> (Int, Int)    -- ^ vertical and horizontal stride
  -> Tensor Nat4 a -- ^ kernel
  -> Tensor Nat4 a -- ^ input
  -> Tensor Nat4 a
convolveCUDNN mode (sv, sh) kernels input = unsafePerformIO $
    Tensor.unsafeWithCArray (realToFrac <$> kernels) $ \kernelsPtr ->
    Tensor.unsafeWithCArray (realToFrac <$> input)   $ \inputPtr   ->
    alloca $ \outputHeightPtr ->
    alloca $ \outputWidthPtr  -> do
      outputPtr <-
        c_test_cudnn_convolve
          mode
          (fromIntegral sv)
          (fromIntegral sh)
          (fromIntegral k)
          (fromIntegral kh)
          (fromIntegral kw)
          kernelsPtr
          (fromIntegral n)
          (fromIntegral c)
          (fromIntegral ih)
          (fromIntegral iw)
          inputPtr
          outputHeightPtr
          outputWidthPtr
      oh <- fromIntegral <$> peek outputHeightPtr
      ow <- fromIntegral <$> peek outputWidthPtr
      outputFPtr <- newForeignPtr finalizerFree outputPtr
      let outputSize = n ::: k ::: oh ::: ow ::: VNil
      return $ realToFrac <$> Tensor.unsafeFromCArray outputSize outputFPtr
  where
    n ::: c ::: ih ::: iw ::: VNil = Tensor.size input
    k ::: _ ::: kh ::: kw ::: VNil = Tensor.size kernels

{-------------------------------------------------------------------------------
  FFI imports
-------------------------------------------------------------------------------}

type CudnnConvolutionMode = CInt

foreign import capi unsafe "test-cudnn.h test_cudnn_binding_version"
  c_test_cudnn_binding_version :: Int

foreign import capi unsafe "test-cudnn.h test_cudnn_library_version"
  c_test_cudnn_library_version :: Int

foreign import capi unsafe "cudnn.h value CUDNN_CONVOLUTION"
  c_mode_convolution :: CudnnConvolutionMode

foreign import capi unsafe "cudnn.h value CUDNN_CROSS_CORRELATION"
  c_mode_cross_correlation :: CudnnConvolutionMode

foreign import capi unsafe "test-cudnn.h test_cudnn_convolve"
  c_test_cudnn_convolve ::
       CudnnConvolutionMode
    -> CInt       -- ^ vertical_stride
    -> CInt       -- ^ horizontal_stride
    -> CInt       -- ^ num_kernels
    -> CInt       -- ^ kernel_height
    -> CInt       -- ^ kernel_width
    -> Ptr Float  -- ^ kernel
    -> CInt       -- ^ num_images
    -> CInt       -- ^ input_channels
    -> CInt       -- ^ input_height
    -> CInt       -- ^ input_width
    -> Ptr Float  -- ^ input
    -> Ptr CInt   -- ^ output_height
    -> Ptr CInt   -- ^ output_width
    -> IO (Ptr Float)