{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE FlexibleContexts #-}
module LLVM.Extra.Vector (
   Simple (shuffleMatch, extract), C (insert),
   Element, Size,
   Canonical, Construct,

   size, sizeInTuple,
   replicate, iterate, assemble,

   shuffle,
   rotateUp, rotateDown, reverse,
   shiftUp, shiftDown,
   shiftUpMultiZero, shiftDownMultiZero,

   shuffleMatchTraversable,
   shuffleMatchAccess,
   shuffleMatchPlain1,
   shuffleMatchPlain2,

   insertTraversable,
   extractTraversable,
   extractAll,

   Constant, constant,

   insertChunk, modify,
   map, mapChunks, zipChunksWith,
   chop, concat, select,
   signedFraction,
   cumulate1, umul32to64,
   Arithmetic
      (sum, sumToPair, sumInterleavedToPair,
       cumulate, dotProduct, mul),
   Real
      (min, max, abs, signum,
       truncate, floor, fraction),
   ) where

import qualified LLVM.Extra.Extension.X86Auto as X86A
import qualified LLVM.Extra.ExtensionCheck.X86 as X86C
import qualified LLVM.Extra.Extension.X86 as X86
import qualified LLVM.Extra.Extension as Ext

import qualified LLVM.Extra.Class as Class
import qualified LLVM.Extra.Monad as M
import qualified LLVM.Extra.ArithmeticPrivate as A

import qualified LLVM.Core as LLVM
import LLVM.Util.Loop (Phi(phis, addPhis), )
import LLVM.Core
   (Value, ConstValue, valueOf, value, constOf, undef,
    Vector, insertelement, extractelement, constVector,
    IsConst, IsArithmetic, IsFloating,
    IsPrimitive,
    CodeGenFunction, )

import Types.Data.Num (D4, (:+:), )
import qualified Types.Data.Num as TypeNum
import Control.Monad.HT ((<=<), )
import Control.Monad (liftM2, liftM3, foldM, )
import Data.Tuple.HT (uncurry3, )
import qualified Data.List.HT as ListHT
import qualified Data.List as List

import Control.Applicative (liftA2, )
import qualified Control.Applicative as App
import qualified Data.Traversable as Trav
import qualified Data.Foldable as Fold

-- import qualified Data.Bits as Bit
import Data.Int  (Int8, Int16, Int32, Int64, )
import Data.Word (Word8, Word16, Word32, Word64, )

import Prelude hiding
          (Real, truncate, floor, round,
           map, zipWith, iterate, replicate, reverse, concat, sum, )


-- * target independent functions

{- |
Allow to work on records of vectors as if they are vectors of records.
This is a reasonable approach for records of different element types
since processor vectors can only be built from elements of the same type.
But also, say, for chunked stereo signal this makes sense.
In this case we would work on @Stereo (Value a)@.

Formerly we used a two-way dependency Vector <-> (Element, Size).
Now we have only the dependency Vector -> (Element, Size).
This means that we need some more type annotations
as in umul32to64/assemble,
on the other hand we can allow multiple vector types
with respect to the same element type.
E.g. we can provide a vector type with pair elements
where the pair elements are interleaved in the vector.
-}
class (Simple v) => C v where
   insert :: Value Word32 -> Element v -> v -> CodeGenFunction r v

class
   (TypeNum.PositiveT (Size v), Phi v, Class.Undefined v) =>
      Simple v where

   type Element v :: *
   type Size v :: *

   shuffleMatch ::
      ConstValue (Vector (Size v) Word32) -> v -> CodeGenFunction r v

   extract :: Value Word32 -> v -> CodeGenFunction r (Element v)


instance
   (TypeNum.PositiveT n, LLVM.IsPrimitive a) =>
      Simple (Value (Vector n a)) where

   type Element (Value (Vector n a)) = Value a
   type Size (Value (Vector n a)) = n

   shuffleMatch is v = shuffleMatchPlain1 v is
   extract k v = extractelement v k

instance
   (TypeNum.PositiveT n, LLVM.IsPrimitive a) =>
      C (Value (Vector n a)) where

   insert k a v = insertelement v a k


instance
   (Simple v0, Simple v1, Size v0 ~ Size v1) =>
      Simple (v0, v1) where

   type Element (v0, v1) = (Element v0, Element v1)
   type Size (v0, v1) = Size v0

   shuffleMatch is (v0,v1) =
      liftM2 (,)
         (shuffleMatch is v0)
         (shuffleMatch is v1)

   extract k (v0,v1) =
      liftM2 (,)
         (extract k v0)
         (extract k v1)

instance
   (C v0, C v1, Size v0 ~ Size v1) =>
      C (v0, v1) where

   insert k (a0,a1) (v0,v1) =
      liftM2 (,)
         (insert k a0 v0)
         (insert k a1 v1)


instance
   (Simple v0, Simple v1, Simple v2, Size v0 ~ Size v1, Size v1 ~ Size v2) =>
      Simple (v0, v1, v2) where

   type Element (v0, v1, v2) = (Element v0, Element v1, Element v2)
   type Size (v0, v1, v2) = Size v0

   shuffleMatch is (v0,v1,v2) =
      liftM3 (,,)
         (shuffleMatch is v0)
         (shuffleMatch is v1)
         (shuffleMatch is v2)

   extract k (v0,v1,v2) =
      liftM3 (,,)
         (extract k v0)
         (extract k v1)
         (extract k v2)

instance
   (C v0, C v1, C v2, Size v0 ~ Size v1, Size v1 ~ Size v2) =>
      C (v0, v1, v2) where

   insert k (a0,a1,a2) (v0,v1,v2) =
      liftM3 (,,)
         (insert k a0 v0)
         (insert k a1 v1)
         (insert k a2 v2)


newtype Constant n a = Constant a

constant :: (TypeNum.PositiveT n) => a -> Constant n a
constant = Constant

instance Functor (Constant n) where
   {-# INLINE fmap #-}
   fmap f (Constant a) = Constant (f a)

instance App.Applicative (Constant n) where
   {-# INLINE pure #-}
   pure = Constant
   {-# INLINE (<*>) #-}
   Constant f <*> Constant a = Constant (f a)

instance Fold.Foldable (Constant n) where
   {-# INLINE foldMap #-}
   foldMap = Trav.foldMapDefault

instance Trav.Traversable (Constant n) where
   {-# INLINE sequenceA #-}
   sequenceA (Constant a) = fmap Constant a

instance (Phi a) => Phi (Constant n a) where
   phis = Class.phisTraversable
   addPhis = Class.addPhisFoldable

instance (Class.Undefined a) => Class.Undefined (Constant n a) where
   undefTuple = Class.undefTuplePointed

instance (TypeNum.PositiveT n, Phi a, Class.Undefined a) => Simple (Constant n a) where

   type Element (Constant n a) = a
   type Size (Constant n a) = n

   shuffleMatch _ = return
   extract _ (Constant a) = return a


class (n ~ Size (Construct n a), a ~ Element (Construct n a),
       C (Construct n a)) =>
         Canonical n a where
   type Construct n a :: *

instance
   (TypeNum.PositiveT n, LLVM.IsPrimitive a) =>
      Canonical n (Value a) where
   type Construct n (Value a) = Value (Vector n a)

instance (Canonical n a0, Canonical n a1) => Canonical n (a0, a1) where
   type Construct n (a0, a1) = (Construct n a0, Construct n a1)

instance (Canonical n a0, Canonical n a1, Canonical n a2) => Canonical n (a0, a1, a2) where
   type Construct n (a0, a1, a2) = (Construct n a0, Construct n a1, Construct n a2)


size ::
   (TypeNum.PositiveT n) =>
   Value (Vector n a) -> Int
size =
   let sz :: (TypeNum.PositiveT n) => n -> Value (Vector n a) -> Int
       sz n _ = TypeNum.fromIntegerT n
   in  sz undefined

{- |
Manually assemble a vector of equal values.
Better use ScalarOrVector.replicate.
-}
replicate ::
   (C v) =>
   Element v -> CodeGenFunction r v
replicate = replicateCore undefined

replicateCore ::
   (C v) =>
   Size v -> Element v -> CodeGenFunction r v
replicateCore n =
   assemble . List.replicate (TypeNum.fromIntegerT n)

{- |
construct a vector out of single elements

You must assert that the length of the list matches the vector size.

This can be considered the inverse of 'extractAll'.
-}
assemble ::
   (C v) =>
   [Element v] -> CodeGenFunction r v
assemble =
   foldM (\v (k,x) -> insert (valueOf k) x v) Class.undefTuple .
   List.zip [0..]
{- sends GHC into an infinite loop
   foldM (\(k,x) -> insert (valueOf k) x) Class.undefTuple .
   List.zip [0..]
-}

insertChunk ::
   (C c, C v, Element c ~ Element v) =>
   Int -> c ->
   v -> CodeGenFunction r v
insertChunk k x =
   M.chain $
   List.zipWith
      (\i j -> \v ->
          extract (valueOf i) x >>= \e ->
          insert (valueOf j) e v)
      (take (sizeInTuple x) [0..])
      [fromIntegral k ..]

iterate ::
   (C v) =>
   (Element v -> CodeGenFunction r (Element v)) ->
   Element v -> CodeGenFunction r v
iterate f x =
   fmap snd $
   iterateCore f x Class.undefTuple

iterateCore ::
   (C v) =>
   (Element v -> CodeGenFunction r (Element v)) ->
   Element v -> v ->
   CodeGenFunction r (Element v, v)
iterateCore f x0 v0 =
   foldM
      (\(x,v) k ->
         liftM2 (,) (f x)
            (insert (valueOf k) x v))
      (x0,v0)
      (take (sizeInTuple v0) [0..])

{- |
Manually implement vector shuffling using insertelement and extractelement.
In contrast to LLVM's built-in instruction it supports distinct vector sizes,
but it allows only one input vector
(or a tuple of vectors, but we cannot shuffle between them).
For more complex shuffling we recommend 'extractAll' and 'assemble'.
-}
shuffle ::
   (C v, C w, Element v ~ Element w) =>
   v ->
   ConstValue (Vector (Size w) Word32) ->
   CodeGenFunction r w
shuffle x i =
   assemble =<<
   mapM
      (flip extract x <=< extractelement (value i) . valueOf)
      (take (size (value i)) [0..])


sizeInTuple :: Simple v => v -> Int
sizeInTuple =
   let sz :: Simple v => Size v -> v -> Int
       sz n _ = TypeNum.fromIntegerT n
   in  sz undefined

{- |
Rotate one element towards the higher elements.

I don't want to call it rotateLeft or rotateRight,
because there is no prefered layout for the vector elements.
In Intel's instruction manual vector
elements are indexed like the bits,
that is from right to left.
However, when working with Haskell list and enumeration syntax,
the start index is left.
-}
rotateUp ::
   (Simple v) =>
   v -> CodeGenFunction r v
rotateUp x =
   shuffleMatch
      (constVector $ List.map constOf $
       (fromIntegral (sizeInTuple x) - 1) : [0..]) x

rotateDown ::
   (Simple v) =>
   v -> CodeGenFunction r v
rotateDown x =
   shuffleMatch
      (constVector $ List.map constOf $
       List.take (sizeInTuple x - 1) [1..] ++ [0]) x

reverse ::
   (Simple v) =>
   v -> CodeGenFunction r v
reverse x =
   shuffleMatch
      (constVector $ List.map constOf $
       List.reverse $
       List.take (sizeInTuple x) [0..]) x

shiftUp ::
   (C v) =>
   Element v -> v -> CodeGenFunction r (Element v, v)
shiftUp x0 x = do
   y <-
      shuffleMatch
         (constVector $ undef : List.map constOf [0..]) x
   liftM2 (,)
      (extract (LLVM.valueOf (fromIntegral (sizeInTuple x) - 1)) x)
      (insert (value LLVM.zero) x0 y)

shiftDown ::
   (C v) =>
   Element v -> v -> CodeGenFunction r (Element v, v)
shiftDown x0 x = do
   y <-
      shuffleMatch
         (constVector $
          List.map constOf (List.take (sizeInTuple x - 1) [1..]) ++ [undef]) x
   liftM2 (,)
      (extract (value LLVM.zero) x)
      (insert (LLVM.valueOf (fromIntegral (sizeInTuple x) - 1)) x0 y)

shiftUpMultiZero ::
   (C v, Class.Zero (Element v)) =>
   Int -> v -> LLVM.CodeGenFunction r v
shiftUpMultiZero n v =
   assemble . take (sizeInTuple v) .
   (List.replicate n Class.zeroTuple ++) =<< extractAll v

shiftDownMultiZero ::
   (C v, Class.Zero (Element v)) =>
   Int -> v -> LLVM.CodeGenFunction r v
shiftDownMultiZero n v =
   assemble . take (sizeInTuple v) .
   (++ List.repeat Class.zeroTuple) . List.drop n
      =<< extractAll v


shuffleMatchTraversable ::
   (Simple v, Trav.Traversable f) =>
   ConstValue (Vector (Size v) Word32) -> f v -> CodeGenFunction r (f v)
shuffleMatchTraversable is v =
   Trav.mapM (shuffleMatch is) v

{- |
Implement the 'shuffleMatch' method using the methods of the 'C' class.
-}
shuffleMatchAccess ::
   (C v) =>
   ConstValue (Vector (Size v) Word32) -> v -> CodeGenFunction r v
shuffleMatchAccess is v =
   assemble =<<
   mapM
      (flip extract v <=<
       flip extract (value is) . valueOf)
      (take (size (value is)) [0..])


shuffleMatchPlain1 ::
   (TypeNum.PositiveT n, IsPrimitive a) =>
   Value (Vector n a) ->
   ConstValue (Vector n Word32) ->
   CodeGenFunction r (Value (Vector n a))
shuffleMatchPlain1 x =
   shuffleMatchPlain2 x (value undef)

shuffleMatchPlain2 ::
   (TypeNum.PositiveT n, IsPrimitive a) =>
   Value (Vector n a) ->
   Value (Vector n a) ->
   ConstValue (Vector n Word32) ->
   CodeGenFunction r (Value (Vector n a))
shuffleMatchPlain2 =
   LLVM.shufflevector


insertTraversable ::
   (C v, Trav.Traversable f, App.Applicative f) =>
   Value Word32 -> f (Element v) -> f v -> CodeGenFunction r (f v)
insertTraversable n a v =
   Trav.sequence (liftA2 (insert n) a v)

extractTraversable ::
   (Simple v, Trav.Traversable f) =>
   Value Word32 -> f v -> CodeGenFunction r (f (Element v))
extractTraversable n v =
   Trav.mapM (extract n) v

{- |
provide the elements of a vector as a list of individual virtual registers

This can be considered the inverse of 'assemble'.
-}
extractAll ::
   (Simple v) =>
   v -> LLVM.CodeGenFunction r [Element v]
extractAll x =
   mapM
      (flip extract x . LLVM.valueOf)
      (take (sizeInTuple x) [0..])


modify ::
   (C v) =>
   Value Word32 ->
   (Element v -> CodeGenFunction r (Element v)) ->
   (v -> CodeGenFunction r v)
modify k f v =
   flip (insert k) v =<< f =<< extract k v

{- |
Like LLVM.Util.Loop.mapVector but the loop is unrolled,
which is faster since it can be packed by the code generator.
-}
map, _mapByFold ::
   (C v, C w, Size v ~ Size w) =>
   (Element v -> CodeGenFunction r (Element w)) ->
   (v -> CodeGenFunction r w)
map f =
   assemble <=< mapM f <=< extractAll

_mapByFold f a =
   foldM
      (\b n ->
         extract (valueOf n) a >>=
         f >>=
         flip (insert (valueOf n)) b)
      Class.undefTuple
      (take (sizeInTuple a) [0..])

mapChunks ::
   (C ca, C cb, Size ca ~ Size cb,
    C va, C vb, Size va ~ Size vb,
    Element ca ~ Element va, Element cb ~ Element vb) =>
   (ca -> CodeGenFunction r cb) ->
   (va -> CodeGenFunction r vb)
mapChunks f a =
   foldM
      (\b (am,k) ->
         am >>= \ac ->
         f ac >>= \bc ->
         insertChunk (k * sizeInTuple ac) bc b)
      Class.undefTuple $
   List.zip (chop a) [0..]

zipChunksWith ::
   (C ca, C cb, C cc, Size ca ~ Size cb, Size cb ~ Size cc,
    C va, C vb, C vc, Size va ~ Size vb, Size vb ~ Size vc,
    Element ca ~ Element va, Element cb ~ Element vb, Element cc ~ Element vc) =>
   (ca -> cb -> CodeGenFunction r cc) ->
   (va -> vb -> CodeGenFunction r vc)
zipChunksWith f a b =
   mapChunks (uncurry f) (a,b)


mapChunks2 ::
   (C ca, C cb, Size ca ~ Size cb,
    C la, C lb, Size la ~ Size lb,
    C va, C vb, Size va ~ Size vb,
    Element ca ~ Element va, Element la ~ Element va,
    Element cb ~ Element vb, Element lb ~ Element vb) =>
   (ca -> CodeGenFunction r cb) ->
   (la -> CodeGenFunction r lb) ->
   (va -> CodeGenFunction r vb)
mapChunks2 f g a = do
   let chunkSize :: C ca => (ca -> cgf) -> Size ca -> Int
       chunkSize _ = TypeNum.fromIntegerT
   xs <- extractAll a
   case ListHT.viewR $
        ListHT.sliceVertical (chunkSize g undefined) xs of
      Nothing -> assemble []
      Just (cs,c) -> do
         ds <- mapM (extractAll <=< g <=< assemble) cs
         d <-
            if List.length c <= chunkSize f undefined
              then fmap List.concat $
                   mapM (extractAll <=< f <=< assemble) $
                   ListHT.sliceVertical (chunkSize f undefined) c
              else extractAll =<< g =<< assemble c
         assemble $ List.concat ds ++ d

zipChunks2With ::
   (C ca, C cb, C cc, Size ca ~ Size cb, Size cb ~ Size cc,
    C la, C lb, C lc, Size la ~ Size lb, Size lb ~ Size lc,
    C va, C vb, C vc, Size va ~ Size vb, Size vb ~ Size vc,
    Element ca ~ Element va, Element la ~ Element va,
    Element cb ~ Element vb, Element lb ~ Element vb,
    Element cc ~ Element vc, Element lc ~ Element vc) =>
   (ca -> cb -> CodeGenFunction r cc) ->
   (la -> lb -> CodeGenFunction r lc) ->
   (va -> vb -> CodeGenFunction r vc)
zipChunks2With f g a b =
   mapChunks2 (uncurry f) (uncurry g) (a,b)


infixl 1 `withRound`

withRound ::
   (IsPrimitive a, IsPrimitive b,
    TypeNum.PositiveT k, TypeNum.PositiveT m, TypeNum.PositiveT n) =>
   CodeGenFunction r x ->
   Ext.T (Value (Vector m a) -> Value Word32 -> CodeGenFunction r (Value (Vector m b))) ->
   Ext.T (Value (Vector k a) -> Value Word32 -> CodeGenFunction r (Value (Vector k b))) ->
   (Value (Vector n b) -> CodeGenFunction r x) ->
   Word32 ->
   Value (Vector n a) -> CodeGenFunction r x
withRound generic roundSmallExt _roundLargeExt post mode x =
   generic
   `Ext.run`
   (Ext.with roundSmallExt $ \round ->
       post =<< mapChunks (flip round (valueOf mode)) x)
{- crashes LLVM-3.1 in JIT mode
Stack dump:
0.      Running pass 'X86 DAG->DAG Instruction Selection' on function '@_fun1'
segmentation fault

   `Ext.run`
   (Ext.with2 roundSmallExt roundLargeExt $ \round roundLarge ->
       post =<< mapChunks2 (flip round (valueOf mode)) (flip roundLarge (valueOf mode)) x)
-}


{- |
Ideally on ix86 with SSE41 this would be translated to 'dpps'.
-}
dotProductPartial ::
   (TypeNum.PositiveT n, LLVM.IsPrimitive a, LLVM.IsArithmetic a) =>
   Int ->
   Value (Vector n a) ->
   Value (Vector n a) ->
   CodeGenFunction r (Value a)
dotProductPartial n x y =
   sumPartial n =<< A.mul x y

sumPartial ::
   (TypeNum.PositiveT n, LLVM.IsPrimitive a, LLVM.IsArithmetic a) =>
   Int ->
   Value (Vector n a) ->
   CodeGenFunction r (Value a)
sumPartial n x =
   foldl1
      {- quite the same as (+) using LLVM.Arithmetic instances,
         but requires less type constraints -}
      (M.liftR2 A.add)
      (List.map (LLVM.extractelement x . valueOf) $ take n $ [0..])


{- |
If the target vector type is a native type
then the chop operation produces no actual machine instruction. (nop)
If the vector cannot be evenly divided into chunks
the last chunk will be padded with undefined values.
-}
chop ::
   (C c, C v, Element c ~ Element v) =>
   v -> [CodeGenFunction r c]
chop = chopCore undefined

chopCore ::
   (C c, C v, Element c ~ Element v) =>
   Size c -> v -> [CodeGenFunction r c]
chopCore m x =
   List.map (shuffle x . constVector) $
   ListHT.sliceVertical (TypeNum.fromIntegerT m) $
   List.map constOf $
   take (sizeInTuple x) [0..]

{- |
The target size is determined by the type.
If the chunk list provides more data, the exceeding data is dropped.
If the chunk list provides too few data,
the target vector is filled with undefined elements.
-}
concat ::
   (C c, C v, Element c ~ Element v) =>
   [c] -> CodeGenFunction r v
concat xs =
   foldM
      (\v0 (js,c) ->
         foldM
            (\v (i,j) -> do
               x <- extract (valueOf i) c
               insert (valueOf j) x v)
            v0 $
         List.zip [0..] js)
      Class.undefTuple $
   List.zip
      (ListHT.sliceVertical (sizeInTuple (head xs)) [0..])
      xs


getLowestPair ::
   (TypeNum.PositiveT n) =>
   Value (Vector n a) ->
   CodeGenFunction r (Value a, Value a)
getLowestPair x =
   liftM2 (,)
      (extractelement x (valueOf 0))
      (extractelement x (valueOf 1))


_reduceAddInterleaved ::
   (IsArithmetic a, IsPrimitive a,
    TypeNum.PositiveT n, TypeNum.PositiveT m, (m :+: m) ~ n) =>
   m ->
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector m a))
_reduceAddInterleaved tm v = do
   let m = TypeNum.fromIntegerT tm
   x <- shuffle v (constVector $ List.map constOf $ take m [0..])
   y <- shuffle v (constVector $ List.map constOf $ take m [fromIntegral m ..])
   A.add x y

sumGeneric ::
   (IsArithmetic a, IsPrimitive a, TypeNum.PositiveT n) =>
   Value (Vector n a) ->
   CodeGenFunction r (Value a)
sumGeneric =
   flip extractelement (valueOf 0) <=<
   reduceSumInterleaved 1

sumToPairGeneric ::
   (Arithmetic a, TypeNum.PositiveT n) =>
   Value (Vector n a) ->
   CodeGenFunction r (Value a, Value a)
sumToPairGeneric v =
   let n2 = div (size v) 2
   in  sumInterleavedToPair =<<
       shuffleMatchPlain1 v
          (constVector $
           List.map (constOf . fromIntegral) $
           concatMap (\k -> [k, k+n2]) $
           take n2 [0..])

{- |
We partition a vector of size n into chunks of size m
and add these chunks using vector additions.
We do this by repeated halving of the vector,
since this way we do not need assumptions about the native vector size.

We reduce the vector size only virtually,
that is we maintain the vector size and fill with undefined values.
This is reasonable
since LLVM-2.5 and LLVM-2.6 does not allow shuffling between vectors of different size
and because it likes to do computations on Vector D2 Float
in MMX registers on ix86 CPU's,
which interacts badly with FPU usage.
Since we fill the vector with undefined values,
LLVM actually treats the vectors like vectors of smaller size.
-}
reduceSumInterleaved ::
   (IsArithmetic a, IsPrimitive a, TypeNum.PositiveT n) =>
   Int ->
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))
reduceSumInterleaved m x0 =
   let go ::
          (IsArithmetic a, IsPrimitive a, TypeNum.PositiveT n) =>
          Int ->
          Value (Vector n a) ->
          CodeGenFunction r (Value (Vector n a))
       go n x =
          if m==n
            then return x
            else
               let n2 = div n 2
               in  go n2
                      =<< A.add x
                      =<< shuffleMatchPlain1 x
                             (constVector $ List.map constOf (take n2 [fromIntegral n2 ..])
                                 ++ List.repeat undef)
   in  go (size x0) x0

cumulateGeneric, _cumulateSimple ::
   (IsArithmetic a, IsPrimitive a, TypeNum.PositiveT n) =>
   Value a -> Value (Vector n a) ->
   CodeGenFunction r (Value a, Value (Vector n a))
_cumulateSimple a x =
   foldM
      (\(a0,y0) k -> do
         a1 <- A.add a0 =<< extract (valueOf k) x
         y1 <- insert (valueOf k) a0 y0
         return (a1,y1))
      (a, Class.undefTuple)
      (take (sizeInTuple x) $ [0..])

cumulateGeneric =
   cumulateFrom1 cumulate1

cumulateFrom1 ::
   (IsArithmetic a, IsPrimitive a, TypeNum.PositiveT n) =>
   (Value (Vector n a) ->
    CodeGenFunction r (Value (Vector n a))) ->
   Value a -> Value (Vector n a) ->
   CodeGenFunction r (Value a, Value (Vector n a))
cumulateFrom1 cum a x0 = do
   (b,x1) <- shiftUp a x0
   y <- cum x1
   z <- A.add b =<< extract (valueOf (fromIntegral (sizeInTuple x0) - 1)) y
   return (z,y)


{- |
Needs (log n) vector additions
-}
cumulate1 ::
   (IsArithmetic a, IsPrimitive a, TypeNum.PositiveT n) =>
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))
cumulate1 x =
   foldM
      (\y k -> A.add y =<< shiftUpMultiZero k y)
      x
      (takeWhile (<sizeInTuple x) $ List.iterate (2*) 1)


inttofp ::
   (LLVM.PositiveT n,
    IsPrimitive a, IsPrimitive b,
    LLVM.IsInteger a, IsFloating b) =>
   Value (Vector n a) -> CodeGenFunction r (Value (Vector n b))
inttofp = LLVM.inttofp

{-
Can be used for both integer and float types,
but we need it only for Float types,
because LLVM produces ugly code for Float and even more ugly code for Double.
-}
signumLogical ::
   (TypeNum.PositiveT n,
    IsPrimitive a, IsPrimitive b, IsArithmetic b) =>
   (Value (Vector n a) ->
    Value (Vector n a) ->
    CodeGenFunction r (Value (Vector n b))) ->
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n b))
signumLogical gt x = do
   let zero = LLVM.value LLVM.zero
   negative <- gt zero x
   positive <- gt x zero
   A.sub negative positive


{-
{- |
This one does not use vectorized select.
Cf. the outcommented signumInt.
-}
signumInt ::
   (TypeNum.PositiveT n,
    IsPrimitive a, IsArithmetic a, IsConst a, Num a,
    LLVM.CmpRet a, LLVM.CmpResult a ~ b,
    IsPrimitive b, LLVM.IsInteger b) =>
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))
signumInt x = do
   let zero = LLVM.value LLVM.zero
   negative <- A.cmp LLVM.CmpLT x zero
   positive <- A.cmp LLVM.CmpGT x zero
   map
      (\(n,p) ->
         LLVM.select n (valueOf (-1))
            =<< LLVM.select p (valueOf 1) (LLVM.value LLVM.zero))
      (negative, positive)

signumWord ::
   (TypeNum.PositiveT n,
    IsPrimitive a, IsArithmetic a, IsConst a, Num a,
    LLVM.CmpRet a, LLVM.CmpResult a ~ b,
    IsPrimitive b, LLVM.IsInteger b) =>
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))
signumWord x = do
   positive <- A.cmp LLVM.CmpGT x (LLVM.value LLVM.zero)
   map
      (\p -> LLVM.select p (valueOf 1) (LLVM.value LLVM.zero))
      positive
-}

signumIntGeneric ::
   (TypeNum.PositiveT n,
    {- TypeNum.PositiveT (n :*: LLVM.SizeOf a), -}
    IsPrimitive a, LLVM.IsInteger a,
    LLVM.CmpRet a, LLVM.CmpResult a ~ b,
    IsPrimitive b, LLVM.IsInteger b) =>
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))
signumIntGeneric x = do
   let zero = LLVM.value LLVM.zero
   negative <- LLVM.sadapt =<< A.cmp LLVM.CmpLT x zero
   positive <- LLVM.sadapt =<< A.cmp LLVM.CmpGT x zero
   A.sub positive negative

signumWordGeneric ::
   (TypeNum.PositiveT n,
    IsPrimitive a, LLVM.IsInteger a,
    LLVM.CmpRet a, LLVM.CmpResult a ~ b,
    IsPrimitive b, LLVM.IsInteger b) =>
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))
signumWordGeneric x =
   LLVM.zadapt =<< A.cmp LLVM.CmpGT x (LLVM.value LLVM.zero)

signumFloatGeneric ::
   (TypeNum.PositiveT n,
    IsPrimitive a, IsArithmetic a, IsFloating a,
    LLVM.CmpRet a, LLVM.CmpResult a ~ b,
    IsPrimitive b, LLVM.IsInteger b) =>
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))
signumFloatGeneric x = do
   let zero = LLVM.value LLVM.zero
   negative <- LLVM.sitofp =<< A.cmp LLVM.CmpLT x zero
   positive <- LLVM.sitofp =<< A.cmp LLVM.CmpGT x zero
   A.sub negative positive


signedFraction ::
   (IsFloating a, IsConst a, Real a, TypeNum.PositiveT n) =>
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))
signedFraction x =
   A.sub x =<< truncate x

floorGeneric ::
   (IsFloating a, IsConst a, Real a, TypeNum.PositiveT n) =>
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))
floorGeneric = floorLogical A.fcmp

{- |
On LLVM-2.6 and X86 this produces branch-free
but even slower code than 'fractionSelect',
since the comparison to booleans and
back to a floating point number is translated literally
to elementwise comparison, conversion to a 0 or -1 byte
and then to a floating point number.
-}
fractionGeneric ::
   (IsFloating a, IsConst a, Real a, TypeNum.PositiveT n) =>
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))
fractionGeneric = fractionLogical A.fcmp


{-
These should be replaced by A.min, A.max, A.abs
when they work, eventually.
-}
class (LLVM.IsSized a, LLVM.IsSized (Mask a),
       LLVM.SizeOf a ~ LLVM.SizeOf (Mask a),
       LLVM.IsPrimitive a, LLVM.IsPrimitive (Mask a),
       LLVM.IsInteger (Mask a)) =>
         Maskable a where
   type Mask a :: *

instance Maskable Int8   where type Mask Int8   = Int8
instance Maskable Int16  where type Mask Int16  = Int16
instance Maskable Int32  where type Mask Int32  = Int32
instance Maskable Int64  where type Mask Int64  = Int64
instance Maskable Word8  where type Mask Word8  = Int8
instance Maskable Word16 where type Mask Word16 = Int16
instance Maskable Word32 where type Mask Word32 = Int32
instance Maskable Word64 where type Mask Word64 = Int64
instance Maskable Float  where type Mask Float  = Int32
instance Maskable Double where type Mask Double = Int64

makeMask ::
   (Maskable a, TypeNum.PositiveT n) =>
   Value (Vector n a) ->
   Value (Vector n Bool) ->
   CodeGenFunction r (Value (Vector n (Mask a)))
makeMask _ = LLVM.sadapt


minGeneric, maxGeneric ::
   (IsConst a, Real a, Maskable a, TypeNum.PositiveT n) =>
   Value (Vector n a) ->
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))

minGeneric x y = do
   b <- makeMask x =<< A.cmp LLVM.CmpLT x y
   selectLogical b x y

maxGeneric x y = do
   b <- makeMask x =<< A.cmp LLVM.CmpGT x y
   selectLogical b x y

absGeneric ::
   (IsConst a, Real a, Maskable a, TypeNum.PositiveT n) =>
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))
absGeneric x =
   maxGeneric x =<< LLVM.neg x

absAuto ::
   (TypeNum.PositiveT n, TypeNum.PositiveT m, TypeNum.PositiveT k,
    IsConst a, Real a, Maskable a) =>
   Ext.T (Value (Vector m a) -> CodeGenFunction r (Value (Vector m a))) ->
   Ext.T (Value (Vector k a) -> CodeGenFunction r (Value (Vector k a))) ->
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))
absAuto byChunk byLargeChunk x =
   absGeneric x
   `Ext.run`
   (Ext.with byChunk $ \f -> mapChunks f x)
   `Ext.run`
   (Ext.with2 byChunk byLargeChunk $
    \ f g -> mapChunks2 f g x)


{- |
LLVM.select on boolean vectors cannot be translated to X86 code in LLVM-2.6,
thus I code my own version that calls select on all elements.
This is slow but works.
When this issue is fixed, this function will be replaced by LLVM.select.
-}
select ::
   (LLVM.IsFirstClass a, IsPrimitive a, TypeNum.PositiveT n,
    LLVM.CmpRet a, LLVM.CmpResult a ~ Bool) =>
   Value (Vector n Bool) ->
   Value (Vector n a) ->
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))
select b x y =
   map (uncurry3 LLVM.select) (b, x, y)

{- |
'floor' implemented using 'select'.
This will need jumps.
-}
_floorSelect ::
   (Num a, IsFloating a, IsConst a, Real a, TypeNum.PositiveT n) =>
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))
_floorSelect x =
   do xr <- truncate x
      b <- A.fcmp LLVM.FPOLE xr x
      select b xr =<< A.sub xr =<< replicate (valueOf 1)

{- |
'fraction' implemented using 'select'.
This will need jumps.
-}
_fractionSelect ::
   (Num a, IsFloating a, IsConst a, Real a, TypeNum.PositiveT n) =>
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))
_fractionSelect x =
   do xf <- signedFraction x
      b <- A.fcmp LLVM.FPOGE xf (value LLVM.zero)
      select b xf =<< A.add xf =<< replicate (valueOf 1)


{- |
Another implementation of 'select',
this time in terms of binary logical operations.
The selecting integers must be
(-1) for selecting an element from the first operand
and 0 for selecting an element from the second operand.
This leads to optimal code.

On SSE41 this could be done with blendvps or blendvpd.
-}
selectLogical ::
   (LLVM.IsFirstClass a, IsPrimitive a,
    LLVM.IsInteger i, IsPrimitive i,
    LLVM.IsSized a, LLVM.IsSized i,
    LLVM.SizeOf a ~ LLVM.SizeOf i,
    TypeNum.PositiveT n) =>
   Value (Vector n i) ->
   Value (Vector n a) ->
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))
selectLogical b x y = do
--   bneg <- A.xor b
   bneg <- LLVM.inv b
   xm <- A.and b    =<< LLVM.bitcastElements x
   ym <- A.and bneg =<< LLVM.bitcastElements y
   LLVM.bitcastElements =<< A.or xm ym


floorLogical ::
   (IsFloating a, IsConst a, Real a,
    IsPrimitive i, LLVM.IsInteger i, TypeNum.PositiveT n) =>
   (LLVM.FPPredicate ->
    Value (Vector n a) ->
    Value (Vector n a) ->
    CodeGenFunction r (Value (Vector n i))) ->
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))
floorLogical cmp x =
   do xr <- truncate x
      b <- cmp LLVM.FPOGT xr x
      A.add xr =<< LLVM.sitofp b

fractionLogical ::
   (IsFloating a, IsConst a, Real a,
    IsPrimitive i, LLVM.IsInteger i, TypeNum.PositiveT n) =>
   (LLVM.FPPredicate ->
    Value (Vector n a) ->
    Value (Vector n a) ->
    CodeGenFunction r (Value (Vector n i))) ->
   Value (Vector n a) ->
   CodeGenFunction r (Value (Vector n a))
fractionLogical cmp x =
   do xf <- signedFraction x
      b <- cmp LLVM.FPOLT xf (value LLVM.zero)
      A.sub xf =<< LLVM.sitofp b


order ::
   (TypeNum.PositiveT n, TypeNum.PositiveT m, TypeNum.PositiveT k,
    LLVM.IsFirstClass a, IsPrimitive a) =>
   (Value (Vector n a) -> Value (Vector n a) -> CodeGenFunction r (Value (Vector n a))) ->
   Ext.T (Value (Vector m a) -> Value (Vector m a) -> CodeGenFunction r (Value (Vector m a))) ->
   Ext.T (Value (Vector k a) -> Value (Vector k a) -> CodeGenFunction r (Value (Vector k a))) ->
   (Value (Vector n a) -> Value (Vector n a) -> CodeGenFunction r (Value (Vector n a)))
order f byChunk byLargeChunk x y =
   f x y
   `Ext.run`
   (Ext.with byChunk $ \psel -> zipChunksWith psel x y)
   `Ext.run`
   (Ext.with2 byChunk byLargeChunk $
    \ psel plsel -> zipChunks2With psel plsel x y)


-- * target independent functions with target dependent optimizations

{- |
The order of addition is chosen for maximum efficiency.
We do not try to prevent cancelations.
-}
class (IsArithmetic a, IsPrimitive a) => Arithmetic a where
   sum ::
      (TypeNum.PositiveT n) =>
      Value (Vector n a) ->
      CodeGenFunction r (Value a)
   sum = sumGeneric

   {- |
   The first result value is the sum of all vector elements from 0 to @div n 2 + 1@
   and the second result value is the sum of vector elements from @div n 2@ to @n-1@.
   n must be at least D2.
   -}
   sumToPair ::
      (TypeNum.PositiveT n) =>
      Value (Vector n a) ->
      CodeGenFunction r (Value a, Value a)
   sumToPair = sumToPairGeneric

   {- |
   Treat the vector as concatenation of pairs and all these pairs are added.
   Useful for stereo signal processing.
   n must be at least D2.
   -}
   sumInterleavedToPair ::
      (TypeNum.PositiveT n) =>
      Value (Vector n a) ->
      CodeGenFunction r (Value a, Value a)
   sumInterleavedToPair v =
      getLowestPair =<< reduceSumInterleaved 2 v

   cumulate ::
      (TypeNum.PositiveT n) =>
      Value a -> Value (Vector n a) ->
      CodeGenFunction r (Value a, Value (Vector n a))
   cumulate = cumulateGeneric

   dotProduct ::
      (TypeNum.PositiveT n) =>
      Value (Vector n a) ->
      Value (Vector n a) ->
      CodeGenFunction r (Value a)
   dotProduct x y =
      dotProductPartial (size x) x y

   mul ::
      (TypeNum.PositiveT n) =>
      Value (Vector n a) ->
      Value (Vector n a) ->
      CodeGenFunction r (Value (Vector n a))
   mul = A.mul

instance Arithmetic Float where
   sum x =
      Ext.runWhen (size x >= 4) (sumGeneric x) $
      Ext.with X86A.haddps $ \haddp ->
          {-
          We can make use of the following facts:
          SSE3 has Float vectors of size 4,
          there is an instruction for horizontal add.
          -}
          do chunkSum <-
                foldl1 (M.liftR2 A.add) $ chop x
             y <- haddp chunkSum (value undef)
             z <- haddp y        (value undef)
{-
             y <- haddp chunkSum chunkSum
             z <- haddp y y
-}
             extractelement z (valueOf 0)

   sumToPair x =
      Ext.runWhen (size x >= 4) (getLowestPair x) $
      Ext.with X86A.haddps $ \haddp ->
          let {-
              reduce ::
                 [CodeGenFunction r (Value (Vector D4 Float))] ->
                 [CodeGenFunction r (Value (Vector D4 Float))]
              -}
              reduce [] = []
              reduce [_] = error "vector must have size power of two"
              reduce (x0:x1:xs) =
                 M.liftR2 haddp x0 x1 : reduce xs
              go []  = error "vector must not be empty"
              go [c] =
                 getLowestPair
                    =<< flip haddp (value undef)
                    =<< c
              go cs  = go (reduce cs)
          in  go $ chop x

{-
The haddps based implementation cumulate is slower than the generic one.
However, one day the x86 processors may implement a cumulative sum
which we could employ with this frame.

   cumulate a x =
      Ext.runWhen (size x >= 4) (cumulateGeneric a x) $
      Ext.with X86.cumulate1s $ \cumulate1s -> do
         (b,ys) <-
            foldr
               (\chunk0 cont a0 -> do
                  (a1,chunk1) <- cumulateFrom1 cumulate1s a0 =<< chunk0
                  fmap (mapSnd (chunk1:)) (cont a1))
               (\a0 -> return (a0,[]))
               (chop x)
               a
         y <- concat ys
         return (b,y)
-}

   dotProduct x y =
      Ext.run (sum =<< A.mul x y) $
      Ext.with X86A.dpps $ \dpp ->
         foldl1 (M.liftR2 A.add) $
         List.zipWith
            (\mx my -> do
               cx <- mx
               cy <- my
               flip extractelement (valueOf 0)
                =<< dpp cx cy (valueOf 0xF1))
            (chop x)
            (chop y)

instance Arithmetic Double where

instance Arithmetic Int8   where
instance Arithmetic Int16  where
instance Arithmetic Int32  where
instance Arithmetic Int64  where
instance Arithmetic Word8  where
instance Arithmetic Word16 where
instance Arithmetic Word64 where

instance Arithmetic Word32 where
   mul x y =
      A.mul x y
      `Ext.run`
      (Ext.with X86A.pmuludq128 $ \pmul ->
         zipChunksWith
            (\cx cy -> do
               evenX <- shuffleMatchPlain1 cx
                  (constVector [constOf 0, undef, constOf 2, undef])
               evenY <- shuffleMatchPlain1 cy
                  (constVector [constOf 0, undef, constOf 2, undef])
               evenZ64 <- pmul evenX evenY
               evenZ <- LLVM.bitcast evenZ64
               oddX <- shuffleMatchPlain1 cx
                  (constVector [constOf 1, undef, constOf 3, undef])
               oddY <- shuffleMatchPlain1 cy
                  (constVector [constOf 1, undef, constOf 3, undef])
               oddZ64 <- pmul oddX oddY
               oddZ <- LLVM.bitcast oddZ64
               shuffleMatchPlain2 evenZ oddZ
                  (constVector [constOf 0, constOf 4, constOf 2, constOf 6]))
            x y)
      `Ext.run`
      Ext.wrap X86C.sse41 (A.mul x y)


umul32to64 ::
   (TypeNum.PositiveT n) =>
   Value (Vector n Word32) ->
   Value (Vector n Word32) ->
   CodeGenFunction r (Value (Vector n Word64))
umul32to64 x y =
   (do x64 <- map LLVM.zext x
       y64 <- map LLVM.zext y
       A.mul x64 y64)
   `Ext.run`
   (Ext.with X86A.pmuludq128 $ \pmul ->
      zipChunksWith
         -- save an initial shuffle
         (\cx cy -> do
            evenX <- shuffleMatchPlain1 cx
               (constVector [constOf 0, undef, constOf 2, undef])
            evenY <- shuffleMatchPlain1 cy
               (constVector [constOf 0, undef, constOf 2, undef])
            evenZ <- pmul evenX evenY
            oddX <- shuffleMatchPlain1 cx
               (constVector [constOf 1, undef, constOf 3, undef])
            oddY <- shuffleMatchPlain1 cy
               (constVector [constOf 1, undef, constOf 3, undef])
            oddZ <- pmul oddX oddY
{-
            shuffleMatchPlain2 evenZ oddZ
               (constVector [constOf 0, constOf 2, constOf 1, constOf 3])
-}
            assemble =<< (sequence $
               extract (valueOf 0) evenZ :
               extract (valueOf 0) oddZ :
               extract (valueOf 1) evenZ :
               extract (valueOf 1) oddZ :
               []) :: CodeGenFunction r (Value (Vector D4 Word64)))
{-
         -- save the final shuffle
         (\cx cy -> do
            lowerX <- shuffleMatchPlain1 cx
               (constVector [constOf 0, undef, constOf 1, undef])
            lowerY <- shuffleMatchPlain1 cy
               (constVector [constOf 0, undef, constOf 1, undef])
            lowerZ <- pmul lowerX lowerY
            upperX <- shuffleMatchPlain1 cx
               (constVector [constOf 2, undef, constOf 3, undef])
            upperY <- shuffleMatchPlain1 cy
               (constVector [constOf 2, undef, constOf 3, undef])
            upperZ <- pmul upperX upperY
{-
            shuffleMatchPlain2 lowerZ upperZ
               (constVector [constOf 0, constOf 1, constOf 2, constOf 3])
-}
            concat [lowerZ, upperZ])
-}
         x y)


{- |
Attention:
The rounding and fraction functions only work
for floating point values with maximum magnitude of @maxBound :: Int32@.
This way we safe expensive handling of possibly seldom cases.
-}
class (Arithmetic a, LLVM.CmpRet a, LLVM.CmpResult a ~ Bool, IsConst a) =>
         Real a where
   min, max ::
      (TypeNum.PositiveT n) =>
      Value (Vector n a) ->
      Value (Vector n a) ->
      CodeGenFunction r (Value (Vector n a))

   abs ::
      (TypeNum.PositiveT n) =>
      Value (Vector n a) ->
      CodeGenFunction r (Value (Vector n a))

   signum ::
      (TypeNum.PositiveT n) =>
      Value (Vector n a) ->
      CodeGenFunction r (Value (Vector n a))

   truncate, floor, fraction ::
      (TypeNum.PositiveT n) =>
      Value (Vector n a) ->
      CodeGenFunction r (Value (Vector n a))

instance Real Float where
   min = order minGeneric X86A.minps X86A.minps256
   max = order maxGeneric X86A.maxps X86A.maxps256
   abs x = Ext.run (absGeneric x) (Ext.with X86.absps ($x))
   signum x =
      signumFloatGeneric x
      `Ext.run`
      (Ext.with X86.cmpps $ \cmp ->
          inttofp =<< mapChunks (signumLogical (cmp LLVM.FPOGT)) x)
{- crashes LLVM-3.1 in JIT mode
Stack dump:
0.      Running pass 'X86 DAG->DAG Instruction Selection' on function '@_fun1'
segmentation fault

      `Ext.run`
      (Ext.with X86.cmpps256 $ \cmp ->
          mapChunks (signumLogical
             (\a b -> LLVM.sitofp =<< cmp LLVM.FPOGT a b)) x)
-}
{- crashes LLVM-3.1 in JIT mode
   only efficient in AVX2, where large integer vector subtraction is available
      `Ext.run`
      (Ext.with X86.cmpps256 $ \cmp ->
          inttofp =<< mapChunks (signumLogical (cmp LLVM.FPOGT)) x)
-}
   {-
   An IEEE specific implementation could do some bit manipulation:
   s eeeeeeee mmmmmmmmmmmmmmmmmmmmmmm
   Generate a pure power of two by clearing mantissa:
   s eeeeeeee 00000000000000000000000
   Now subtract 1 in order to get the required bit mask for the mantissa
   s eeeeeeee 11111111110000000000000
   multiply with 2 in order to correct exponent
   and then do bitwise AND of the mask with the original number.
   This method only works for numbers from 1 to 2^23-1,
   that is the range is even more smaller
   than that for the rounding via Int32.
   -}
   truncate x =
      withRound
         ((LLVM.inttofp .
           (id :: Value (Vector n Int32) -> Value (Vector n Int32))
           <=< LLVM.fptoint) x)
         X86A.roundps X86A.roundps256 return 3 x
   floor x =
      withRound
         (floorGeneric x
          `Ext.run`
          (Ext.with X86.cmpps $ \cmp ->
              mapChunks (floorLogical cmp) x)
{- LLVM-2.6 rearranges the MXCSR manipulations in an invalid way
          `Ext.run`
          (Ext.with2 (X86.withMXCSR (Bit.shiftL 1 13)) X86.cvtps2dq $
              \ with cvtps2dq -> with $
                 LLVM.inttofp =<< mapChunks cvtps2dq x)
-}
          )
         X86A.roundps X86A.roundps256 return 1 x
   fraction x =
      withRound
         (fractionGeneric x
          `Ext.run`
          (Ext.with X86.cmpps $ \cmp ->
              mapChunks (fractionLogical cmp) x)
{-
          `Ext.run`
          (Ext.with2 (X86.withMXCSR (Bit.shiftL 1 13)) X86.cvtps2dq $
              \ with cvtps2dq -> with $
                 A.sub x =<< LLVM.inttofp =<< mapChunks cvtps2dq x)
-}
          )
         X86A.roundps X86A.roundps256 (A.sub x) 1 x

instance Real Double where
   min = order minGeneric X86A.minpd X86A.minpd256
   max = order maxGeneric X86A.maxpd X86A.maxpd256
   abs x = Ext.run (absGeneric x) (Ext.with X86.abspd ($x))
   signum x =
      signumFloatGeneric x
      `Ext.run`
      (Ext.with2 X86.cmppd X86A.cvtdq2pd $ \cmp tofp ->
          mapChunks (signumLogical
             (\a b -> do
                c <- LLVM.bitcast =<< cmp LLVM.FPOGT a b
                c0 <- extract (valueOf 0) (c :: Value (Vector D4 Int32))
                c1 <- extract (valueOf 2) c
                tofp =<< assemble [c0,c1])) x)
{- crashes LLVM-3.1 in JIT mode
      `Ext.run`
      -- we could still optimize using mapChunks2
      (Ext.with2 X86.cmppd256 X86A.cvtdq2pd256 $ \cmp tofp ->
          mapChunks (signumLogical
             (\a b -> do
                c <- LLVM.bitcast =<< cmp LLVM.FPOGT a b
                c0 <- extract (valueOf 0) (c :: Value (Vector D8 Int32))
                c1 <- extract (valueOf 2) c
                c2 <- extract (valueOf 4) c
                c3 <- extract (valueOf 6) c
                tofp =<< assemble [c0,c1,c2,c3])) x)
-}
   truncate x =
      withRound
         ((LLVM.inttofp .
           (id :: Value (Vector n Int64) -> Value (Vector n Int64))
           <=< LLVM.fptoint) x)
         X86A.roundpd X86A.roundpd256 return 3 x
   floor x =
      withRound
         (floorGeneric x
          `Ext.run`
          (Ext.with X86.cmppd $ \cmp ->
              mapChunks (floorLogical cmp) x))
         X86A.roundpd X86A.roundpd256 return 1 x
   fraction x =
      withRound
         (fractionGeneric x
          `Ext.run`
          (Ext.with X86.cmppd $ \cmp ->
              mapChunks (fractionLogical cmp) x))
         X86A.roundpd X86A.roundpd256 (A.sub x) 1 x

instance Real Int8 where
   min = order minGeneric X86A.pminsb128 X86A.pminsb256
   max = order maxGeneric X86A.pmaxsb128 X86A.pmaxsb256
   abs = absAuto X86A.pabsb128 X86A.pabsb256
   signum = signumIntGeneric
   truncate = return
   floor = return
   fraction = const $ return (value LLVM.zero)

instance Real Int16 where
   min = order minGeneric X86A.pminsw128 X86A.pminsw256
   max = order maxGeneric X86A.pmaxsw128 X86A.pmaxsw256
   abs = absAuto X86A.pabsw128 X86A.pabsw256
   signum = signumIntGeneric
   truncate = return
   floor = return
   fraction = const $ return (value LLVM.zero)

instance Real Int32 where
   min = order minGeneric X86A.pminsd128 X86A.pminsd256
   max = order maxGeneric X86A.pmaxsd128 X86A.pmaxsd256
   abs = absAuto X86A.pabsd128 X86A.pabsd256
   signum = signumIntGeneric
   truncate = return
   floor = return
   fraction = const $ return (value LLVM.zero)

instance Real Int64 where
   min = minGeneric
   max = maxGeneric
   abs = absGeneric
   signum = signumIntGeneric
   truncate = return
   floor = return
   fraction = const $ return (value LLVM.zero)

instance Real Word8 where
   min = order minGeneric X86A.pminub128 X86A.pminub256
   max = order maxGeneric X86A.pmaxub128 X86A.pmaxub256
   abs = return
   signum = signumWordGeneric
   truncate = return
   floor = return
   fraction = const $ return (value LLVM.zero)

instance Real Word16 where
   min = order minGeneric X86A.pminuw128 X86A.pminuw256
   max = order maxGeneric X86A.pmaxuw128 X86A.pmaxuw256
   abs = return
   signum = signumWordGeneric
   truncate = return
   floor = return
   fraction = const $ return (value LLVM.zero)

instance Real Word32 where
   min = order minGeneric X86A.pminud128 X86A.pminud256
   max = order maxGeneric X86A.pmaxud128 X86A.pmaxud256
   abs = return
   signum = signumWordGeneric
   truncate = return
   floor = return
   fraction = const $ return (value LLVM.zero)

instance Real Word64 where
   min = minGeneric
   max = maxGeneric
   abs = return
   signum = signumWordGeneric
   truncate = return
   floor = return
   fraction = const $ return (value LLVM.zero)