module Language.Paraiso.Generator.PlanTrans (
translate
) where
import qualified Algebra.Additive as Additive
import Data.Char
import Data.Dynamic
import qualified Data.Graph.Inductive as FGL
import Data.List (sortBy)
import qualified Data.ListLike.String as LL
import Data.ListLike.Text ()
import qualified Data.Foldable as F
import Data.Maybe
import qualified Data.Set as Set
import Data.Tensor.TypeLevel
import qualified Data.Text as T
import qualified Data.Vector as V
import qualified Language.Paraiso.Annotation as Anot
import qualified Language.Paraiso.Annotation.SyncThreads as Sync
import qualified Language.Paraiso.Generator.Claris as C
import qualified Language.Paraiso.Generator.Native as Native
import qualified Language.Paraiso.Generator.Plan as Plan
import qualified Language.Paraiso.OM.Arithmetic as Arith
import qualified Language.Paraiso.OM.DynValue as DVal
import qualified Language.Paraiso.OM.Graph as OM
import qualified Language.Paraiso.OM.Realm as Realm
import qualified Language.Paraiso.Optimization.Graph as Opt
import Language.Paraiso.Name
import Language.Paraiso.Prelude hiding (Boolean(..))
import NumericPrelude hiding ((++))
type AnAn = Anot.Annotation
data Env v g = Env (Native.Setup v g) (Plan.Plan v g AnAn)
translate :: Opt.Ready v g => Native.Setup v g -> Plan.Plan v g AnAn -> C.Program
translate setup plan =
C.Program
{ C.progName = name plan,
C.topLevel =
map include stlHeaders ++
library env ++
comments ++
subHelperFuncs ++
[ C.ClassDef $ C.Class (name plan) $
storageVars ++
constructorDef ++
memberFuncForSize env ++
subMemberFuncs ++ memberFuncs
]
}
where
env = Env setup plan
comments = (:[]) $ C.Comment $ LL.unlines [
"",
"lowerMargin = " ++ showT (Plan.lowerMargin plan),
"upperMargin = " ++ showT (Plan.upperMargin plan)
]
constructorDef =
(:[]) $
C.MemberFunc C.Public True $
(C.function C.ConstructorType (name plan))
{ C.funcMemberInitializer =
concat $
flip map storageVars $ \memb -> case memb of
C.MemberVar _ var ->
[C.FuncCallUsr (name var) [C.FuncCallStd "memorySize" []]]
_ -> []
}
memberFuncs =
V.toList $
V.imap (\idx ker -> makeFunc env idx ker) $
Plan.kernels plan
subHelperFuncs = concat $ V.toList $ V.map snd $ subKernelFuncs
subMemberFuncs = concat $ V.toList $ V.map fst $ subKernelFuncs
subKernelFuncs = V.map (makeSubFunc env) $ Plan.subKernels plan
include = C.Exclusive C.HeaderFile . C.StmtPrpr . C.PrprInclude C.Chevron
stlHeaders = case Native.language setup of
Native.CPlusPlus -> ["algorithm", "cmath", "vector"]
Native.CUDA -> ["thrust/device_vector.h", "thrust/host_vector.h", "thrust/functional.h", "thrust/extrema.h", "thrust/reduce.h"]
storageVars =
V.toList $
V.map storageRefToMenber $
Plan.storages plan
storageRefToMenber stRef =
C.MemberVar C.Public $
C.Var
(mkCppType env $ Plan.storageType stRef)
(name stRef)
memberFuncForSize :: Opt.Ready v g => Env v g -> [C.MemberDef]
memberFuncForSize env@(Env setup plan) =
makeMami False "size" size ++
makeMami True "lowerMargin" lM ++
makeMami True "upperMargin" uM ++
makeMami False "memorySize" memorySize
where
size = F.toList $ Native.localSize setup
memorySize = F.toList $ Native.localSize setup + Plan.lowerMargin plan + Plan.upperMargin plan
lM = F.toList $ Plan.lowerMargin plan
uM = F.toList $ Plan.upperMargin plan
makeMami alone label xs =
(if not alone then (finale label xs :) else id) $
map (\(i,x) -> tiro (label ++ show i) x) $
zip [(0::Int)..] xs
tiro str ret =
C.MemberFunc C.Public True $
(C.function (C.typeOf (size!!0)) $ mkName $ T.pack str)
{ C.funcBody =
[ C.StmtReturn (C.toDyn ret)
]
}
finale str xs = tiro str (product xs)
makeFunc :: Opt.Ready v g => Env v g -> Int -> OM.Kernel v g AnAn -> C.MemberDef
makeFunc env@(Env setup plan) kerIdx ker = C.MemberFunc C.Public False $
(C.function tVoid (name ker))
{ C.funcBody = kernelCalls ++ storeInsts
}
where
graph = OM.dataflow ker
kernelCalls =
V.toList $
V.map (\subker -> callSubKer subker $ V.map findVar $
Plan.inputIdxs subker V.++ Plan.outputIdxs subker) $
V.filter ((== kerIdx) . Plan.kernelIdx) $
Plan.subKernels plan
callSubKer subker xs =
C.StmtExpr $
C.FuncCallUsr (name subker) (V.toList xs)
storeInsts =
map swapStmt $
concatMap (\(idx, nd) -> case nd of
OM.NInst (OM.Store (OM.StaticIdx statIdx))_ -> [(idx, statIdx)]
_ -> []) $
FGL.labNodes graph
swapStmt (idx, statIdx) =
let preIdx = head $ FGL.pre graph idx in
case (filter ((Plan.StaticRef statIdx==) . Plan.storageIdx) $ V.toList $ Plan.storages plan,
filter ((Plan.ManifestRef kerIdx preIdx==) . Plan.storageIdx) $ V.toList $ Plan.storages plan) of
([stRef],[maRef]) ->
C.StmtExpr $ C.Op2Infix "="
(C.VarExpr $ C.Var (mkCppType env $ Plan.storageType stRef) (name stRef) )
(C.VarExpr $ C.Var C.UnknownType (name maRef) )
_ -> error $ "mismatch in storage phase: " ++ show (idx, statIdx)
findVar idx =
let
loadIdx =
listToMaybe $
concat $
map (\jdx ->
case FGL.lab graph jdx of
Just (OM.NInst (OM.Load (OM.StaticIdx statIdx))_)-> [Plan.StaticRef statIdx]
_ -> []) $
FGL.pre graph idx
match stIdx
| stIdx == Plan.ManifestRef kerIdx idx = True
| Just stIdx == loadIdx = True
| otherwise = False
stRef = V.head $ V.filter ( match . Plan.storageIdx ) $ Plan.storages plan
in C.VarExpr $ C.Var
(mkCppType env $ Plan.storageType stRef)
(name stRef)
makeSubFunc :: Opt.Ready v g
=> Env v g
-> Plan.SubKernelRef v g AnAn
-> ([C.MemberDef], [C.Statement])
makeSubFunc env@(Env setup plan) subker =
case Native.language setup of
Native.CPlusPlus -> cppSolution
Native.CUDA -> cudaSolution
where
rlm = Realm.realm subker
cudaHelperName = mkName $ nameText subker ++ "_inner"
cudaBodys =
if rlm == Realm.Global
then []
else
(:[]) $
C.FuncDef $
(C.function
(C.QualifiedType [C.CudaGlobal] tVoid)
cudaHelperName)
{ C.funcArgs =
makeRawSubArg env True (Plan.labNodesIn subker) ++
makeRawSubArg env False (Plan.labNodesOut subker),
C.funcBody =
[ C.Comment $ LL.unlines
[ "",
"lowerMargin = " ++ showT (Plan.lowerBoundary subker),
"upperMargin = " ++ showT (Plan.upperBoundary subker)
]
] ++ loopMaker env rlm subker
}
(gridDim, blockDim) = Native.cudaGridSize setup
cudaSolution =
(,cudaBodys) $
(:[]) $
C.MemberFunc C.Public False $
(C.function tVoid (name subker))
{ C.funcArgs =
makeSubArg env True (Plan.labNodesIn subker) ++
makeSubArg env False (Plan.labNodesOut subker),
C.funcBody =
if rlm == Realm.Global
then loopMaker env rlm subker
else
[ C.Comment $ LL.unlines
[ "",
"lowerMargin = " ++ showT (Plan.lowerBoundary subker),
"upperMargin = " ++ showT (Plan.upperBoundary subker)
],
C.RawStatement "{static bool fst = false;",
C.StmtExpr $ C.FuncCallStd "if (fst) cudaFuncSetCacheConfig"
[mkVarExpr $ nameText cudaHelperName, mkVarExpr "cudaFuncCachePreferL1"],
C.RawStatement "fst = true;}",
C.StmtExpr $ C.CudaFuncCallUsr cudaHelperName (C.toDyn gridDim) (C.toDyn blockDim) $
map takeRaw $
(V.toList $ Plan.labNodesIn subker) ++ (V.toList $ Plan.labNodesOut subker)
]
}
takeRaw (idx, nd)=
case nd of
OM.NValue typ _ ->
extractor typ $
C.VarExpr $
C.Var (mkCppType env typ) (nodeNameUniversal idx)
_ -> error "NValue expected"
extractor typ = case Realm.realm typ of
Realm.Local ->
C.FuncCallStd "thrust::raw_pointer_cast" .
(:[]) .
C.Op1Prefix "&*" .
flip C.MemberAccess (C.FuncCallStd "begin" [])
Realm.Global ->
id
cppSolution =
(,[]) $
(:[]) $
C.MemberFunc C.Public False $
(C.function tVoid (name subker))
{ C.funcArgs =
makeSubArg env True (Plan.labNodesIn subker) ++
makeSubArg env False (Plan.labNodesOut subker),
C.funcBody =
if rlm == Realm.Global
then loopMaker env rlm subker
else
[ C.Comment $ LL.unlines
[ "",
"lowerMargin = " ++ showT (Plan.lowerBoundary subker),
"upperMargin = " ++ showT (Plan.upperBoundary subker)
]
] ++ loopMaker env rlm subker
}
makeSubArg :: Opt.Ready v g => Env v g -> Bool -> V.Vector (FGL.LNode (OM.Node v g AnAn)) -> [C.Var]
makeSubArg env isConst lnodes =
let f = (if isConst then C.Const else id) . C.RefOf
in
map (\(idx,nd)-> case nd of
OM.NValue typ _ -> C.Var (f $ mkCppType env typ) (nodeNameUniversal idx)
_ -> error "NValue expected" ) $
V.toList lnodes
makeRawSubArg :: Opt.Ready v g => Env v g -> Bool -> V.Vector (FGL.LNode (OM.Node v g AnAn)) -> [C.Var]
makeRawSubArg env isConst lnodes =
let f = (if isConst then C.Const else id)
in
map (\(idx,nd)-> case nd of
OM.NValue typ _ ->
C.Var (f $ mkCudaRawType env typ) (nodeNameUniversal idx)
_ -> error "NValue expected" ) $
V.toList lnodes
loopMaker :: Opt.Ready v g => Env v g -> Realm.Realm -> Plan.SubKernelRef v g AnAn -> [C.Statement]
loopMaker env@(Env setup plan) realm subker = case realm of
Realm.Local ->
pragma ++
[ C.StmtFor
(C.VarDefSub loopCounter loopBegin)
(C.Op2Infix "<" (C.VarExpr loopCounter) loopEnd)
(C.Op2Infix "+=" (C.VarExpr loopCounter) loopStride) $
[C.VarDefSub addrCounter codecAddr] ++
loopContent
]
Realm.Global -> loopContent
where
pragma =
if Native.language setup == Native.CPlusPlus
then [C.StmtPrpr $ C.PrprPragma "omp parallel for"]
else []
(loopBegin, loopEnd, loopStride) = case Native.language setup of
Native.CPlusPlus -> (intImm 0, C.toDyn (product boundarySize), intImm 1)
Native.CUDA -> (loopBeginCuda, C.toDyn (product boundarySize), loopStrideCuda)
loopBeginCuda = mkVarExpr "blockIdx.x * blockDim.x + threadIdx.x"
loopStrideCuda = mkVarExpr "blockDim.x * gridDim.x"
loopCounter = C.Var tSizet (mkName "i")
memorySize = F.toList $ Native.localSize setup + Plan.lowerMargin plan + Plan.upperMargin plan
boundarySize = F.toList $ Native.localSize setup + Plan.lowerMargin plan + Plan.upperMargin plan
Plan.lowerBoundary subker Plan.upperBoundary subker
codecDiv =
[ if idx == 0 then (C.VarExpr loopCounter) else C.Op2Infix "/" (C.VarExpr loopCounter) (C.toDyn $ product $ take idx boundarySize)
| idx <- [0..length boundarySize1]]
codecMod =
[ if idx == length codecDiv1 then x else C.Op2Infix "%" x (C.toDyn $ boundarySize !! idx)
| (idx, x) <- zip [0..] codecDiv]
codecModAdd =
[ C.Op2Infix "+" x (C.toDyn $ Plan.lowerMargin plan ! (Axis idx))
| (idx, x) <- zip [0..] codecMod]
codecAddr =
if memorySize == boundarySize
then C.VarExpr loopCounter
else foldl1 (C.Op2Infix "+")
[ C.Op2Infix "*" x (C.toDyn $ product $ take idx memorySize)
| (idx, x) <- zip [0..] codecModAdd]
codecLoadIndex =
[ C.Op2Infix "-" x (C.toDyn ((Plan.lowerMargin plan Plan.lowerBoundary subker) ! (Axis idx) ))
| (idx, x) <- zip [0..] codecMod]
codecLoadSize =
[ C.toDyn (Native.localSize setup ! (Axis idx) )
| (idx, _) <- zip [0..] codecMod]
codecCursor cursor =
(C.Op2Infix "+" (C.VarExpr addrCounter) (C.toDyn summa))
where
summa = sum $
[ cursor ! (Axis idx) * product (take idx memorySize)
| (idx, _) <- zip [0..] memorySize]
addrCounter = C.Var tSizet (mkName "addr_origin")
loopContent =
concat $
map buildExprs $
filterVal $
Set.toList allIdxSet
buildExprs (idx, val@(DVal.DynValue r c)) =
addSyncFunctions idx $
map (\cursor ->
lhs cursor
(fst $ rhsAndRequest env idx cursor)
) $
Set.toList $ lhsCursors V.! idx
where
lhs cursor expr =
if Set.member idx outputIdxSet
then C.StmtExpr $ flip (C.Op2Infix "=") expr $ case realm of
Realm.Local ->
(C.ArrayAccess (C.VarExpr $ C.Var (C.UnitType c) (nodeNameUniversal idx)) (C.VarExpr addrCounter))
Realm.Global ->
C.VarExpr $ C.Var (C.UnitType c) (nodeNameUniversal idx)
else flip C.VarDefSub expr
(C.Var (C.UnitType c) $ nodeNameCursored env idx cursor)
addSyncFunctions :: FGL.Node -> [C.Statement] -> [C.Statement]
addSyncFunctions idx =
if realm /= Realm.Local ||
Native.language setup /= Native.CUDA
then id
else foldl (.) id $ map adder anot
where
anot :: [Sync.Timing]
anot = (maybeToList $ FGL.lab graph idx) >>= (Anot.toList . OM.getA)
adder :: Sync.Timing -> [C.Statement] -> [C.Statement]
adder Sync.Pre = ([C.RawStatement "__syncthreads();"] ++ )
adder Sync.Post = ( ++ [C.RawStatement "__syncthreads();"])
lhsCursors = V.generate idxSize f
where
f idx
| not (Set.member idx allIdxSet) = Set.empty
| Set.member idx outputIdxSet = Set.singleton Additive.zero
| otherwise = lhsRequest idx
lhsRequest idx =
Set.fromList $
map snd $
filter ((==idx) . fst) $
concat $
[snd $ rhsAndRequest env jdx cur|
jdx <- Set.toList allIdxSet,
jdx > idx,
cur <- Set.toList $ lhsCursors V.! jdx
]
rhsAndRequest env' idx cursor =
let (idxInst,inst) = case preInst idx of
found:_ -> found
_ -> error $ "right hand side is not inst:" ++ show idx
prepre = map fst $ sortBy (\x y -> compare (snd x) (snd y)) $ FGL.lpre graph idxInst
isInput = Set.member idx inputIdxSet
creatVar idx' = C.VarExpr $ C.Var C.UnknownType (nodeNameUniversal idx')
in case inst of
_ | isInput -> case realm of
Realm.Local -> (C.ArrayAccess (creatVar idx) (codecCursor cursor), [])
Realm.Global -> (creatVar idx, [])
OM.Imm dyn -> (C.Imm dyn, [])
OM.Arith op -> (rhsArith env' op (map (nodeToRhs env' cursor) prepre),
map (,cursor) prepre)
OM.Shift v -> case prepre of
[pre1] -> (nodeToRhs env' cursor' pre1, [(pre1,cursor')]) where cursor' = cursor v
_ -> error $ "shift has not 1 pre!" ++ show idxInst ++ show prepre
OM.LoadIndex ax -> (codecLoadIndex !! axisIndex ax, [])
OM.LoadSize ax -> (codecLoadSize !! axisIndex ax, [])
OM.Reduce op -> let fname = T.pack ("reduce_" ++ map toLower (show op)) in
(C.FuncCallStd fname (map creatVar prepre), [])
OM.Broadcast -> let fname = "broadcast" in
(C.FuncCallStd fname (map creatVar prepre), [])
_ -> (C.CommentExpr ("TODO : " ++ showT inst) (C.toDyn (42::Int)), [])
nodeToRhs env' cursor idx = C.VarExpr $ C.Var C.UnknownType $ nodeNameCursored env' idx cursor
preVal = filterVal . FGL.pre graph
preInst = filterInst . FGL.pre graph
sucVal = filterVal . FGL.suc graph
sucInst = filterInst . FGL.suc graph
filterVal = concat . map (\(i,(xs,ys))-> map(i,)xs) . shiwake
filterInst = concat . map (\(i,(xs,ys))-> map(i,)ys) . shiwake
shiwake indices =
map (\idx -> (idx,) $ case FGL.lab graph idx of
Just (OM.NValue dval _) -> ([dval], [])
Just (OM.NInst inst _) -> ([], [inst])
Nothing -> error $ "not in graph:" ++ show idx) $
indices
idxSize = FGL.noNodes graph
allIdxSet = Set.unions [inputIdxSet, outputIdxSet, calcIdxSet]
inputIdxSet = Set.fromList $ V.toList $ Plan.inputIdxs subker
outputIdxSet = Set.fromList $ V.toList $ Plan.outputIdxs subker
calcIdxSet = Set.fromList $ V.toList $ Plan.calcIdxs subker
graph = Plan.dataflow subker
mkCppType :: Opt.Ready v g => Env v g -> DVal.DynValue -> C.TypeRep
mkCppType env x = case x of
DVal.DynValue Realm.Global c -> C.UnitType c
DVal.DynValue Realm.Local c -> containerType env c
containerType :: Env v g -> TypeRep -> C.TypeRep
containerType (Env setup _) c = case Native.language setup of
Native.CPlusPlus -> C.TemplateType "std::vector" [C.UnitType c]
Native.CUDA -> C.TemplateType "thrust::device_vector" [C.UnitType c]
mkCudaRawType :: Opt.Ready v g => Env v g -> DVal.DynValue -> C.TypeRep
mkCudaRawType env x = case x of
DVal.DynValue Realm.Global c -> C.UnitType c
DVal.DynValue Realm.Local c -> containerRawType env c
containerRawType :: Env v g -> TypeRep -> C.TypeRep
containerRawType (Env setup _) c = case Native.language setup of
Native.CPlusPlus -> C.PtrOf $ C.UnitType c
Native.CUDA -> C.PtrOf $ C.UnitType c
nodeNameUniversal :: FGL.Node -> Name
nodeNameUniversal idx = mkName $ "a" ++ showT idx
nodeNameCursored :: Opt.Ready v g => Env v g -> FGL.Node -> v g -> Name
nodeNameCursored env idx cursor = mkName $ "a" ++ showT idx ++ "_" ++
cursorToText env cursor
cursorToText :: Opt.Ready v g => Env v g -> v g -> T.Text
cursorToText _ cursor = cursorT
where
cursorT :: T.Text
cursorT = F.foldl1 connector $ compose (\i -> T.map sanitize $ showT (cursor ! i))
connector a b = a ++ "_" ++ b
sanitize c
| isDigit c = c
| c == '-' = 'm'
| c == '.' = 'd'
| otherwise = 'k'
intImm :: Int -> C.Expr
intImm = C.toDyn
tInt :: C.TypeRep
tInt = C.typeOf (undefined :: Int)
tSizet :: C.TypeRep
tSizet = C.typeOf (undefined :: Int)
mkVarExpr :: Text -> C.Expr
mkVarExpr = C.VarExpr . C.Var C.UnknownType . mkName
tVoid :: C.TypeRep
tVoid = C.typeOf ()
tHostVecInt :: C.TypeRep
tHostVecInt = C.TemplateType "thrust::host_vector" [tInt]
tDeviceVecInt :: C.TypeRep
tDeviceVecInt = C.TemplateType "thrust::device_vector" [tInt]
rhsArith :: Opt.Ready v g => Env v g -> Arith.Operator -> [C.Expr] -> C.Expr
rhsArith (Env setup _) op argExpr = case (op, argExpr) of
(Arith.Identity, [x]) -> x
(Arith.Add , [x,y]) -> C.Op2Infix "+" x y
(Arith.Sub , [x,y]) -> C.Op2Infix "-" x y
(Arith.Neg , [x]) -> C.Op1Prefix "-" x
(Arith.Mul , [x,y]) -> C.Op2Infix "*" x y
(Arith.Div , [x,y]) -> C.Op2Infix "/" x y
(Arith.Mod , [x,y]) -> C.Op2Infix "%" x y
(Arith.Inv , [x]) -> C.Op1Prefix "1/" x
(Arith.Not , [x]) -> C.Op1Prefix "!" x
(Arith.And , [x,y]) -> C.Op2Infix "&&" x y
(Arith.Or , [x,y]) -> C.Op2Infix "||" x y
(Arith.EQ , [x,y]) -> C.Op2Infix "==" x y
(Arith.NE , [x,y]) -> C.Op2Infix "!=" x y
(Arith.LT , [x,y]) -> C.Op2Infix "<" x y
(Arith.LE , [x,y]) -> C.Op2Infix "<=" x y
(Arith.GT , [x,y]) -> C.Op2Infix ">" x y
(Arith.GE , [x,y]) -> C.Op2Infix ">=" x y
(Arith.Select , [x,y,z]) -> C.Op3Infix "?" ":" x y z
(Arith.Max , [x,y]) -> C.FuncCallStd (nmsp "std::max" "max") [x,y]
(Arith.Min , [x,y]) -> C.FuncCallStd (nmsp "std::min" "min") [x,y]
(Arith.Abs , [x]) -> C.FuncCallStd "abs" [x]
(Arith.Sqrt , [x]) -> C.FuncCallStd "sqrt" [x]
(Arith.Exp , [x]) -> C.FuncCallStd "exp" [x]
(Arith.Log , [x]) -> C.FuncCallStd "log" [x]
(Arith.Sin , [x]) -> C.FuncCallStd "sin" [x]
(Arith.Cos , [x]) -> C.FuncCallStd "cos" [x]
(Arith.Tan , [x]) -> C.FuncCallStd "tan" [x]
(Arith.Asin , [x]) -> C.FuncCallStd "asin" [x]
(Arith.Acos , [x]) -> C.FuncCallStd "acos" [x]
(Arith.Atan , [x]) -> C.FuncCallStd "atan" [x]
(Arith.Atan2 , [x,y]) -> C.FuncCallStd "atan2" [x,y]
_ -> C.FuncCallStd (T.map toLower $ showT op) argExpr
where
nmsp a b = case Native.language setup of
Native.CPlusPlus -> a
Native.CUDA -> b
library :: Opt.Ready v g => Env v g -> [C.Statement]
library (Env setup _) = (:[]) $ C.Exclusive C.SourceFile $ C.RawStatement $ lib
where
lib = case Native.language setup of
Native.CPlusPlus -> cpuLib
Native.CUDA -> gpuLib
cpuLib = "template <class T> T broadcast (const T& x) {\n return x;\n}\ntemplate <class T> T reduce_sum (const std::vector<T> &xs) {\n T ret = 0;\n for (int i = 0; i < xs.size(); ++i) ret+=xs[i];\n return ret;\n}\ntemplate <class T> T reduce_min (const std::vector<T> &xs) {\n T ret = xs[0];\n for (int i = 1; i < xs.size(); ++i) ret=std::min(ret,xs[i]);\n return ret;\n}\ntemplate <class T> T reduce_max (const std::vector<T> &xs) {\n T ret = xs[0];\n for (int i = 1; i < xs.size(); ++i) ret=std::max(ret,xs[i]);\n return ret;\n}\n"
gpuLib = "template <class T>\n__device__ __host__\nT broadcast (const T& x) {\n return x;\n}\ntemplate <class T> T reduce_sum (const std::vector<T> &xs) {\n T ret = 0;\n for (int i = 0; i < xs.size(); ++i) ret+=xs[i];\n return ret;\n}\ntemplate <class T> T reduce_min (const std::vector<T> &xs) {\n T ret = xs[0];\n for (int i = 1; i < xs.size(); ++i) ret=std::min(ret,xs[i]);\n return ret;\n}\ntemplate <class T> T reduce_max (const std::vector<T> &xs) {\n T ret = xs[0];\n for (int i = 1; i < xs.size(); ++i) ret=std::max(ret,xs[i]);\n return ret;\n}\n" ++ "template <class T> T reduce_sum (const thrust::device_vector<T> &xs) {\n return thrust::reduce(xs.begin(), xs.end(), 0, thrust::plus<T>());\n}\ntemplate <class T> T reduce_min (const thrust::device_vector<T> &xs) {\n return *(thrust::min_element(xs.begin(), xs.end()));\n}\ntemplate <class T> T reduce_max (const thrust::device_vector<T> &xs) {\n return *(thrust::max_element(xs.begin(), xs.end()));\n}\n\n"