{-# LANGUAGE CPP, DeriveDataTypeable, ExistentialQuantification, NoImplicitPrelude, OverloadedStrings, TupleSections #-} {-# OPTIONS -Wall #-} module Language.Paraiso.Generator.PlanTrans ( translate ) where import qualified Algebra.Additive as Additive import Data.Char import Data.Dynamic import qualified Data.Graph.Inductive as FGL import Data.List (sortBy) import qualified Data.ListLike.String as LL import Data.ListLike.Text () import qualified Data.Foldable as F import Data.Maybe import qualified Data.Set as Set import Data.Tensor.TypeLevel import qualified Data.Text as T import qualified Data.Vector as V import qualified Language.Paraiso.Annotation as Anot import qualified Language.Paraiso.Annotation.SyncThreads as Sync import qualified Language.Paraiso.Generator.Claris as C import qualified Language.Paraiso.Generator.Native as Native import qualified Language.Paraiso.Generator.Plan as Plan import qualified Language.Paraiso.OM.Arithmetic as Arith import qualified Language.Paraiso.OM.DynValue as DVal import qualified Language.Paraiso.OM.Graph as OM import qualified Language.Paraiso.OM.Realm as Realm import qualified Language.Paraiso.Optimization.Graph as Opt import Language.Paraiso.Name import Language.Paraiso.Prelude hiding (Boolean(..)) import NumericPrelude hiding ((++)) type AnAn = Anot.Annotation data Env v g = Env (Native.Setup v g) (Plan.Plan v g AnAn) -- translate the plan to Claris translate :: Opt.Ready v g => Native.Setup v g -> Plan.Plan v g AnAn -> C.Program translate setup plan = C.Program { C.progName = name plan, C.topLevel = map include stlHeaders ++ library env ++ comments ++ subHelperFuncs ++ [ C.ClassDef $ C.Class (name plan) $ storageVars ++ constructorDef ++ memberFuncForSize env ++ subMemberFuncs ++ memberFuncs ] } where env = Env setup plan comments = (:[]) $ C.Comment $ LL.unlines [ "", "lowerMargin = " ++ showT (Plan.lowerMargin plan), "upperMargin = " ++ showT (Plan.upperMargin plan) ] constructorDef = (:[]) $ C.MemberFunc C.Public True $ (C.function C.ConstructorType (name plan)) { C.funcMemberInitializer = -- allocate memory for storage members concat $ flip map storageVars $ \memb -> case memb of C.MemberVar _ var -> [C.FuncCallUsr (name var) [C.FuncCallStd "memorySize" []]] _ -> [] } memberFuncs = V.toList $ V.imap (\idx ker -> makeFunc env idx ker) $ Plan.kernels plan subHelperFuncs = concat $ V.toList $ V.map snd $ subKernelFuncs subMemberFuncs = concat $ V.toList $ V.map fst $ subKernelFuncs subKernelFuncs = V.map (makeSubFunc env) $ Plan.subKernels plan include = C.Exclusive C.HeaderFile . C.StmtPrpr . C.PrprInclude C.Chevron stlHeaders = case Native.language setup of Native.CPlusPlus -> ["algorithm", "cmath", "vector"] Native.CUDA -> ["thrust/device_vector.h", "thrust/host_vector.h", "thrust/functional.h", "thrust/extrema.h", "thrust/reduce.h"] storageVars = V.toList $ V.map storageRefToMenber $ Plan.storages plan storageRefToMenber stRef = C.MemberVar C.Public $ C.Var (mkCppType env $ Plan.storageType stRef) (name stRef) -- Generate member functions that returns the sizes of the mesh memberFuncForSize :: Opt.Ready v g => Env v g -> [C.MemberDef] memberFuncForSize env@(Env setup plan) = makeMami False "size" size ++ makeMami True "lowerMargin" lM ++ makeMami True "upperMargin" uM ++ makeMami False "memorySize" memorySize where size = F.toList $ Native.localSize setup memorySize = F.toList $ Native.localSize setup + Plan.lowerMargin plan + Plan.upperMargin plan lM = F.toList $ Plan.lowerMargin plan uM = F.toList $ Plan.upperMargin plan makeMami alone label xs = (if not alone then (finale label xs :) else id) $ map (\(i,x) -> tiro (label ++ show i) x) $ zip [(0::Int)..] xs tiro str ret = C.MemberFunc C.Public True $ (C.function (C.typeOf (size!!0)) $ mkName $ T.pack str) { C.funcBody = [ C.StmtReturn (C.toDyn ret) ] } finale str xs = tiro str (product xs) makeFunc :: Opt.Ready v g => Env v g -> Int -> OM.Kernel v g AnAn -> C.MemberDef makeFunc env@(Env setup plan) kerIdx ker = C.MemberFunc C.Public False $ (C.function tVoid (name ker)) { C.funcBody = kernelCalls ++ storeInsts } where graph = OM.dataflow ker kernelCalls = V.toList $ V.map (\subker -> callSubKer subker $ V.map findVar $ Plan.inputIdxs subker V.++ Plan.outputIdxs subker) $ V.filter ((== kerIdx) . Plan.kernelIdx) $ Plan.subKernels plan callSubKer subker xs = C.StmtExpr $ C.FuncCallUsr (name subker) (V.toList xs) storeInsts = map swapStmt $ concatMap (\(idx, nd) -> case nd of OM.NInst (OM.Store (OM.StaticIdx statIdx))_ -> [(idx, statIdx)] _ -> []) $ FGL.labNodes graph swapStmt (idx, statIdx) = let preIdx = head $ FGL.pre graph idx in case (filter ((Plan.StaticRef statIdx==) . Plan.storageIdx) $ V.toList $ Plan.storages plan, filter ((Plan.ManifestRef kerIdx preIdx==) . Plan.storageIdx) $ V.toList $ Plan.storages plan) of ([stRef],[maRef]) -> C.StmtExpr $ C.Op2Infix "=" (C.VarExpr $ C.Var (mkCppType env $ Plan.storageType stRef) (name stRef) ) (C.VarExpr $ C.Var C.UnknownType (name maRef) ) _ -> error $ "mismatch in storage phase: " ++ show (idx, statIdx) findVar idx = let loadIdx = listToMaybe $ concat $ map (\jdx -> case FGL.lab graph jdx of Just (OM.NInst (OM.Load (OM.StaticIdx statIdx))_)-> [Plan.StaticRef statIdx] _ -> []) $ FGL.pre graph idx match stIdx | stIdx == Plan.ManifestRef kerIdx idx = True | Just stIdx == loadIdx = True | otherwise = False stRef = V.head $ V.filter ( match . Plan.storageIdx ) $ Plan.storages plan in C.VarExpr $ C.Var (mkCppType env $ Plan.storageType stRef) (name stRef) -- | Create a subKernel: a member function that performs a portion of -- actual calculations. It may also generate some helper functions -- called from the subKernel body. makeSubFunc :: Opt.Ready v g => Env v g -> Plan.SubKernelRef v g AnAn -> ([C.MemberDef], [C.Statement]) makeSubFunc env@(Env setup plan) subker = case Native.language setup of Native.CPlusPlus -> cppSolution Native.CUDA -> cudaSolution where rlm = Realm.realm subker cudaHelperName = mkName $ nameText subker ++ "_inner" cudaBodys = if rlm == Realm.Global then [] else (:[]) $ C.FuncDef $ (C.function (C.QualifiedType [C.CudaGlobal] tVoid) cudaHelperName) { C.funcArgs = makeRawSubArg env True (Plan.labNodesIn subker) ++ makeRawSubArg env False (Plan.labNodesOut subker), C.funcBody = [ C.Comment $ LL.unlines [ "", "lowerMargin = " ++ showT (Plan.lowerBoundary subker), "upperMargin = " ++ showT (Plan.upperBoundary subker) ] ] ++ loopMaker env rlm subker } (gridDim, blockDim) = Native.cudaGridSize setup cudaSolution = (,cudaBodys) $ (:[]) $ C.MemberFunc C.Public False $ (C.function tVoid (name subker)) { C.funcArgs = makeSubArg env True (Plan.labNodesIn subker) ++ makeSubArg env False (Plan.labNodesOut subker), C.funcBody = if rlm == Realm.Global then loopMaker env rlm subker else [ C.Comment $ LL.unlines [ "", "lowerMargin = " ++ showT (Plan.lowerBoundary subker), "upperMargin = " ++ showT (Plan.upperBoundary subker) ], C.RawStatement "{static bool fst = false;", C.StmtExpr $ C.FuncCallStd "if (fst) cudaFuncSetCacheConfig" [mkVarExpr $ nameText cudaHelperName, mkVarExpr "cudaFuncCachePreferL1"], C.RawStatement "fst = true;}", C.StmtExpr $ C.CudaFuncCallUsr cudaHelperName (C.toDyn gridDim) (C.toDyn blockDim) $ map takeRaw $ (V.toList $ Plan.labNodesIn subker) ++ (V.toList $ Plan.labNodesOut subker) ] } takeRaw (idx, nd)= case nd of OM.NValue typ _ -> extractor typ $ C.VarExpr $ C.Var (mkCppType env typ) (nodeNameUniversal idx) _ -> error "NValue expected" extractor typ = case Realm.realm typ of Realm.Local -> C.FuncCallStd "thrust::raw_pointer_cast" . (:[]) . C.Op1Prefix "&*" . flip C.MemberAccess (C.FuncCallStd "begin" []) Realm.Global -> id cppSolution = (,[]) $ (:[]) $ C.MemberFunc C.Public False $ (C.function tVoid (name subker)) { C.funcArgs = makeSubArg env True (Plan.labNodesIn subker) ++ makeSubArg env False (Plan.labNodesOut subker), C.funcBody = if rlm == Realm.Global then loopMaker env rlm subker else [ C.Comment $ LL.unlines [ "", "lowerMargin = " ++ showT (Plan.lowerBoundary subker), "upperMargin = " ++ showT (Plan.upperBoundary subker) ] ] ++ loopMaker env rlm subker } -- | make a subroutine argument list. makeSubArg :: Opt.Ready v g => Env v g -> Bool -> V.Vector (FGL.LNode (OM.Node v g AnAn)) -> [C.Var] makeSubArg env isConst lnodes = let f = (if isConst then C.Const else id) . C.RefOf in map (\(idx,nd)-> case nd of OM.NValue typ _ -> C.Var (f $ mkCppType env typ) (nodeNameUniversal idx) _ -> error "NValue expected" ) $ V.toList lnodes -- | make a subroutine argument list, using raw pointers. makeRawSubArg :: Opt.Ready v g => Env v g -> Bool -> V.Vector (FGL.LNode (OM.Node v g AnAn)) -> [C.Var] makeRawSubArg env isConst lnodes = let f = (if isConst then C.Const else id) in map (\(idx,nd)-> case nd of OM.NValue typ _ -> C.Var (f $ mkCudaRawType env typ) (nodeNameUniversal idx) _ -> error "NValue expected" ) $ V.toList lnodes -- | implement the loop for each subroutine loopMaker :: Opt.Ready v g => Env v g -> Realm.Realm -> Plan.SubKernelRef v g AnAn -> [C.Statement] loopMaker env@(Env setup plan) realm subker = case realm of Realm.Local -> pragma ++ [ C.StmtFor (C.VarDefSub loopCounter loopBegin) (C.Op2Infix "<" (C.VarExpr loopCounter) loopEnd) (C.Op2Infix "+=" (C.VarExpr loopCounter) loopStride) $ [C.VarDefSub addrCounter codecAddr] ++ loopContent ] Realm.Global -> loopContent where pragma = if Native.language setup == Native.CPlusPlus then [C.StmtPrpr $ C.PrprPragma "omp parallel for"] else [] (loopBegin, loopEnd, loopStride) = case Native.language setup of Native.CPlusPlus -> (intImm 0, C.toDyn (product boundarySize), intImm 1) Native.CUDA -> (loopBeginCuda, C.toDyn (product boundarySize), loopStrideCuda) loopBeginCuda = mkVarExpr "blockIdx.x * blockDim.x + threadIdx.x" loopStrideCuda = mkVarExpr "blockDim.x * gridDim.x" loopCounter = C.Var tSizet (mkName "i") memorySize = F.toList $ Native.localSize setup + Plan.lowerMargin plan + Plan.upperMargin plan boundarySize = F.toList $ Native.localSize setup + Plan.lowerMargin plan + Plan.upperMargin plan - Plan.lowerBoundary subker - Plan.upperBoundary subker codecDiv = [ if idx == 0 then (C.VarExpr loopCounter) else C.Op2Infix "/" (C.VarExpr loopCounter) (C.toDyn $ product $ take idx boundarySize) | idx <- [0..length boundarySize-1]] codecMod = [ if idx == length codecDiv-1 then x else C.Op2Infix "%" x (C.toDyn $ boundarySize !! idx) | (idx, x) <- zip [0..] codecDiv] codecModAdd = [ C.Op2Infix "+" x (C.toDyn $ Plan.lowerMargin plan ! (Axis idx)) | (idx, x) <- zip [0..] codecMod] codecAddr = if memorySize == boundarySize then C.VarExpr loopCounter else foldl1 (C.Op2Infix "+") [ C.Op2Infix "*" x (C.toDyn $ product $ take idx memorySize) | (idx, x) <- zip [0..] codecModAdd] codecLoadIndex = [ C.Op2Infix "-" x (C.toDyn ((Plan.lowerMargin plan - Plan.lowerBoundary subker) ! (Axis idx) )) | (idx, x) <- zip [0..] codecMod] codecLoadSize = [ C.toDyn (Native.localSize setup ! (Axis idx) ) | (idx, _) <- zip [0..] codecMod] codecCursor cursor = (C.Op2Infix "+" (C.VarExpr addrCounter) (C.toDyn summa)) where summa = sum $ [ cursor ! (Axis idx) * product (take idx memorySize) | (idx, _) <- zip [0..] memorySize] addrCounter = C.Var tSizet (mkName "addr_origin") loopContent = concat $ map buildExprs $ filterVal $ Set.toList allIdxSet buildExprs (idx, val@(DVal.DynValue r c)) = addSyncFunctions idx $ map (\cursor -> lhs cursor (fst $ rhsAndRequest env idx cursor) ) $ Set.toList $ lhsCursors V.! idx where lhs cursor expr = if Set.member idx outputIdxSet then C.StmtExpr $ flip (C.Op2Infix "=") expr $ case realm of Realm.Local -> (C.ArrayAccess (C.VarExpr $ C.Var (C.UnitType c) (nodeNameUniversal idx)) (C.VarExpr addrCounter)) Realm.Global -> C.VarExpr $ C.Var (C.UnitType c) (nodeNameUniversal idx) else flip C.VarDefSub expr (C.Var (C.UnitType c) $ nodeNameCursored env idx cursor) addSyncFunctions :: FGL.Node -> [C.Statement] -> [C.Statement] addSyncFunctions idx = if realm /= Realm.Local || Native.language setup /= Native.CUDA then id else foldl (.) id $ map adder anot where anot :: [Sync.Timing] anot = (maybeToList $ FGL.lab graph idx) >>= (Anot.toList . OM.getA) adder :: Sync.Timing -> [C.Statement] -> [C.Statement] adder Sync.Pre = ([C.RawStatement "__syncthreads();"] ++ ) adder Sync.Post = ( ++ [C.RawStatement "__syncthreads();"]) -- lhsCursors :: (Opt.Ready v g) => V.Vector(Set.Set(v g)) lhsCursors = V.generate idxSize f where f idx | not (Set.member idx allIdxSet) = Set.empty | Set.member idx outputIdxSet = Set.singleton Additive.zero | otherwise = lhsRequest idx -- lhsRequests :: FGL.Node -> (Set.Set (v g)) lhsRequest idx = Set.fromList $ map snd $ filter ((==idx) . fst) $ concat $ [snd $ rhsAndRequest env jdx cur| jdx <- Set.toList allIdxSet, jdx > idx, cur <- Set.toList $ lhsCursors V.! jdx ] -- rhsAndRequest :: (Opt.Ready v g) => Env v g -> FGL.Node -> v g -> (C.Expr,[(Int, v g)]) rhsAndRequest env' idx cursor = let (idxInst,inst) = case preInst idx of found:_ -> found _ -> error $ "right hand side is not inst:" ++ show idx prepre = map fst $ sortBy (\x y -> compare (snd x) (snd y)) $ FGL.lpre graph idxInst isInput = Set.member idx inputIdxSet creatVar idx' = C.VarExpr $ C.Var C.UnknownType (nodeNameUniversal idx') in case inst of _ | isInput -> case realm of Realm.Local -> (C.ArrayAccess (creatVar idx) (codecCursor cursor), []) Realm.Global -> (creatVar idx, []) OM.Imm dyn -> (C.Imm dyn, []) OM.Arith op -> (rhsArith env' op (map (nodeToRhs env' cursor) prepre), map (,cursor) prepre) OM.Shift v -> case prepre of [pre1] -> (nodeToRhs env' cursor' pre1, [(pre1,cursor')]) where cursor' = cursor - v _ -> error $ "shift has not 1 pre!" ++ show idxInst ++ show prepre OM.LoadIndex ax -> (codecLoadIndex !! axisIndex ax, []) OM.LoadSize ax -> (codecLoadSize !! axisIndex ax, []) OM.Reduce op -> let fname = T.pack ("reduce_" ++ map toLower (show op)) in (C.FuncCallStd fname (map creatVar prepre), []) OM.Broadcast -> let fname = "broadcast" in (C.FuncCallStd fname (map creatVar prepre), []) _ -> (C.CommentExpr ("TODO : " ++ showT inst) (C.toDyn (42::Int)), []) nodeToRhs env' cursor idx = C.VarExpr $ C.Var C.UnknownType $ nodeNameCursored env' idx cursor preVal = filterVal . FGL.pre graph preInst = filterInst . FGL.pre graph sucVal = filterVal . FGL.suc graph sucInst = filterInst . FGL.suc graph filterVal = concat . map (\(i,(xs,ys))-> map(i,)xs) . shiwake filterInst = concat . map (\(i,(xs,ys))-> map(i,)ys) . shiwake shiwake indices = map (\idx -> (idx,) $ case FGL.lab graph idx of Just (OM.NValue dval _) -> ([dval], []) Just (OM.NInst inst _) -> ([], [inst]) Nothing -> error $ "not in graph:" ++ show idx) $ indices idxSize = FGL.noNodes graph allIdxSet = Set.unions [inputIdxSet, outputIdxSet, calcIdxSet] inputIdxSet = Set.fromList $ V.toList $ Plan.inputIdxs subker outputIdxSet = Set.fromList $ V.toList $ Plan.outputIdxs subker calcIdxSet = Set.fromList $ V.toList $ Plan.calcIdxs subker graph = Plan.dataflow subker -- | convert a DynValue to C type representation mkCppType :: Opt.Ready v g => Env v g -> DVal.DynValue -> C.TypeRep mkCppType env x = case x of DVal.DynValue Realm.Global c -> C.UnitType c DVal.DynValue Realm.Local c -> containerType env c containerType :: Env v g -> TypeRep -> C.TypeRep containerType (Env setup _) c = case Native.language setup of Native.CPlusPlus -> C.TemplateType "std::vector" [C.UnitType c] Native.CUDA -> C.TemplateType "thrust::device_vector" [C.UnitType c] -- | convert a DynValue to raw-pointer type representation -- for example used within CUDA kernel mkCudaRawType :: Opt.Ready v g => Env v g -> DVal.DynValue -> C.TypeRep mkCudaRawType env x = case x of DVal.DynValue Realm.Global c -> C.UnitType c DVal.DynValue Realm.Local c -> containerRawType env c containerRawType :: Env v g -> TypeRep -> C.TypeRep containerRawType (Env setup _) c = case Native.language setup of Native.CPlusPlus -> C.PtrOf $ C.UnitType c Native.CUDA -> C.PtrOf $ C.UnitType c -- | a universal naming rule for a node. nodeNameUniversal :: FGL.Node -> Name nodeNameUniversal idx = mkName $ "a" ++ showT idx nodeNameCursored :: Opt.Ready v g => Env v g -> FGL.Node -> v g -> Name nodeNameCursored env idx cursor = mkName $ "a" ++ showT idx ++ "_" ++ cursorToText env cursor cursorToText :: Opt.Ready v g => Env v g -> v g -> T.Text cursorToText _ cursor = cursorT where cursorT :: T.Text cursorT = F.foldl1 connector $ compose (\i -> T.map sanitize $ showT (cursor ! i)) connector a b = a ++ "_" ++ b sanitize c | isDigit c = c | c == '-' = 'm' | c == '.' = 'd' | otherwise = 'k' -- | Utility Types intImm :: Int -> C.Expr intImm = C.toDyn tInt :: C.TypeRep tInt = C.typeOf (undefined :: Int) tSizet :: C.TypeRep tSizet = C.typeOf (undefined :: Int) mkVarExpr :: Text -> C.Expr mkVarExpr = C.VarExpr . C.Var C.UnknownType . mkName tVoid :: C.TypeRep tVoid = C.typeOf () tHostVecInt :: C.TypeRep tHostVecInt = C.TemplateType "thrust::host_vector" [tInt] tDeviceVecInt :: C.TypeRep tDeviceVecInt = C.TemplateType "thrust::device_vector" [tInt] rhsArith :: Opt.Ready v g => Env v g -> Arith.Operator -> [C.Expr] -> C.Expr rhsArith (Env setup _) op argExpr = case (op, argExpr) of (Arith.Identity, [x]) -> x (Arith.Add , [x,y]) -> C.Op2Infix "+" x y (Arith.Sub , [x,y]) -> C.Op2Infix "-" x y (Arith.Neg , [x]) -> C.Op1Prefix "-" x (Arith.Mul , [x,y]) -> C.Op2Infix "*" x y (Arith.Div , [x,y]) -> C.Op2Infix "/" x y (Arith.Mod , [x,y]) -> C.Op2Infix "%" x y (Arith.Inv , [x]) -> C.Op1Prefix "1/" x (Arith.Not , [x]) -> C.Op1Prefix "!" x (Arith.And , [x,y]) -> C.Op2Infix "&&" x y (Arith.Or , [x,y]) -> C.Op2Infix "||" x y (Arith.EQ , [x,y]) -> C.Op2Infix "==" x y (Arith.NE , [x,y]) -> C.Op2Infix "!=" x y (Arith.LT , [x,y]) -> C.Op2Infix "<" x y (Arith.LE , [x,y]) -> C.Op2Infix "<=" x y (Arith.GT , [x,y]) -> C.Op2Infix ">" x y (Arith.GE , [x,y]) -> C.Op2Infix ">=" x y (Arith.Select , [x,y,z]) -> C.Op3Infix "?" ":" x y z (Arith.Max , [x,y]) -> C.FuncCallStd (nmsp "std::max" "max") [x,y] (Arith.Min , [x,y]) -> C.FuncCallStd (nmsp "std::min" "min") [x,y] (Arith.Abs , [x]) -> C.FuncCallStd "abs" [x] (Arith.Sqrt , [x]) -> C.FuncCallStd "sqrt" [x] (Arith.Exp , [x]) -> C.FuncCallStd "exp" [x] (Arith.Log , [x]) -> C.FuncCallStd "log" [x] (Arith.Sin , [x]) -> C.FuncCallStd "sin" [x] (Arith.Cos , [x]) -> C.FuncCallStd "cos" [x] (Arith.Tan , [x]) -> C.FuncCallStd "tan" [x] (Arith.Asin , [x]) -> C.FuncCallStd "asin" [x] (Arith.Acos , [x]) -> C.FuncCallStd "acos" [x] (Arith.Atan , [x]) -> C.FuncCallStd "atan" [x] (Arith.Atan2 , [x,y]) -> C.FuncCallStd "atan2" [x,y] _ -> C.FuncCallStd (T.map toLower $ showT op) argExpr where nmsp a b = case Native.language setup of Native.CPlusPlus -> a Native.CUDA -> b library :: Opt.Ready v g => Env v g -> [C.Statement] library (Env setup _) = (:[]) $ C.Exclusive C.SourceFile $ C.RawStatement $ lib where lib = case Native.language setup of Native.CPlusPlus -> cpuLib Native.CUDA -> gpuLib cpuLib = "template T broadcast (const T& x) {\n return x;\n}\ntemplate T reduce_sum (const std::vector &xs) {\n T ret = 0;\n for (int i = 0; i < xs.size(); ++i) ret+=xs[i];\n return ret;\n}\ntemplate T reduce_min (const std::vector &xs) {\n T ret = xs[0];\n for (int i = 1; i < xs.size(); ++i) ret=std::min(ret,xs[i]);\n return ret;\n}\ntemplate T reduce_max (const std::vector &xs) {\n T ret = xs[0];\n for (int i = 1; i < xs.size(); ++i) ret=std::max(ret,xs[i]);\n return ret;\n}\n" gpuLib = "template \n__device__ __host__\nT broadcast (const T& x) {\n return x;\n}\ntemplate T reduce_sum (const std::vector &xs) {\n T ret = 0;\n for (int i = 0; i < xs.size(); ++i) ret+=xs[i];\n return ret;\n}\ntemplate T reduce_min (const std::vector &xs) {\n T ret = xs[0];\n for (int i = 1; i < xs.size(); ++i) ret=std::min(ret,xs[i]);\n return ret;\n}\ntemplate T reduce_max (const std::vector &xs) {\n T ret = xs[0];\n for (int i = 1; i < xs.size(); ++i) ret=std::max(ret,xs[i]);\n return ret;\n}\n" ++ "template T reduce_sum (const thrust::device_vector &xs) {\n return thrust::reduce(xs.begin(), xs.end(), 0, thrust::plus());\n}\ntemplate T reduce_min (const thrust::device_vector &xs) {\n return *(thrust::min_element(xs.begin(), xs.end()));\n}\ntemplate T reduce_max (const thrust::device_vector &xs) {\n return *(thrust::max_element(xs.begin(), xs.end()));\n}\n\n"