StreamExecutor: Optimize kernel argument packing

Create a single class to hold all kernel arguments and optimize how they are added into this class. Change: 140556725
author: Peter Hawkins <phawkins@google.com> 2016-11-29 18:55:46 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2016-11-29 19:05:55 -0800
commit: bada4a5339d4567419e993b7736eb23a6f3535c4 (patch)
tree: d793b3a70518a2ab63fbc24c0e41d72eb7d7d97f /tensorflow/stream_executor/stream_executor_pimpl.h
parent: 347d3ef2a871d8212f39c7ea2b7defe63468dfbc (diff)
1 files changed, 3 insertions, 7 deletions
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 2b5a70f807..83fd27599e 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -392,7 +392,7 @@ class StreamExecutor {
   // implementation in StreamExecutorInterface::Launch().
   bool Launch(Stream *stream, const ThreadDim &thread_dims,
               const BlockDim &block_dims, const KernelBase &kernel,
-              const std::vector<KernelArg> &args);
+              const KernelArgsArrayBase &args);
 
   // Gets-or-creates (creates with memoization) a FftSupport datatype that can
   // be used to execute FFT routines on the current platform.
@@ -427,10 +427,6 @@ class StreamExecutor {
   // previously registered.
   bool UnregisterTraceListener(TraceListener* listener);
 
-  // Converts a DeviceMemory object into a KernelArg object for passing to the
-  // device driver for kernel launch.
-  KernelArg DeviceMemoryToKernelArg(const DeviceMemoryBase &gpu_mem) const;
-
  private:
   template <typename BeginCallT, typename CompleteCallT,
             typename ReturnT, typename... BeginArgsT>
@@ -758,9 +754,9 @@ inline Stream &Stream::ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
     // we pack the variadic parameters passed as ...args into the desired
     // tuple form and pass that packed form to the StreamExecutor::Launch()
     // implementation.
-    std::vector<KernelArg> kernel_args;
-    kernel_args.reserve(kernel.Arity());
+    KernelArgsArray<sizeof...(args)> kernel_args;
     kernel.PackParams(&kernel_args, args...);
+    DCHECK(parent_ != nullptr);
     bool ok =
         parent_->Launch(this, thread_dims, block_dims, kernel, kernel_args);
     if (!ok) {
author	Peter Hawkins <phawkins@google.com>	2016-11-29 18:55:46 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2016-11-29 19:05:55 -0800
commit	bada4a5339d4567419e993b7736eb23a6f3535c4 (patch)
tree	d793b3a70518a2ab63fbc24c0e41d72eb7d7d97f /tensorflow/stream_executor/stream_executor_pimpl.h
parent	347d3ef2a871d8212f39c7ea2b7defe63468dfbc (diff)