aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/stream_executor.h
blob: 3bccaec5e3ee6c24e622f5dc7b2afb52e740603d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
// The StreamExecutor is a single-device abstraction for:
//
// * Loading/launching data-parallel-kernels
// * Invoking pre-canned high-performance library routines (like matrix
//   multiply)
//
// The appropriately-typed kernel and "loader spec" are automatically generated
// for the user within a namespace by the gcudacc compiler output, so typical
// use looks like so:
//
//    namespace gpu = ::perftools::gputools;
//    namespace gcudacc = ::platforms::gpus::gcudacc;
//
//    gpu::StreamExecutor stream_exec{PlatformKind::kCuda};
//    gcudacc::kernel::MyKernel my_kernel{&stream_exec};
//    bool ok = stream_exec.GetKernel(gcudacc::spec::MyKernelSpec(),
//    &my_kernel);
//    if (!ok) { ... }
//    gpu::DeviceMemory<int> result = stream_exec.AllocateZeroed<int>();
//    if (result == nullptr) { ... }
//    int host_result;
//    gpu::Stream my_stream{&stream_exec};
//    my_stream
//      .Init()
//      .ThenLaunch(ThreadDim{1024}, BlockDim{1}, my_kernel, result)
//      .ThenMemcpy(&host_result, result, sizeof(host_result))
//      .BlockHostUntilDone()
//    if (!my_stream.ok()) { ... }
//    printf("%d\n", host_result);
//
// Since the device may operate asynchronously to the host, the
// Stream::BlockHostUntilDone() call forces the calling host thread to wait for
// the chain of commands specified for the Stream to complete execution.

#ifndef TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
#define TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_H_

#include "tensorflow/stream_executor/device_description.h"  // IWYU pragma: export
#include "tensorflow/stream_executor/device_memory.h"    // IWYU pragma: export
#include "tensorflow/stream_executor/device_options.h"  // IWYU pragma: export
#include "tensorflow/stream_executor/event.h"           // IWYU pragma: export
#include "tensorflow/stream_executor/kernel.h"       // IWYU pragma: export
#include "tensorflow/stream_executor/kernel_spec.h"  // IWYU pragma: export
#include "tensorflow/stream_executor/launch_dim.h"   // IWYU pragma: export
#include "tensorflow/stream_executor/platform.h"     // IWYU pragma: export
#include "tensorflow/stream_executor/stream.h"       // IWYU pragma: export
#include "tensorflow/stream_executor/stream_executor_pimpl.h"  // IWYU pragma: export
#include "tensorflow/stream_executor/timer.h"            // IWYU pragma: export

#endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_H_