aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/kernel.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/stream_executor/kernel.cc')
-rw-r--r--tensorflow/stream_executor/kernel.cc95
1 files changed, 95 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/kernel.cc b/tensorflow/stream_executor/kernel.cc
new file mode 100644
index 0000000000..5e7fe95627
--- /dev/null
+++ b/tensorflow/stream_executor/kernel.cc
@@ -0,0 +1,95 @@
+// Implementation of the pointer-to-implementation wrapper for the data-parallel
+// kernel abstraction. KernelBase just delegates to the internal
+// platform-specific implementation instance.
+
+#include "tensorflow/stream_executor/kernel.h"
+
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "tensorflow/stream_executor/lib/demangle.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace perftools {
+namespace gputools {
+
+bool KernelMetadata::registers_per_thread(int *registers_per_thread) const {
+ if (has_registers_per_thread_) {
+ *registers_per_thread = registers_per_thread_;
+ return true;
+ }
+
+ return false;
+}
+
+void KernelMetadata::set_registers_per_thread(int registers_per_thread) {
+ registers_per_thread_ = registers_per_thread;
+ has_registers_per_thread_ = true;
+}
+
+bool KernelMetadata::shared_memory_bytes(int *shared_memory_bytes) const {
+ if (has_shared_memory_bytes_) {
+ *shared_memory_bytes = shared_memory_bytes_;
+ return true;
+ }
+
+ return false;
+}
+
+void KernelMetadata::set_shared_memory_bytes(int shared_memory_bytes) {
+ shared_memory_bytes_ = shared_memory_bytes;
+ has_shared_memory_bytes_ = true;
+}
+
+static internal::KernelInterface *KernelImplementationFromPlatformKind(
+ PlatformKind platform_kind) {
+ if (platform_kind == PlatformKind::kCuda) {
+ return (*internal::MakeCUDAKernelImplementation())();
+ } else if (platform_kind == PlatformKind::kOpenCL ||
+ platform_kind == PlatformKind::kOpenCLAltera) {
+ return (*internal::MakeOpenCLKernelImplementation())();
+ } else {
+ LOG(FATAL) << "cannot create kernel implementation for platform kind: "
+ << PlatformKindString(platform_kind);
+ }
+}
+
+KernelBase::KernelBase(StreamExecutor *parent)
+ : implementation_(
+ KernelImplementationFromPlatformKind(parent->platform_kind())),
+ parent_(parent) {
+ DCHECK(parent_ != nullptr);
+}
+
+KernelBase::KernelBase(StreamExecutor *parent,
+ internal::KernelInterface *implementation)
+ : implementation_(implementation), parent_(parent) {}
+
+KernelBase::~KernelBase() {}
+
+unsigned KernelBase::Arity() const { return implementation_->Arity(); }
+
+void KernelBase::SetPreferredCacheConfig(KernelCacheConfig config) {
+ return implementation_->SetPreferredCacheConfig(config);
+}
+
+KernelCacheConfig KernelBase::GetPreferredCacheConfig() const {
+ return implementation_->GetPreferredCacheConfig();
+}
+
+// Prefix stub functions emitted by the CUDA splitter.
+static const char *kStubPrefix = "__device_stub_";
+
+void KernelBase::set_name(port::StringPiece name) {
+ name_ = name.ToString();
+ port::StringPiece stubless_name = name;
+ if (name.starts_with(kStubPrefix)) {
+ stubless_name.remove_prefix(strlen(kStubPrefix));
+ }
+ demangled_name_ = port::Demangle(stubless_name.data());
+}
+
+} // namespace gputools
+} // namespace perftools