diff options
-rw-r--r-- | tensorflow/core/platform/default/stream_executor.h | 2 | ||||
-rw-r--r-- | tensorflow/core/platform/default/stream_executor_no_cuda.h | 2 | ||||
-rw-r--r-- | tensorflow/stream_executor/BUILD | 12 | ||||
-rw-r--r-- | tensorflow/stream_executor/host/host_gpu_executor.cc | 263 | ||||
-rw-r--r-- | tensorflow/stream_executor/host/host_gpu_executor.h | 215 | ||||
-rw-r--r-- | tensorflow/stream_executor/host/host_platform.cc | 120 | ||||
-rw-r--r-- | tensorflow/stream_executor/host/host_platform.h | 88 | ||||
-rw-r--r-- | tensorflow/stream_executor/host/host_platform_id.cc | 26 | ||||
-rw-r--r-- | tensorflow/stream_executor/host/host_platform_id.h | 36 | ||||
-rw-r--r-- | tensorflow/stream_executor/host/host_stream.cc | 57 | ||||
-rw-r--r-- | tensorflow/stream_executor/host/host_stream.h | 58 | ||||
-rw-r--r-- | tensorflow/stream_executor/host/host_timer.cc | 52 | ||||
-rw-r--r-- | tensorflow/stream_executor/host/host_timer.h | 63 | ||||
-rw-r--r-- | tensorflow/stream_executor/lib/env.h | 1 | ||||
-rw-r--r-- | tensorflow/stream_executor/multi_platform_manager.cc | 10 | ||||
-rw-r--r-- | tensorflow/stream_executor/multi_platform_manager.h | 7 |
16 files changed, 1001 insertions, 11 deletions
diff --git a/tensorflow/core/platform/default/stream_executor.h b/tensorflow/core/platform/default/stream_executor.h index f96bf7dc94..3c178ffce0 100644 --- a/tensorflow/core/platform/default/stream_executor.h +++ b/tensorflow/core/platform/default/stream_executor.h @@ -20,10 +20,12 @@ limitations under the License. // IWYU pragma: friend third_party/tensorflow/core/platform/stream_executor.h #include "tensorflow/core/platform/default/from_stream_executor_status.h" +#include "tensorflow/stream_executor/cuda/cuda_platform_id.h" #include "tensorflow/stream_executor/device_memory.h" #include "tensorflow/stream_executor/dnn.h" #include "tensorflow/stream_executor/dso_loader.h" #include "tensorflow/stream_executor/event.h" +#include "tensorflow/stream_executor/host/host_platform_id.h" #include "tensorflow/stream_executor/lib/status.h" #include "tensorflow/stream_executor/multi_platform_manager.h" #include "tensorflow/stream_executor/platform.h" diff --git a/tensorflow/core/platform/default/stream_executor_no_cuda.h b/tensorflow/core/platform/default/stream_executor_no_cuda.h index 445f63a9e7..faa65256f9 100644 --- a/tensorflow/core/platform/default/stream_executor_no_cuda.h +++ b/tensorflow/core/platform/default/stream_executor_no_cuda.h @@ -20,10 +20,12 @@ limitations under the License. // IWYU pragma: friend third_party/tensorflow/core/platform/stream_executor_no_cuda.h #include "tensorflow/core/platform/default/from_stream_executor_status.h" +#include "tensorflow/stream_executor/cuda/cuda_platform_id.h" #include "tensorflow/stream_executor/device_memory.h" #include "tensorflow/stream_executor/dnn.h" #include "tensorflow/stream_executor/dso_loader.h" #include "tensorflow/stream_executor/event.h" +#include "tensorflow/stream_executor/host/host_platform_id.h" #include "tensorflow/stream_executor/lib/status.h" #include "tensorflow/stream_executor/multi_platform_manager.h" #include "tensorflow/stream_executor/platform.h" diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD index 256b128750..a84f9d5f9d 100644 --- a/tensorflow/stream_executor/BUILD +++ b/tensorflow/stream_executor/BUILD @@ -7,6 +7,8 @@ cc_library( srcs = glob( [ "*.cc", + "host/*.cc", + "cuda/cuda_platform_id.cc", "lib/*.cc", "platform/default/*.cc", ], @@ -14,13 +16,17 @@ cc_library( "**/*_test.cc", ], ) + if_cuda( - glob([ - "cuda/*.cc", - ]), + glob( + [ + "cuda/*.cc", + ], + exclude = ["cuda/cuda_platform_id.cc"], + ), ), hdrs = glob([ "*.h", "cuda/*.h", + "host/*.h", "lib/*.h", "lib/gtl/*.h", "platform/**/*.h", diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc new file mode 100644 index 0000000000..ff07432bb7 --- /dev/null +++ b/tensorflow/stream_executor/host/host_gpu_executor.cc @@ -0,0 +1,263 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Implementation of HostExecutor class [of those methods not defined in the +// class declaration]. +#include "tensorflow/stream_executor/host/host_gpu_executor.h" + +#include <string.h> + +#include "tensorflow/stream_executor/host/host_platform_id.h" +#include "tensorflow/stream_executor/host/host_stream.h" +#include "tensorflow/stream_executor/host/host_timer.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/plugin_registry.h" + +namespace gpu = ::perftools::gputools; + +namespace perftools { +namespace gputools { +namespace host { + +HostStream *AsHostStream(Stream *stream) { + DCHECK(stream != nullptr); + return dynamic_cast<HostStream *>(stream->implementation()); +} + +HostExecutor::HostExecutor(const PluginConfig &plugin_config) + : plugin_config_(plugin_config) {} + +HostExecutor::~HostExecutor() {} + +void *HostExecutor::Allocate(uint64 size) { return new char[size]; } + +void *HostExecutor::AllocateSubBuffer(DeviceMemoryBase *parent, + uint64 offset_bytes, uint64 size_bytes) { + return reinterpret_cast<char *>(parent->opaque()) + offset_bytes; +} + +void HostExecutor::Deallocate(DeviceMemoryBase *mem) { + if (!mem->is_sub_buffer()) { + delete[] static_cast<char *>(mem->opaque()); + } +} + +bool HostExecutor::SynchronousMemZero(DeviceMemoryBase *location, uint64 size) { + memset(location->opaque(), 0, size); + return true; +} + +bool HostExecutor::SynchronousMemSet(DeviceMemoryBase *location, int value, + uint64 size) { + memset(location->opaque(), value, size); + return true; +} + +bool HostExecutor::Memcpy(Stream *stream, void *host_dst, + const DeviceMemoryBase &gpu_src, uint64 size) { + // Enqueue the [asynchronous] memcpy on the stream (HostStream) associated + // with the HostExecutor. + void *src_mem = const_cast<void *>(gpu_src.opaque()); + AsHostStream(stream)->EnqueueTask( + [host_dst, src_mem, size]() { memcpy(host_dst, src_mem, size); }); + return true; +} + +bool HostExecutor::Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, + const void *host_src, uint64 size) { + void *dst_mem = gpu_dst->opaque(); + // Enqueue the [asynchronous] memcpy on the stream (HostStream) associated + // with the HostExecutor. + AsHostStream(stream)->EnqueueTask( + [dst_mem, host_src, size]() { memcpy(dst_mem, host_src, size); }); + return true; +} + +bool HostExecutor::MemcpyDeviceToDevice(Stream *stream, + DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) { + void *dst_mem = gpu_dst->opaque(); + void *src_mem = const_cast<void *>(gpu_src.opaque()); + // Enqueue this [asynchronous] "device-to-device" (i.e., host-to-host, given + // the nature of the HostExecutor) memcpy on the stream (HostStream) + // associated with the HostExecutor. + AsHostStream(stream)->EnqueueTask( + [src_mem, dst_mem, size]() { memcpy(src_mem, dst_mem, size); }); + return true; +} + +bool HostExecutor::MemZero(Stream *stream, DeviceMemoryBase *location, + uint64 size) { + void *gpu_mem = location->opaque(); + // Enqueue the [asynchronous] memzero on the stream (HostStream) associated + // with the HostExecutor. + AsHostStream(stream)->EnqueueTask( + [gpu_mem, size]() { memset(gpu_mem, 0, size); }); + return true; +} + +bool HostExecutor::Memset(Stream *stream, DeviceMemoryBase *location, + uint8 pattern, uint64 size) { + void *gpu_mem = location->opaque(); + // Enqueue the [asynchronous] memzero on the stream (HostStream) associated + // with the HostExecutor. + AsHostStream(stream)->EnqueueTask( + [gpu_mem, size, pattern]() { memset(gpu_mem, pattern, size); }); + return true; +} + +bool HostExecutor::Memset32(Stream *stream, DeviceMemoryBase *location, + uint32 pattern, uint64 size) { + void *gpu_mem = location->opaque(); + // Enqueue the [asynchronous] memzero on the stream (HostStream) associated + // with the HostExecutor. + AsHostStream(stream)->EnqueueTask( + [gpu_mem, size, pattern]() { memset(gpu_mem, pattern, size); }); + return true; +} + +bool HostExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst, + const void *host_src, uint64 size) { + memcpy(gpu_dst->opaque(), host_src, size); + return true; +} + +bool HostExecutor::SynchronousMemcpy(void *host_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) { + memcpy(host_dst, gpu_src.opaque(), size); + return true; +} + +bool HostExecutor::SynchronousMemcpyDeviceToDevice( + DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src, uint64 size) { + memcpy(gpu_dst->opaque(), gpu_src.opaque(), size); + return true; +} + +bool HostExecutor::HostCallback(Stream *stream, + std::function<void()> callback) { + AsHostStream(stream)->EnqueueTask(callback); + return true; +} + +bool HostExecutor::AllocateStream(Stream *stream) { return true; } + +void HostExecutor::DeallocateStream(Stream *stream) {} + +bool HostExecutor::CreateStreamDependency(Stream *dependent, Stream *other) { + AsHostStream(dependent)->EnqueueTask( + [other]() { other->BlockHostUntilDone(); }); + AsHostStream(dependent)->BlockUntilDone(); + return true; +} + +bool HostExecutor::StartTimer(Stream *stream, Timer *timer) { + dynamic_cast<HostTimer *>(timer->implementation())->Start(stream); + return true; +} + +bool HostExecutor::StopTimer(Stream *stream, Timer *timer) { + dynamic_cast<HostTimer *>(timer->implementation())->Stop(stream); + return true; +} + +bool HostExecutor::BlockHostUntilDone(Stream *stream) { + AsHostStream(stream)->BlockUntilDone(); + return true; +} + +DeviceDescription *HostExecutor::PopulateDeviceDescription() const { + internal::DeviceDescriptionBuilder builder; + + builder.set_device_address_bits(64); + + // TODO(rspringer): How to report a value that's based in reality but that + // doesn't result in thrashing or other badness? 4GiB chosen arbitrarily. + builder.set_device_memory_size(static_cast<uint64>(4) * 1024 * 1024 * 1024); + + builder.set_clock_rate_ghz(static_cast<float>(CLOCKS_PER_SEC) / 1e9); + + auto built = builder.Build(); + return built.release(); +} + +bool HostExecutor::SupportsBlas() const { + return PluginRegistry::Instance() + ->GetFactory<PluginRegistry::BlasFactory>(kHostPlatformId, + plugin_config_.blas()) + .ok(); +} + +blas::BlasSupport *HostExecutor::CreateBlas() { + PluginRegistry *registry = PluginRegistry::Instance(); + port::StatusOr<PluginRegistry::BlasFactory> status = + registry->GetFactory<PluginRegistry::BlasFactory>(kHostPlatformId, + plugin_config_.blas()); + if (!status.ok()) { + LOG(ERROR) << "Unable to retrieve BLAS factory: " + << status.status().error_message(); + return nullptr; + } + + return status.ValueOrDie()(this); +} + +bool HostExecutor::SupportsFft() const { + return PluginRegistry::Instance() + ->GetFactory<PluginRegistry::FftFactory>(kHostPlatformId, + plugin_config_.fft()) + .ok(); +} + +fft::FftSupport *HostExecutor::CreateFft() { + PluginRegistry *registry = PluginRegistry::Instance(); + port::StatusOr<PluginRegistry::FftFactory> status = + registry->GetFactory<PluginRegistry::FftFactory>(kHostPlatformId, + plugin_config_.fft()); + if (!status.ok()) { + LOG(ERROR) << "Unable to retrieve FFT factory: " + << status.status().error_message(); + return nullptr; + } + + return status.ValueOrDie()(this); +} + +bool HostExecutor::SupportsRng() const { + return PluginRegistry::Instance() + ->GetFactory<PluginRegistry::RngFactory>(kHostPlatformId, + plugin_config_.rng()) + .ok(); +} + +rng::RngSupport *HostExecutor::CreateRng() { + PluginRegistry *registry = PluginRegistry::Instance(); + port::StatusOr<PluginRegistry::RngFactory> status = + registry->GetFactory<PluginRegistry::RngFactory>(kHostPlatformId, + plugin_config_.rng()); + if (!status.ok()) { + LOG(ERROR) << "Unable to retrieve RNG factory: " + << status.status().error_message(); + return nullptr; + } + + return status.ValueOrDie()(this); +} + +} // namespace host +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h new file mode 100644 index 0000000000..f217f7947f --- /dev/null +++ b/tensorflow/stream_executor/host/host_gpu_executor.h @@ -0,0 +1,215 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Declares the HostExecutor class, which is a CPU-only implementation of +// the StreamExecutor interface. For now, this is used for testing and to +// examine the performance of host-based StreamExecutor code. +#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_GPU_EXECUTOR_H_ +#define TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_GPU_EXECUTOR_H_ + +#include "tensorflow/stream_executor/blas.h" +#include "tensorflow/stream_executor/host/host_stream.h" +#include "tensorflow/stream_executor/host/host_timer.h" +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/rng.h" +#include "tensorflow/stream_executor/stream_executor.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" + +namespace perftools { +namespace gputools { +namespace host { + +// An implementation of StreamExecutor that does no communication or interaction +// with a device, but DOES perform memory operations backed by the host. +// Plugin routines (RNG, BLAS) are also supported and functional. +// Kernel invocations will fail, but host callbacks may be enqueued on this +// executor and its associated stream, and should follow standard ordering +// semantics. +// +// This is useful for evaluating the performance of host-based or fallback +// routines executed under the context of a GPU executor. +// See stream_executor.h for description of the below operations. +class HostExecutor : public internal::StreamExecutorInterface { + public: + explicit HostExecutor(const PluginConfig &plugin_config); + ~HostExecutor() override; + + port::Status Init(int device_ordinal, DeviceOptions device_options) override { + return port::Status::OK(); + } + + bool GetKernel(const MultiKernelLoaderSpec &spec, + KernelBase *kernel) override { + return false; + } + bool Launch(Stream *stream, const ThreadDim &thread_dims, + const BlockDim &block_dims, const KernelBase &kernel, + const KernelArgsArrayBase &args) override { + return false; + } + + void *Allocate(uint64 size) override; + void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes, + uint64 size_bytes) override; + void Deallocate(DeviceMemoryBase *mem) override; + + void *HostMemoryAllocate(uint64 size) override { return new char[size]; } + void HostMemoryDeallocate(void *mem) override { + delete[] static_cast<char *>(mem); + } + bool HostMemoryRegister(void *mem, uint64 size) override { return true; } + bool HostMemoryUnregister(void *mem) override { return true; } + + bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &gpu_src, + uint64 size) override; + bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, const void *host_src, + uint64 size) override; + bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &host_src, + uint64 size) override; + + bool MemZero(Stream *stream, DeviceMemoryBase *location, + uint64 size) override; + bool Memset(Stream *stream, DeviceMemoryBase *location, uint8 pattern, + uint64 size) override; + bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern, + uint64 size) override; + + // No "synchronize all activity" implemented for this platform at the moment. + bool SynchronizeAllActivity() override { return false; } + bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override; + + bool SynchronousMemSet(DeviceMemoryBase *location, int value, + uint64 size) override; + + bool SynchronousMemcpy(DeviceMemoryBase *gpu_dst, const void *host_src, + uint64 size) override; + bool SynchronousMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src, + uint64 size) override; + bool SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) override; + + bool HostCallback(Stream *stream, std::function<void()> callback) override; + + port::Status AllocateEvent(Event *event) override { + return port::Status{port::error::UNIMPLEMENTED, ""}; + } + + port::Status DeallocateEvent(Event *event) override { + return port::Status{port::error::UNIMPLEMENTED, ""}; + } + + port::Status RecordEvent(Stream *stream, Event *event) override { + return port::Status{port::error::UNIMPLEMENTED, ""}; + } + + port::Status WaitForEvent(Stream *stream, Event *event) override { + return port::Status{port::error::UNIMPLEMENTED, ""}; + } + + Event::Status PollForEventStatus(Event *event) override { + return Event::Status::kError; + } + + bool AllocateStream(Stream *stream) override; + void DeallocateStream(Stream *stream) override; + bool CreateStreamDependency(Stream *dependent, Stream *other) override; + + // No special initialization is necessary for host timers. + bool AllocateTimer(Timer *timer) override { return true; } + + void DeallocateTimer(Timer *timer) override {} + + bool StartTimer(Stream *stream, Timer *timer) override; + + bool StopTimer(Stream *stream, Timer *timer) override; + + bool BlockHostUntilDone(Stream *stream) override; + + int PlatformDeviceCount() override { return 1; } + + bool DeviceMemoryUsage(int64 *free, int64 *total) const override { + return false; + } + + DeviceDescription *PopulateDeviceDescription() const override; + + port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override { + return port::Status::OK(); + } + + bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override { + return true; + } + + SharedMemoryConfig GetDeviceSharedMemoryConfig() override { + LOG(INFO) << "Shared memory configuration is unsupported for host " + << "executors."; + return SharedMemoryConfig::kDefault; + } + + port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override { + string error_msg{ + "Shared memory configuration is unsupported for host " + "executors."}; + LOG(INFO) << error_msg; + return port::Status{port::error::UNIMPLEMENTED, error_msg}; + } + + bool SupportsBlas() const override; + blas::BlasSupport *CreateBlas() override; + + bool SupportsDnn() const override { return false; } + dnn::DnnSupport *CreateDnn() override { return nullptr; } + + bool SupportsFft() const override; + fft::FftSupport *CreateFft() override; + + bool SupportsRng() const override; + rng::RngSupport *CreateRng() override; + + std::unique_ptr<internal::EventInterface> CreateEventImplementation() + override { + LOG(WARNING) << "Events not currently supported by HostExecutor."; + return nullptr; + } + + std::unique_ptr<internal::KernelInterface> CreateKernelImplementation() + override { + return nullptr; + } + + std::unique_ptr<internal::StreamInterface> GetStreamImplementation() + override { + return std::unique_ptr<internal::StreamInterface>(new HostStream()); + } + + std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override { + return std::unique_ptr<internal::TimerInterface>(new HostTimer()); + } + + void *CudaContextHack() override { return nullptr; } + + private: + const PluginConfig plugin_config_; +}; + +} // namespace host +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_GPU_EXECUTOR_H_ diff --git a/tensorflow/stream_executor/host/host_platform.cc b/tensorflow/stream_executor/host/host_platform.cc new file mode 100644 index 0000000000..1fa4dfce84 --- /dev/null +++ b/tensorflow/stream_executor/host/host_platform.cc @@ -0,0 +1,120 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/stream_executor/host/host_platform.h" + +#include <thread> + +#include "tensorflow/stream_executor/host/host_gpu_executor.h" +#include "tensorflow/stream_executor/host/host_platform_id.h" +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/initialize.h" +#include "tensorflow/stream_executor/lib/ptr_util.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/status_macros.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" + +namespace gpu = ::perftools::gputools; + +namespace perftools { +namespace gputools { +namespace host { + +HostPlatform::HostPlatform() : name_("Host") {} + +HostPlatform::~HostPlatform() {} + +Platform::Id HostPlatform::id() const { return kHostPlatformId; } + +int HostPlatform::VisibleDeviceCount() const { + return std::thread::hardware_concurrency(); +} + +const string& HostPlatform::Name() const { return name_; } + +port::StatusOr<StreamExecutor*> HostPlatform::ExecutorForDevice(int ordinal) { + StreamExecutorConfig config; + config.ordinal = ordinal; + config.plugin_config = PluginConfig(); + config.device_options = DeviceOptions::Default(); + return GetExecutor(config); +} + +port::StatusOr<StreamExecutor*> HostPlatform::ExecutorForDeviceWithPluginConfig( + int device_ordinal, const PluginConfig& plugin_config) { + StreamExecutorConfig config; + config.ordinal = device_ordinal; + config.plugin_config = plugin_config; + config.device_options = DeviceOptions::Default(); + return GetExecutor(config); +} + +port::StatusOr<StreamExecutor*> HostPlatform::GetExecutor( + const StreamExecutorConfig& config) { + mutex_lock lock(executors_mutex_); + + port::StatusOr<StreamExecutor*> status = executor_cache_.Get(config); + if (status.ok()) { + return status.ValueOrDie(); + } + + port::StatusOr<std::unique_ptr<StreamExecutor>> executor = + GetUncachedExecutor(config); + if (!executor.ok()) { + return executor.status(); + } + + StreamExecutor* naked_executor = executor.ValueOrDie().get(); + SE_RETURN_IF_ERROR( + executor_cache_.Insert(config, executor.ConsumeValueOrDie())); + return naked_executor; +} + +port::StatusOr<std::unique_ptr<StreamExecutor>> +HostPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) { + auto executor = port::MakeUnique<StreamExecutor>( + this, new HostExecutor(config.plugin_config)); + auto init_status = executor->Init(config.ordinal, config.device_options); + if (!init_status.ok()) { + return port::Status{ + port::error::INTERNAL, + port::Printf( + "failed initializing StreamExecutor for device ordinal %d: %s", + config.ordinal, init_status.ToString().c_str())}; + } + + return std::move(executor); +} + +void HostPlatform::RegisterTraceListener( + std::unique_ptr<TraceListener> listener) { + LOG(FATAL) << "not yet implemented: register host trace listener"; +} + +void HostPlatform::UnregisterTraceListener(TraceListener* listener) { + LOG(FATAL) << "not yet implemented: unregister host trace listener"; +} + +static void InitializeHostPlatform() { + std::unique_ptr<gpu::Platform> platform(new gpu::host::HostPlatform); + SE_CHECK_OK(gpu::MultiPlatformManager::RegisterPlatform(std::move(platform))); +} + +} // namespace host +} // namespace gputools +} // namespace perftools + +REGISTER_MODULE_INITIALIZER( + host_platform, perftools::gputools::host::InitializeHostPlatform()); diff --git a/tensorflow/stream_executor/host/host_platform.h b/tensorflow/stream_executor/host/host_platform.h new file mode 100644 index 0000000000..86805ef3e3 --- /dev/null +++ b/tensorflow/stream_executor/host/host_platform.h @@ -0,0 +1,88 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Declares the "host" platform, which is a CPU-only implementation of the +// StreamExecutor. The host platform only supports memory operations and plugin +// routines, and is primarily used for testing. +#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_H_ +#define TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_H_ + +#include <memory> +#include <string> +#include <vector> + +#include "tensorflow/stream_executor/executor_cache.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/multi_platform_manager.h" +#include "tensorflow/stream_executor/platform.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/platform/thread_annotations.h" +#include "tensorflow/stream_executor/stream_executor_pimpl.h" +#include "tensorflow/stream_executor/trace_listener.h" + +namespace perftools { +namespace gputools { +namespace host { + +// Host (CPU) platform plugin, registered as a singleton value via module +// initializer. +class HostPlatform : public Platform { + public: + HostPlatform(); + ~HostPlatform() override; + + Platform::Id id() const override; + + // Device count is less clear-cut for CPUs than accelerators. This call + // currently returns the number of thread units in the host, as reported by + // base::NumCPUs(). + int VisibleDeviceCount() const override; + + const string& Name() const override; + + port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override; + + port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig( + int ordinal, const PluginConfig& config) override; + + port::StatusOr<StreamExecutor*> GetExecutor( + const StreamExecutorConfig& config) override; + + port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor( + const StreamExecutorConfig& config) override; + + void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override; + + void UnregisterTraceListener(TraceListener* listener) override; + + private: + // This platform's name. + string name_; + + // mutex that guards the ordinal-to-executor map. + mutable mutex executors_mutex_; + + // Cache of created StreamExecutors. + ExecutorCache executor_cache_; + + SE_DISALLOW_COPY_AND_ASSIGN(HostPlatform); +}; + +} // namespace host +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_H_ diff --git a/tensorflow/stream_executor/host/host_platform_id.cc b/tensorflow/stream_executor/host/host_platform_id.cc new file mode 100644 index 0000000000..69a203f298 --- /dev/null +++ b/tensorflow/stream_executor/host/host_platform_id.cc @@ -0,0 +1,26 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/stream_executor/host/host_platform_id.h" + +namespace perftools { +namespace gputools { +namespace host { + +PLATFORM_DEFINE_ID(kHostPlatformId); + +} // namespace host +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/host/host_platform_id.h b/tensorflow/stream_executor/host/host_platform_id.h new file mode 100644 index 0000000000..61d84ea2e2 --- /dev/null +++ b/tensorflow/stream_executor/host/host_platform_id.h @@ -0,0 +1,36 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_ID_H_ +#define TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_ID_H_ + +#include "tensorflow/stream_executor/platform.h" + +namespace perftools { +namespace gputools { +namespace host { + +// Opaque and unique identifier for the host platform. +// This is needed so that plugins can refer to/identify this platform without +// instantiating a HostPlatform object. +// This is broken out here to avoid a circular dependency between HostPlatform +// and HostStreamExecutor. +extern const Platform::Id kHostPlatformId; + +} // namespace host +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_ID_H_ diff --git a/tensorflow/stream_executor/host/host_stream.cc b/tensorflow/stream_executor/host/host_stream.cc new file mode 100644 index 0000000000..c81ca406b8 --- /dev/null +++ b/tensorflow/stream_executor/host/host_stream.cc @@ -0,0 +1,57 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Class method definitions for HostStream, the Stream implementation for +// the HostExecutor implementation. +#include "tensorflow/stream_executor/host/host_stream.h" + +namespace perftools { +namespace gputools { +namespace host { + +HostStream::HostStream() + : host_executor_(new port::ThreadPool(port::Env::Default(), + port::ThreadOptions(), + "host_executor", kExecutorThreads)) {} + +HostStream::~HostStream() {} + +bool HostStream::EnqueueTask(std::function<void()> task) { + { + mutex_lock lock(mu_); + ++pending_tasks_; + } + host_executor_->Schedule([this, task]() { + task(); + { + mutex_lock lock(mu_); + --pending_tasks_; + } + completion_condition_.notify_all(); + }); + return true; +} + +void HostStream::BlockUntilDone() { + mutex_lock lock(mu_); + completion_condition_.wait(lock, [this]() { + return pending_tasks_ == 0; + }); +} + +} // namespace host + +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/host/host_stream.h b/tensorflow/stream_executor/host/host_stream.h new file mode 100644 index 0000000000..9894d17feb --- /dev/null +++ b/tensorflow/stream_executor/host/host_stream.h @@ -0,0 +1,58 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Class declaration for Stream type that enqueues tasks onto a host/CPU-based +// execution context (as opposed to a GPU device), HostExecutor. +#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_STREAM_H_ +#define TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_STREAM_H_ + +#include <functional> +#include <memory> + +#include "tensorflow/stream_executor/lib/threadpool.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" + +namespace perftools { +namespace gputools { +namespace host { + +class HostStream : public internal::StreamInterface { + public: + HostStream(); + ~HostStream() override; + + bool EnqueueTask(std::function<void()> task); + + void *CudaStreamHack() override { return nullptr; } + void **CudaStreamMemberHack() override { return nullptr; } + + void BlockUntilDone(); + + private: + // Use only one thread and own task queue to preserve FIFO ordering + // for the operations enqueued by any given stream. + static const int kExecutorThreads = 1; + std::unique_ptr<port::ThreadPool> host_executor_; + + mutex mu_; + int pending_tasks_ GUARDED_BY(mu_) = 0; + condition_variable completion_condition_; +}; + +} // namespace host +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_STREAM_H_ diff --git a/tensorflow/stream_executor/host/host_timer.cc b/tensorflow/stream_executor/host/host_timer.cc new file mode 100644 index 0000000000..187db9f0c2 --- /dev/null +++ b/tensorflow/stream_executor/host/host_timer.cc @@ -0,0 +1,52 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/stream_executor/host/host_timer.h" + +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/stream.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" + +namespace gpu = ::perftools::gputools; + +namespace perftools { +namespace gputools { +namespace host { + +using std::chrono::duration_cast; + +bool HostTimer::Start(Stream* stream) { + return stream->ThenDoHostCallback([this]() { this->StartNow(); }).ok(); +} + +bool HostTimer::Stop(Stream* stream) { + return stream->ThenDoHostCallback([this]() { this->StopNow(); }).ok(); +} + +uint64 HostTimer::Microseconds() const { + return duration_cast<std::chrono::microseconds>(duration_).count(); +} + +uint64 HostTimer::Nanoseconds() const { + return duration_cast<std::chrono::nanoseconds>(duration_).count(); +} + +void HostTimer::StartNow() { start_time_ = clock::now(); } + +void HostTimer::StopNow() { duration_ = clock::now() - start_time_; } + +} // namespace host +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/host/host_timer.h b/tensorflow/stream_executor/host/host_timer.h new file mode 100644 index 0000000000..17af7c0521 --- /dev/null +++ b/tensorflow/stream_executor/host/host_timer.h @@ -0,0 +1,63 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_TIMER_H_ +#define TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_TIMER_H_ + +#include <chrono> + +#include "tensorflow/stream_executor/stream_executor_internal.h" + +namespace perftools { +namespace gputools { +namespace host { + +class HostTimer : public internal::TimerInterface { + public: + HostTimer() {} + ~HostTimer() override {} + + // Begins the timer at the present point in the stream. + bool Start(Stream *stream); + + // Stops the timer at the present point in the stream. + bool Stop(Stream *stream); + + // Returns the most recent value recorded for a start/stopcycle, in + // microseconds. + uint64 Microseconds() const override; + + // Returns the most recent value recorded for a start/stopcycle, in + // nanoseconds. + uint64 Nanoseconds() const override; + + private: + using clock = std::chrono::high_resolution_clock; + + clock::time_point start_time_; + clock::duration duration_; + + // Actually starts (rather than enqueues starting) the timer. + void StartNow(); + + // Actually stops (rather than enqueues stopping) the timer. + void StopNow(); +}; + +} // namespace host +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_TIMER_H_ diff --git a/tensorflow/stream_executor/lib/env.h b/tensorflow/stream_executor/lib/env.h index f11a4cb25b..9bdc46b3dc 100644 --- a/tensorflow/stream_executor/lib/env.h +++ b/tensorflow/stream_executor/lib/env.h @@ -17,6 +17,7 @@ limitations under the License. #define TENSORFLOW_STREAM_EXECUTOR_LIB_ENV_H_ #include "tensorflow/core/platform/env.h" +#include "tensorflow/stream_executor/lib/status.h" #include "tensorflow/stream_executor/lib/stringpiece.h" #include "tensorflow/stream_executor/platform/port.h" diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc index 6f3ac8bda1..b1f25ab5d3 100644 --- a/tensorflow/stream_executor/multi_platform_manager.cc +++ b/tensorflow/stream_executor/multi_platform_manager.cc @@ -22,13 +22,11 @@ limitations under the License. namespace perftools { namespace gputools { -/* static */ mutex MultiPlatformManager::platforms_mutex_(LINKER_INITIALIZED); - /* static */ port::Status MultiPlatformManager::RegisterPlatform( std::unique_ptr<Platform> platform) { CHECK(platform != nullptr); string key = port::Lowercase(platform->Name()); - mutex_lock lock(platforms_mutex_); + mutex_lock lock(GetPlatformsMutex()); if (GetPlatformMap()->find(key) != GetPlatformMap()->end()) { return port::Status(port::error::INTERNAL, "platform is already registered with name: \"" + @@ -46,7 +44,7 @@ namespace gputools { /* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithName( const string& target) { - mutex_lock lock(platforms_mutex_); + mutex_lock lock(GetPlatformsMutex()); auto it = GetPlatformMap()->find(port::Lowercase(target)); if (it == GetPlatformMap()->end()) { @@ -60,7 +58,7 @@ namespace gputools { /* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithId( const Platform::Id& id) { - mutex_lock lock(platforms_mutex_); + mutex_lock lock(GetPlatformsMutex()); auto it = GetPlatformByIdMap()->find(id); if (it == GetPlatformByIdMap()->end()) { return port::Status( @@ -72,7 +70,7 @@ namespace gputools { } /* static */ void MultiPlatformManager::ClearPlatformRegistry() { - mutex_lock lock(platforms_mutex_); + mutex_lock lock(GetPlatformsMutex()); GetPlatformMap()->clear(); GetPlatformByIdMap()->clear(); } diff --git a/tensorflow/stream_executor/multi_platform_manager.h b/tensorflow/stream_executor/multi_platform_manager.h index e49cb73848..ea6155b482 100644 --- a/tensorflow/stream_executor/multi_platform_manager.h +++ b/tensorflow/stream_executor/multi_platform_manager.h @@ -123,13 +123,16 @@ class MultiPlatformManager { // Provides access to the available set of platforms under a lock. static port::Status WithPlatforms( std::function<port::Status(PlatformMap*)> callback) { - mutex_lock lock(platforms_mutex_); + mutex_lock lock(GetPlatformsMutex()); return callback(GetPlatformMap()); } private: // mutex that guards the platform map. - static mutex platforms_mutex_; + static mutex& GetPlatformsMutex() { + static mutex* platforms_mutex = new mutex; + return *platforms_mutex; + } // TODO(b/22689637): Clean up these two maps; make sure they coexist nicely. // TODO(b/22689637): Move this (whatever the final/"official" map is) to |