diff options
author | 2015-12-08 09:58:59 -0800 | |
---|---|---|
committer | 2015-12-08 09:58:59 -0800 | |
commit | ddd4aaf5286de24ba70402ee0ec8b836d3aed8c7 (patch) | |
tree | 4efdf6cf4d69b45041fd2a02cd2b7327ea9f1f58 /tensorflow/stream_executor/stream_executor_pimpl.cc | |
parent | cd53f3c3302c9312c1840389a9988a879b8b9dd5 (diff) |
TensorFlow: upstream changes to git.
Change 109695551
Update FAQ
Change 109694725
Add a gradient for resize_bilinear op.
Change 109694505
Don't mention variables module in docs
variables.Variable should be tf.Variable.
Change 109658848
Adding an option to create a new thread-pool for each session.
Change 109640570
Take the snapshot of stream-executor.
+ Expose an interface for scratch space allocation in the interface.
Change 109638559
Let image_summary accept uint8 input
This allows users to do their own normalization / scaling if the default
(very weird) behavior of image_summary is undesired.
This required a slight tweak to fake_input.cc to make polymorphically typed
fake inputs infer if their type attr is not set but has a default.
Unfortunately, adding a second valid type to image_summary *disables* automatic
implicit conversion from np.float64 to tf.float32, so this change is slightly
backwards incompatible.
Change 109636969
Add serialization operations for SparseTensor.
Change 109636644
Update generated Op docs.
Change 109634899
TensorFlow: add a markdown file for producing release notes for our
releases. Seed with 0.5.0 with a boring but accurate description.
Change 109634502
Let histogram_summary take any realnumbertype
It used to take only floats, not it understands ints.
Change 109634434
TensorFlow: update locations where we mention python 3 support, update
them to current truth.
Change 109632108
Move HSV <> RGB conversions, grayscale conversions, and adjust_* ops back to tensorflow
- make GPU-capable version of RGBToHSV and HSVToRGB, allows only float input/output
- change docs to reflect new size constraints
- change HSV format to be [0,1] for all components
- add automatic dtype conversion for all adjust_* and grayscale conversion ops
- fix up docs
Change 109631077
Improve optimizer exceptions
1. grads_and_vars is now a tuple, so must be wrapped when passed to format.
2. Use '%r' instead of '%s' for dtype formatting
Base CL: 109697989
Diffstat (limited to 'tensorflow/stream_executor/stream_executor_pimpl.cc')
-rw-r--r-- | tensorflow/stream_executor/stream_executor_pimpl.cc | 79 |
1 files changed, 48 insertions, 31 deletions
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc index e496deaf9d..acaa0efcb2 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.cc +++ b/tensorflow/stream_executor/stream_executor_pimpl.cc @@ -26,6 +26,8 @@ limitations under the License. #include "tensorflow/stream_executor/lib/env.h" #include "tensorflow/stream_executor/lib/error.h" #include "tensorflow/stream_executor/lib/notification.h" +#include "tensorflow/stream_executor/lib/stacktrace.h" +#include "tensorflow/stream_executor/lib/str_util.h" #include "tensorflow/stream_executor/lib/stringprintf.h" #include "tensorflow/stream_executor/lib/threadpool.h" #include "tensorflow/stream_executor/platform/port.h" @@ -40,6 +42,14 @@ namespace perftools { namespace gputools { namespace { +string StackTraceIfVLOG10() { + if (VLOG_IS_ON(10)) { + return port::StrCat(" ", port::CurrentStackTrace(), "\n"); + } else { + return ""; + } +} + // Maximum stack depth to report when generating backtrace on mem allocation // (for GPU memory leak checker) static const int kMaxStackDepth = 256; @@ -66,9 +76,6 @@ internal::StreamExecutorInterface *StreamExecutorImplementationFromPlatformKind( case PlatformKind::kOpenCL: factory = *internal::MakeOpenCLExecutorImplementation(); break; - case PlatformKind::kOpenCLAltera: - factory = *internal::MakeOpenCLAlteraExecutorImplementation(); - break; case PlatformKind::kHost: factory = internal::MakeHostExecutorImplementation; break; @@ -148,7 +155,8 @@ MakeScopedTracer(StreamExecutor *stream_exec, BeginCallT begin_call, StreamExecutor::StreamExecutor(PlatformKind platform_kind, const PluginConfig &plugin_config) - : implementation_(StreamExecutorImplementationFromPlatformKind( + : platform_(nullptr), + implementation_(StreamExecutorImplementationFromPlatformKind( platform_kind, plugin_config)), platform_kind_(platform_kind), device_ordinal_(-1), @@ -160,16 +168,21 @@ StreamExecutor::StreamExecutor(PlatformKind platform_kind, } StreamExecutor::StreamExecutor( - PlatformKind platform_kind, - internal::StreamExecutorInterface *implementation) - : implementation_(implementation), - platform_kind_(platform_kind), + const Platform *platform, internal::StreamExecutorInterface *implementation) + : platform_(platform), + implementation_(implementation), device_ordinal_(-1), background_threads_(new port::ThreadPool( port::Env::Default(), "stream_executor", kNumBackgroundThreads)), live_stream_count_(0), tracing_enabled_(false) { - CheckPlatformKindIsValid(platform_kind); + if (port::Lowercase(platform_->Name()) == "cuda") { + platform_kind_ = PlatformKind::kCuda; + } else if (port::Lowercase(platform_->Name()) == "opencl") { + platform_kind_ = PlatformKind::kOpenCL; + } else if (port::Lowercase(platform_->Name()) == "host") { + platform_kind_ = PlatformKind::kHost; + } } StreamExecutor::~StreamExecutor() { @@ -208,7 +221,7 @@ bool StreamExecutor::GetKernel(const MultiKernelLoaderSpec &spec, void StreamExecutor::Deallocate(DeviceMemoryBase *mem) { VLOG(1) << "Called StreamExecutor::Deallocate(mem=" << mem->opaque() - << ") mem->size()=" << mem->size(); + << ") mem->size()=" << mem->size() << StackTraceIfVLOG10(); if (mem->opaque() != nullptr) { EraseAllocRecord(mem->opaque()); @@ -333,8 +346,8 @@ bool StreamExecutor::BlockHostUntilDone(Stream *stream) { void *StreamExecutor::Allocate(uint64 size) { void *buf = implementation_->Allocate(size); - VLOG(1) << "Called StreamExecutor::Allocate(size=" << size - << ") returns " << buf; + VLOG(1) << "Called StreamExecutor::Allocate(size=" << size << ") returns " + << buf << StackTraceIfVLOG10(); CreateAllocRecord(buf, size); return buf; @@ -348,20 +361,20 @@ bool StreamExecutor::GetSymbol(const string &symbol_name, void **mem, void *StreamExecutor::HostMemoryAllocate(uint64 size) { void *buffer = implementation_->HostMemoryAllocate(size); VLOG(1) << "Called StreamExecutor::HostMemoryAllocate(size=" << size - << ") returns " << buffer; + << ") returns " << buffer << StackTraceIfVLOG10(); return buffer; } void StreamExecutor::HostMemoryDeallocate(void *location) { - VLOG(1) << "Called StreamExecutor::HostMemoryDeallocate(location=" - << location << ")"; + VLOG(1) << "Called StreamExecutor::HostMemoryDeallocate(location=" << location + << ")" << StackTraceIfVLOG10(); return implementation_->HostMemoryDeallocate(location); } bool StreamExecutor::HostMemoryRegister(void *location, uint64 size) { VLOG(1) << "Called StreamExecutor::HostMemoryRegister(location=" << location - << ", size=" << size << ")"; + << ", size=" << size << ")" << StackTraceIfVLOG10(); if (location == nullptr || size == 0) { LOG(WARNING) << "attempting to register null or zero-sized memory: " << location << "; size " << size; @@ -371,12 +384,13 @@ bool StreamExecutor::HostMemoryRegister(void *location, uint64 size) { bool StreamExecutor::HostMemoryUnregister(void *location) { VLOG(1) << "Called StreamExecutor::HostMemoryUnregister(location=" << location - << ")"; + << ")" << StackTraceIfVLOG10(); return implementation_->HostMemoryUnregister(location); } bool StreamExecutor::SynchronizeAllActivity() { - VLOG(1) << "Called StreamExecutor::SynchronizeAllActivity()"; + VLOG(1) << "Called StreamExecutor::SynchronizeAllActivity()" + << StackTraceIfVLOG10(); bool ok = implementation_->SynchronizeAllActivity(); // This should all be quick and infallible work, so we can perform the @@ -388,16 +402,17 @@ bool StreamExecutor::SynchronizeAllActivity() { bool StreamExecutor::SynchronousMemZero(DeviceMemoryBase *location, uint64 size) { - VLOG(1) << "Called StreamExecutor::SynchronousMemZero(location=" - << location << ", size=" << size << ")"; + VLOG(1) << "Called StreamExecutor::SynchronousMemZero(location=" << location + << ", size=" << size << ")" << StackTraceIfVLOG10(); return implementation_->SynchronousMemZero(location, size); } bool StreamExecutor::SynchronousMemSet(DeviceMemoryBase *location, int value, uint64 size) { - VLOG(1) << "Called StreamExecutor::SynchronousMemSet(location=" - << location << ", value=" << value << ", size=" << size << ")"; + VLOG(1) << "Called StreamExecutor::SynchronousMemSet(location=" << location + << ", value=" << value << ", size=" << size << ")" + << StackTraceIfVLOG10(); return implementation_->SynchronousMemSet(location, value, size); } @@ -406,7 +421,7 @@ bool StreamExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst, const void *host_src, uint64 size) { VLOG(1) << "Called StreamExecutor::SynchronousMemcpy(gpu_dst=" << gpu_dst->opaque() << ", host_src=" << host_src << ", size=" << size - << ") H2D"; + << ") H2D" << StackTraceIfVLOG10(); // Tracing overloaded methods is very difficult due to issues with type // inference on template args. Since use of these overloaded methods is @@ -417,9 +432,9 @@ bool StreamExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst, bool StreamExecutor::SynchronousMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src, uint64 size) { - VLOG(1) << "Called StreamExecutor::SynchronousMemcpy(host_dst=" - << host_dst << ", gpu_src=" << gpu_src.opaque() << ", size=" << size - << ") D2H"; + VLOG(1) << "Called StreamExecutor::SynchronousMemcpy(host_dst=" << host_dst + << ", gpu_src=" << gpu_src.opaque() << ", size=" << size << ") D2H" + << StackTraceIfVLOG10(); return implementation_->SynchronousMemcpy(host_dst, gpu_src, size); } @@ -428,8 +443,8 @@ bool StreamExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src, uint64 size) { VLOG(1) << "Called StreamExecutor::SynchronousMemcpy(gpu_dst=" - << gpu_dst->opaque() << ", gpu_src=" << gpu_src.opaque() << ", size=" << size - << ") D2D"; + << gpu_dst->opaque() << ", gpu_src=" << gpu_src.opaque() + << ", size=" << size << ") D2D" << StackTraceIfVLOG10(); return implementation_->SynchronousMemcpyDeviceToDevice(gpu_dst, gpu_src, size); @@ -438,7 +453,8 @@ bool StreamExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst, port::Status StreamExecutor::SynchronousMemcpyD2H( const DeviceMemoryBase &gpu_src, int64 size, void *host_dst) { VLOG(1) << "Called StreamExecutor::SynchronousMemcpyD2H(gpu_src=" - << gpu_src.opaque() << ", size=" << size << ", host_dst=" << host_dst << ")"; + << gpu_src.opaque() << ", size=" << size << ", host_dst=" << host_dst + << ")" << StackTraceIfVLOG10(); port::Status result{port::Status::OK()}; SCOPED_TRACE(TraceListener::SynchronousMemcpyD2H, @@ -459,8 +475,9 @@ port::Status StreamExecutor::SynchronousMemcpyD2H( port::Status StreamExecutor::SynchronousMemcpyH2D(const void *host_src, int64 size, DeviceMemoryBase *gpu_dst) { - VLOG(1) << "Called StreamExecutor::SynchronousMemcpyH2D(host_src=" - << host_src << ", size=" << size << ", gpu_dst" << gpu_dst->opaque() << ")"; + VLOG(1) << "Called StreamExecutor::SynchronousMemcpyH2D(host_src=" << host_src + << ", size=" << size << ", gpu_dst" << gpu_dst->opaque() << ")" + << StackTraceIfVLOG10(); port::Status result{port::Status::OK()}; SCOPED_TRACE(TraceListener::SynchronousMemcpyH2D, |