aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/stream_executor_pimpl.cc
diff options
context:
space:
mode:
authorGravatar Vijay Vasudevan <vrv@google.com>2015-12-08 09:58:59 -0800
committerGravatar Vijay Vasudevan <vrv@google.com>2015-12-08 09:58:59 -0800
commitddd4aaf5286de24ba70402ee0ec8b836d3aed8c7 (patch)
tree4efdf6cf4d69b45041fd2a02cd2b7327ea9f1f58 /tensorflow/stream_executor/stream_executor_pimpl.cc
parentcd53f3c3302c9312c1840389a9988a879b8b9dd5 (diff)
TensorFlow: upstream changes to git.
Change 109695551 Update FAQ Change 109694725 Add a gradient for resize_bilinear op. Change 109694505 Don't mention variables module in docs variables.Variable should be tf.Variable. Change 109658848 Adding an option to create a new thread-pool for each session. Change 109640570 Take the snapshot of stream-executor. + Expose an interface for scratch space allocation in the interface. Change 109638559 Let image_summary accept uint8 input This allows users to do their own normalization / scaling if the default (very weird) behavior of image_summary is undesired. This required a slight tweak to fake_input.cc to make polymorphically typed fake inputs infer if their type attr is not set but has a default. Unfortunately, adding a second valid type to image_summary *disables* automatic implicit conversion from np.float64 to tf.float32, so this change is slightly backwards incompatible. Change 109636969 Add serialization operations for SparseTensor. Change 109636644 Update generated Op docs. Change 109634899 TensorFlow: add a markdown file for producing release notes for our releases. Seed with 0.5.0 with a boring but accurate description. Change 109634502 Let histogram_summary take any realnumbertype It used to take only floats, not it understands ints. Change 109634434 TensorFlow: update locations where we mention python 3 support, update them to current truth. Change 109632108 Move HSV <> RGB conversions, grayscale conversions, and adjust_* ops back to tensorflow - make GPU-capable version of RGBToHSV and HSVToRGB, allows only float input/output - change docs to reflect new size constraints - change HSV format to be [0,1] for all components - add automatic dtype conversion for all adjust_* and grayscale conversion ops - fix up docs Change 109631077 Improve optimizer exceptions 1. grads_and_vars is now a tuple, so must be wrapped when passed to format. 2. Use '%r' instead of '%s' for dtype formatting Base CL: 109697989
Diffstat (limited to 'tensorflow/stream_executor/stream_executor_pimpl.cc')
-rw-r--r--tensorflow/stream_executor/stream_executor_pimpl.cc79
1 files changed, 48 insertions, 31 deletions
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index e496deaf9d..acaa0efcb2 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -26,6 +26,8 @@ limitations under the License.
#include "tensorflow/stream_executor/lib/env.h"
#include "tensorflow/stream_executor/lib/error.h"
#include "tensorflow/stream_executor/lib/notification.h"
+#include "tensorflow/stream_executor/lib/stacktrace.h"
+#include "tensorflow/stream_executor/lib/str_util.h"
#include "tensorflow/stream_executor/lib/stringprintf.h"
#include "tensorflow/stream_executor/lib/threadpool.h"
#include "tensorflow/stream_executor/platform/port.h"
@@ -40,6 +42,14 @@ namespace perftools {
namespace gputools {
namespace {
+string StackTraceIfVLOG10() {
+ if (VLOG_IS_ON(10)) {
+ return port::StrCat(" ", port::CurrentStackTrace(), "\n");
+ } else {
+ return "";
+ }
+}
+
// Maximum stack depth to report when generating backtrace on mem allocation
// (for GPU memory leak checker)
static const int kMaxStackDepth = 256;
@@ -66,9 +76,6 @@ internal::StreamExecutorInterface *StreamExecutorImplementationFromPlatformKind(
case PlatformKind::kOpenCL:
factory = *internal::MakeOpenCLExecutorImplementation();
break;
- case PlatformKind::kOpenCLAltera:
- factory = *internal::MakeOpenCLAlteraExecutorImplementation();
- break;
case PlatformKind::kHost:
factory = internal::MakeHostExecutorImplementation;
break;
@@ -148,7 +155,8 @@ MakeScopedTracer(StreamExecutor *stream_exec, BeginCallT begin_call,
StreamExecutor::StreamExecutor(PlatformKind platform_kind,
const PluginConfig &plugin_config)
- : implementation_(StreamExecutorImplementationFromPlatformKind(
+ : platform_(nullptr),
+ implementation_(StreamExecutorImplementationFromPlatformKind(
platform_kind, plugin_config)),
platform_kind_(platform_kind),
device_ordinal_(-1),
@@ -160,16 +168,21 @@ StreamExecutor::StreamExecutor(PlatformKind platform_kind,
}
StreamExecutor::StreamExecutor(
- PlatformKind platform_kind,
- internal::StreamExecutorInterface *implementation)
- : implementation_(implementation),
- platform_kind_(platform_kind),
+ const Platform *platform, internal::StreamExecutorInterface *implementation)
+ : platform_(platform),
+ implementation_(implementation),
device_ordinal_(-1),
background_threads_(new port::ThreadPool(
port::Env::Default(), "stream_executor", kNumBackgroundThreads)),
live_stream_count_(0),
tracing_enabled_(false) {
- CheckPlatformKindIsValid(platform_kind);
+ if (port::Lowercase(platform_->Name()) == "cuda") {
+ platform_kind_ = PlatformKind::kCuda;
+ } else if (port::Lowercase(platform_->Name()) == "opencl") {
+ platform_kind_ = PlatformKind::kOpenCL;
+ } else if (port::Lowercase(platform_->Name()) == "host") {
+ platform_kind_ = PlatformKind::kHost;
+ }
}
StreamExecutor::~StreamExecutor() {
@@ -208,7 +221,7 @@ bool StreamExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
void StreamExecutor::Deallocate(DeviceMemoryBase *mem) {
VLOG(1) << "Called StreamExecutor::Deallocate(mem=" << mem->opaque()
- << ") mem->size()=" << mem->size();
+ << ") mem->size()=" << mem->size() << StackTraceIfVLOG10();
if (mem->opaque() != nullptr) {
EraseAllocRecord(mem->opaque());
@@ -333,8 +346,8 @@ bool StreamExecutor::BlockHostUntilDone(Stream *stream) {
void *StreamExecutor::Allocate(uint64 size) {
void *buf = implementation_->Allocate(size);
- VLOG(1) << "Called StreamExecutor::Allocate(size=" << size
- << ") returns " << buf;
+ VLOG(1) << "Called StreamExecutor::Allocate(size=" << size << ") returns "
+ << buf << StackTraceIfVLOG10();
CreateAllocRecord(buf, size);
return buf;
@@ -348,20 +361,20 @@ bool StreamExecutor::GetSymbol(const string &symbol_name, void **mem,
void *StreamExecutor::HostMemoryAllocate(uint64 size) {
void *buffer = implementation_->HostMemoryAllocate(size);
VLOG(1) << "Called StreamExecutor::HostMemoryAllocate(size=" << size
- << ") returns " << buffer;
+ << ") returns " << buffer << StackTraceIfVLOG10();
return buffer;
}
void StreamExecutor::HostMemoryDeallocate(void *location) {
- VLOG(1) << "Called StreamExecutor::HostMemoryDeallocate(location="
- << location << ")";
+ VLOG(1) << "Called StreamExecutor::HostMemoryDeallocate(location=" << location
+ << ")" << StackTraceIfVLOG10();
return implementation_->HostMemoryDeallocate(location);
}
bool StreamExecutor::HostMemoryRegister(void *location, uint64 size) {
VLOG(1) << "Called StreamExecutor::HostMemoryRegister(location=" << location
- << ", size=" << size << ")";
+ << ", size=" << size << ")" << StackTraceIfVLOG10();
if (location == nullptr || size == 0) {
LOG(WARNING) << "attempting to register null or zero-sized memory: "
<< location << "; size " << size;
@@ -371,12 +384,13 @@ bool StreamExecutor::HostMemoryRegister(void *location, uint64 size) {
bool StreamExecutor::HostMemoryUnregister(void *location) {
VLOG(1) << "Called StreamExecutor::HostMemoryUnregister(location=" << location
- << ")";
+ << ")" << StackTraceIfVLOG10();
return implementation_->HostMemoryUnregister(location);
}
bool StreamExecutor::SynchronizeAllActivity() {
- VLOG(1) << "Called StreamExecutor::SynchronizeAllActivity()";
+ VLOG(1) << "Called StreamExecutor::SynchronizeAllActivity()"
+ << StackTraceIfVLOG10();
bool ok = implementation_->SynchronizeAllActivity();
// This should all be quick and infallible work, so we can perform the
@@ -388,16 +402,17 @@ bool StreamExecutor::SynchronizeAllActivity() {
bool StreamExecutor::SynchronousMemZero(DeviceMemoryBase *location,
uint64 size) {
- VLOG(1) << "Called StreamExecutor::SynchronousMemZero(location="
- << location << ", size=" << size << ")";
+ VLOG(1) << "Called StreamExecutor::SynchronousMemZero(location=" << location
+ << ", size=" << size << ")" << StackTraceIfVLOG10();
return implementation_->SynchronousMemZero(location, size);
}
bool StreamExecutor::SynchronousMemSet(DeviceMemoryBase *location, int value,
uint64 size) {
- VLOG(1) << "Called StreamExecutor::SynchronousMemSet(location="
- << location << ", value=" << value << ", size=" << size << ")";
+ VLOG(1) << "Called StreamExecutor::SynchronousMemSet(location=" << location
+ << ", value=" << value << ", size=" << size << ")"
+ << StackTraceIfVLOG10();
return implementation_->SynchronousMemSet(location, value, size);
}
@@ -406,7 +421,7 @@ bool StreamExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
const void *host_src, uint64 size) {
VLOG(1) << "Called StreamExecutor::SynchronousMemcpy(gpu_dst="
<< gpu_dst->opaque() << ", host_src=" << host_src << ", size=" << size
- << ") H2D";
+ << ") H2D" << StackTraceIfVLOG10();
// Tracing overloaded methods is very difficult due to issues with type
// inference on template args. Since use of these overloaded methods is
@@ -417,9 +432,9 @@ bool StreamExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
bool StreamExecutor::SynchronousMemcpy(void *host_dst,
const DeviceMemoryBase &gpu_src,
uint64 size) {
- VLOG(1) << "Called StreamExecutor::SynchronousMemcpy(host_dst="
- << host_dst << ", gpu_src=" << gpu_src.opaque() << ", size=" << size
- << ") D2H";
+ VLOG(1) << "Called StreamExecutor::SynchronousMemcpy(host_dst=" << host_dst
+ << ", gpu_src=" << gpu_src.opaque() << ", size=" << size << ") D2H"
+ << StackTraceIfVLOG10();
return implementation_->SynchronousMemcpy(host_dst, gpu_src, size);
}
@@ -428,8 +443,8 @@ bool StreamExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
const DeviceMemoryBase &gpu_src,
uint64 size) {
VLOG(1) << "Called StreamExecutor::SynchronousMemcpy(gpu_dst="
- << gpu_dst->opaque() << ", gpu_src=" << gpu_src.opaque() << ", size=" << size
- << ") D2D";
+ << gpu_dst->opaque() << ", gpu_src=" << gpu_src.opaque()
+ << ", size=" << size << ") D2D" << StackTraceIfVLOG10();
return implementation_->SynchronousMemcpyDeviceToDevice(gpu_dst, gpu_src,
size);
@@ -438,7 +453,8 @@ bool StreamExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
port::Status StreamExecutor::SynchronousMemcpyD2H(
const DeviceMemoryBase &gpu_src, int64 size, void *host_dst) {
VLOG(1) << "Called StreamExecutor::SynchronousMemcpyD2H(gpu_src="
- << gpu_src.opaque() << ", size=" << size << ", host_dst=" << host_dst << ")";
+ << gpu_src.opaque() << ", size=" << size << ", host_dst=" << host_dst
+ << ")" << StackTraceIfVLOG10();
port::Status result{port::Status::OK()};
SCOPED_TRACE(TraceListener::SynchronousMemcpyD2H,
@@ -459,8 +475,9 @@ port::Status StreamExecutor::SynchronousMemcpyD2H(
port::Status StreamExecutor::SynchronousMemcpyH2D(const void *host_src,
int64 size,
DeviceMemoryBase *gpu_dst) {
- VLOG(1) << "Called StreamExecutor::SynchronousMemcpyH2D(host_src="
- << host_src << ", size=" << size << ", gpu_dst" << gpu_dst->opaque() << ")";
+ VLOG(1) << "Called StreamExecutor::SynchronousMemcpyH2D(host_src=" << host_src
+ << ", size=" << size << ", gpu_dst" << gpu_dst->opaque() << ")"
+ << StackTraceIfVLOG10();
port::Status result{port::Status::OK()};
SCOPED_TRACE(TraceListener::SynchronousMemcpyH2D,