aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/kernels/stack_ops.cc
diff options
context:
space:
mode:
authorGravatar Yuan Yu <yuanbyu@google.com>2016-03-07 15:43:13 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2016-03-08 17:16:17 -0800
commit204400d34666d09f92cbc0df93fd0504ad4450b8 (patch)
tree67447361b656c976085c02e0813ff1fa9e33099e /tensorflow/core/kernels/stack_ops.cc
parent624eafe2fb91ac6ae0fba5cfef0cda6e3a79dcfa (diff)
Further improvements for training long sequences. Here is the current performance of rnn.dyanmic_rnn() on various sequence lengths. The last column is the time for each time step. It stayed as a constant until len=600, when GPU/CPU memory swapping kicked in. It increased gradually as we had to swap more out of GPU memory. The overhead reached about 7% when len=1000. Static unroll LSTM (rnn.rnn()) ran out of memory when len=600. So if you have OOM problems due to long sequence or batch size, you should probably want to switch to use the while-based dynamic_rnn().
Calculation: Long LSTM Sequence batch len units dynamic elapsed_t elapsed_t/len 512 100 512 True 0.654122 0.006541 512 200 512 True 1.303605 0.006518 512 300 512 True 1.956359 0.006521 512 400 512 True 2.609437 0.006524 512 500 512 True 3.261797 0.006524 512 600 512 True 3.984582 0.006641 <-- swapping started 512 700 512 True 4.782934 0.006833 512 800 512 True 5.520320 0.006900 512 900 512 True 6.274871 0.006972 512 1000 512 True 7.021808 0.007022 Below are the graph creation times in seconds for various sequence lengths. Graph Creation: Static Unroll vs. Dynamic Unroll LSTM len dt(static) dt(dynamic) dt(dynamic)/dt(static) 1 0.928402 1.502915 1.618819 25 12.294692 1.745810 0.141997 50 25.619513 2.103737 0.082115 100 52.181274 2.670684 0.051181 200 105.721542 3.737968 0.035357 This CL added a few optimizations: 1. Defer the capturing of a forward tensor until it is really used by a node in backprop. Previously, we captured a froward tensor at the time it is *referenced* in backprop, which led to capturing some tensors that are not actually used in backprop. 2. Move certain computations defined in gradient functions to forward. This is in particular important for reduction operations (e.g., shape) since performing them early could significantly reduce memory consumption. 3. Avoid capturing a tensor that can be determined to be a loop invariant. Change: 116596407
Diffstat (limited to 'tensorflow/core/kernels/stack_ops.cc')
-rw-r--r--tensorflow/core/kernels/stack_ops.cc75
1 files changed, 40 insertions, 35 deletions
diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc
index 2ba571bcdb..4bddcd7e98 100644
--- a/tensorflow/core/kernels/stack_ops.cc
+++ b/tensorflow/core/kernels/stack_ops.cc
@@ -181,47 +181,52 @@ class StackPushOp : public AsyncOpKernel {
// Push the tensor onto the stack. Swap the tensor to CPU if instructed.
const Tensor& tensor = ctx->input(1);
AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1);
- DeviceContext* device_ctxt = ctx->op_device_context();
- auto device = static_cast<tensorflow::Device*>(ctx->device());
- Allocator* allocator = device->GetAllocator(alloc_attrs);
- AllocatorStats stats;
- allocator->GetStats(&stats);
+ // For now, we use a simple heuristic for swapping: A GPU tensor is moved
+ // to CPU if the tensor has more than kCopyThreshold bytes and the GPU
+ // allocator says more than kOccupancy of the memory is in use.
static constexpr int kCopyThreshold = 2048;
static constexpr double kOccupancy = 0.7;
if (swap_memory_ && !alloc_attrs.on_host() &&
std::is_same<Device, GPUDevice>::value &&
- stats.bytes_in_use > (stats.bytes_limit * kOccupancy) &&
tensor.TotalBytes() > kCopyThreshold) {
- // Asynchronously copy the tensor from GPU to CPU memory.
- // TODO(yuanbyu): Swap the oldest tensor first.
- AllocatorAttributes host_alloc_attrs;
- host_alloc_attrs.set_gpu_compatible(true);
- host_alloc_attrs.set_on_host(true);
- Allocator* cpu_allocator = device->GetAllocator(host_alloc_attrs);
- Tensor* cpu_tensor =
- new Tensor(cpu_allocator, tensor.dtype(), tensor.shape());
- device_ctxt->CopyDeviceTensorToCPU(
- &tensor, "StackPush", device, cpu_tensor,
- [cpu_tensor, stack, ctx, done](const Status& s) {
- ctx->SetStatus(s);
- if (s.ok()) {
- AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1);
- ctx->SetStatus(stack->Push(
- {PersistentTensor(*cpu_tensor), alloc_attrs, true}));
- }
- if (ctx->status().ok()) {
- ctx->set_output(0, *cpu_tensor);
- }
- done();
- delete cpu_tensor;
- });
- } else {
- // Execute synchronously if not swapped.
- OP_REQUIRES_OK(
- ctx, stack->Push({PersistentTensor(tensor), alloc_attrs, false}));
- ctx->set_output(0, tensor);
- done();
+ DeviceContext* device_ctxt = ctx->op_device_context();
+ auto device = static_cast<tensorflow::Device*>(ctx->device());
+ Allocator* allocator = device->GetAllocator(alloc_attrs);
+ AllocatorStats stats;
+ allocator->GetStats(&stats);
+ if (stats.bytes_in_use > (stats.bytes_limit * kOccupancy)) {
+ // Asynchronously copy the tensor from GPU to CPU memory.
+ // TODO(yuanbyu): Swap the oldest tensor first.
+ AllocatorAttributes host_alloc_attrs;
+ host_alloc_attrs.set_gpu_compatible(true);
+ host_alloc_attrs.set_on_host(true);
+ Allocator* cpu_allocator = device->GetAllocator(host_alloc_attrs);
+ Tensor* cpu_tensor =
+ new Tensor(cpu_allocator, tensor.dtype(), tensor.shape());
+ device_ctxt->CopyDeviceTensorToCPU(
+ &tensor, "StackPush", device, cpu_tensor,
+ [cpu_tensor, stack, ctx, done](const Status& s) {
+ ctx->SetStatus(s);
+ if (s.ok()) {
+ AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1);
+ ctx->SetStatus(stack->Push(
+ {PersistentTensor(*cpu_tensor), alloc_attrs, true}));
+ }
+ if (ctx->status().ok()) {
+ ctx->set_output(0, *cpu_tensor);
+ }
+ done();
+ delete cpu_tensor;
+ });
+ return;
+ }
}
+
+ // Execute synchronously if not swapped.
+ OP_REQUIRES_OK(ctx,
+ stack->Push({PersistentTensor(tensor), alloc_attrs, false}));
+ ctx->set_output(0, tensor);
+ done();
}
bool IsExpensive() override { return false; }