22 files changed, 105 insertions, 242 deletions
diff --git a/tensorflow/c/eager/runtime.cc b/tensorflow/c/eager/runtime.cc
index ec34b0ea77..3a9951e14d 100644
--- a/tensorflow/c/eager/runtime.cc
+++ b/tensorflow/c/eager/runtime.cc
@@ -316,18 +316,12 @@ Status KernelAndDevice::Run(std::vector<Tensor>* input_tensors,
       allocator_pair.second->GetRecordsAndUnRef();
     }
     auto* ms = stats->mutable_memory_stats();
-    ms->set_host_temp_memory_size(context.host_temp_memory_size());
-    ms->set_device_temp_memory_size(context.device_temp_memory_size());
-    for (const auto& alloc_id : context.host_persistent_alloc_ids()) {
-      ms->mutable_host_persistent_tensor_alloc_ids()->Add(alloc_id);
+    ms->set_temp_memory_size(context.temp_memory_size());
+    for (const auto& alloc_id : context.persistent_alloc_ids()) {
+      ms->mutable_persistent_tensor_alloc_ids()->Add(alloc_id);
     }
-    for (const auto& alloc_id : context.device_persistent_alloc_ids()) {
-      ms->mutable_device_persistent_tensor_alloc_ids()->Add(alloc_id);
-    }
-    ms->set_host_persistent_memory_size(
-        context.host_persistent_memory_allocated());
-    ms->set_device_persistent_memory_size(
-        context.device_persistent_memory_allocated());
+
+    ms->set_persistent_memory_size(context.persistent_memory_allocated());
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index fe1cf1b12e..9d03caff1e 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -172,17 +172,11 @@ void SetMemory(NodeExecStatsWrapper* stats, OpKernelContext* ctx) {
     stats->AddAllocation(allocator_pair.first, allocator_pair.second);
   }
   auto* ms = stats->stats()->mutable_memory_stats();
-  ms->set_host_temp_memory_size(ctx->host_temp_memory_size());
-  ms->set_device_temp_memory_size(ctx->device_temp_memory_size());
-  for (const auto& alloc_id : ctx->host_persistent_alloc_ids()) {
-    ms->mutable_host_persistent_tensor_alloc_ids()->Add(alloc_id);
+  ms->set_temp_memory_size(ctx->temp_memory_size());
+  for (const auto& alloc_id : ctx->persistent_alloc_ids()) {
+    ms->mutable_persistent_tensor_alloc_ids()->Add(alloc_id);
   }
-  for (const auto& alloc_id : ctx->device_persistent_alloc_ids()) {
-    ms->mutable_device_persistent_tensor_alloc_ids()->Add(alloc_id);
-  }
-  ms->set_host_persistent_memory_size(ctx->host_persistent_memory_allocated());
-  ms->set_device_persistent_memory_size(
-      ctx->device_persistent_memory_allocated());
+  ms->set_persistent_memory_size(ctx->persistent_memory_allocated());
 }
 
 void SetReferencedTensors(NodeExecStatsWrapper* stats,
diff --git a/tensorflow/core/framework/cost_graph.proto b/tensorflow/core/framework/cost_graph.proto
index f4837fbfc5..7885b0171a 100644
--- a/tensorflow/core/framework/cost_graph.proto
+++ b/tensorflow/core/framework/cost_graph.proto
@@ -45,10 +45,12 @@ message CostGraphDef {
     // Temporary memory used by this node.
     int64 temporary_memory_size = 6;
 
-    int64 host_temp_memory_size = 10;
-    int64 device_temp_memory_size = 11;
-    int64 host_persistent_memory_size = 12;
-    int64 device_persistent_memory_size = 16;
+    // Persistent memory used by this node.
+    int64 persistent_memory_size = 12;
+
+    int64 host_temp_memory_size = 10 [deprecated = true];
+    int64 device_temp_memory_size = 11 [deprecated = true];
+    int64 device_persistent_memory_size = 16 [deprecated = true];
 
     // Estimate of the computational cost of this node, in microseconds.
     int64 compute_cost = 9;
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 433005c8ab..c879dc6f3f 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -252,10 +252,8 @@ OpKernelContext::OpKernelContext(Params* params)
 OpKernelContext::OpKernelContext(Params* params, int num_outputs)
     : params_(params),
       outputs_(num_outputs),
-      host_temp_memory_size_(0),
-      device_temp_memory_size_(0),
-      host_persistent_memory_allocated_(0),
-      device_persistent_memory_allocated_(0) {
+      temp_memory_size_(0),
+      persistent_memory_allocated_(0) {
   Allocator* eigen_gpu_allocator = get_allocator(AllocatorAttributes());
   params_->ensure_eigen_gpu_device();
   params_->device->ReinitializeGpuDevice(this, params_->eigen_gpu_device,
@@ -668,11 +666,7 @@ Status OpKernelContext::allocate_temp(
     if (a->TracksAllocationSizes()) {
       int64 alloc_size =
           a->AllocatedSize(const_cast<char*>(out_temp->tensor_data().data()));
-      if (allocate_on_host(allocator_attr)) {
-        record_host_temp_memory_size(alloc_size);
-      } else {
-        record_device_temp_memory_size(alloc_size);
-      }
+      record_temp_memory_size(alloc_size);
     }
   }
   return s;
@@ -795,26 +789,15 @@ bool OpKernelContext::allocate_on_host(AllocatorAttributes alloc_attr) const {
   return alloc_attr.on_host() || device()->attributes().device_type() == "CPU";
 }
 
-void OpKernelContext::record_host_persistent_memory_allocation(int64 size,
-                                                               int64 alloc_id) {
-  host_persistent_memory_allocated_ += size;
-  host_persistent_alloc_ids_.push_back(alloc_id);
-}
-
-void OpKernelContext::record_device_persistent_memory_allocation(
-    int64 size, int64 alloc_id) {
-  device_persistent_memory_allocated_ += size;
-  device_persistent_alloc_ids_.push_back(alloc_id);
-}
-
-std::vector<int64> OpKernelContext::host_persistent_alloc_ids() const {
-  return std::vector<int64>(host_persistent_alloc_ids_.begin(),
-                            host_persistent_alloc_ids_.end());
+void OpKernelContext::record_persistent_memory_allocation(int64 size,
+                                                          int64 alloc_id) {
+  persistent_memory_allocated_ += size;
+  persistent_alloc_ids_.push_back(alloc_id);
 }
 
-std::vector<int64> OpKernelContext::device_persistent_alloc_ids() const {
-  return std::vector<int64>(device_persistent_alloc_ids_.begin(),
-                            device_persistent_alloc_ids_.end());
+std::vector<int64> OpKernelContext::persistent_alloc_ids() const {
+  return std::vector<int64>(persistent_alloc_ids_.begin(),
+                            persistent_alloc_ids_.end());
 }
 
 // OpKernel registration ------------------------------------------------------
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 3a9a6121c0..25150499ad 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -1033,33 +1033,21 @@ class OpKernelContext {
   bool allocate_on_host(AllocatorAttributes alloc_attr) const;
 
   // Records temporary memory sizes.
-  void record_host_temp_memory_size(int64 size) {
-    host_temp_memory_size_ += size;
-  }
-  void record_device_temp_memory_size(int64 size) {
-    device_temp_memory_size_ += size;
-  }
+  void record_temp_memory_size(int64 size) { temp_memory_size_ += size; }
 
   // Returns recorded size of temporary memory;
-  int64 host_temp_memory_size() const { return host_temp_memory_size_; }
-  int64 device_temp_memory_size() const { return device_temp_memory_size_; }
+  int64 temp_memory_size() const { return temp_memory_size_; }
 
   // Records persistent memory allocation, size can be negative indicating
   // deallocation.
-  void record_host_persistent_memory_allocation(int64 size,
-                                                int64 alloc_id = -1);
-  void record_device_persistent_memory_allocation(int64 size,
-                                                  int64 alloc_id = -1);
+  void record_persistent_memory_allocation(int64 size, int64 alloc_id = -1);
 
   // Returns recorded size and ids of persistent memory.
-  int64 host_persistent_memory_allocated() const {
-    return host_persistent_memory_allocated_;
+  int64 persistent_memory_allocated() const {
+    return persistent_memory_allocated_;
   }
-  int64 device_persistent_memory_allocated() const {
-    return device_persistent_memory_allocated_;
-  }
-  std::vector<int64> host_persistent_alloc_ids() const;
-  std::vector<int64> device_persistent_alloc_ids() const;
+
+  std::vector<int64> persistent_alloc_ids() const;
 
   bool input_is_ref(int index) const;
 
@@ -1104,12 +1092,9 @@ class OpKernelContext {
 
   bool is_output_dead_ = false;
 
-  int64 host_temp_memory_size_;
-  int64 device_temp_memory_size_;
-  gtl::InlinedVector<int64, 2> host_persistent_alloc_ids_;
-  gtl::InlinedVector<int64, 2> device_persistent_alloc_ids_;
-  int64 host_persistent_memory_allocated_;
-  int64 device_persistent_memory_allocated_;
+  int64 temp_memory_size_;
+  gtl::InlinedVector<int64, 2> persistent_alloc_ids_;
+  int64 persistent_memory_allocated_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(OpKernelContext);
 };
diff --git a/tensorflow/core/framework/step_stats.proto b/tensorflow/core/framework/step_stats.proto
index 99dee2257e..65c8089d51 100644
--- a/tensorflow/core/framework/step_stats.proto
+++ b/tensorflow/core/framework/step_stats.proto
@@ -40,12 +40,13 @@ message NodeOutput {
 
 // For memory tracking.
 message MemoryStats {
-  int64 host_temp_memory_size = 1;
-  int64 device_temp_memory_size = 2;
-  int64 host_persistent_memory_size = 3;
-  int64 device_persistent_memory_size = 4;
-  repeated int64 host_persistent_tensor_alloc_ids = 5;
-  repeated int64 device_persistent_tensor_alloc_ids = 6;
+  int64 temp_memory_size = 1;
+  int64 persistent_memory_size = 3;
+  repeated int64 persistent_tensor_alloc_ids = 5;
+
+  int64 device_temp_memory_size = 2 [deprecated = true];
+  int64 device_persistent_memory_size = 4 [deprecated = true];
+  repeated int64 device_persistent_tensor_alloc_ids = 6 [deprecated = true];
 }
 
 // Time/size stats recorded for a single execution of a graph node.
diff --git a/tensorflow/core/graph/costmodel.cc b/tensorflow/core/graph/costmodel.cc
index 3ed32068ae..b1e6cf64e8 100644
--- a/tensorflow/core/graph/costmodel.cc
+++ b/tensorflow/core/graph/costmodel.cc
@@ -291,59 +291,24 @@ Bytes CostModel::TempMemorySize(const Node* node) const {
   return max_mem_usage_[id].temp_memory_size;
 }
 
-Bytes CostModel::HostTempMemorySize(const Node* node) const {
+Bytes CostModel::PersistentMemorySize(const Node* node) const {
   const int id = Id(node);
   if (id < 0) {
     return Bytes(0);
   }
-  return max_mem_usage_[id].host_temp_memory_size;
-}
-
-Bytes CostModel::DeviceTempMemorySize(const Node* node) const {
-  const int id = Id(node);
-  if (id < 0) {
-    return Bytes(0);
-  }
-  return max_mem_usage_[id].device_temp_memory_size;
-}
-
-Bytes CostModel::HostPersistentMemorySize(const Node* node) const {
-  const int id = Id(node);
-  if (id < 0) {
-    return Bytes(0);
-  }
-  return max_mem_usage_[id].host_persistent_memory_size;
-}
-
-Bytes CostModel::DevicePersistentMemorySize(const Node* node) const {
-  const int id = Id(node);
-  if (id < 0) {
-    return Bytes(0);
-  }
-  return max_mem_usage_[id].device_persistent_memory_size;
+  return max_mem_usage_[id].persistent_memory_size;
 }
 
 void CostModel::RecordMemoryStats(const Node* node,
                                   const MemoryStats& memory_stats) {
   const int id = Id(node);
   if (id < 0) return;
-  max_mem_usage_[id].host_temp_memory_size =
-      memory_stats.host_temp_memory_size();
-  max_mem_usage_[id].device_temp_memory_size =
-      memory_stats.device_temp_memory_size();
-  max_mem_usage_[id].host_persistent_memory_size =
-      memory_stats.host_persistent_memory_size();
-  max_mem_usage_[id].device_persistent_memory_size =
-      memory_stats.device_persistent_memory_size();
-  for (int64 alloc_id : memory_stats.host_persistent_tensor_alloc_ids()) {
-    if (alloc_id > 0) {
-      host_persistent_alloc_ids_.insert(alloc_id);
-    }
-  }
-  for (int64 alloc_id : memory_stats.device_persistent_tensor_alloc_ids()) {
+  max_mem_usage_[id].temp_memory_size = memory_stats.temp_memory_size();
+  max_mem_usage_[id].persistent_memory_size =
+      memory_stats.persistent_memory_size();
+  for (int64 alloc_id : memory_stats.persistent_tensor_alloc_ids()) {
     if (alloc_id > 0) {
-      persistent_alloc_ids_by_devices_[node->assigned_device_name()].insert(
-          alloc_id);
+      persistent_alloc_ids_.insert(alloc_id);
     }
   }
 }
@@ -381,7 +346,7 @@ int64 CostModel::AllocationId(const Node* node, int slot) const {
 }
 
 bool CostModel::IsPersistentTensor(const Node* node, int64 alloc_id) const {
-  if (host_persistent_alloc_ids_.count(alloc_id) > 0) {
+  if (persistent_alloc_ids_.count(alloc_id) > 0) {
     return true;
   }
   if (persistent_alloc_ids_by_devices_.find(node->assigned_device_name()) ==
@@ -548,11 +513,8 @@ void CostModel::AddToCostGraphDef(const Graph* graph,
       cnode->add_control_input(Id(e->src()));
     }
 
-    cnode->set_host_temp_memory_size(HostTempMemorySize(n).value());
-    cnode->set_device_temp_memory_size(DeviceTempMemorySize(n).value());
-    cnode->set_host_persistent_memory_size(HostPersistentMemorySize(n).value());
-    cnode->set_device_persistent_memory_size(
-        DevicePersistentMemorySize(n).value());
+    cnode->set_temporary_memory_size(TempMemorySize(n).value());
+    cnode->set_persistent_memory_size(PersistentMemorySize(n).value());
 
     cnode->set_compute_cost(MaxExecutionTime(n).value());
 
diff --git a/tensorflow/core/graph/costmodel.h b/tensorflow/core/graph/costmodel.h
index 8afa4971ad..081eb2ff4c 100644
--- a/tensorflow/core/graph/costmodel.h
+++ b/tensorflow/core/graph/costmodel.h
@@ -133,13 +133,8 @@ class CostModel {
   // Returns the size in bytes of temporary memory consumed by "node".
   Bytes TempMemorySize(const Node* node) const;
 
-  // Returns the size in bytes of temporary memory consumed by "node".
-  Bytes HostTempMemorySize(const Node* node) const;
-  Bytes DeviceTempMemorySize(const Node* node) const;
-
   // Returns the size of persistent memory allocated by "node".
-  Bytes HostPersistentMemorySize(const Node* node) const;
-  Bytes DevicePersistentMemorySize(const Node* node) const;
+  Bytes PersistentMemorySize(const Node* node) const;
 
   // Records memory stats such as temp momory and persistent memory.
   void RecordMemoryStats(const Node* node, const MemoryStats& memory_stats);
@@ -210,21 +205,11 @@ class CostModel {
 
   // Maximum memory usage
   struct MemUsage {
-    MemUsage()
-        : temp_memory_size(-1),
-          host_temp_memory_size(0),
-          device_temp_memory_size(0),
-          host_persistent_memory_size(0),
-          device_persistent_memory_size(0) {}
+    MemUsage() : temp_memory_size(0), persistent_memory_size(0) {}
 
     // TODO(yuefengz): temp_memory_size is not being used, remove it.
     Bytes temp_memory_size;
-
-    Bytes host_temp_memory_size;
-    Bytes device_temp_memory_size;
-
-    Bytes host_persistent_memory_size;
-    Bytes device_persistent_memory_size;
+    Bytes persistent_memory_size;
 
     gtl::InlinedVector<Bytes, 2> output_port_mem;
     gtl::InlinedVector<TensorShapeProto, 2> output_port_shape;
@@ -234,7 +219,7 @@ class CostModel {
 
   std::vector<gtl::InlinedVector<int64, 2> > output_port_alloc_ids_;
 
-  std::set<int64> host_persistent_alloc_ids_;
+  std::set<int64> persistent_alloc_ids_;
   std::map<string, std::set<int64>> persistent_alloc_ids_by_devices_;
 
   TensorShapeProto unknown_shape_;
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index f6b394a860..c6352c1448 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -467,13 +467,11 @@ TEST_F(SingleMachineTest, PersistentMemory) {
       found_hashtable = true;
       // Persistent memory usage should be 0 since it's recorded as part of the
       // initialize_table op.
-      EXPECT_EQ(0, node.host_persistent_memory_size());
-      EXPECT_EQ(0, node.device_persistent_memory_size());
+      EXPECT_EQ(0, node.persistent_memory_size());
     } else if (node.name() == "initialize_table") {
       found_table_init = true;
       // Persistent memory should hold 2 keys and 2 values.
-      EXPECT_LE(4 * sizeof(int64), node.host_persistent_memory_size());
-      EXPECT_EQ(0, node.device_persistent_memory_size());
+      EXPECT_LE(4 * sizeof(int64), node.persistent_memory_size());
     }
   }
   EXPECT_TRUE(found_table_init);
diff --git a/tensorflow/core/grappler/costs/op_performance_data.proto b/tensorflow/core/grappler/costs/op_performance_data.proto
index 1a111b71dc..1d623b8db8 100644
--- a/tensorflow/core/grappler/costs/op_performance_data.proto
+++ b/tensorflow/core/grappler/costs/op_performance_data.proto
@@ -96,13 +96,12 @@ message OpPerformance {
     // The output information may have memory usage and output shapes.
     repeated int64 output_memory = 1;
 
-    // Temporary memory allocated by this node.
-    int64 host_temp_memory = 2;
-    int64 device_temp_memory = 3;
+    // Temp and persistent memory allocated by this node.
+    int64 temp_memory = 2;
+    int64 persistent_memory = 4;
 
-    // The persisted_memory doesn't include outputs.
-    int64 host_persistent_memory = 4;
-    int64 device_persistent_memory = 5;
+    int64 device_temp_memory = 3 [deprecated = true];
+    int64 device_persistent_memory = 5 [deprecated = true];
   }
   OpMemory op_memory = 9;
 }
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index ade0ad53fb..602f69f12e 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -285,14 +285,10 @@ OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph,
       perf->mutable_op_memory()->add_output_memory(output_info.size());
     }
 
-    perf->mutable_op_memory()->set_host_temp_memory(
-        cost_node->host_temp_memory_size());
-    perf->mutable_op_memory()->set_device_temp_memory(
-        cost_node->device_temp_memory_size());
-    perf->mutable_op_memory()->set_host_persistent_memory(
-        cost_node->host_persistent_memory_size());
-    perf->mutable_op_memory()->set_device_persistent_memory(
-        cost_node->device_persistent_memory_size());
+    perf->mutable_op_memory()->set_temp_memory(
+        cost_node->temporary_memory_size());
+    perf->mutable_op_memory()->set_persistent_memory(
+        cost_node->persistent_memory_size());
   }
   return ret;
 }
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 0af889f886..d7d07ee7a5 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -984,21 +984,12 @@ Costs VirtualScheduler::Summary(RunMetadata* metadata) {
             nodestate.time_scheduled.asMicroSeconds().count());
         auto* mem_stats = node_stats->mutable_memory_stats();
         // VirtualScheduler does not specify scratch pad memory usage.
-        mem_stats->set_host_temp_memory_size(0);
-        mem_stats->set_device_temp_memory_size(0);
-        int64 host_persistent_memory_size = 0;
-        int64 device_persistent_memory_size = 0;
+        mem_stats->set_temp_memory_size(0);
+        int64 persistent_memory_size = 0;
         if (IsPersistentNode(node_def)) {
-          if (device.first.find("cpu") != string::npos ||
-              device.first.find("CPU") != string::npos) {
-            host_persistent_memory_size = total_output_size;
-          } else {
-            device_persistent_memory_size = total_output_size;
-          }
+          persistent_memory_size = total_output_size;
         }
-        mem_stats->set_host_persistent_memory_size(host_persistent_memory_size);
-        mem_stats->set_device_persistent_memory_size(
-            device_persistent_memory_size);
+        mem_stats->set_persistent_memory_size(persistent_memory_size);
         *device_partition_graph->add_node() = *node_def;
       }
     }
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index c8bfb26859..59f9f69315 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -57,12 +57,7 @@ ConstantOp::ConstantOp(OpKernelConstruction* ctx)
 void ConstantOp::Compute(OpKernelContext* ctx) {
   ctx->set_output(0, tensor_);
   if (TF_PREDICT_FALSE(ctx->track_allocations())) {
-    AllocatorAttributes attr;
-    if (ctx->allocate_on_host(attr)) {
-      ctx->record_host_persistent_memory_allocation(tensor_.AllocatedBytes());
-    } else {
-      ctx->record_device_persistent_memory_allocation(tensor_.AllocatedBytes());
-    }
+    ctx->record_persistent_memory_allocation(tensor_.AllocatedBytes());
   }
 }
 
diff --git a/tensorflow/core/kernels/constant_op_test.cc b/tensorflow/core/kernels/constant_op_test.cc
index 62cc67c736..7a05d9371d 100644
--- a/tensorflow/core/kernels/constant_op_test.cc
+++ b/tensorflow/core/kernels/constant_op_test.cc
@@ -72,9 +72,9 @@ void ConstantOpTest::PersistentMemoryTrackingTest(bool on_gpu) {
   TF_EXPECT_OK(ctx.status());
 
   if (on_gpu) {
-    EXPECT_EQ(ctx.device_persistent_memory_allocated(), 512);
+    EXPECT_EQ(ctx.persistent_memory_allocated(), 512);
   } else {
-    EXPECT_EQ(ctx.host_persistent_memory_allocated(), 480);
+    EXPECT_EQ(ctx.persistent_memory_allocated(), 480);
   }
 
   // Remove memry leak errors.
diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc
index 38adcada6d..b352dd257c 100644
--- a/tensorflow/core/kernels/lookup_table_init_op.cc
+++ b/tensorflow/core/kernels/lookup_table_init_op.cc
@@ -82,8 +82,8 @@ class InitializeTableOp : public OpKernel {
     }
     OP_REQUIRES_OK(ctx, table->Initialize(iter));
     if (ctx->track_allocations()) {
-      ctx->record_host_persistent_memory_allocation(table->MemoryUsed() -
-                                                    memory_used_before);
+      ctx->record_persistent_memory_allocation(table->MemoryUsed() -
+                                               memory_used_before);
     }
   }
 
@@ -144,8 +144,8 @@ class InitializeTableFromTextFileOp : public OpKernel {
                             vocab_filename, vocab_size_, delimiter_, key_index_,
                             value_index_, ctx->env(), table));
     if (ctx->track_allocations()) {
-      ctx->record_host_persistent_memory_allocation(table->MemoryUsed() -
-                                                    memory_used_before);
+      ctx->record_persistent_memory_allocation(table->MemoryUsed() -
+                                               memory_used_before);
     }
   }
 
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index 418d9dcc61..e3872fee0e 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -709,8 +709,8 @@ class LookupTableInsertOp : public OpKernel {
     }
     OP_REQUIRES_OK(ctx, table->Insert(ctx, keys, values));
     if (ctx->track_allocations()) {
-      ctx->record_host_persistent_memory_allocation(table->MemoryUsed() -
-                                                    memory_used_before);
+      ctx->record_persistent_memory_allocation(table->MemoryUsed() -
+                                               memory_used_before);
     }
   }
 };
@@ -786,8 +786,8 @@ class LookupTableImportOp : public OpKernel {
     }
     OP_REQUIRES_OK(ctx, table->ImportValues(ctx, keys, values));
     if (ctx->track_allocations()) {
-      ctx->record_host_persistent_memory_allocation(table->MemoryUsed() -
-                                                    memory_used_before);
+      ctx->record_persistent_memory_allocation(table->MemoryUsed() -
+                                               memory_used_before);
     }
   }
 };
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index ff23a09a24..5ba9b936e4 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -64,7 +64,7 @@ class LookupTableOp : public OpKernel {
         return ctx->status();
       }
       if (ctx->track_allocations()) {
-        ctx->record_host_persistent_memory_allocation(
+        ctx->record_persistent_memory_allocation(
             container->MemoryUsed() + table_handle_.AllocatedBytes());
       }
       *ret = container;
diff --git a/tensorflow/core/kernels/queue_op.h b/tensorflow/core/kernels/queue_op.h
index 2d68ac7a29..ad606803ee 100644
--- a/tensorflow/core/kernels/queue_op.h
+++ b/tensorflow/core/kernels/queue_op.h
@@ -44,8 +44,7 @@ class QueueOp : public ResourceOpKernel<QueueInterface> {
   void Compute(OpKernelContext* context) override {
     ResourceOpKernel<QueueInterface>::Compute(context);
     if (resource_ && context->track_allocations()) {
-      context->record_host_persistent_memory_allocation(
-          resource_->MemoryUsed());
+      context->record_persistent_memory_allocation(resource_->MemoryUsed());
     }
   }
 
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
index 9da992ccd1..d7bebfb24c 100644
--- a/tensorflow/core/kernels/reduction_ops_common.h
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -240,14 +240,7 @@ class ReductionOp : public OpKernel {
       ctx->SetStatus(errors::Internal("Error during reduction copy."));
     }
     if (ctx->track_allocations()) {
-      // The temporary memory becomes the output memory.
-      if (ctx->allocate_on_host(alloc_attr)) {
-        ctx->record_host_temp_memory_size(
-            -static_cast<int64>(out.AllocatedBytes()));
-      } else {
-        ctx->record_device_temp_memory_size(
-            -static_cast<int64>(out.AllocatedBytes()));
-      }
+      ctx->record_temp_memory_size(-static_cast<int64>(out.AllocatedBytes()));
     }
     ctx->set_output(0, out);
   }
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index 1b7079dcba..10ccc85b7c 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -76,13 +76,7 @@ void VariableOp::Compute(OpKernelContext* ctx) {
     AllocatorAttributes attr;
     attr.set_gpu_compatible(true);
     attr.set_nic_compatible(true);
-    if (ctx->allocate_on_host(attr)) {
-      ctx->record_host_persistent_memory_allocation(
-          var->tensor()->AllocatedBytes());
-    } else {
-      ctx->record_device_persistent_memory_allocation(
-          var->tensor()->AllocatedBytes());
-    }
+    ctx->record_persistent_memory_allocation(var->tensor()->AllocatedBytes());
   }
   var->Unref();
 }
@@ -113,14 +107,8 @@ class TemporaryVariableOp : public OpKernel {
                                        var_name_, tmp_var));
     context->set_output_ref(0, &tmp_var->mu, &tmp_var->val);
     if (context->track_allocations()) {
-      AllocatorAttributes attr;
-      if (context->allocate_on_host(attr)) {
-        context->record_host_persistent_memory_allocation(
-            tmp_var->val.AllocatedBytes());
-      } else {
-        context->record_device_persistent_memory_allocation(
-            tmp_var->val.AllocatedBytes());
-      }
+      context->record_persistent_memory_allocation(
+          tmp_var->val.AllocatedBytes());
     }
   }
 
@@ -163,13 +151,8 @@ class DestroyTemporaryVariableOp : public OpKernel {
     OP_REQUIRES_OK(context, rm->Delete<TemporaryVariableOp::TmpVar>(
                                 context->step_container()->name(), var_name_));
     if (context->track_allocations()) {
-      if (context->allocate_on_host(AllocatorAttributes())) {
-        context->record_host_persistent_memory_allocation(
-            -static_cast<int64>(tmpvar.AllocatedBytes()));
-      } else {
-        context->record_device_persistent_memory_allocation(
-            -static_cast<int64>(tmpvar.AllocatedBytes()));
-      }
+      context->record_persistent_memory_allocation(
+          -static_cast<int64>(tmpvar.AllocatedBytes()));
     }
   }
 
diff --git a/tensorflow/core/profiler/internal/testdata/run_meta b/tensorflow/core/profiler/internal/testdata/run_meta
index ae76acb743..eaea62b06c 100644
--- a/tensorflow/core/profiler/internal/testdata/run_meta
+++ b/tensorflow/core/profiler/internal/testdata/run_meta
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 2945c9510f..86cb20de7b 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -133,18 +133,21 @@ void ExecStep::AddMemoryStats(const string& dev,
   exec_mem.set_output_bytes(total_output_bytes);
 
   if (step_stat.has_memory_stats()) {
-    exec_mem.set_host_temp_bytes(
-        exec_mem.host_temp_bytes() +
-        step_stat.memory_stats().host_temp_memory_size());
-    exec_mem.set_host_persistent_bytes(
-        exec_mem.host_persistent_bytes() +
-        step_stat.memory_stats().host_persistent_memory_size());
-    exec_mem.set_accelerator_temp_bytes(
-        exec_mem.accelerator_temp_bytes() +
-        step_stat.memory_stats().device_temp_memory_size());
-    exec_mem.set_accelerator_persistent_bytes(
-        exec_mem.accelerator_persistent_bytes() +
-        step_stat.memory_stats().device_persistent_memory_size());
+    if (IsPlacedOnCPU(dev)) {
+      // Currently we assume ops placed on gpu only allocate memory on gpu.
+      exec_mem.set_host_temp_bytes(exec_mem.host_temp_bytes() +
+                                   step_stat.memory_stats().temp_memory_size());
+      exec_mem.set_host_persistent_bytes(
+          exec_mem.host_persistent_bytes() +
+          step_stat.memory_stats().persistent_memory_size());
+    } else {
+      exec_mem.set_accelerator_temp_bytes(
+          exec_mem.accelerator_temp_bytes() +
+          step_stat.memory_stats().temp_memory_size());
+      exec_mem.set_accelerator_persistent_bytes(
+          exec_mem.accelerator_persistent_bytes() +
+          step_stat.memory_stats().persistent_memory_size());
+    }
   }
 
   // TODO(xpan): Make this more accurate: