1 files changed, 11 insertions, 3 deletions
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index e4f66c3b4b..67e10f7c05 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -187,9 +187,17 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
     gpu::Platform* gpu_platform = GPUMachineManager();
     gpu::StreamExecutor* se = gpu_platform->ExecutorForDevice(0).ValueOrDie();
     CHECK(se);
-    Allocator* allocator = new PoolAllocator(
-        100 /*pool_size_limit*/, true /*auto_resize*/,
-        new CUDAHostAllocator(se), new Pow2Rounder, "cuda_host");
+    Allocator* allocator = nullptr;
+    static constexpr bool kCudaHostMemoryUseBFC = true;
+    if (kCudaHostMemoryUseBFC) {
+      allocator =
+          new BFCAllocator(new CUDAHostAllocator(se), 1LL << 36 /*64GB max*/,
+                           true /*allow_growth*/, "cuda_host_bfc" /*name*/);
+    } else {
+      allocator = new PoolAllocator(
+          100 /*pool_size_limit*/, true /*auto_resize*/,
+          new CUDAHostAllocator(se), new Pow2Rounder, "cuda_host");
+    }
     if (LogMemory::IsEnabled()) {
       // Wrap the allocator to track allocation ids for better logging
       // at the cost of performance.