diff options
author | 2018-09-28 19:49:23 -0700 | |
---|---|---|
committer | 2018-09-28 19:49:23 -0700 | |
commit | d936d819752916d3122f02def571ecac9e995029 (patch) | |
tree | 5b86fe5b82d0ddea4569afe36fb4dbd668b736da /tensorflow/core/common_runtime | |
parent | a287961cffcb9ae1a0675f4e18d14674dfae130a (diff) |
Lower the MKLCpuAllocator priority so that it can use default allocator when MKL is disabled, and with some minor changes
Diffstat (limited to 'tensorflow/core/common_runtime')
-rw-r--r-- | tensorflow/core/common_runtime/mkl_cpu_allocator.h | 54 | ||||
-rw-r--r-- | tensorflow/core/common_runtime/process_util.cc | 37 | ||||
-rw-r--r-- | tensorflow/core/common_runtime/threadpool_device.cc | 4 |
3 files changed, 36 insertions, 59 deletions
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h index 516138d28d..429b19599b 100644 --- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h +++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h @@ -27,7 +27,6 @@ limitations under the License. #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/mem.h" -#include "tensorflow/core/util/util.h" #include "tensorflow/core/platform/numa.h" #ifndef INTEL_MKL_DNN_ONLY @@ -164,12 +163,6 @@ class MklCPUAllocator : public Allocator { } Status Initialize() { - if (DisableMKL()) { - VLOG(1) << "TF-MKL: Disabling pool allocator"; - tf_disable_pool_allocator_flag_ = true; - return Status::OK(); - } - VLOG(2) << "MklCPUAllocator: In MklCPUAllocator"; // Set upper bound on memory allocation to physical RAM available on the @@ -224,10 +217,6 @@ class MklCPUAllocator : public Allocator { inline string Name() override { return kName; } inline void* AllocateRaw(size_t alignment, size_t num_bytes) override { - if (tf_disable_pool_allocator_flag_) { - return port::AlignedMalloc(num_bytes, alignment); - } - // If the allocation size is less than threshold, call small allocator, // otherwise call large-size allocator (BFC). We found that BFC allocator // does not deliver good performance for small allocations when @@ -238,10 +227,6 @@ class MklCPUAllocator : public Allocator { } inline void DeallocateRaw(void* ptr) override { - if (tf_disable_pool_allocator_flag_) { - port::AlignedFree(ptr); - return; - } // Check if ptr is for "small" allocation. If it is, then call Free // directly. Otherwise, call BFC to handle free. if (small_size_allocator_->IsSmallSizeAllocation(ptr)) { @@ -252,30 +237,26 @@ class MklCPUAllocator : public Allocator { } void GetStats(AllocatorStats* stats) override { - if (!tf_disable_pool_allocator_flag_) { - AllocatorStats l_stats, s_stats; - small_size_allocator_->GetStats(&s_stats); - large_size_allocator_->GetStats(&l_stats); - - // Combine statistics from small-size and large-size allocator. - stats->num_allocs = l_stats.num_allocs + s_stats.num_allocs; - stats->bytes_in_use = l_stats.bytes_in_use + s_stats.bytes_in_use; - stats->max_bytes_in_use = - l_stats.max_bytes_in_use + s_stats.max_bytes_in_use; - - // Since small-size allocations go to MklSmallSizeAllocator, - // max_alloc_size from large_size_allocator would be the maximum - // size allocated by MklCPUAllocator. - stats->max_alloc_size = l_stats.max_alloc_size; - stats->bytes_limit = std::max(s_stats.bytes_limit, l_stats.bytes_limit); - } + AllocatorStats l_stats, s_stats; + small_size_allocator_->GetStats(&s_stats); + large_size_allocator_->GetStats(&l_stats); + + // Combine statistics from small-size and large-size allocator. + stats->num_allocs = l_stats.num_allocs + s_stats.num_allocs; + stats->bytes_in_use = l_stats.bytes_in_use + s_stats.bytes_in_use; + stats->max_bytes_in_use = + l_stats.max_bytes_in_use + s_stats.max_bytes_in_use; + + // Since small-size allocations go to MklSmallSizeAllocator, + // max_alloc_size from large_size_allocator would be the maximum + // size allocated by MklCPUAllocator. + stats->max_alloc_size = l_stats.max_alloc_size; + stats->bytes_limit = std::max(s_stats.bytes_limit, l_stats.bytes_limit); } void ClearStats() override { - if (!tf_disable_pool_allocator_flag_) { - small_size_allocator_->ClearStats(); - large_size_allocator_->ClearStats(); - } + small_size_allocator_->ClearStats(); + large_size_allocator_->ClearStats(); } private: @@ -314,7 +295,6 @@ class MklCPUAllocator : public Allocator { // The alignment that we need for the allocations static constexpr const size_t kAlignment = 64; - bool tf_disable_pool_allocator_flag_ = false; Allocator* large_size_allocator_; // owned by this class MklSmallSizeAllocator* small_size_allocator_; // owned by this class. diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc index 60fa601907..b3064a4c08 100644 --- a/tensorflow/core/common_runtime/process_util.cc +++ b/tensorflow/core/common_runtime/process_util.cc @@ -57,28 +57,25 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) { const int32 inter_op = options.config.inter_op_parallelism_threads(); if (inter_op != 0) return inter_op; #ifdef INTEL_MKL - // Early return if MKL is disabled - if (DisableMKL()) - return port::NumSchedulableCPUs(); - - // MKL library executes ops in parallel using OMP threads - // Set inter_op conservatively to avoid thread oversubscription that could - // lead to severe perf degradations and OMP resource exhaustion - int mkl_intra_op = 1; -#ifdef _OPENMP - mkl_intra_op = omp_get_max_threads(); -#endif // _OPENMP - CHECK_GE(mkl_intra_op, 1); - const int32 mkl_inter_op = std::max( - (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2); - VLOG(0) << "Creating new thread pool with default inter op setting: " - << mkl_inter_op - << ". Tune using inter_op_parallelism_threads for best performance."; - return mkl_inter_op; -#else + if (!DisableMKL()) { + // MKL library executes ops in parallel using OMP threads + // Set inter_op conservatively to avoid thread oversubscription that could + // lead to severe perf degradations and OMP resource exhaustion + int mkl_intra_op = 1; + #ifdef _OPENMP + mkl_intra_op = omp_get_max_threads(); + #endif // _OPENMP + CHECK_GE(mkl_intra_op, 1); + const int32 mkl_inter_op = std::max( + (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2); + VLOG(0) << "Creating new thread pool with default inter op setting: " + << mkl_inter_op + << ". Tune using inter_op_parallelism_threads for best performance."; + return mkl_inter_op; + } +#endif // INTEL_MKL // Default to using the number of cores available in the process. return port::NumSchedulableCPUs(); -#endif // INTEL_MKL } thread::ThreadPool* NewThreadPoolFromSessionOptions( diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc index 29c01d7f72..f188016610 100644 --- a/tensorflow/core/common_runtime/threadpool_device.cc +++ b/tensorflow/core/common_runtime/threadpool_device.cc @@ -50,7 +50,7 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options, allocator_(allocator), scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) { #ifdef INTEL_MKL - // Eearly return when MKL is disabled + // Early return when MKL is disabled if (DisableMKL()) return; #ifdef _OPENMP @@ -118,7 +118,7 @@ class MklCPUAllocatorFactory : public AllocatorFactory { }; #ifdef ENABLE_MKL -REGISTER_MEM_ALLOCATOR("MklCPUAllocator", 200, MklCPUAllocatorFactory); +REGISTER_MEM_ALLOCATOR("MklCPUAllocator", (DisableMKL() ? 50 : 200), MklCPUAllocatorFactory); #endif // ENABLE_MKL } // namespace |