diff options
author | 2018-06-06 11:26:43 -0700 | |
---|---|---|
committer | 2018-06-06 11:29:18 -0700 | |
commit | 879fc3440495d9388754cb7d1878caf034d03d61 (patch) | |
tree | eea14054c8f73365f1c185b3d86f5eca698ab722 /tensorflow/python/lib | |
parent | 980c390941853649bb56c4940a46f474eb97ed80 (diff) |
Use memmove instead of memcpy for the large tensors on Linux.
Issue: #17246
~1.7x speedup for fetching a variable
Before:
fetch_cpu_variable : 5.5 GB/sec, min: 14.56, median: 15.05, mean: 15.14
fetch_cpu_variable_add: 11.0 GB/sec, min: 7.29, median: 12.03, mean: 12.56
fetch_cpu_variable_concat: 11.6 GB/sec, min: 6.92, median: 13.78, mean: 14.76
After:
fetch_cpu_variable : 9.2 GB/sec, min: 8.71, median: 8.79, mean: 8.80
fetch_cpu_variable_add: 12.5 GB/sec, min: 6.41, median: 7.20, mean: 7.51
fetch_cpu_variable_concat: 12.7 GB/sec, min: 6.32, median: 6.54
PiperOrigin-RevId: 199497691
Diffstat (limited to 'tensorflow/python/lib')
-rw-r--r-- | tensorflow/python/lib/core/ndarray_tensor.cc | 38 |
1 files changed, 36 insertions, 2 deletions
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc index 9df38d464c..2acab92764 100644 --- a/tensorflow/python/lib/core/ndarray_tensor.cc +++ b/tensorflow/python/lib/core/ndarray_tensor.cc @@ -312,6 +312,40 @@ Status GetPyArrayDescrForTensor(const TF_Tensor* tensor, return Status::OK(); } + +inline void FastMemcpy(void* dst, const void* src, size_t size) { + // clang-format off + switch (size) { + // Most compilers will generate inline code for fixed sizes, + // which is significantly faster for small copies. + case 1: memcpy(dst, src, 1); break; + case 2: memcpy(dst, src, 2); break; + case 3: memcpy(dst, src, 3); break; + case 4: memcpy(dst, src, 4); break; + case 5: memcpy(dst, src, 5); break; + case 6: memcpy(dst, src, 6); break; + case 7: memcpy(dst, src, 7); break; + case 8: memcpy(dst, src, 8); break; + case 9: memcpy(dst, src, 9); break; + case 10: memcpy(dst, src, 10); break; + case 11: memcpy(dst, src, 11); break; + case 12: memcpy(dst, src, 12); break; + case 13: memcpy(dst, src, 13); break; + case 14: memcpy(dst, src, 14); break; + case 15: memcpy(dst, src, 15); break; + case 16: memcpy(dst, src, 16); break; +#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_POSIX) && \ + !defined(IS_MOBILE_PLATFORM) + // On Linux, memmove appears to be faster than memcpy for + // large sizes, strangely enough. + default: memmove(dst, src, size); break; +#else + default: memcpy(dst, src, size); break; +#endif + } + // clang-format on +} + } // namespace // Converts the given TF_Tensor to a numpy ndarray. @@ -362,8 +396,8 @@ Status TF_TensorToPyArray(Safe_TF_TensorPtr tensor, PyObject** out_ndarray) { " bytes but TF_Tensor was ", TF_TensorByteSize(tensor.get()), " bytes"); } else { - memcpy(PyArray_DATA(py_array), TF_TensorData(tensor.get()), - PyArray_NBYTES(py_array)); + FastMemcpy(PyArray_DATA(py_array), TF_TensorData(tensor.get()), + PyArray_NBYTES(py_array)); } // PyArray_Return turns rank 0 arrays into numpy scalars |