1 files changed, 327 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
new file mode 100644
index 0000000000..59c3159895
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -0,0 +1,327 @@
+#include "tensorflow/stream_executor/cuda/cuda_fft.h"
+
+#include <dlfcn.h>
+
+#include <complex>
+
+#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
+#include "tensorflow/stream_executor/cuda/cuda_helpers.h"
+#include "tensorflow/stream_executor/cuda/cuda_platform.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/dso_loader.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuFftPlugin);
+
+namespace dynload {
+
+// This macro wraps a global identifier, given by __name, in a callable
+// structure that loads the DLL symbol out of the DSO handle in a thread-safe
+// manner on first use. This dynamic loading technique is used to avoid DSO
+// dependencies on vendor libraries which may or may not be available in the
+// deployed binary environment.
+#define PERFTOOLS_GPUTOOLS_CUFFT_WRAP(__name)                              \
+  struct DynLoadShim__##__name {                                           \
+    static const char *kName;                                              \
+    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;       \
+    static void *GetDsoHandle() {                                          \
+      static auto status = internal::CachedDsoLoader::GetCufftDsoHandle(); \
+      return status.ValueOrDie();                                          \
+    }                                                                      \
+    static FuncPointerT DynLoad() {                                        \
+      static void *f = dlsym(GetDsoHandle(), kName);                       \
+      CHECK(f != nullptr) << "could not find " << kName                    \
+                          << " in cuFFT DSO; dlerror: " << dlerror();      \
+      return reinterpret_cast<FuncPointerT>(f);                            \
+    }                                                                      \
+    template <typename... Args>                                            \
+    cufftResult operator()(CUDAExecutor * parent, Args... args) {          \
+      cuda::ScopedActivateExecutorContext sac{parent};                     \
+      return DynLoad()(args...);                                           \
+    }                                                                      \
+  } __name;                                                                \
+  const char *DynLoadShim__##__name::kName = #__name;
+
+#define CUFFT_ROUTINE_EACH(__macro)                                         \
+  __macro(cufftDestroy) __macro(cufftSetStream) __macro(cufftPlan1d)        \
+      __macro(cufftPlan2d) __macro(cufftPlan3d) __macro(cufftPlanMany)      \
+          __macro(cufftExecD2Z) __macro(cufftExecZ2D) __macro(cufftExecC2C) \
+              __macro(cufftExecC2R) __macro(cufftExecZ2Z)                   \
+                  __macro(cufftExecR2C)
+
+CUFFT_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUFFT_WRAP)
+
+}  // namespace dynload
+
+namespace {
+
+// A helper function transforming gpu_fft arguments into cuFFT arguments.
+cufftType CUDAFftType(fft::Type type) {
+  switch (type) {
+    case fft::Type::kC2CForward:
+    case fft::Type::kC2CInverse:
+      return CUFFT_C2C;
+    case fft::Type::kC2R:
+      return CUFFT_C2R;
+    case fft::Type::kR2C:
+      return CUFFT_R2C;
+    case fft::Type::kZ2ZForward:
+    case fft::Type::kZ2ZInverse:
+      return CUFFT_Z2Z;
+    case fft::Type::kZ2D:
+      return CUFFT_Z2D;
+    case fft::Type::kD2Z:
+      return CUFFT_D2Z;
+    default:
+      LOG(FATAL) << "Invalid value of fft::Type.";
+  }
+}
+
+// Associates the given stream with the given cuFFT plan.
+bool SetStream(CUDAExecutor *parent, cufftHandle plan, Stream *stream) {
+  auto ret = dynload::cufftSetStream(parent, plan, AsCUDAStreamValue(stream));
+  if (ret != CUFFT_SUCCESS) {
+    LOG(ERROR) << "failed to run cuFFT routine cufftSetStream: " << ret;
+    return false;
+  }
+  return true;
+}
+
+}  // namespace
+
+CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, fft::Type type)
+    : parent_(parent), fft_type_(type) {
+  auto ret = dynload::cufftPlan1d(parent, &plan_, num_x, CUDAFftType(type),
+                                  1 /* = batch */);
+  if (ret != CUFFT_SUCCESS) {
+    LOG(ERROR) << "failed to create cuFFT 1d plan:" << ret;
+  }
+}
+
+CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y,
+                         fft::Type type)
+    : parent_(parent), fft_type_(type) {
+  auto ret =
+      dynload::cufftPlan2d(parent, &plan_, num_x, num_y, CUDAFftType(type));
+  if (ret != CUFFT_SUCCESS) {
+    LOG(ERROR) << "failed to create cuFFT 2d plan:" << ret;
+  }
+}
+
+CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y,
+                         uint64 num_z, fft::Type type)
+    : parent_(parent), fft_type_(type) {
+  auto ret = dynload::cufftPlan3d(parent, &plan_, num_x, num_y, num_z,
+                                  CUDAFftType(type));
+  if (ret != CUFFT_SUCCESS) {
+    LOG(ERROR) << "failed to create cuFFT 3d plan:" << ret;
+  }
+}
+
+CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, int rank, uint64 *elem_count,
+                         uint64 *input_embed, uint64 input_stride,
+                         uint64 input_distance, uint64 *output_embed,
+                         uint64 output_stride, uint64 output_distance,
+                         fft::Type type, int batch_count)
+    : parent_(parent), fft_type_(type) {
+  int elem_count_[3], input_embed_[3], output_embed_[3];
+  for (int i = 0; i < rank; ++i) {
+    elem_count_[i] = elem_count[i];
+    if (input_embed) {
+      input_embed_[i] = input_embed[i];
+    }
+    if (output_embed) {
+      output_embed_[i] = output_embed[i];
+    }
+  }
+  auto ret = dynload::cufftPlanMany(
+      parent, &plan_, rank, elem_count_, input_embed ? input_embed_ : nullptr,
+      input_stride, input_distance, output_embed ? output_embed_ : nullptr,
+      output_stride, output_distance, CUDAFftType(type), batch_count);
+  if (ret != CUFFT_SUCCESS) {
+    LOG(ERROR) << "failed to create cuFFT batched plan:" << ret;
+  }
+}
+
+CUDAFftPlan::~CUDAFftPlan() { dynload::cufftDestroy(parent_, plan_); }
+
+int CUDAFftPlan::GetFftDirection() const {
+  switch (fft_type_) {
+    case fft::Type::kC2CForward:
+    case fft::Type::kZ2ZForward:
+    case fft::Type::kR2C:
+    case fft::Type::kD2Z:
+      return CUFFT_FORWARD;
+    case fft::Type::kC2CInverse:
+    case fft::Type::kZ2ZInverse:
+    case fft::Type::kC2R:
+    case fft::Type::kZ2D:
+      return CUFFT_INVERSE;
+    default:
+      LOG(FATAL) << "Invalid value of fft::Type.";
+  }
+}
+
+std::unique_ptr<fft::Plan> CUDAFft::Create1dPlan(Stream *stream, uint64 num_x,
+                                                 fft::Type type,
+                                                 bool in_place_fft) {
+  std::unique_ptr<fft::Plan> plan{new CUDAFftPlan(parent_, num_x, type)};
+  return plan;
+}
+
+std::unique_ptr<fft::Plan> CUDAFft::Create2dPlan(Stream *stream, uint64 num_x,
+                                                 uint64 num_y, fft::Type type,
+                                                 bool in_place_fft) {
+  std::unique_ptr<fft::Plan> plan{new CUDAFftPlan(parent_, num_x, num_y, type)};
+  return plan;
+}
+
+std::unique_ptr<fft::Plan> CUDAFft::Create3dPlan(Stream *stream, uint64 num_x,
+                                                 uint64 num_y, uint64 num_z,
+                                                 fft::Type type,
+                                                 bool in_place_fft) {
+  std::unique_ptr<fft::Plan> plan{
+      new CUDAFftPlan(parent_, num_x, num_y, num_z, type)};
+  return plan;
+}
+
+std::unique_ptr<fft::Plan> CUDAFft::CreateBatchedPlan(
+    Stream *stream, int rank, uint64 *elem_count, uint64 *input_embed,
+    uint64 input_stride, uint64 input_distance, uint64 *output_embed,
+    uint64 output_stride, uint64 output_distance, fft::Type type,
+    bool in_place_fft, int batch_count) {
+  std::unique_ptr<fft::Plan> plan{new CUDAFftPlan(
+      parent_, rank, elem_count, input_embed, input_stride, input_distance,
+      output_embed, output_stride, output_distance, type, batch_count)};
+  return plan;
+}
+
+template <typename FuncT, typename InputT, typename OutputT>
+bool CUDAFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufftExec,
+                            const DeviceMemory<InputT> &input,
+                            DeviceMemory<OutputT> *output) {
+  CUDAFftPlan *cuda_fft_plan = dynamic_cast<CUDAFftPlan *>(plan);
+  if (cuda_fft_plan == nullptr) {
+    LOG(ERROR) << "the passed-in plan is not a CUDAFftPlan object.";
+    return false;
+  }
+
+  if (!SetStream(parent_, cuda_fft_plan->GetPlan(), stream)) {
+    return false;
+  }
+
+  auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(),
+                       CUDAComplex(const_cast<InputT *>(CUDAMemory(input))),
+                       CUDAComplex(CUDAMemoryMutable(output)));
+
+  if (ret != CUFFT_SUCCESS) {
+    LOG(ERROR) << "failed to run cuFFT routine: " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+template <typename FuncT, typename InputT, typename OutputT>
+bool CUDAFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
+                                         FuncT cufftExec,
+                                         const DeviceMemory<InputT> &input,
+                                         DeviceMemory<OutputT> *output) {
+  CUDAFftPlan *cuda_fft_plan = dynamic_cast<CUDAFftPlan *>(plan);
+  if (cuda_fft_plan == nullptr) {
+    LOG(ERROR) << "the passed-in plan is not a CUDAFftPlan object.";
+    return false;
+  }
+
+  if (!SetStream(parent_, cuda_fft_plan->GetPlan(), stream)) {
+    return false;
+  }
+
+  auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(),
+                       CUDAComplex(const_cast<InputT *>(CUDAMemory(input))),
+                       CUDAComplex(CUDAMemoryMutable(output)),
+                       cuda_fft_plan->GetFftDirection());
+
+  if (ret != CUFFT_SUCCESS) {
+    LOG(ERROR) << "failed to run cuFFT routine: " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+#define PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(__type, __fft_type1, __fft_type2,   \
+                                           __fft_type3)                        \
+  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                         \
+                      const DeviceMemory<std::complex<__type>> &input,         \
+                      DeviceMemory<std::complex<__type>> *output) {            \
+    return DoFftWithDirectionInternal(                                         \
+        stream, plan, dynload::cufftExec##__fft_type1, input, output);         \
+  }                                                                            \
+  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                         \
+                      const DeviceMemory<__type> &input,                       \
+                      DeviceMemory<std::complex<__type>> *output) {            \
+    return DoFftInternal(stream, plan, dynload::cufftExec##__fft_type2, input, \
+                         output);                                              \
+  }                                                                            \
+  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                         \
+                      const DeviceMemory<std::complex<__type>> &input,         \
+                      DeviceMemory<__type> *output) {                          \
+    return DoFftInternal(stream, plan, dynload::cufftExec##__fft_type3, input, \
+                         output);                                              \
+  }
+
+PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(float, C2C, R2C, C2R)
+PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(double, Z2Z, D2Z, Z2D)
+
+#undef PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+namespace gpu = ::perftools::gputools;
+
+REGISTER_MODULE_INITIALIZER(register_cufft, {
+  gpu::port::Status status =
+      gpu::PluginRegistry::Instance()
+          ->RegisterFactory<gpu::PluginRegistry::FftFactory>(
+              gpu::cuda::kCudaPlatformId, gpu::cuda::kCuFftPlugin, "cuFFT",
+              [](gpu::internal::StreamExecutorInterface
+                     *parent) -> gpu::fft::FftSupport * {
+                gpu::cuda::CUDAExecutor *cuda_executor =
+                    dynamic_cast<gpu::cuda::CUDAExecutor *>(parent);
+                if (cuda_executor == nullptr) {
+                  LOG(ERROR)
+                      << "Attempting to initialize an instance of the cuFFT "
+                      << "support library with a non-CUDA StreamExecutor";
+                  return nullptr;
+                }
+
+                return new gpu::cuda::CUDAFft(cuda_executor);
+              });
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to register cuFFT factory: "
+               << status.error_message();
+  }
+
+  // Prime the cuFFT DSO. The loader will log more information.
+  auto statusor = gpu::internal::CachedDsoLoader::GetCufftDsoHandle();
+  if (!statusor.ok()) {
+    LOG(INFO) << "Unable to load cuFFT DSO.";
+  }
+
+  gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
+                                                     gpu::PluginKind::kFft,
+                                                     gpu::cuda::kCuFftPlugin);
+});