aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/cuda/cuda_fft.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/stream_executor/cuda/cuda_fft.cc')
-rw-r--r--tensorflow/stream_executor/cuda/cuda_fft.cc327
1 files changed, 327 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
new file mode 100644
index 0000000000..59c3159895
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -0,0 +1,327 @@
+#include "tensorflow/stream_executor/cuda/cuda_fft.h"
+
+#include <dlfcn.h>
+
+#include <complex>
+
+#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
+#include "tensorflow/stream_executor/cuda/cuda_helpers.h"
+#include "tensorflow/stream_executor/cuda/cuda_platform.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/dso_loader.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuFftPlugin);
+
+namespace dynload {
+
+// This macro wraps a global identifier, given by __name, in a callable
+// structure that loads the DLL symbol out of the DSO handle in a thread-safe
+// manner on first use. This dynamic loading technique is used to avoid DSO
+// dependencies on vendor libraries which may or may not be available in the
+// deployed binary environment.
+#define PERFTOOLS_GPUTOOLS_CUFFT_WRAP(__name) \
+ struct DynLoadShim__##__name { \
+ static const char *kName; \
+ using FuncPointerT = std::add_pointer<decltype(::__name)>::type; \
+ static void *GetDsoHandle() { \
+ static auto status = internal::CachedDsoLoader::GetCufftDsoHandle(); \
+ return status.ValueOrDie(); \
+ } \
+ static FuncPointerT DynLoad() { \
+ static void *f = dlsym(GetDsoHandle(), kName); \
+ CHECK(f != nullptr) << "could not find " << kName \
+ << " in cuFFT DSO; dlerror: " << dlerror(); \
+ return reinterpret_cast<FuncPointerT>(f); \
+ } \
+ template <typename... Args> \
+ cufftResult operator()(CUDAExecutor * parent, Args... args) { \
+ cuda::ScopedActivateExecutorContext sac{parent}; \
+ return DynLoad()(args...); \
+ } \
+ } __name; \
+ const char *DynLoadShim__##__name::kName = #__name;
+
+#define CUFFT_ROUTINE_EACH(__macro) \
+ __macro(cufftDestroy) __macro(cufftSetStream) __macro(cufftPlan1d) \
+ __macro(cufftPlan2d) __macro(cufftPlan3d) __macro(cufftPlanMany) \
+ __macro(cufftExecD2Z) __macro(cufftExecZ2D) __macro(cufftExecC2C) \
+ __macro(cufftExecC2R) __macro(cufftExecZ2Z) \
+ __macro(cufftExecR2C)
+
+CUFFT_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUFFT_WRAP)
+
+} // namespace dynload
+
+namespace {
+
+// A helper function transforming gpu_fft arguments into cuFFT arguments.
+cufftType CUDAFftType(fft::Type type) {
+ switch (type) {
+ case fft::Type::kC2CForward:
+ case fft::Type::kC2CInverse:
+ return CUFFT_C2C;
+ case fft::Type::kC2R:
+ return CUFFT_C2R;
+ case fft::Type::kR2C:
+ return CUFFT_R2C;
+ case fft::Type::kZ2ZForward:
+ case fft::Type::kZ2ZInverse:
+ return CUFFT_Z2Z;
+ case fft::Type::kZ2D:
+ return CUFFT_Z2D;
+ case fft::Type::kD2Z:
+ return CUFFT_D2Z;
+ default:
+ LOG(FATAL) << "Invalid value of fft::Type.";
+ }
+}
+
+// Associates the given stream with the given cuFFT plan.
+bool SetStream(CUDAExecutor *parent, cufftHandle plan, Stream *stream) {
+ auto ret = dynload::cufftSetStream(parent, plan, AsCUDAStreamValue(stream));
+ if (ret != CUFFT_SUCCESS) {
+ LOG(ERROR) << "failed to run cuFFT routine cufftSetStream: " << ret;
+ return false;
+ }
+ return true;
+}
+
+} // namespace
+
+CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, fft::Type type)
+ : parent_(parent), fft_type_(type) {
+ auto ret = dynload::cufftPlan1d(parent, &plan_, num_x, CUDAFftType(type),
+ 1 /* = batch */);
+ if (ret != CUFFT_SUCCESS) {
+ LOG(ERROR) << "failed to create cuFFT 1d plan:" << ret;
+ }
+}
+
+CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y,
+ fft::Type type)
+ : parent_(parent), fft_type_(type) {
+ auto ret =
+ dynload::cufftPlan2d(parent, &plan_, num_x, num_y, CUDAFftType(type));
+ if (ret != CUFFT_SUCCESS) {
+ LOG(ERROR) << "failed to create cuFFT 2d plan:" << ret;
+ }
+}
+
+CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y,
+ uint64 num_z, fft::Type type)
+ : parent_(parent), fft_type_(type) {
+ auto ret = dynload::cufftPlan3d(parent, &plan_, num_x, num_y, num_z,
+ CUDAFftType(type));
+ if (ret != CUFFT_SUCCESS) {
+ LOG(ERROR) << "failed to create cuFFT 3d plan:" << ret;
+ }
+}
+
+CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, int rank, uint64 *elem_count,
+ uint64 *input_embed, uint64 input_stride,
+ uint64 input_distance, uint64 *output_embed,
+ uint64 output_stride, uint64 output_distance,
+ fft::Type type, int batch_count)
+ : parent_(parent), fft_type_(type) {
+ int elem_count_[3], input_embed_[3], output_embed_[3];
+ for (int i = 0; i < rank; ++i) {
+ elem_count_[i] = elem_count[i];
+ if (input_embed) {
+ input_embed_[i] = input_embed[i];
+ }
+ if (output_embed) {
+ output_embed_[i] = output_embed[i];
+ }
+ }
+ auto ret = dynload::cufftPlanMany(
+ parent, &plan_, rank, elem_count_, input_embed ? input_embed_ : nullptr,
+ input_stride, input_distance, output_embed ? output_embed_ : nullptr,
+ output_stride, output_distance, CUDAFftType(type), batch_count);
+ if (ret != CUFFT_SUCCESS) {
+ LOG(ERROR) << "failed to create cuFFT batched plan:" << ret;
+ }
+}
+
+CUDAFftPlan::~CUDAFftPlan() { dynload::cufftDestroy(parent_, plan_); }
+
+int CUDAFftPlan::GetFftDirection() const {
+ switch (fft_type_) {
+ case fft::Type::kC2CForward:
+ case fft::Type::kZ2ZForward:
+ case fft::Type::kR2C:
+ case fft::Type::kD2Z:
+ return CUFFT_FORWARD;
+ case fft::Type::kC2CInverse:
+ case fft::Type::kZ2ZInverse:
+ case fft::Type::kC2R:
+ case fft::Type::kZ2D:
+ return CUFFT_INVERSE;
+ default:
+ LOG(FATAL) << "Invalid value of fft::Type.";
+ }
+}
+
+std::unique_ptr<fft::Plan> CUDAFft::Create1dPlan(Stream *stream, uint64 num_x,
+ fft::Type type,
+ bool in_place_fft) {
+ std::unique_ptr<fft::Plan> plan{new CUDAFftPlan(parent_, num_x, type)};
+ return plan;
+}
+
+std::unique_ptr<fft::Plan> CUDAFft::Create2dPlan(Stream *stream, uint64 num_x,
+ uint64 num_y, fft::Type type,
+ bool in_place_fft) {
+ std::unique_ptr<fft::Plan> plan{new CUDAFftPlan(parent_, num_x, num_y, type)};
+ return plan;
+}
+
+std::unique_ptr<fft::Plan> CUDAFft::Create3dPlan(Stream *stream, uint64 num_x,
+ uint64 num_y, uint64 num_z,
+ fft::Type type,
+ bool in_place_fft) {
+ std::unique_ptr<fft::Plan> plan{
+ new CUDAFftPlan(parent_, num_x, num_y, num_z, type)};
+ return plan;
+}
+
+std::unique_ptr<fft::Plan> CUDAFft::CreateBatchedPlan(
+ Stream *stream, int rank, uint64 *elem_count, uint64 *input_embed,
+ uint64 input_stride, uint64 input_distance, uint64 *output_embed,
+ uint64 output_stride, uint64 output_distance, fft::Type type,
+ bool in_place_fft, int batch_count) {
+ std::unique_ptr<fft::Plan> plan{new CUDAFftPlan(
+ parent_, rank, elem_count, input_embed, input_stride, input_distance,
+ output_embed, output_stride, output_distance, type, batch_count)};
+ return plan;
+}
+
+template <typename FuncT, typename InputT, typename OutputT>
+bool CUDAFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufftExec,
+ const DeviceMemory<InputT> &input,
+ DeviceMemory<OutputT> *output) {
+ CUDAFftPlan *cuda_fft_plan = dynamic_cast<CUDAFftPlan *>(plan);
+ if (cuda_fft_plan == nullptr) {
+ LOG(ERROR) << "the passed-in plan is not a CUDAFftPlan object.";
+ return false;
+ }
+
+ if (!SetStream(parent_, cuda_fft_plan->GetPlan(), stream)) {
+ return false;
+ }
+
+ auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(),
+ CUDAComplex(const_cast<InputT *>(CUDAMemory(input))),
+ CUDAComplex(CUDAMemoryMutable(output)));
+
+ if (ret != CUFFT_SUCCESS) {
+ LOG(ERROR) << "failed to run cuFFT routine: " << ret;
+ return false;
+ }
+
+ return true;
+}
+
+template <typename FuncT, typename InputT, typename OutputT>
+bool CUDAFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
+ FuncT cufftExec,
+ const DeviceMemory<InputT> &input,
+ DeviceMemory<OutputT> *output) {
+ CUDAFftPlan *cuda_fft_plan = dynamic_cast<CUDAFftPlan *>(plan);
+ if (cuda_fft_plan == nullptr) {
+ LOG(ERROR) << "the passed-in plan is not a CUDAFftPlan object.";
+ return false;
+ }
+
+ if (!SetStream(parent_, cuda_fft_plan->GetPlan(), stream)) {
+ return false;
+ }
+
+ auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(),
+ CUDAComplex(const_cast<InputT *>(CUDAMemory(input))),
+ CUDAComplex(CUDAMemoryMutable(output)),
+ cuda_fft_plan->GetFftDirection());
+
+ if (ret != CUFFT_SUCCESS) {
+ LOG(ERROR) << "failed to run cuFFT routine: " << ret;
+ return false;
+ }
+
+ return true;
+}
+
+#define PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(__type, __fft_type1, __fft_type2, \
+ __fft_type3) \
+ bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan, \
+ const DeviceMemory<std::complex<__type>> &input, \
+ DeviceMemory<std::complex<__type>> *output) { \
+ return DoFftWithDirectionInternal( \
+ stream, plan, dynload::cufftExec##__fft_type1, input, output); \
+ } \
+ bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan, \
+ const DeviceMemory<__type> &input, \
+ DeviceMemory<std::complex<__type>> *output) { \
+ return DoFftInternal(stream, plan, dynload::cufftExec##__fft_type2, input, \
+ output); \
+ } \
+ bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan, \
+ const DeviceMemory<std::complex<__type>> &input, \
+ DeviceMemory<__type> *output) { \
+ return DoFftInternal(stream, plan, dynload::cufftExec##__fft_type3, input, \
+ output); \
+ }
+
+PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(float, C2C, R2C, C2R)
+PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(double, Z2Z, D2Z, Z2D)
+
+#undef PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT
+
+} // namespace cuda
+} // namespace gputools
+} // namespace perftools
+
+namespace gpu = ::perftools::gputools;
+
+REGISTER_MODULE_INITIALIZER(register_cufft, {
+ gpu::port::Status status =
+ gpu::PluginRegistry::Instance()
+ ->RegisterFactory<gpu::PluginRegistry::FftFactory>(
+ gpu::cuda::kCudaPlatformId, gpu::cuda::kCuFftPlugin, "cuFFT",
+ [](gpu::internal::StreamExecutorInterface
+ *parent) -> gpu::fft::FftSupport * {
+ gpu::cuda::CUDAExecutor *cuda_executor =
+ dynamic_cast<gpu::cuda::CUDAExecutor *>(parent);
+ if (cuda_executor == nullptr) {
+ LOG(ERROR)
+ << "Attempting to initialize an instance of the cuFFT "
+ << "support library with a non-CUDA StreamExecutor";
+ return nullptr;
+ }
+
+ return new gpu::cuda::CUDAFft(cuda_executor);
+ });
+ if (!status.ok()) {
+ LOG(ERROR) << "Unable to register cuFFT factory: "
+ << status.error_message();
+ }
+
+ // Prime the cuFFT DSO. The loader will log more information.
+ auto statusor = gpu::internal::CachedDsoLoader::GetCufftDsoHandle();
+ if (!statusor.ok()) {
+ LOG(INFO) << "Unable to load cuFFT DSO.";
+ }
+
+ gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
+ gpu::PluginKind::kFft,
+ gpu::cuda::kCuFftPlugin);
+});