#include "tensorflow/stream_executor/kernel_spec.h"


namespace perftools {
namespace gputools {

KernelLoaderSpec::KernelLoaderSpec(port::StringPiece kernelname)
    : kernelname_(kernelname.ToString()) {}

OnDiskKernelLoaderSpec::OnDiskKernelLoaderSpec(port::StringPiece filename,
                                               port::StringPiece kernelname)
    : KernelLoaderSpec(kernelname), filename_(filename.ToString()) {}

CudaPtxOnDisk::CudaPtxOnDisk(port::StringPiece filename,
                             port::StringPiece kernelname)
    : OnDiskKernelLoaderSpec(filename, kernelname) {}

CudaCubinOnDisk::CudaCubinOnDisk(port::StringPiece filename,
                                 port::StringPiece kernelname)
    : OnDiskKernelLoaderSpec(filename, kernelname) {}

CudaCubinInMemory::CudaCubinInMemory(const char *bytes,
                                     port::StringPiece kernelname)
    : KernelLoaderSpec(kernelname), bytes_(bytes) {}

bool CompareComputeCapability(const std::tuple<int, int> &lhs,
                              const std::tuple<int, int> &rhs) {
  return std::get<0>(lhs) < std::get<0>(rhs) ||
         (std::get<0>(lhs) == std::get<0>(rhs) &&
          std::get<1>(lhs) < std::get<1>(rhs));
}

const std::tuple<int, int> CudaPtxInMemory::kMinimumCapability{1, 0};

CudaPtxInMemory::CudaPtxInMemory(port::StringPiece ptx,
                                 port::StringPiece kernel_name,
                                 bool ptx_compressed)
    : KernelLoaderSpec(kernel_name),
      ptx_by_compute_capability_(CompareComputeCapability) {
  if (ptx_compressed) {
    // Lazy decompression. Put an empty string in decompressed_ptx_ showing that
    // the original ptx is compressed.
    decompressed_ptx_[ptx.data()] = "";
  }
  ptx_by_compute_capability_[kMinimumCapability] = ptx.data();
}

CudaPtxInMemory::CudaPtxInMemory(
    const std::initializer_list<CudaPtxInMemory::PtxSpec> &spec_list,
    port::StringPiece kernel_name, bool ptx_compressed)
    : KernelLoaderSpec(kernel_name),
      ptx_by_compute_capability_(CompareComputeCapability) {
  for (const auto &spec : spec_list) {
    int major, minor;
    port::StringPiece ptx;
    std::tie(major, minor, ptx) = spec;
    if (ptx_compressed) {
      // Lazy decompression. Put an empty string in decompressed_ptx_ showing
      // that the original ptx is compressed.
      decompressed_ptx_[ptx.data()] = "";
    }
    ptx_by_compute_capability_[std::tuple<int, int>{major, minor}] = ptx.data();
  }
}

string CudaPtxInMemory::DecompressPtx(const char *ptx) {
  // Get the length of the PTX string from the beginning of the buffer.
  uint64 ptx_length = *reinterpret_cast<const uint64 *>(ptx);
  // Get the PTX string from the buffer with offset and length.
  string compressed_ptx(ptx + sizeof(uint64),
                        ptx + sizeof(uint64) + ptx_length);
  string decompressed_ptx;
  // Decompress the PTX string with bzip2.
  LOG(FATAL) << "bzip2 decompression is not supported yet.";
  return decompressed_ptx;
}

const char *CudaPtxInMemory::default_text() const {
  if (ptx_by_compute_capability_.empty()) {
    return nullptr;
  }

  mutex_lock lock{mu_};

  auto ptx = ptx_by_compute_capability_.begin()->second;
  // Check if there is an entry in decompressed ptx table.
  auto decompressed_ptx_iter = decompressed_ptx_.find(ptx);
  if (decompressed_ptx_iter != decompressed_ptx_.end()) {
    // If the decompressed string is empty, which means the ptx hasn't been
    // decompressed, decompress it here.
    if (decompressed_ptx_iter->second.size() == 0) {
      decompressed_ptx_iter->second = DecompressPtx(ptx);
    }
    return decompressed_ptx_iter->second.c_str();
  }
  return ptx;
}

const char *CudaPtxInMemory::original_default_text() const {
  if (ptx_by_compute_capability_.empty()) {
    return nullptr;
  }

  return ptx_by_compute_capability_.begin()->second;
}

const char *CudaPtxInMemory::text(int compute_capability_major,
                                  int compute_capability_minor) const {
  std::tuple<int, int> capability{compute_capability_major,
                                  compute_capability_minor};

  auto ptx_iter = ptx_by_compute_capability_.find(capability);
  if (ptx_iter == ptx_by_compute_capability_.end()) {
    return nullptr;
  }

  mutex_lock lock{mu_};

  // Check if there is an entry in decompressed ptx table.
  auto decompressed_ptx_iter = decompressed_ptx_.find(ptx_iter->second);
  if (decompressed_ptx_iter != decompressed_ptx_.end()) {
    // If the decompressed string is empty, which means the ptx hasn't been
    // decompressed, decompress it here.
    if (decompressed_ptx_iter->second.size() == 0) {
      decompressed_ptx_iter->second = DecompressPtx(ptx_iter->second);
    }
    return decompressed_ptx_iter->second.c_str();
  }
  return ptx_iter->second;
}

const char *CudaPtxInMemory::original_text(int compute_capability_major,
                                           int compute_capability_minor) const {
  std::tuple<int, int> capability{compute_capability_major,
                                  compute_capability_minor};

  auto ptx_iter = ptx_by_compute_capability_.find(capability);
  if (ptx_iter == ptx_by_compute_capability_.end()) {
    return nullptr;
  }

  return ptx_iter->second;
}

OpenCLTextOnDisk::OpenCLTextOnDisk(port::StringPiece filename,
                                   port::StringPiece kernelname)
    : OnDiskKernelLoaderSpec(filename, kernelname) {}

OpenCLTextInMemory::OpenCLTextInMemory(port::StringPiece text,
                                       port::StringPiece kernelname)
    : KernelLoaderSpec(kernelname), text_(text.ToString()) {}

OpenCLBinaryOnDisk::OpenCLBinaryOnDisk(port::StringPiece filename,
                                       port::StringPiece kernelname)
    : OnDiskKernelLoaderSpec(filename, kernelname) {}

MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLTextOnDisk(
    port::StringPiece filename, port::StringPiece kernelname) {
  CHECK(ocl_text_on_disk_ == nullptr);
  ocl_text_on_disk_.reset(new OpenCLTextOnDisk{filename, kernelname});
  return this;
}

MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLBinaryOnDisk(
    port::StringPiece filename, port::StringPiece kernelname) {
  CHECK(ocl_binary_on_disk_ == nullptr);
  ocl_binary_on_disk_.reset(new OpenCLBinaryOnDisk{filename, kernelname});
  return this;
}

MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLTextInMemory(
    port::StringPiece filename, port::StringPiece kernelname) {
  CHECK(ocl_text_in_memory_ == nullptr);
  ocl_text_in_memory_.reset(new OpenCLTextInMemory{filename, kernelname});
  return this;
}

MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxOnDisk(
    port::StringPiece filename, port::StringPiece kernelname) {
  CHECK(cuda_ptx_on_disk_ == nullptr);
  cuda_ptx_on_disk_.reset(new CudaPtxOnDisk{filename, kernelname});
  return this;
}

MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCubinInMemory(
    const char *bytes, port::StringPiece kernelname) {
  CHECK(cuda_cubin_in_memory_ == nullptr);
  cuda_cubin_in_memory_.reset(new CudaCubinInMemory{bytes, kernelname});
  return this;
}

MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCubinOnDisk(
    port::StringPiece filename, port::StringPiece kernelname) {
  CHECK(cuda_cubin_on_disk_ == nullptr);
  cuda_cubin_on_disk_.reset(new CudaCubinOnDisk{filename, kernelname});
  return this;
}

MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory(
    port::StringPiece ptx, port::StringPiece kernelname) {
  CHECK(cuda_ptx_in_memory_ == nullptr);
  cuda_ptx_in_memory_.reset(
      new CudaPtxInMemory{ptx, kernelname, false /* ptx_compressed */});
  return this;
}

MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory(
    port::StringPiece ptx, port::StringPiece kernelname) {
  CHECK(cuda_ptx_in_memory_ == nullptr);
  cuda_ptx_in_memory_.reset(
      new CudaPtxInMemory{ptx, kernelname, true /* ptx_compressed */});
  return this;
}

MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory(
    std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
    port::StringPiece kernelname) {
  CHECK(cuda_ptx_in_memory_ == nullptr);
  cuda_ptx_in_memory_.reset(
      new CudaPtxInMemory{spec_list, kernelname, false /* ptx_compressed */});
  return this;
}

MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory(
    std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
    port::StringPiece kernelname) {
  CHECK(cuda_ptx_in_memory_ == nullptr);
  cuda_ptx_in_memory_.reset(
      new CudaPtxInMemory{spec_list, kernelname, true /* ptx_compressed */});
  return this;
}

MultiKernelLoaderSpec::MultiKernelLoaderSpec(size_t arity) : arity_(arity) {}

}  // namespace gputools
}  // namespace perftools