/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include "tensorflow/stream_executor/stream.h" #include "tensorflow/stream_executor/platform/port.h" #include "tensorflow/stream_executor/blas.h" #include "tensorflow/stream_executor/lib/stacktrace.h" #include "tensorflow/stream_executor/lib/strcat.h" #include "tensorflow/stream_executor/platform.h" #include "tensorflow/stream_executor/platform/logging.h" #include "tensorflow/stream_executor/rng.h" #include "tensorflow/stream_executor/stream_executor_internal.h" #include "tensorflow/stream_executor/stream_executor_pimpl.h" namespace perftools { namespace gputools { namespace { // Code to turn parameters to functions on stream into strings that // will be VLOG'ed. We need overloads, instead of // e.g. BatchDescriptorToVlogString(), as the code that calls these // functions does not know what the type of the parameter is. string ToVlogString(const dnn::BatchDescriptor &descriptor) { return descriptor.ToShortString(); } string ToVlogString(const dnn::FilterDescriptor &descriptor) { return descriptor.ToShortString(); } string ToVlogString(const dnn::ConvolutionDescriptor &descriptor) { return descriptor.ToShortString(); } string ToVlogString(const dnn::PoolingDescriptor &descriptor) { return descriptor.ToShortString(); } string ToVlogString(const dnn::NormalizeDescriptor &descriptor) { return descriptor.ToShortString(); } string ToVlogString(dnn::ActivationMode mode) { return dnn::ActivationModeString(mode); } string ToVlogString(dnn::ElementwiseOperation op) { return dnn::ElementwiseOperationString(op); } string ToVlogString(dnn::QuantizedActivationMode mode) { return dnn::QuantizedActivationModeString(mode); } string ToVlogString(blas::Transpose t) { return blas::TransposeString(t); } string ToVlogString(blas::UpperLower ul) { return blas::UpperLowerString(ul); } string ToVlogString(blas::Diagonal d) { return blas::DiagonalString(d); } string ToVlogString(blas::Side s) { return blas::SideString(s); } string ToVlogString(const void *ptr) { if (ptr == nullptr) { return "null"; } // StrCat does not convert pointers to text. std::ostringstream out; out << ptr; return out.str(); } template string ToVlogString(const std::complex &c) { // StrCat does not convert std::complex to text. std::ostringstream out; out << c; return out.str(); } template string ToVlogString(const std::function &f) { return f == nullptr ? "null" : ""; } string ToVlogString(const DeviceMemoryBase &memory) { return ToVlogString(memory.opaque()); } string ToVlogString(const DeviceMemoryBase *memory) { return ToVlogString(*memory); } string ToVlogString(int i) { return port::StrCat(i); } string ToVlogString(uint32 i) { return port::StrCat(i); } string ToVlogString(uint64 i) { return port::StrCat(i); } string ToVlogString(int64 i) { return port::StrCat(i); } string ToVlogString(float f) { return port::StrCat(f); } string ToVlogString(double d) { return port::StrCat(d); } template string ToVlogString(port::ArraySlice elements) { string str = port::StrCat( ToVlogString(reinterpret_cast(elements.data())), "[", elements.size(), "]{"); const char *separator = ""; size_t max_to_show = std::numeric_limits::max(); if (!VLOG_IS_ON(2)) { max_to_show = 5; } else if (!VLOG_IS_ON(3)) { max_to_show = 20; } else if (!VLOG_IS_ON(11)) { max_to_show = 1000; } for (size_t i = 0; i < elements.size(); ++i) { if (i == max_to_show) { str += ", ..."; break; } port::StrAppend(&str, separator, ToVlogString(elements[i])); separator = ", "; } str += "}"; return str; } template string ToVlogString(port::MutableArraySlice elements) { return ToVlogString(port::ArraySlice(elements)); } // Used together with PARAM to VLOG calls made to the stream. Intended // to be used like this: // // VLOG(1) << CallStr("MyFunction", this, {PARAM(a), PARAM(b)}); // // where a and b are the parameters to MyFunction. // // See VLOG_CALL for a short-hand for this. This way of doing it saves // a tremendous amount of boilerplate code given how many functions // there are on Stream and how many parameters they each have. string CallStr(const char *function_name, Stream *stream, std::vector> params) { // Do not call this function unless VLOG is on since just // constructing all the strings in params is expensive. CHECK(VLOG_IS_ON(1)); string str = port::StrCat("Called Stream::", function_name, "("); const char *separator = ""; for (const auto ¶m : params) { port::StrAppend(&str, separator, param.first, "=", param.second); separator = ", "; } port::StrAppend(&str, ") stream=", ToVlogString(stream)); if (VLOG_IS_ON(10)) { port::StrAppend(&str, " ", port::CurrentStackTrace(), "\n"); } return str; } // Use this macro to avoid having to type every parameter twice to log // it with VLOG and CallStr. #define PARAM(parameter) \ { #parameter, ToVlogString(parameter) } // Use this macro to avoid having to type out the name of each // function and to save some boilerplate. Intended to be used like this: // // VLOG_CALL(PARAM(a), PARAM(b)) // // This saves a tremendous amount of boilerplate compared to the alternative: // // VLOG(1) << "Calling MyFunction(a=" << ToVlogString(a) // << ", b=" << ToVlogString(b); // // Note here that most of the parameter names are not short and that // most of the functions take many more than 2 parameters. #define VLOG_CALL(...) VLOG(1) << CallStr(__func__, this, {__VA_ARGS__}) } // namespace Stream::Stream(StreamExecutor *parent) : parent_(parent), implementation_(parent->implementation()->GetStreamImplementation()), allocated_(false), ok_(false), temporary_memory_manager_(this) { VLOG_CALL(PARAM(parent)); } Stream::Stream(StreamExecutor *parent, internal::StreamInterface *implementation) : parent_(parent), implementation_(implementation), allocated_(false), ok_(false), temporary_memory_manager_(this) { VLOG_CALL(PARAM(parent), PARAM(implementation)); } Stream::~Stream() { VLOG_CALL(); temporary_memory_manager_.ForceDeallocateAll(); if (allocated_) { parent_->DeallocateStream(this); } } Stream &Stream::Init() { VLOG_CALL(); mutex_lock lock{mu_}; CHECK_EQ(false, allocated_) << "stream appears to already have been initialized"; CHECK(!ok_) << "stream should be in !ok() state pre-initialization"; if (parent_->AllocateStream(this)) { // Successful initialization! allocated_ = true; ok_ = true; } else { LOG(ERROR) << "failed to allocate stream during initialization"; } return *this; } Stream &Stream::InitTimer(Timer *timer) { VLOG_CALL(PARAM(timer)); if (ok()) { CheckError(parent_->AllocateTimer(timer)); } else { LOG(INFO) << "did not allocate timer: " << timer; } return *this; } Stream &Stream::InitWithTimer(Timer *timer) { VLOG_CALL(PARAM(timer)); return Init().InitTimer(timer); } Stream &Stream::ThenRecordEvent(Event *event) { VLOG_CALL(PARAM(event)); port::Status status = parent_->RecordEvent(this, event); if (!status.ok()) { LOG(ERROR) << "Error recording event in stream: " << status.error_message() << "; not marking stream as bad, as the Event object may be " << "at fault. Monitor for further errors."; } return *this; } Stream &Stream::ThenConvolveWithScratch( const dnn::BatchDescriptor &input_descriptor, const DeviceMemory &input_data, const dnn::FilterDescriptor &filter_descriptor, const DeviceMemory &filter_data, const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::BatchDescriptor &output_descriptor, DeviceMemory *output, ScratchAllocator *scratch_allocator) { VLOG_CALL(PARAM(input_descriptor), PARAM(input_data), PARAM(filter_descriptor), PARAM(filter_data), PARAM(convolution_descriptor), PARAM(output_descriptor), PARAM(output)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoConvolve( this, input_descriptor, input_data, filter_descriptor, filter_data, convolution_descriptor, output_descriptor, output, /*scratch_allocator=*/scratch_allocator, dnn::kDefaultAlgorithm, nullptr)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenConvolveWithScratch( const dnn::BatchDescriptor &input_descriptor, const DeviceMemory &input_data, const dnn::FilterDescriptor &filter_descriptor, const DeviceMemory &filter_data, const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::BatchDescriptor &output_descriptor, DeviceMemory *output, ScratchAllocator *scratch_allocator) { VLOG_CALL(PARAM(input_descriptor), PARAM(input_data), PARAM(filter_descriptor), PARAM(filter_data), PARAM(convolution_descriptor), PARAM(output_descriptor), PARAM(output)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoConvolve( this, input_descriptor, input_data, filter_descriptor, filter_data, convolution_descriptor, output_descriptor, output, /*scratch_allocator=*/scratch_allocator, dnn::kDefaultAlgorithm, nullptr)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenConvolveWithAlgorithm( const dnn::BatchDescriptor &input_descriptor, const DeviceMemory &input_data, const dnn::FilterDescriptor &filter_descriptor, const DeviceMemory &filter_data, const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::BatchDescriptor &output_descriptor, DeviceMemory *output, ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm, dnn::ProfileResult *output_profile_result) { VLOG_CALL(PARAM(input_descriptor), PARAM(input_data), PARAM(filter_descriptor), PARAM(filter_data), PARAM(convolution_descriptor), PARAM(output_descriptor), PARAM(output)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { auto status = dnn->DoConvolve( this, input_descriptor, input_data, filter_descriptor, filter_data, convolution_descriptor, output_descriptor, output, scratch_allocator, algorithm, output_profile_result); if (!status && !output_profile_result) { SetError(); } } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenConvolveWithAlgorithm( const dnn::BatchDescriptor &input_descriptor, const DeviceMemory &input_data, const dnn::FilterDescriptor &filter_descriptor, const DeviceMemory &filter_data, const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::BatchDescriptor &output_descriptor, DeviceMemory *output, ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm, dnn::ProfileResult *output_profile_result) { VLOG_CALL(PARAM(input_descriptor), PARAM(input_data), PARAM(filter_descriptor), PARAM(filter_data), PARAM(convolution_descriptor), PARAM(output_descriptor), PARAM(output)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { auto status = dnn->DoConvolve( this, input_descriptor, input_data, filter_descriptor, filter_data, convolution_descriptor, output_descriptor, output, scratch_allocator, algorithm, output_profile_result); if (!status && !output_profile_result) { SetError(); } } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenConvolve( const dnn::BatchDescriptor &input_descriptor, const DeviceMemory &input_data, const dnn::FilterDescriptor &filter_descriptor, const DeviceMemory &filter_data, const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::BatchDescriptor &output_descriptor, DeviceMemory *output) { return ThenConvolveWithScratch(input_descriptor, input_data, filter_descriptor, filter_data, convolution_descriptor, output_descriptor, output, /*scratch_allocator=*/nullptr); } Stream &Stream::ThenSeparableConvolve( const dnn::BatchDescriptor &batch_descriptor, const DeviceMemory &input_data, const dnn::FilterDescriptor &filter_descriptor, int depth_multiplier, const DeviceMemory &first_weights, const DeviceMemory &second_weights, const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::BatchDescriptor &output_descriptor, DeviceMemory *output) { VLOG_CALL( PARAM(batch_descriptor), PARAM(input_data), PARAM(filter_descriptor), PARAM(depth_multiplier), PARAM(first_weights), PARAM(second_weights), PARAM(convolution_descriptor), PARAM(output_descriptor), PARAM(output)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoSeparableConvolve( this, batch_descriptor, input_data, filter_descriptor, depth_multiplier, first_weights, second_weights, convolution_descriptor, output_descriptor, output)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenConvolveBackwardDataWithScratch( const dnn::FilterDescriptor &filter_descriptor, const DeviceMemory &filter_data, const dnn::BatchDescriptor &output_descriptor, DeviceMemory backward_output_data, const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::BatchDescriptor &input_descriptor, DeviceMemory *backward_input_data, ScratchAllocator *scratch_allocator) { VLOG_CALL(PARAM(filter_descriptor), PARAM(filter_data), PARAM(output_descriptor), PARAM(backward_output_data), PARAM(convolution_descriptor), PARAM(input_descriptor), PARAM(backward_input_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoConvolveBackwardData( this, filter_descriptor, filter_data, output_descriptor, backward_output_data, convolution_descriptor, input_descriptor, backward_input_data, scratch_allocator, dnn::kDefaultAlgorithm, nullptr)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenConvolveBackwardDataWithAlgorithm( const dnn::FilterDescriptor &filter_descriptor, const DeviceMemory &filter_data, const dnn::BatchDescriptor &output_descriptor, DeviceMemory backward_output_data, const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::BatchDescriptor &input_descriptor, DeviceMemory *backward_input_data, ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm, dnn::ProfileResult *output_profile_result) { VLOG_CALL(PARAM(filter_descriptor), PARAM(filter_data), PARAM(output_descriptor), PARAM(backward_output_data), PARAM(convolution_descriptor), PARAM(input_descriptor), PARAM(backward_input_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { auto status = dnn->DoConvolveBackwardData( this, filter_descriptor, filter_data, output_descriptor, backward_output_data, convolution_descriptor, input_descriptor, backward_input_data, scratch_allocator, algorithm, output_profile_result); if (!status && !output_profile_result) { SetError(); } } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenConvolveBackwardDataWithAlgorithm( const dnn::FilterDescriptor &filter_descriptor, const DeviceMemory &filter_data, const dnn::BatchDescriptor &output_descriptor, DeviceMemory backward_output_data, const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::BatchDescriptor &input_descriptor, DeviceMemory *backward_input_data, ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm, dnn::ProfileResult *output_profile_result) { VLOG_CALL(PARAM(filter_descriptor), PARAM(filter_data), PARAM(output_descriptor), PARAM(backward_output_data), PARAM(convolution_descriptor), PARAM(input_descriptor), PARAM(backward_input_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { auto status = dnn->DoConvolveBackwardData( this, filter_descriptor, filter_data, output_descriptor, backward_output_data, convolution_descriptor, input_descriptor, backward_input_data, scratch_allocator, algorithm, output_profile_result); if (!status && !output_profile_result) { SetError(); } } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenConvolveBackwardDataWithScratch( const dnn::FilterDescriptor &filter_descriptor, const DeviceMemory &filter_data, const dnn::BatchDescriptor &output_descriptor, DeviceMemory backward_output_data, const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::BatchDescriptor &input_descriptor, DeviceMemory *backward_input_data, ScratchAllocator *scratch_allocator) { VLOG_CALL(PARAM(filter_descriptor), PARAM(filter_data), PARAM(output_descriptor), PARAM(backward_output_data), PARAM(convolution_descriptor), PARAM(input_descriptor), PARAM(backward_input_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoConvolveBackwardData( this, filter_descriptor, filter_data, output_descriptor, backward_output_data, convolution_descriptor, input_descriptor, backward_input_data, scratch_allocator, dnn::kDefaultAlgorithm, nullptr)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenConvolveBackwardData( const dnn::FilterDescriptor &filter_descriptor, const DeviceMemory &filter_data, const dnn::BatchDescriptor &output_descriptor, DeviceMemory backward_output_data, const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::BatchDescriptor &input_descriptor, DeviceMemory *backward_input_data) { return ThenConvolveBackwardDataWithScratch( filter_descriptor, filter_data, output_descriptor, backward_output_data, convolution_descriptor, input_descriptor, backward_input_data, /*scratch_allocator=*/nullptr); } Stream &Stream::ThenConvolveBackwardFilterWithScratch( const dnn::BatchDescriptor &input_descriptor, const DeviceMemory &input_data, const dnn::BatchDescriptor &output_descriptor, DeviceMemory backward_output_data, const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::FilterDescriptor &filter_descriptor, DeviceMemory *backward_filter_data, ScratchAllocator *scratch_allocator) { VLOG_CALL(PARAM(input_descriptor), PARAM(input_data), PARAM(output_descriptor), PARAM(backward_output_data), PARAM(convolution_descriptor), PARAM(filter_descriptor), PARAM(backward_filter_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoConvolveBackwardFilter( this, input_descriptor, input_data, output_descriptor, backward_output_data, convolution_descriptor, filter_descriptor, backward_filter_data, scratch_allocator, dnn::kDefaultAlgorithm, nullptr)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm( const dnn::BatchDescriptor &input_descriptor, const DeviceMemory &input_data, const dnn::BatchDescriptor &output_descriptor, DeviceMemory backward_output_data, const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::FilterDescriptor &filter_descriptor, DeviceMemory *backward_filter_data, ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm, dnn::ProfileResult *output_profile_result) { VLOG_CALL(PARAM(input_descriptor), PARAM(input_data), PARAM(output_descriptor), PARAM(backward_output_data), PARAM(convolution_descriptor), PARAM(filter_descriptor), PARAM(backward_filter_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { auto status = dnn->DoConvolveBackwardFilter( this, input_descriptor, input_data, output_descriptor, backward_output_data, convolution_descriptor, filter_descriptor, backward_filter_data, scratch_allocator, algorithm, output_profile_result); if (!status && !output_profile_result) { SetError(); } } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenConvolveBackwardFilterWithScratch( const dnn::BatchDescriptor &input_descriptor, const DeviceMemory &input_data, const dnn::BatchDescriptor &output_descriptor, DeviceMemory backward_output_data, const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::FilterDescriptor &filter_descriptor, DeviceMemory *backward_filter_data, ScratchAllocator *scratch_allocator) { VLOG_CALL(PARAM(input_descriptor), PARAM(input_data), PARAM(output_descriptor), PARAM(backward_output_data), PARAM(convolution_descriptor), PARAM(filter_descriptor), PARAM(backward_filter_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoConvolveBackwardFilter( this, input_descriptor, input_data, output_descriptor, backward_output_data, convolution_descriptor, filter_descriptor, backward_filter_data, scratch_allocator, dnn::kDefaultAlgorithm, nullptr)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm( const dnn::BatchDescriptor &input_descriptor, const DeviceMemory &input_data, const dnn::BatchDescriptor &output_descriptor, DeviceMemory backward_output_data, const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::FilterDescriptor &filter_descriptor, DeviceMemory *backward_filter_data, ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm, dnn::ProfileResult *output_profile_result) { VLOG_CALL(PARAM(input_descriptor), PARAM(input_data), PARAM(output_descriptor), PARAM(backward_output_data), PARAM(convolution_descriptor), PARAM(filter_descriptor), PARAM(backward_filter_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { auto status = dnn->DoConvolveBackwardFilter( this, input_descriptor, input_data, output_descriptor, backward_output_data, convolution_descriptor, filter_descriptor, backward_filter_data, scratch_allocator, algorithm, output_profile_result); if (!status && !output_profile_result) { SetError(); } } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenConvolveBackwardFilter( const dnn::BatchDescriptor &input_descriptor, const DeviceMemory &input_data, const dnn::BatchDescriptor &output_descriptor, DeviceMemory backward_output_data, const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::FilterDescriptor &filter_descriptor, DeviceMemory *backward_filter_data) { return ThenConvolveBackwardFilterWithScratch( input_descriptor, input_data, output_descriptor, backward_output_data, convolution_descriptor, filter_descriptor, backward_filter_data, /*scratch_allocator=*/nullptr); } template Stream &Stream::ThenConvolveBackwardBiasImpl( const dnn::BatchDescriptor &input_descriptor, const DeviceMemory &input_data, const dnn::BatchDescriptor &bias_descriptor, DeviceMemory *backward_bias_data) { VLOG_CALL(PARAM(input_descriptor), PARAM(input_data), PARAM(bias_descriptor), PARAM(backward_bias_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoConvolveBackwardBias(this, input_descriptor, input_data, bias_descriptor, backward_bias_data)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenConvolveBackwardBias( const dnn::BatchDescriptor &input_descriptor, const DeviceMemory &input_data, const dnn::BatchDescriptor &bias_descriptor, DeviceMemory *backward_bias_data) { return ThenConvolveBackwardBiasImpl(input_descriptor, input_data, bias_descriptor, backward_bias_data); } Stream &Stream::ThenConvolveBackwardBias( const dnn::BatchDescriptor &input_descriptor, const DeviceMemory &input_data, const dnn::BatchDescriptor &bias_descriptor, DeviceMemory *backward_bias_data) { return ThenConvolveBackwardBiasImpl(input_descriptor, input_data, bias_descriptor, backward_bias_data); } Stream &Stream::ThenConvolveBackwardBias( const dnn::BatchDescriptor &input_descriptor, const DeviceMemory &input_data, const dnn::BatchDescriptor &bias_descriptor, DeviceMemory *backward_bias_data) { return ThenConvolveBackwardBiasImpl(input_descriptor, input_data, bias_descriptor, backward_bias_data); } Stream &Stream::ThenMatMul(const DeviceMemory &input_data, const DeviceMemory &weights, const dnn::BatchDescriptor &input_dimensions, const dnn::BatchDescriptor &output_dimensions, DeviceMemory *output_data) { VLOG_CALL(PARAM(input_data), PARAM(weights), PARAM(input_dimensions), PARAM(output_dimensions), PARAM(output_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoMatMul(this, input_data, weights, input_dimensions, output_dimensions, output_data)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenMatMulQuantized( const DeviceMemory &input_data, const DeviceMemory &weights, const DeviceMemory &weight_scales, const dnn::BatchDescriptor &input_dimensions, const dnn::BatchDescriptor &output_dimensions, DeviceMemory *output_data) { VLOG_CALL(PARAM(input_data), PARAM(weights), PARAM(weight_scales), PARAM(input_dimensions), PARAM(output_dimensions), PARAM(output_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoMatMulQuantized(this, input_data, weights, weight_scales, input_dimensions, output_dimensions, output_data)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenMatMulQuantized( const DeviceMemory &input_data, const DeviceMemory &weights, const DeviceMemory &weight_scales, const dnn::BatchDescriptor &input_dimensions, const dnn::BatchDescriptor &output_dimensions, DeviceMemory *output_data) { VLOG_CALL(PARAM(input_data), PARAM(weights), PARAM(weight_scales), PARAM(input_dimensions), PARAM(output_dimensions), PARAM(output_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoMatMulQuantized(this, input_data, weights, weight_scales, input_dimensions, output_dimensions, output_data)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenBiasAdd(const DeviceMemory &input_data, const DeviceMemory &biases, const dnn::BatchDescriptor &dimensions, DeviceMemory *output_data) { VLOG_CALL(PARAM(input_data), PARAM(biases), PARAM(dimensions), PARAM(output_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError( dnn->DoBiasAdd(this, input_data, biases, dimensions, output_data)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenPoolForward( const dnn::PoolingDescriptor &pooling_dimensions, const dnn::BatchDescriptor &input_dimensions, const DeviceMemory &input_data, const dnn::BatchDescriptor &output_dimensions, DeviceMemory *output_data) { VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions), PARAM(input_data), PARAM(output_dimensions), PARAM(output_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions, input_data, output_dimensions, output_data)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenPoolBackward( const dnn::PoolingDescriptor &pooling_dimensions, const dnn::BatchDescriptor &input_dimensions, const DeviceMemory &input_data, const dnn::BatchDescriptor &output_dimensions, const DeviceMemory &output_data, const DeviceMemory &input_diff_data, DeviceMemory *output_diff_data) { VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions), PARAM(input_data), PARAM(output_dimensions), PARAM(output_data), PARAM(input_diff_data), PARAM(output_diff_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions, input_data, output_dimensions, output_data, input_diff_data, output_diff_data)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenNormalize( const dnn::NormalizeDescriptor &normalize_descriptor, const DeviceMemory &input_data, DeviceMemory *output_data) { VLOG_CALL(PARAM(normalize_descriptor), PARAM(input_data), PARAM(output_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoNormalize(this, normalize_descriptor, input_data, output_data)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenActivate(dnn::ActivationMode activation_mode, const dnn::BatchDescriptor &dimensions, const DeviceMemory &input_data, DeviceMemory *output_data) { VLOG_CALL(PARAM(activation_mode), PARAM(dimensions), PARAM(input_data), PARAM(output_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoActivate(this, activation_mode, dimensions, input_data, output_data)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenDepthConcatenate( port::ArraySlice input_dimensions, port::ArraySlice *> input_data, DeviceMemory *output_data) { VLOG_CALL(PARAM(input_dimensions), PARAM(input_data), PARAM(output_data)); for (size_t i = 1; i < input_dimensions.size(); ++i) { if (input_dimensions[i].count() != input_dimensions[0].count() || input_dimensions[i].height() != input_dimensions[0].height() || input_dimensions[i].width() != input_dimensions[0].width()) { SetError(); LOG(ERROR) << "Incompatible dimensions for depth concatenation.\n" << "input_dimensions[0]: " << input_dimensions[0].ToString() << "input_dimensions[" << i << "]: " << input_dimensions[i].ToString(); return *this; } } if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoDepthConcatenate(this, input_dimensions, input_data, output_data)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenElementwiseOperate( dnn::ElementwiseOperation operation, port::ArraySlice input_dimensions, port::ArraySlice *> input_data, const dnn::BatchDescriptor &output_dimensions, DeviceMemory *output_data) { VLOG_CALL(PARAM(operation), PARAM(input_dimensions), PARAM(input_data), PARAM(output_dimensions), PARAM(output_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoElementwiseOperate(this, operation, input_dimensions, input_data, output_dimensions, output_data)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenXYPad(const dnn::BatchDescriptor &dimensions, const DeviceMemory &input_data, int64 left_pad, int64 right_pad, int64 top_pad, int64 bottom_pad, DeviceMemory *output_data) { VLOG_CALL(PARAM(dimensions), PARAM(input_data), PARAM(left_pad), PARAM(right_pad), PARAM(top_pad), PARAM(bottom_pad), PARAM(output_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoXYPad(this, dimensions, input_data, left_pad, right_pad, top_pad, bottom_pad, output_data)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenXYSlice(const dnn::BatchDescriptor &dimensions, const DeviceMemory &input_data, int64 left_trim, int64 right_trim, int64 top_trim, int64 bottom_trim, DeviceMemory *output_data) { VLOG_CALL(PARAM(dimensions), PARAM(input_data), PARAM(left_trim), PARAM(right_trim), PARAM(top_trim), PARAM(bottom_trim), PARAM(output_data)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoXYSlice(this, dimensions, input_data, left_trim, right_trim, top_trim, bottom_trim, output_data)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenMemcpyD2HQuantized( const DeviceMemory &gpu_unquantized_src, dnn::QuantizedActivationMode mode, void *host_dst, uint64 size) { VLOG_CALL(PARAM(gpu_unquantized_src), PARAM(mode), PARAM(host_dst), PARAM(size)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoMemcpyD2HQuantized(this, gpu_unquantized_src, mode, host_dst, size)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream &Stream::ThenMemcpyH2DQuantized( const void *host_src, uint64 size, dnn::QuantizedActivationMode mode, DeviceMemory *gpu_unquantized_dst) { VLOG_CALL(PARAM(host_src), PARAM(size), PARAM(mode), PARAM(gpu_unquantized_dst)); if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { CheckError(dnn->DoMemcpyH2DQuantized(this, host_src, size, mode, gpu_unquantized_dst)); } else { SetError(); LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " "without DNN support"; } } return *this; } Stream *Stream::GetOrCreateSubStream() { mutex_lock lock{mu_}; for (auto &stream : sub_streams_) { if (stream.second) { stream.second = false; return stream.first.get(); } } sub_streams_.emplace_back(std::unique_ptr{new Stream{parent_}}, false); Stream *sub_stream = sub_streams_.back().first.get(); sub_stream->Init(); CHECK(ok_) << "sub-stream failed to be initialized"; return sub_stream; } void Stream::ReturnSubStream(Stream *sub_stream) { mutex_lock lock{mu_}; for (auto &stream : sub_streams_) { if (stream.first.get() == sub_stream) { stream.second = true; return; } } LOG(FATAL) << "the sub-stream to be returned is not created by this stream"; } Stream &Stream::ThenStartTimer(Timer *t) { VLOG_CALL(PARAM(t)); if (ok()) { CheckError(parent_->StartTimer(this, t)); } else { LOG(INFO) << "stream " << this << " did not enqueue 'start timer': " << t; } return *this; } Stream &Stream::ThenStopTimer(Timer *t) { VLOG_CALL(PARAM(t)); if (ok()) { CheckError(parent_->StopTimer(this, t)); } else { LOG(INFO) << "stream " << this << " did not enqueue 'stop timer': " << t; } return *this; } Stream &Stream::ThenWaitFor(Stream *other) { VLOG_CALL(PARAM(other)); CHECK(this != other) << "stream cannot wait for itself"; if (ok() && other->ok()) { CheckError(parent_->CreateStreamDependency(this, other)); } else { SetError(); LOG(INFO) << "stream " << this << " did not wait for stream: " << other; } return *this; } Stream &Stream::ThenWaitFor(std::vector> *others) { VLOG_CALL(PARAM(others)); for (auto &stream : *others) { CHECK_NE(stream.get(), this); ThenWaitFor(stream.get()); } return *this; } Stream &Stream::ThenWaitFor(Event *event) { VLOG_CALL(PARAM(event)); if (ok()) { port::Status status = parent_->WaitForEvent(this, event); if (!status.ok()) { LOG(ERROR) << "Error waiting for event in stream: " << status.error_message() << "; not marking stream as bad, as the Event object may be " << "at fault. Monitor for further errors."; } } else { LOG(INFO) << "stream " << this << " did not wait for an event."; } return *this; } // A functor that implements ThenBlasXXX interfaces, which calls DoBlasXXX // functions and logs for errors. template struct ThenBlasImpl { // blas_func is the DoBlasXXX member function pointer, and args are its // arguments except the first one of Stream* type. Stream &operator()(Stream *stream, bool (blas::BlasSupport::*blas_func)(Stream *, Args...), Args... args); }; template Stream &ThenBlasImpl::operator()( Stream *stream, bool (blas::BlasSupport::*blas_func)(Stream *, Args...), Args... args) { if (stream->ok()) { if (blas::BlasSupport *blas = stream->parent_->AsBlas()) { stream->CheckError((blas->*blas_func)(stream, args...)); } else { stream->CheckError(false); LOG(WARNING) << "attempting to perform BLAS operation using StreamExecutor " "without BLAS support"; } } return *stream; } Stream &Stream::ThenBlasAsum(uint64 elem_count, const DeviceMemory &x, int incx, DeviceMemory *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); ThenBlasImpl &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasAsum, elem_count, x, incx, result); } Stream &Stream::ThenBlasAsum(uint64 elem_count, const DeviceMemory &x, int incx, DeviceMemory *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); ThenBlasImpl &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasAsum, elem_count, x, incx, result); } Stream &Stream::ThenBlasAsum(uint64 elem_count, const DeviceMemory> &x, int incx, DeviceMemory *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); ThenBlasImpl> &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasAsum, elem_count, x, incx, result); } Stream &Stream::ThenBlasAsum(uint64 elem_count, const DeviceMemory> &x, int incx, DeviceMemory *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); ThenBlasImpl> &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasAsum, elem_count, x, incx, result); } Stream &Stream::ThenBlasAxpy(uint64 elem_count, float alpha, const DeviceMemory &x, int incx, DeviceMemory *y, int incy) { VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); ThenBlasImpl &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasAxpy, elem_count, alpha, x, incx, y, incy); } Stream &Stream::ThenBlasAxpy(uint64 elem_count, double alpha, const DeviceMemory &x, int incx, DeviceMemory *y, int incy) { VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); ThenBlasImpl &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasAxpy, elem_count, alpha, x, incx, y, incy); } Stream &Stream::ThenBlasAxpy(uint64 elem_count, std::complex alpha, const DeviceMemory> &x, int incx, DeviceMemory> *y, int incy) { VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); ThenBlasImpl, const DeviceMemory> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasAxpy, elem_count, alpha, x, incx, y, incy); } Stream &Stream::ThenBlasAxpy(uint64 elem_count, std::complex alpha, const DeviceMemory> &x, int incx, DeviceMemory> *y, int incy) { VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); ThenBlasImpl, const DeviceMemory> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasAxpy, elem_count, alpha, x, incx, y, incy); } Stream &Stream::ThenBlasCopy(uint64 elem_count, const DeviceMemory &x, int incx, DeviceMemory *y, int incy) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); ThenBlasImpl &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasCopy, elem_count, x, incx, y, incy); } Stream &Stream::ThenBlasCopy(uint64 elem_count, const DeviceMemory &x, int incx, DeviceMemory *y, int incy) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); ThenBlasImpl &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasCopy, elem_count, x, incx, y, incy); } Stream &Stream::ThenBlasCopy(uint64 elem_count, const DeviceMemory> &x, int incx, DeviceMemory> *y, int incy) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); ThenBlasImpl> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasCopy, elem_count, x, incx, y, incy); } Stream &Stream::ThenBlasCopy(uint64 elem_count, const DeviceMemory> &x, int incx, DeviceMemory> *y, int incy) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); ThenBlasImpl> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasCopy, elem_count, x, incx, y, incy); } Stream &Stream::ThenBlasDot(uint64 elem_count, const DeviceMemory &x, int incx, const DeviceMemory &y, int incy, DeviceMemory *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(result)); ThenBlasImpl &, int, const DeviceMemory &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasDot, elem_count, x, incx, y, incy, result); } Stream &Stream::ThenBlasDot(uint64 elem_count, const DeviceMemory &x, int incx, const DeviceMemory &y, int incy, DeviceMemory *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(result)); ThenBlasImpl &, int, const DeviceMemory &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasDot, elem_count, x, incx, y, incy, result); } Stream &Stream::ThenBlasDotc(uint64 elem_count, const DeviceMemory> &x, int incx, const DeviceMemory> &y, int incy, DeviceMemory> *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(result)); ThenBlasImpl> &, int, const DeviceMemory> &, int, DeviceMemory> *> impl; return impl(this, &blas::BlasSupport::DoBlasDotc, elem_count, x, incx, y, incy, result); } Stream &Stream::ThenBlasDotc(uint64 elem_count, const DeviceMemory> &x, int incx, const DeviceMemory> &y, int incy, DeviceMemory> *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(result)); ThenBlasImpl> &, int, const DeviceMemory> &, int, DeviceMemory> *> impl; return impl(this, &blas::BlasSupport::DoBlasDotc, elem_count, x, incx, y, incy, result); } Stream &Stream::ThenBlasDotu(uint64 elem_count, const DeviceMemory> &x, int incx, const DeviceMemory> &y, int incy, DeviceMemory> *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(result)); ThenBlasImpl> &, int, const DeviceMemory> &, int, DeviceMemory> *> impl; return impl(this, &blas::BlasSupport::DoBlasDotu, elem_count, x, incx, y, incy, result); } Stream &Stream::ThenBlasDotu(uint64 elem_count, const DeviceMemory> &x, int incx, const DeviceMemory> &y, int incy, DeviceMemory> *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(result)); ThenBlasImpl> &, int, const DeviceMemory> &, int, DeviceMemory> *> impl; return impl(this, &blas::BlasSupport::DoBlasDotu, elem_count, x, incx, y, incy, result); } Stream &Stream::ThenBlasNrm2(uint64 elem_count, const DeviceMemory &x, int incx, DeviceMemory *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); ThenBlasImpl &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasNrm2, elem_count, x, incx, result); } Stream &Stream::ThenBlasNrm2(uint64 elem_count, const DeviceMemory &x, int incx, DeviceMemory *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); ThenBlasImpl &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasNrm2, elem_count, x, incx, result); } Stream &Stream::ThenBlasNrm2(uint64 elem_count, const DeviceMemory> &x, int incx, DeviceMemory *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); ThenBlasImpl> &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasNrm2, elem_count, x, incx, result); } Stream &Stream::ThenBlasNrm2(uint64 elem_count, const DeviceMemory> &x, int incx, DeviceMemory *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); ThenBlasImpl> &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasNrm2, elem_count, x, incx, result); } Stream &Stream::ThenBlasRot(uint64 elem_count, DeviceMemory *x, int incx, DeviceMemory *y, int incy, float c, float s) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(c), PARAM(s)); ThenBlasImpl *, int, DeviceMemory *, int, float, float> impl; return impl(this, &blas::BlasSupport::DoBlasRot, elem_count, x, incx, y, incy, c, s); } Stream &Stream::ThenBlasRot(uint64 elem_count, DeviceMemory *x, int incx, DeviceMemory *y, int incy, double c, double s) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(c), PARAM(s)); ThenBlasImpl *, int, DeviceMemory *, int, double, double> impl; return impl(this, &blas::BlasSupport::DoBlasRot, elem_count, x, incx, y, incy, c, s); } Stream &Stream::ThenBlasRot(uint64 elem_count, DeviceMemory> *x, int incx, DeviceMemory> *y, int incy, float c, float s) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(c), PARAM(s)); ThenBlasImpl> *, int, DeviceMemory> *, int, float, float> impl; return impl(this, &blas::BlasSupport::DoBlasRot, elem_count, x, incx, y, incy, c, s); } Stream &Stream::ThenBlasRot(uint64 elem_count, DeviceMemory> *x, int incx, DeviceMemory> *y, int incy, double c, double s) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(c), PARAM(s)); ThenBlasImpl> *, int, DeviceMemory> *, int, double, double> impl; return impl(this, &blas::BlasSupport::DoBlasRot, elem_count, x, incx, y, incy, c, s); } Stream &Stream::ThenBlasRotg(DeviceMemory *a, DeviceMemory *b, DeviceMemory *c, DeviceMemory *s) { VLOG_CALL(PARAM(a), PARAM(b), PARAM(c), PARAM(s)); ThenBlasImpl *, DeviceMemory *, DeviceMemory *, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasRotg, a, b, c, s); } Stream &Stream::ThenBlasRotg(DeviceMemory *a, DeviceMemory *b, DeviceMemory *c, DeviceMemory *s) { VLOG_CALL(PARAM(a), PARAM(b), PARAM(c), PARAM(s)); ThenBlasImpl *, DeviceMemory *, DeviceMemory *, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasRotg, a, b, c, s); } Stream &Stream::ThenBlasRotg(DeviceMemory> *a, DeviceMemory> *b, DeviceMemory *c, DeviceMemory> *s) { VLOG_CALL(PARAM(a), PARAM(b), PARAM(c), PARAM(s)); ThenBlasImpl> *, DeviceMemory> *, DeviceMemory *, DeviceMemory> *> impl; return impl(this, &blas::BlasSupport::DoBlasRotg, a, b, c, s); } Stream &Stream::ThenBlasRotg(DeviceMemory> *a, DeviceMemory> *b, DeviceMemory *c, DeviceMemory> *s) { VLOG_CALL(PARAM(a), PARAM(b), PARAM(c), PARAM(s)); ThenBlasImpl> *, DeviceMemory> *, DeviceMemory *, DeviceMemory> *> impl; return impl(this, &blas::BlasSupport::DoBlasRotg, a, b, c, s); } Stream &Stream::ThenBlasRotm(uint64 elem_count, DeviceMemory *x, int incx, DeviceMemory *y, int incy, const DeviceMemory ¶m) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(param)); ThenBlasImpl *, int, DeviceMemory *, int, const DeviceMemory &> impl; return impl(this, &blas::BlasSupport::DoBlasRotm, elem_count, x, incx, y, incy, param); } Stream &Stream::ThenBlasRotm(uint64 elem_count, DeviceMemory *x, int incx, DeviceMemory *y, int incy, const DeviceMemory ¶m) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(param)); ThenBlasImpl *, int, DeviceMemory *, int, const DeviceMemory &> impl; return impl(this, &blas::BlasSupport::DoBlasRotm, elem_count, x, incx, y, incy, param); } Stream &Stream::ThenBlasRotmg(DeviceMemory *d1, DeviceMemory *d2, DeviceMemory *x1, const DeviceMemory &y1, DeviceMemory *param) { VLOG_CALL(PARAM(d1), PARAM(d2), PARAM(x1), PARAM(y1), PARAM(param)); ThenBlasImpl *, DeviceMemory *, DeviceMemory *, const DeviceMemory &, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasRotmg, d1, d2, x1, y1, param); } Stream &Stream::ThenBlasRotmg(DeviceMemory *d1, DeviceMemory *d2, DeviceMemory *x1, const DeviceMemory &y1, DeviceMemory *param) { VLOG_CALL(PARAM(d1), PARAM(d2), PARAM(x1), PARAM(y1), PARAM(param)); ThenBlasImpl *, DeviceMemory *, DeviceMemory *, const DeviceMemory &, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasRotmg, d1, d2, x1, y1, param); } Stream &Stream::ThenBlasScal(uint64 elem_count, float alpha, DeviceMemory *x, int incx) { VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx)); ThenBlasImpl *, int> impl; return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx); } Stream &Stream::ThenBlasScal(uint64 elem_count, double alpha, DeviceMemory *x, int incx) { VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx)); ThenBlasImpl *, int> impl; return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx); } Stream &Stream::ThenBlasScal(uint64 elem_count, float alpha, DeviceMemory> *x, int incx) { VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx)); ThenBlasImpl> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx); } Stream &Stream::ThenBlasScal(uint64 elem_count, double alpha, DeviceMemory> *x, int incx) { VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx)); ThenBlasImpl> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx); } Stream &Stream::ThenBlasScal(uint64 elem_count, std::complex alpha, DeviceMemory> *x, int incx) { VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx)); ThenBlasImpl, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx); } Stream &Stream::ThenBlasScal(uint64 elem_count, std::complex alpha, DeviceMemory> *x, int incx) { VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx)); ThenBlasImpl, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx); } Stream &Stream::ThenBlasSwap(uint64 elem_count, DeviceMemory *x, int incx, DeviceMemory *y, int incy) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); ThenBlasImpl *, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSwap, elem_count, x, incx, y, incy); } Stream &Stream::ThenBlasSwap(uint64 elem_count, DeviceMemory *x, int incx, DeviceMemory *y, int incy) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); ThenBlasImpl *, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSwap, elem_count, x, incx, y, incy); } Stream &Stream::ThenBlasSwap(uint64 elem_count, DeviceMemory> *x, int incx, DeviceMemory> *y, int incy) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); ThenBlasImpl> *, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSwap, elem_count, x, incx, y, incy); } Stream &Stream::ThenBlasSwap(uint64 elem_count, DeviceMemory> *x, int incx, DeviceMemory> *y, int incy) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); ThenBlasImpl> *, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSwap, elem_count, x, incx, y, incy); } Stream &Stream::ThenBlasIamax(uint64 elem_count, const DeviceMemory &x, int incx, DeviceMemory *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); ThenBlasImpl &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasIamax, elem_count, x, incx, result); } Stream &Stream::ThenBlasIamax(uint64 elem_count, const DeviceMemory &x, int incx, DeviceMemory *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); ThenBlasImpl &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasIamax, elem_count, x, incx, result); } Stream &Stream::ThenBlasIamax(uint64 elem_count, const DeviceMemory> &x, int incx, DeviceMemory *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); ThenBlasImpl> &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasIamax, elem_count, x, incx, result); } Stream &Stream::ThenBlasIamax(uint64 elem_count, const DeviceMemory> &x, int incx, DeviceMemory *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); ThenBlasImpl> &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasIamax, elem_count, x, incx, result); } Stream &Stream::ThenBlasIamin(uint64 elem_count, const DeviceMemory &x, int incx, DeviceMemory *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); ThenBlasImpl &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasIamin, elem_count, x, incx, result); } Stream &Stream::ThenBlasIamin(uint64 elem_count, const DeviceMemory &x, int incx, DeviceMemory *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); ThenBlasImpl &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasIamin, elem_count, x, incx, result); } Stream &Stream::ThenBlasIamin(uint64 elem_count, const DeviceMemory> &x, int incx, DeviceMemory *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); ThenBlasImpl> &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasIamin, elem_count, x, incx, result); } Stream &Stream::ThenBlasIamin(uint64 elem_count, const DeviceMemory> &x, int incx, DeviceMemory *result) { VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); ThenBlasImpl> &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasIamin, elem_count, x, incx, result); } Stream &Stream::ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n, uint64 kl, uint64 ku, float alpha, const DeviceMemory &a, int lda, const DeviceMemory &x, int incx, float beta, DeviceMemory *y, int incy) { VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(kl), PARAM(ku), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl &, int, const DeviceMemory &, int, float, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGbmv, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } Stream &Stream::ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n, uint64 kl, uint64 ku, double alpha, const DeviceMemory &a, int lda, const DeviceMemory &x, int incx, double beta, DeviceMemory *y, int incy) { VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(kl), PARAM(ku), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl &, int, const DeviceMemory &, int, double, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGbmv, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } Stream &Stream::ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n, uint64 kl, uint64 ku, std::complex alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &x, int incx, std::complex beta, DeviceMemory> *y, int incy) { VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(kl), PARAM(ku), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGbmv, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } Stream &Stream::ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n, uint64 kl, uint64 ku, std::complex alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &x, int incx, std::complex beta, DeviceMemory> *y, int incy) { VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(kl), PARAM(ku), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGbmv, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n, float alpha, const DeviceMemory &a, int lda, const DeviceMemory &x, int incx, float beta, DeviceMemory *y, int incy) { VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl &, int, const DeviceMemory &, int, float, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n, double alpha, const DeviceMemory &a, int lda, const DeviceMemory &x, int incx, double beta, DeviceMemory *y, int incy) { VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl &, int, const DeviceMemory &, int, double, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n, std::complex alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &x, int incx, std::complex beta, DeviceMemory> *y, int incy) { VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n, std::complex alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &x, int incx, std::complex beta, DeviceMemory> *y, int incy) { VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } Stream &Stream::ThenBlasGer(uint64 m, uint64 n, float alpha, const DeviceMemory &x, int incx, const DeviceMemory &y, int incy, DeviceMemory *a, int lda) { VLOG_CALL(PARAM(m), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(a), PARAM(lda)); ThenBlasImpl &, int, const DeviceMemory &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGer, m, n, alpha, x, incx, y, incy, a, lda); } Stream &Stream::ThenBlasGer(uint64 m, uint64 n, double alpha, const DeviceMemory &x, int incx, const DeviceMemory &y, int incy, DeviceMemory *a, int lda) { VLOG_CALL(PARAM(m), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(a), PARAM(lda)); ThenBlasImpl &, int, const DeviceMemory &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGer, m, n, alpha, x, incx, y, incy, a, lda); } Stream &Stream::ThenBlasGerc(uint64 m, uint64 n, std::complex alpha, const DeviceMemory> &x, int incx, const DeviceMemory> &y, int incy, DeviceMemory> *a, int lda) { VLOG_CALL(PARAM(m), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(a), PARAM(lda)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGerc, m, n, alpha, x, incx, y, incy, a, lda); } Stream &Stream::ThenBlasGerc(uint64 m, uint64 n, std::complex alpha, const DeviceMemory> &x, int incx, const DeviceMemory> &y, int incy, DeviceMemory> *a, int lda) { VLOG_CALL(PARAM(m), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(a), PARAM(lda)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGerc, m, n, alpha, x, incx, y, incy, a, lda); } Stream &Stream::ThenBlasGeru(uint64 m, uint64 n, std::complex alpha, const DeviceMemory> &x, int incx, const DeviceMemory> &y, int incy, DeviceMemory> *a, int lda) { VLOG_CALL(PARAM(m), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(a), PARAM(lda)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGeru, m, n, alpha, x, incx, y, incy, a, lda); } Stream &Stream::ThenBlasGeru(uint64 m, uint64 n, std::complex alpha, const DeviceMemory> &x, int incx, const DeviceMemory> &y, int incy, DeviceMemory> *a, int lda) { VLOG_CALL(PARAM(m), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(a), PARAM(lda)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGeru, m, n, alpha, x, incx, y, incy, a, lda); } Stream &Stream::ThenBlasHbmv(blas::UpperLower uplo, uint64 n, uint64 k, std::complex alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &x, int incx, std::complex beta, DeviceMemory> *y, int incy) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasHbmv, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy); } Stream &Stream::ThenBlasHbmv(blas::UpperLower uplo, uint64 n, uint64 k, std::complex alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &x, int incx, std::complex beta, DeviceMemory> *y, int incy) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasHbmv, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy); } Stream &Stream::ThenBlasHemv(blas::UpperLower uplo, uint64 n, std::complex alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &x, int incx, std::complex beta, DeviceMemory> *y, int incy) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasHemv, uplo, n, alpha, a, lda, x, incx, beta, y, incy); } Stream &Stream::ThenBlasHemv(blas::UpperLower uplo, uint64 n, std::complex alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &x, int incx, std::complex beta, DeviceMemory> *y, int incy) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasHemv, uplo, n, alpha, a, lda, x, incx, beta, y, incy); } Stream &Stream::ThenBlasHer(blas::UpperLower uplo, uint64 n, float alpha, const DeviceMemory> &x, int incx, DeviceMemory> *a, int lda) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(a), PARAM(lda)); ThenBlasImpl> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasHer, uplo, n, alpha, x, incx, a, lda); } Stream &Stream::ThenBlasHer(blas::UpperLower uplo, uint64 n, double alpha, const DeviceMemory> &x, int incx, DeviceMemory> *a, int lda) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(a), PARAM(lda)); ThenBlasImpl> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasHer, uplo, n, alpha, x, incx, a, lda); } Stream &Stream::ThenBlasHer2(blas::UpperLower uplo, uint64 n, std::complex alpha, const DeviceMemory> &x, int incx, const DeviceMemory> &y, int incy, DeviceMemory> *a, int lda) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(a), PARAM(lda)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasHer2, uplo, n, alpha, x, incx, y, incy, a, lda); } Stream &Stream::ThenBlasHer2(blas::UpperLower uplo, uint64 n, std::complex alpha, const DeviceMemory> &x, int incx, const DeviceMemory> &y, int incy, DeviceMemory> *a, int lda) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(a), PARAM(lda)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasHer2, uplo, n, alpha, x, incx, y, incy, a, lda); } Stream &Stream::ThenBlasHpmv(blas::UpperLower uplo, uint64 n, std::complex alpha, const DeviceMemory> &ap, const DeviceMemory> &x, int incx, std::complex beta, DeviceMemory> *y, int incy) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(ap), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl, const DeviceMemory> &, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasHpmv, uplo, n, alpha, ap, x, incx, beta, y, incy); } Stream &Stream::ThenBlasHpmv(blas::UpperLower uplo, uint64 n, std::complex alpha, const DeviceMemory> &ap, const DeviceMemory> &x, int incx, std::complex beta, DeviceMemory> *y, int incy) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(ap), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl, const DeviceMemory> &, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasHpmv, uplo, n, alpha, ap, x, incx, beta, y, incy); } Stream &Stream::ThenBlasHpr(blas::UpperLower uplo, uint64 n, float alpha, const DeviceMemory> &x, int incx, DeviceMemory> *ap) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(ap)); ThenBlasImpl> &, int, DeviceMemory> *> impl; return impl(this, &blas::BlasSupport::DoBlasHpr, uplo, n, alpha, x, incx, ap); } Stream &Stream::ThenBlasHpr(blas::UpperLower uplo, uint64 n, double alpha, const DeviceMemory> &x, int incx, DeviceMemory> *ap) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(ap)); ThenBlasImpl> &, int, DeviceMemory> *> impl; return impl(this, &blas::BlasSupport::DoBlasHpr, uplo, n, alpha, x, incx, ap); } Stream &Stream::ThenBlasHpr2(blas::UpperLower uplo, uint64 n, std::complex alpha, const DeviceMemory> &x, int incx, const DeviceMemory> &y, int incy, DeviceMemory> *ap) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(ap)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, DeviceMemory> *> impl; return impl(this, &blas::BlasSupport::DoBlasHpr2, uplo, n, alpha, x, incx, y, incy, ap); } Stream &Stream::ThenBlasHpr2(blas::UpperLower uplo, uint64 n, std::complex alpha, const DeviceMemory> &x, int incx, const DeviceMemory> &y, int incy, DeviceMemory> *ap) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(ap)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, DeviceMemory> *> impl; return impl(this, &blas::BlasSupport::DoBlasHpr2, uplo, n, alpha, x, incx, y, incy, ap); } Stream &Stream::ThenBlasSbmv(blas::UpperLower uplo, uint64 n, uint64 k, float alpha, const DeviceMemory &a, int lda, const DeviceMemory &x, int incx, float beta, DeviceMemory *y, int incy) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl &, int, const DeviceMemory &, int, float, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSbmv, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy); } Stream &Stream::ThenBlasSbmv(blas::UpperLower uplo, uint64 n, uint64 k, double alpha, const DeviceMemory &a, int lda, const DeviceMemory &x, int incx, double beta, DeviceMemory *y, int incy) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl &, int, const DeviceMemory &, int, double, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSbmv, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy); } Stream &Stream::ThenBlasSpmv(blas::UpperLower uplo, uint64 n, float alpha, const DeviceMemory &ap, const DeviceMemory &x, int incx, float beta, DeviceMemory *y, int incy) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(ap), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl &, const DeviceMemory &, int, float, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSpmv, uplo, n, alpha, ap, x, incx, beta, y, incy); } Stream &Stream::ThenBlasSpmv(blas::UpperLower uplo, uint64 n, double alpha, const DeviceMemory &ap, const DeviceMemory &x, int incx, double beta, DeviceMemory *y, int incy) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(ap), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl &, const DeviceMemory &, int, double, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSpmv, uplo, n, alpha, ap, x, incx, beta, y, incy); } Stream &Stream::ThenBlasSpr(blas::UpperLower uplo, uint64 n, float alpha, const DeviceMemory &x, int incx, DeviceMemory *ap) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(ap)); ThenBlasImpl &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasSpr, uplo, n, alpha, x, incx, ap); } Stream &Stream::ThenBlasSpr(blas::UpperLower uplo, uint64 n, double alpha, const DeviceMemory &x, int incx, DeviceMemory *ap) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(ap)); ThenBlasImpl &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasSpr, uplo, n, alpha, x, incx, ap); } Stream &Stream::ThenBlasSpr2(blas::UpperLower uplo, uint64 n, float alpha, const DeviceMemory &x, int incx, const DeviceMemory &y, int incy, DeviceMemory *ap) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(ap)); ThenBlasImpl &, int, const DeviceMemory &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasSpr2, uplo, n, alpha, x, incx, y, incy, ap); } Stream &Stream::ThenBlasSpr2(blas::UpperLower uplo, uint64 n, double alpha, const DeviceMemory &x, int incx, const DeviceMemory &y, int incy, DeviceMemory *ap) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(ap)); ThenBlasImpl &, int, const DeviceMemory &, int, DeviceMemory *> impl; return impl(this, &blas::BlasSupport::DoBlasSpr2, uplo, n, alpha, x, incx, y, incy, ap); } Stream &Stream::ThenBlasSymv(blas::UpperLower uplo, uint64 n, float alpha, const DeviceMemory &a, int lda, const DeviceMemory &x, int incx, float beta, DeviceMemory *y, int incy) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl &, int, const DeviceMemory &, int, float, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSymv, uplo, n, alpha, a, lda, x, incx, beta, y, incy); } Stream &Stream::ThenBlasSymv(blas::UpperLower uplo, uint64 n, double alpha, const DeviceMemory &a, int lda, const DeviceMemory &x, int incx, double beta, DeviceMemory *y, int incy) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); ThenBlasImpl &, int, const DeviceMemory &, int, double, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSymv, uplo, n, alpha, a, lda, x, incx, beta, y, incy); } Stream &Stream::ThenBlasSyr(blas::UpperLower uplo, uint64 n, float alpha, const DeviceMemory &x, int incx, DeviceMemory *a, int lda) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(a), PARAM(lda)); ThenBlasImpl &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSyr, uplo, n, alpha, x, incx, a, lda); } Stream &Stream::ThenBlasSyr(blas::UpperLower uplo, uint64 n, double alpha, const DeviceMemory &x, int incx, DeviceMemory *a, int lda) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(a), PARAM(lda)); ThenBlasImpl &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSyr, uplo, n, alpha, x, incx, a, lda); } Stream &Stream::ThenBlasSyr2(blas::UpperLower uplo, uint64 n, float alpha, const DeviceMemory &x, int incx, const DeviceMemory &y, int incy, DeviceMemory *a, int lda) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(a), PARAM(lda)); ThenBlasImpl &, int, const DeviceMemory &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSyr2, uplo, n, alpha, x, incx, y, incy, a, lda); } Stream &Stream::ThenBlasSyr2(blas::UpperLower uplo, uint64 n, double alpha, const DeviceMemory &x, int incx, const DeviceMemory &y, int incy, DeviceMemory *a, int lda) { VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), PARAM(a), PARAM(lda)); ThenBlasImpl &, int, const DeviceMemory &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSyr2, uplo, n, alpha, x, incx, y, incy, a, lda); } Stream &Stream::ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, uint64 k, const DeviceMemory &a, int lda, DeviceMemory *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(k), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); ThenBlasImpl &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTbmv, uplo, trans, diag, n, k, a, lda, x, incx); } Stream &Stream::ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, uint64 k, const DeviceMemory &a, int lda, DeviceMemory *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(k), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); ThenBlasImpl &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTbmv, uplo, trans, diag, n, k, a, lda, x, incx); } Stream &Stream::ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, uint64 k, const DeviceMemory> &a, int lda, DeviceMemory> *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(k), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); ThenBlasImpl> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTbmv, uplo, trans, diag, n, k, a, lda, x, incx); } Stream &Stream::ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, uint64 k, const DeviceMemory> &a, int lda, DeviceMemory> *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(k), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); ThenBlasImpl> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTbmv, uplo, trans, diag, n, k, a, lda, x, incx); } Stream &Stream::ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, uint64 k, const DeviceMemory &a, int lda, DeviceMemory *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(k), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); ThenBlasImpl &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTbsv, uplo, trans, diag, n, k, a, lda, x, incx); } Stream &Stream::ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, uint64 k, const DeviceMemory &a, int lda, DeviceMemory *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(k), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); ThenBlasImpl &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTbsv, uplo, trans, diag, n, k, a, lda, x, incx); } Stream &Stream::ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, uint64 k, const DeviceMemory> &a, int lda, DeviceMemory> *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(k), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); ThenBlasImpl> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTbsv, uplo, trans, diag, n, k, a, lda, x, incx); } Stream &Stream::ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, uint64 k, const DeviceMemory> &a, int lda, DeviceMemory> *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(k), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); ThenBlasImpl> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTbsv, uplo, trans, diag, n, k, a, lda, x, incx); } Stream &Stream::ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, const DeviceMemory &ap, DeviceMemory *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(ap), PARAM(x), PARAM(incx)); ThenBlasImpl &, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTpmv, uplo, trans, diag, n, ap, x, incx); } Stream &Stream::ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, const DeviceMemory &ap, DeviceMemory *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(ap), PARAM(x), PARAM(incx)); ThenBlasImpl &, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTpmv, uplo, trans, diag, n, ap, x, incx); } Stream &Stream::ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, const DeviceMemory> &ap, DeviceMemory> *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(ap), PARAM(x), PARAM(incx)); ThenBlasImpl> &, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTpmv, uplo, trans, diag, n, ap, x, incx); } Stream &Stream::ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, const DeviceMemory> &ap, DeviceMemory> *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(ap), PARAM(x), PARAM(incx)); ThenBlasImpl> &, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTpmv, uplo, trans, diag, n, ap, x, incx); } Stream &Stream::ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, const DeviceMemory &ap, DeviceMemory *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(ap), PARAM(x), PARAM(incx)); ThenBlasImpl &, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTpsv, uplo, trans, diag, n, ap, x, incx); } Stream &Stream::ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, const DeviceMemory &ap, DeviceMemory *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(ap), PARAM(x), PARAM(incx)); ThenBlasImpl &, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTpsv, uplo, trans, diag, n, ap, x, incx); } Stream &Stream::ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, const DeviceMemory> &ap, DeviceMemory> *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(ap), PARAM(x), PARAM(incx)); ThenBlasImpl> &, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTpsv, uplo, trans, diag, n, ap, x, incx); } Stream &Stream::ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, const DeviceMemory> &ap, DeviceMemory> *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(ap), PARAM(x), PARAM(incx)); ThenBlasImpl> &, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTpsv, uplo, trans, diag, n, ap, x, incx); } Stream &Stream::ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, const DeviceMemory &a, int lda, DeviceMemory *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); ThenBlasImpl &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTrmv, uplo, trans, diag, n, a, lda, x, incx); } Stream &Stream::ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, const DeviceMemory &a, int lda, DeviceMemory *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); ThenBlasImpl &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTrmv, uplo, trans, diag, n, a, lda, x, incx); } Stream &Stream::ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, const DeviceMemory> &a, int lda, DeviceMemory> *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); ThenBlasImpl> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTrmv, uplo, trans, diag, n, a, lda, x, incx); } Stream &Stream::ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, const DeviceMemory> &a, int lda, DeviceMemory> *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); ThenBlasImpl> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTrmv, uplo, trans, diag, n, a, lda, x, incx); } Stream &Stream::ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, const DeviceMemory &a, int lda, DeviceMemory *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); ThenBlasImpl &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTrsv, uplo, trans, diag, n, a, lda, x, incx); } Stream &Stream::ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, const DeviceMemory &a, int lda, DeviceMemory *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); ThenBlasImpl &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTrsv, uplo, trans, diag, n, a, lda, x, incx); } Stream &Stream::ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, const DeviceMemory> &a, int lda, DeviceMemory> *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); ThenBlasImpl> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTrsv, uplo, trans, diag, n, a, lda, x, incx); } Stream &Stream::ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans, blas::Diagonal diag, uint64 n, const DeviceMemory> &a, int lda, DeviceMemory> *x, int incx) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); ThenBlasImpl> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTrsv, uplo, trans, diag, n, a, lda, x, incx); } Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, uint64 k, float alpha, const DeviceMemory &a, int lda, const DeviceMemory &b, int ldb, float beta, DeviceMemory *c, int ldc) { VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl &, int, const DeviceMemory &, int, float, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, uint64 k, float alpha, const DeviceMemory &a, int lda, const DeviceMemory &b, int ldb, float beta, DeviceMemory *c, int ldc) { VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl &, int, const DeviceMemory &, int, float, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, uint64 k, double alpha, const DeviceMemory &a, int lda, const DeviceMemory &b, int ldb, double beta, DeviceMemory *c, int ldc) { VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl &, int, const DeviceMemory &, int, double, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, uint64 k, std::complex alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &b, int ldb, std::complex beta, DeviceMemory> *c, int ldc) { VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, uint64 k, std::complex alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &b, int ldb, std::complex beta, DeviceMemory> *c, int ldc) { VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } Stream &Stream::ThenBlasHemm(blas::Side side, blas::UpperLower uplo, uint64 m, uint64 n, std::complex alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &b, int ldb, std::complex beta, DeviceMemory> *c, int ldc) { VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasHemm, side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } Stream &Stream::ThenBlasHemm(blas::Side side, blas::UpperLower uplo, uint64 m, uint64 n, std::complex alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &b, int ldb, std::complex beta, DeviceMemory> *c, int ldc) { VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasHemm, side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } Stream &Stream::ThenBlasHerk(blas::UpperLower uplo, blas::Transpose trans, uint64 n, uint64 k, float alpha, const DeviceMemory> &a, int lda, float beta, DeviceMemory> *c, int ldc) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl> &, int, float, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasHerk, uplo, trans, n, k, alpha, a, lda, beta, c, ldc); } Stream &Stream::ThenBlasHerk(blas::UpperLower uplo, blas::Transpose trans, uint64 n, uint64 k, double alpha, const DeviceMemory> &a, int lda, double beta, DeviceMemory> *c, int ldc) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl> &, int, double, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasHerk, uplo, trans, n, k, alpha, a, lda, beta, c, ldc); } Stream &Stream::ThenBlasHer2k(blas::UpperLower uplo, blas::Transpose trans, uint64 n, uint64 k, std::complex alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &b, int ldb, float beta, DeviceMemory> *c, int ldc) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, float, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasHer2k, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } Stream &Stream::ThenBlasHer2k(blas::UpperLower uplo, blas::Transpose trans, uint64 n, uint64 k, std::complex alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &b, int ldb, double beta, DeviceMemory> *c, int ldc) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, double, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasHer2k, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } Stream &Stream::ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m, uint64 n, float alpha, const DeviceMemory &a, int lda, const DeviceMemory &b, int ldb, float beta, DeviceMemory *c, int ldc) { VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl &, int, const DeviceMemory &, int, float, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSymm, side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } Stream &Stream::ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m, uint64 n, double alpha, const DeviceMemory &a, int lda, const DeviceMemory &b, int ldb, double beta, DeviceMemory *c, int ldc) { VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl &, int, const DeviceMemory &, int, double, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSymm, side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } Stream &Stream::ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m, uint64 n, std::complex alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &b, int ldb, std::complex beta, DeviceMemory> *c, int ldc) { VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSymm, side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } Stream &Stream::ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m, uint64 n, std::complex alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &b, int ldb, std::complex beta, DeviceMemory> *c, int ldc) { VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSymm, side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } Stream &Stream::ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans, uint64 n, uint64 k, float alpha, const DeviceMemory &a, int lda, float beta, DeviceMemory *c, int ldc) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl &, int, float, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSyrk, uplo, trans, n, k, alpha, a, lda, beta, c, ldc); } Stream &Stream::ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans, uint64 n, uint64 k, double alpha, const DeviceMemory &a, int lda, double beta, DeviceMemory *c, int ldc) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl &, int, double, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSyrk, uplo, trans, n, k, alpha, a, lda, beta, c, ldc); } Stream &Stream::ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans, uint64 n, uint64 k, std::complex alpha, const DeviceMemory> &a, int lda, std::complex beta, DeviceMemory> *c, int ldc) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSyrk, uplo, trans, n, k, alpha, a, lda, beta, c, ldc); } Stream &Stream::ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans, uint64 n, uint64 k, std::complex alpha, const DeviceMemory> &a, int lda, std::complex beta, DeviceMemory> *c, int ldc) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSyrk, uplo, trans, n, k, alpha, a, lda, beta, c, ldc); } Stream &Stream::ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans, uint64 n, uint64 k, float alpha, const DeviceMemory &a, int lda, const DeviceMemory &b, int ldb, float beta, DeviceMemory *c, int ldc) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl &, int, const DeviceMemory &, int, float, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSyr2k, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } Stream &Stream::ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans, uint64 n, uint64 k, double alpha, const DeviceMemory &a, int lda, const DeviceMemory &b, int ldb, double beta, DeviceMemory *c, int ldc) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl &, int, const DeviceMemory &, int, double, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSyr2k, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } Stream &Stream::ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans, uint64 n, uint64 k, std::complex alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &b, int ldb, std::complex beta, DeviceMemory> *c, int ldc) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSyr2k, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } Stream &Stream::ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans, uint64 n, uint64 k, std::complex alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &b, int ldb, std::complex beta, DeviceMemory> *c, int ldc) { VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc)); ThenBlasImpl, const DeviceMemory> &, int, const DeviceMemory> &, int, std::complex, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasSyr2k, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } Stream &Stream::ThenBlasTrmm(blas::Side side, blas::UpperLower uplo, blas::Transpose transa, blas::Diagonal diag, uint64 m, uint64 n, float alpha, const DeviceMemory &a, int lda, DeviceMemory *b, int ldb) { VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb)); ThenBlasImpl &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTrmm, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb); } Stream &Stream::ThenBlasTrmm(blas::Side side, blas::UpperLower uplo, blas::Transpose transa, blas::Diagonal diag, uint64 m, uint64 n, double alpha, const DeviceMemory &a, int lda, DeviceMemory *b, int ldb) { VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb)); ThenBlasImpl &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTrmm, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb); } Stream &Stream::ThenBlasTrmm(blas::Side side, blas::UpperLower uplo, blas::Transpose transa, blas::Diagonal diag, uint64 m, uint64 n, std::complex alpha, const DeviceMemory> &a, int lda, DeviceMemory> *b, int ldb) { VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb)); ThenBlasImpl, const DeviceMemory> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTrmm, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb); } Stream &Stream::ThenBlasTrmm(blas::Side side, blas::UpperLower uplo, blas::Transpose transa, blas::Diagonal diag, uint64 m, uint64 n, std::complex alpha, const DeviceMemory> &a, int lda, DeviceMemory> *b, int ldb) { VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb)); ThenBlasImpl, const DeviceMemory> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTrmm, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb); } Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo, blas::Transpose transa, blas::Diagonal diag, uint64 m, uint64 n, float alpha, const DeviceMemory &a, int lda, DeviceMemory *b, int ldb) { VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb)); ThenBlasImpl &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb); } Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo, blas::Transpose transa, blas::Diagonal diag, uint64 m, uint64 n, double alpha, const DeviceMemory &a, int lda, DeviceMemory *b, int ldb) { VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb)); ThenBlasImpl &, int, DeviceMemory *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb); } Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo, blas::Transpose transa, blas::Diagonal diag, uint64 m, uint64 n, std::complex alpha, const DeviceMemory> &a, int lda, DeviceMemory> *b, int ldb) { VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb)); ThenBlasImpl, const DeviceMemory> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb); } Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo, blas::Transpose transa, blas::Diagonal diag, uint64 m, uint64 n, std::complex alpha, const DeviceMemory> &a, int lda, DeviceMemory> *b, int ldb) { VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb)); ThenBlasImpl, const DeviceMemory> &, int, DeviceMemory> *, int> impl; return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb); } Stream &Stream::ThenBlasGemmBatched( blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, uint64 k, float alpha, const port::ArraySlice *> &a, int lda, const port::ArraySlice *> &b, int ldb, float beta, const port::ArraySlice *> &c, int ldc, int batch_count) { return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, nullptr); } Stream &Stream::ThenBlasGemmBatchedWithScratch( blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, uint64 k, float alpha, const port::ArraySlice *> &a, int lda, const port::ArraySlice *> &b, int ldb, float beta, const port::ArraySlice *> &c, int ldc, int batch_count, ScratchAllocator *scratch_allocator) { VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count)); ThenBlasImpl *> &, int, const port::ArraySlice *> &, int, float, const port::ArraySlice *> &, int, int, ScratchAllocator *> impl; return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, scratch_allocator); } Stream &Stream::ThenBlasGemmBatched( blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, uint64 k, double alpha, const port::ArraySlice *> &a, int lda, const port::ArraySlice *> &b, int ldb, double beta, const port::ArraySlice *> &c, int ldc, int batch_count) { return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, nullptr); } Stream &Stream::ThenBlasGemmBatchedWithScratch( blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, uint64 k, double alpha, const port::ArraySlice *> &a, int lda, const port::ArraySlice *> &b, int ldb, double beta, const port::ArraySlice *> &c, int ldc, int batch_count, ScratchAllocator *scratch_allocator) { VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count)); ThenBlasImpl *> &, int, const port::ArraySlice *> &, int, double, const port::ArraySlice *> &, int, int, ScratchAllocator *> impl; return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, scratch_allocator); } Stream &Stream::ThenBlasGemmBatched( blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, uint64 k, std::complex alpha, const port::ArraySlice> *> &a, int lda, const port::ArraySlice> *> &b, int ldb, std::complex beta, const port::ArraySlice> *> &c, int ldc, int batch_count) { return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, nullptr); } Stream &Stream::ThenBlasGemmBatchedWithScratch( blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, uint64 k, std::complex alpha, const port::ArraySlice> *> &a, int lda, const port::ArraySlice> *> &b, int ldb, std::complex beta, const port::ArraySlice> *> &c, int ldc, int batch_count, ScratchAllocator *scratch_allocator) { VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count)); ThenBlasImpl, const port::ArraySlice> *> &, int, const port::ArraySlice> *> &, int, std::complex, const port::ArraySlice> *> &, int, int, ScratchAllocator *> impl; return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, scratch_allocator); } Stream &Stream::ThenBlasGemmBatched( blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, uint64 k, std::complex alpha, const port::ArraySlice> *> &a, int lda, const port::ArraySlice> *> &b, int ldb, std::complex beta, const port::ArraySlice> *> &c, int ldc, int batch_count) { return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, nullptr); } Stream &Stream::ThenBlasGemmBatchedWithScratch( blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, uint64 k, std::complex alpha, const port::ArraySlice> *> &a, int lda, const port::ArraySlice> *> &b, int ldb, std::complex beta, const port::ArraySlice> *> &c, int ldc, int batch_count, ScratchAllocator *scratch_allocator) { VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count)); ThenBlasImpl, const port::ArraySlice> *> &, int, const port::ArraySlice> *> &, int, std::complex, const port::ArraySlice> *> &, int, int, ScratchAllocator *> impl; return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count, scratch_allocator); } Stream &Stream::ThenSetRngSeed(const uint8 *seed, uint64 seed_bytes) { VLOG_CALL(PARAM(seed), PARAM(seed_bytes)); if (ok()) { if (rng::RngSupport *rng = parent_->AsRng()) { CheckError(rng->SetSeed(this, seed, seed_bytes)); } else { SetError(); LOG(INFO) << "stream " << this << " unable to initialize RNG"; } } else { LOG(INFO) << "stream " << this << " did not set RNG seed: " << static_cast(seed) << "; bytes: " << seed_bytes; } return *this; } Stream &Stream::ThenPopulateRandUniform(DeviceMemory *values) { VLOG_CALL(PARAM(values)); if (ok()) { if (rng::RngSupport *rng = parent_->AsRng()) { CheckError(rng->DoPopulateRandUniform(this, values)); } else { SetError(); LOG(INFO) << "attempting to perform RNG operation using StreamExecutor " "without RNG support."; } } return *this; } Stream &Stream::ThenPopulateRandGaussian(float mean, float sd, DeviceMemory *values) { VLOG_CALL(PARAM(mean), PARAM(sd), PARAM(values)); if (ok()) { if (rng::RngSupport *rng = parent_->AsRng()) { CheckError(rng->DoPopulateRandGaussian(this, mean, sd, values)); } else { SetError(); LOG(INFO) << "attempting to perform RNG operation using StreamExecutor " "without RNG support."; } } return *this; } Stream &Stream::ThenPopulateRandGaussian(double mean, double sd, DeviceMemory *values) { VLOG_CALL(PARAM(mean), PARAM(sd), PARAM(values)); if (ok()) { if (rng::RngSupport *rng = parent_->AsRng()) { CheckError(rng->DoPopulateRandGaussian(this, mean, sd, values)); } else { SetError(); LOG(INFO) << "attempting to perform RNG operation using StreamExecutor " "without RNG support."; } } return *this; } Stream &Stream::ThenPopulateRandUniform(DeviceMemory *values) { VLOG_CALL(PARAM(values)); if (ok()) { if (rng::RngSupport *rng = parent_->AsRng()) { CheckError(rng->DoPopulateRandUniform(this, values)); } else { SetError(); LOG(INFO) << "attempting to perform RNG operation using StreamExecutor " "without RNG support."; } } return *this; } Stream &Stream::ThenPopulateRandUniform( DeviceMemory> *values) { VLOG_CALL(PARAM(values)); if (ok()) { if (rng::RngSupport *rng = parent_->AsRng()) { CheckError(rng->DoPopulateRandUniform(this, values)); } else { SetError(); LOG(INFO) << "attempting to perform RNG operation using StreamExecutor " "without RNG support."; } } return *this; } Stream &Stream::ThenPopulateRandUniform( DeviceMemory> *values) { VLOG_CALL(PARAM(values)); if (ok()) { if (rng::RngSupport *rng = parent_->AsRng()) { CheckError(rng->DoPopulateRandUniform(this, values)); } else { SetError(); LOG(INFO) << "stream " << this << " attempting to perform RNG operation using StreamExecutor " "without RNG support."; } } return *this; } Stream &Stream::ThenMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src, uint64 size) { VLOG_CALL(PARAM(host_dst), PARAM(gpu_src), PARAM(size)); if (ok()) { CheckError(parent_->Memcpy(this, host_dst, gpu_src, size)); } else { LOG(INFO) << "stream " << this << " did not memcpy device-to-host; source: " << gpu_src.opaque(); } return *this; } Stream &Stream::ThenMemcpy(DeviceMemoryBase *gpu_dst, const void *host_src, uint64 size) { VLOG_CALL(PARAM(gpu_dst), PARAM(host_src), PARAM(size)); if (ok()) { CheckError(parent_->Memcpy(this, gpu_dst, host_src, size)); } else { LOG(INFO) << "stream " << this << " did not memcpy host-to-device; source: " << host_src; } return *this; } Stream &Stream::ThenMemcpy(DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src, uint64 size) { VLOG_CALL(PARAM(gpu_dst), PARAM(gpu_src), PARAM(size)); if (ok()) { CheckError(parent_->MemcpyDeviceToDevice(this, gpu_dst, gpu_src, size)); } else { LOG(INFO) << "stream " << this << " did not memcpy gpu-to-gpu; source: " << &gpu_src; } return *this; } Stream &Stream::ThenMemZero(DeviceMemoryBase *location, uint64 size) { VLOG_CALL(PARAM(location), PARAM(size)); if (ok()) { CheckError(parent_->MemZero(this, location, size)); } else { LOG(INFO) << "stream " << this << " did not memzero GPU location; source: " << location; } return *this; } Stream &Stream::ThenMemset32(DeviceMemoryBase *location, const uint32 &pattern, uint64 size) { VLOG_CALL(PARAM(location), PARAM(pattern), PARAM(size)); if (ok()) { CheckError(parent_->Memset32(this, location, pattern, size)); } else { LOG(INFO) << "stream " << this << " did not memset GPU location; source: " << location << "; size: " << size << "; pattern: " << std::hex << pattern; } return *this; } Stream &Stream::ThenDoHostCallbackForTest(std::function callback) { VLOG_CALL(PARAM(callback)); return ThenDoHostCallback(callback); } Stream &Stream::ThenDoHostCallback(std::function callback) { VLOG_CALL(PARAM(callback)); if (ok()) { CheckError(parent_->HostCallback(this, callback)); } else { LOG(INFO) << "stream " << this << " was in error state before adding host callback"; } return *this; } Stream &Stream::ThenFft(fft::Plan *plan, const DeviceMemory> &input, DeviceMemory> *output) { VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output)); if (ok()) { if (fft::FftSupport *fft = parent_->AsFft()) { CheckError(fft->DoFft(this, plan, input, output)); } else { SetError(); LOG(INFO) << "attempting to perform FFT operation using StreamExecutor " "without FFT support"; } } return *this; } Stream &Stream::ThenFft(fft::Plan *plan, const DeviceMemory> &input, DeviceMemory> *output) { VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output)); if (ok()) { if (fft::FftSupport *fft = parent_->AsFft()) { CheckError(fft->DoFft(this, plan, input, output)); } else { SetError(); LOG(INFO) << "attempting to perform FFT operation using StreamExecutor " "without FFT support"; } } return *this; } Stream &Stream::ThenFft(fft::Plan *plan, const DeviceMemory &input, DeviceMemory> *output) { VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output)); if (ok()) { if (fft::FftSupport *fft = parent_->AsFft()) { CheckError(fft->DoFft(this, plan, input, output)); } else { SetError(); LOG(INFO) << "attempting to perform FFT operation using StreamExecutor " "without FFT support"; } } return *this; } Stream &Stream::ThenFft(fft::Plan *plan, const DeviceMemory &input, DeviceMemory> *output) { VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output)); if (ok()) { if (fft::FftSupport *fft = parent_->AsFft()) { CheckError(fft->DoFft(this, plan, input, output)); } else { SetError(); LOG(INFO) << "attempting to perform FFT operation using StreamExecutor " "without FFT support"; } } return *this; } Stream &Stream::ThenFft(fft::Plan *plan, const DeviceMemory> &input, DeviceMemory *output) { VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output)); if (ok()) { if (fft::FftSupport *fft = parent_->AsFft()) { CheckError(fft->DoFft(this, plan, input, output)); } else { SetError(); LOG(INFO) << "attempting to perform FFT operation using StreamExecutor " "without FFT support"; } } return *this; } Stream &Stream::ThenFft(fft::Plan *plan, const DeviceMemory> &input, DeviceMemory *output) { VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output)); if (ok()) { if (fft::FftSupport *fft = parent_->AsFft()) { CheckError(fft->DoFft(this, plan, input, output)); } else { SetError(); LOG(INFO) << "attempting to perform FFT operation using StreamExecutor " "without FFT support"; } } return *this; } // It looks confusing, but all this is doing is inserting a callback at the // present point in the stream to then enqueue a task on the host executor. Stream &Stream::ThenEnqueueOnBackgroundThread( std::function task) { VLOG_CALL(PARAM(task)); StreamExecutor *stream_executor = this->parent_; std::function bound_task = std::bind(task, stream_executor); return ThenDoHostCallback([stream_executor, bound_task]() { stream_executor->EnqueueOnBackgroundThread(bound_task); }); } bool Stream::BlockHostUntilDone() { VLOG_CALL(); if (!ok()) { LOG(INFO) << "stream " << this << " did not block host until done; was already in an error state"; return false; } { // Wait until all active sub-streams have done their tasks. mutex_lock lock{mu_}; for (auto &stream : sub_streams_) { if (!stream.second) { CheckError(stream.first->BlockHostUntilDone()); // Set this sub-stream as available. stream.second = true; } } } temporary_memory_manager_.DeallocateFinalizedTemporaries(); CheckError(parent_->BlockHostUntilDone(this)); return ok(); } } // namespace gputools } // namespace perftools