/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"

#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"

namespace tflite {
namespace kernel_utils {

void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
                  const float* recurrent_weights_ptr, const float* bias_ptr,
                  int input_size, int num_units, int batch_size,
                  TfLiteFusedActivation activation,
                  float* hidden_state_ptr_batch, float* output_ptr_batch) {
  RnnBatchStep(input_ptr_batch, input_weights_ptr,
               /*aux_input_ptr_batch=*/nullptr,
               /*aux_input_weights_ptr=*/nullptr, recurrent_weights_ptr,
               bias_ptr, input_size, /*aux_input_size=*/0, num_units,
               batch_size, activation, hidden_state_ptr_batch,
               output_ptr_batch);
}

void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
                  const float* aux_input_ptr_batch,
                  const float* aux_input_weights_ptr,
                  const float* recurrent_weights_ptr, const float* bias_ptr,
                  int input_size, int aux_input_size, int num_units,
                  int batch_size, TfLiteFusedActivation activation,
                  float* hidden_state_ptr_batch, float* output_ptr_batch) {
  // Output = bias
  tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
                                        output_ptr_batch);
  // Output += input * input_weights
  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
      input_weights_ptr, num_units, input_size, input_ptr_batch, batch_size,
      output_ptr_batch, /*result_stride=*/1);
  // Output += aux_input * aux_input_weights (if they are not empty).
  if (aux_input_size > 0) {
    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
        aux_input_weights_ptr, num_units, aux_input_size, aux_input_ptr_batch,
        batch_size, output_ptr_batch, /*result_stride=*/1);
  }
  // Output += recurrent_weights * hidden_state
  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
      recurrent_weights_ptr, num_units, num_units, hidden_state_ptr_batch,
      batch_size, output_ptr_batch, /*result_stride=*/1);
  // Output = activation(Output) and update hidden_state
  tensor_utils::ApplyActivationToVector(
      output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
  tensor_utils::VectorBatchVectorAssign(output_ptr_batch, num_units, batch_size,
                                        hidden_state_ptr_batch);
}

void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr,
                  float input_weights_scale,
                  const int8_t* recurrent_weights_ptr,
                  float recurrent_weights_scale, const float* bias_ptr,
                  int input_size, int num_units, int batch_size,
                  TfLiteFusedActivation activation,
                  int8_t* quantized_input_ptr_batch,
                  int8_t* quantized_hidden_state_ptr_batch,
                  float* scaling_factors, float* hidden_state_ptr_batch,
                  float* output_ptr_batch) {
  RnnBatchStep(input_ptr_batch, input_weights_ptr, input_weights_scale,
               /*aux_input_ptr_batch=*/nullptr,
               /*aux_input_weights_ptr=*/nullptr,
               /*aux_input_weights_scale=*/0.0f, recurrent_weights_ptr,
               recurrent_weights_scale, bias_ptr, input_size,
               /*aux_input_size=*/0, num_units, batch_size, activation,
               quantized_input_ptr_batch,
               /*aux_quantized_input_ptr_batch=*/nullptr,
               quantized_hidden_state_ptr_batch, scaling_factors,
               hidden_state_ptr_batch, output_ptr_batch);
}

void RnnBatchStep(
    const float* input_ptr_batch, const int8_t* input_weights_ptr,
    float input_weights_scale, const float* aux_input_ptr_batch,
    const int8_t* aux_input_weights_ptr, float aux_input_weights_scale,
    const int8_t* recurrent_weights_ptr, float recurrent_weights_scale,
    const float* bias_ptr, int input_size, int aux_input_size, int num_units,
    int batch_size, TfLiteFusedActivation activation,
    int8_t* quantized_input_ptr_batch, int8_t* aux_quantized_input_ptr_batch,
    int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
    float* hidden_state_ptr_batch, float* output_ptr_batch) {
  // Output = bias
  tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
                                        output_ptr_batch);

  // Save quantization and matmul computation for all zero input.
  if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) {
    // Quantize input from float to uint8 + quantization params (scaling
    // factor).
    float unused_min, unused_max;
    // TODO(mirkov,raziel): replace this for-loop with a MACRO (or function)
    // whichever is faster.
    for (int b = 0; b < batch_size; ++b) {
      const int offset = b * input_size;
      tensor_utils::SymmetricQuantizeFloats(
          input_ptr_batch + offset, input_size,
          quantized_input_ptr_batch + offset, &unused_min, &unused_max,
          &scaling_factors[b]);
      scaling_factors[b] *= input_weights_scale;
    }

    // Output += input * input_weights
    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
        input_weights_ptr, num_units, input_size, quantized_input_ptr_batch,
        scaling_factors, batch_size, output_ptr_batch, /*result_stride=*/1);
  }

  if (aux_input_ptr_batch &&
      !tensor_utils::IsZeroVector(aux_input_ptr_batch,
                                  batch_size * aux_input_size)) {
    float unused_min, unused_max;
    for (int b = 0; b < batch_size; ++b) {
      const int offset = b * aux_input_size;
      tensor_utils::SymmetricQuantizeFloats(
          aux_input_ptr_batch + offset, aux_input_size,
          aux_quantized_input_ptr_batch + offset, &unused_min, &unused_max,
          &scaling_factors[b]);
      scaling_factors[b] *= aux_input_weights_scale;
    }

    // Output += aux_input * aux_input_weights
    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
        aux_input_weights_ptr, num_units, aux_input_size,
        aux_quantized_input_ptr_batch, scaling_factors, batch_size,
        output_ptr_batch, /*result_stride=*/1);
  }

  // Save quantization and matmul computation for all zero input.
  if (!tensor_utils::IsZeroVector(hidden_state_ptr_batch,
                                  batch_size * num_units)) {
    // Quantize hidden_state
    float unused_min, unused_max;
    for (int b = 0; b < batch_size; ++b) {
      const int offset = b * num_units;
      tensor_utils::SymmetricQuantizeFloats(
          hidden_state_ptr_batch + offset, num_units,
          quantized_hidden_state_ptr_batch + offset, &unused_min, &unused_max,
          &scaling_factors[b]);
      scaling_factors[b] *= recurrent_weights_scale;
    }

    // Output += recurrent_weights * hidden_state
    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
        recurrent_weights_ptr, num_units, num_units,
        quantized_hidden_state_ptr_batch, scaling_factors, batch_size,
        output_ptr_batch, /*result_stride=*/1);
  }

  // Output = activation(Output) and update hidden_state
  tensor_utils::ApplyActivationToVector(
      output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
  tensor_utils::VectorBatchVectorAssign(output_ptr_batch, num_units, batch_size,
                                        hidden_state_ptr_batch);
}

}  // namespace kernel_utils
}  // namespace tflite