Automated rollback of change 134501895

Change: 134506649
author: A. Unique TensorFlower <gardener@tensorflow.org> 2016-09-28 00:15:58 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2016-09-28 01:35:32 -0700
commit: 419d5d072375ee0044fecb94e4bfe21a7b3b0b9e (patch)
tree: cb66e6e7238bf2e7938b58f3638bd31f65d542c2 /tensorflow/contrib/quantization
parent: c1e4f0f6a1078fd6715e8145fbef874e4d447ab8 (diff)
43 files changed, 8531 insertions, 15 deletions
diff --git a/tensorflow/contrib/quantization/BUILD b/tensorflow/contrib/quantization/BUILD
index 5347b32bdb..881349fda7 100644
--- a/tensorflow/contrib/quantization/BUILD
+++ b/tensorflow/contrib/quantization/BUILD
@@ -13,6 +13,53 @@ load(
     "tf_custom_op_library",
 )
 
+cc_library(
+    name = "cc_array_ops",
+    srcs = ["ops/array_ops.cc"],
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "cc_math_ops",
+    srcs = ["ops/math_ops.cc"],
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "cc_nn_ops",
+    srcs = ["ops/nn_ops.cc"],
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "cc_ops",
+    linkstatic = 1,
+    deps = [
+        ":cc_array_ops",
+        ":cc_math_ops",
+        ":cc_nn_ops",
+    ],
+    alwayslink = 1,
+)
+
+filegroup(
+    name = "android_ops",
+    srcs = glob(["ops/*.cc"]),
+    visibility = ["//visibility:public"],
+)
+
 py_library(
     name = "quantization_py",
     srcs = [
@@ -22,6 +69,8 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":ops",
+        "//tensorflow/contrib/quantization:quantized_ops_py",
+        "//tensorflow/contrib/quantization/kernels:quantized_kernels_py",
     ],
 )
 
@@ -34,9 +83,52 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
+        ":array_ops",
+        ":math_ops",
+        ":nn_ops",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "array_ops",
+    deps = ["//tensorflow/contrib/quantization:cc_array_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "math_ops",
+    deps = ["//tensorflow/contrib/quantization:cc_math_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "nn_ops",
+    deps = ["//tensorflow/contrib/quantization:cc_nn_ops"],
+)
+
+py_test(
+    name = "dequantize_op_test",
+    size = "small",
+    srcs = ["python/dequantize_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ops",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/quantization:quantized_ops_py",
+        "//tensorflow/contrib/quantization/kernels:quantized_kernels_py",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+py_test(
+    name = "quantized_conv_ops_test",
+    size = "small",
+    srcs = ["python/quantized_conv_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ops",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/quantization:quantized_ops_py",
+        "//tensorflow/contrib/quantization/kernels:quantized_kernels_py",
+        "//tensorflow/python:framework_test_lib",
     ],
 )
 
@@ -47,6 +139,24 @@ filegroup(
     ]),
 )
 
+tf_custom_op_library(
+    name = "_quantized_ops.so",
+    srcs = [
+        "ops/array_ops.cc",
+        "ops/math_ops.cc",
+        "ops/nn_ops.cc",
+    ],
+    deps = [
+    ],
+)
+
+py_library(
+    name = "quantized_ops_py",
+    srcs = ["load_quantized_ops_so.py"],
+    data = ["_quantized_ops.so"],
+    srcs_version = "PY2AND3",
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/quantization/Makefile.in b/tensorflow/contrib/quantization/Makefile.in
new file mode 100644
index 0000000000..563639e5d7
--- /dev/null
+++ b/tensorflow/contrib/quantization/Makefile.in
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# This sub Makefile compiles libraries under this directory. This is designed to
+# be used as a sub Makefile with tensorflow/contrib/makefile/Makefile.
+# You can build targets in this file by including this sub makefile like:
+# $ make -f tensorflow/contrib/makefile/Makefile TARGET=<target> \
+# SUB_MAKEFILES=$(pwd)/tensorflow/contrib/quantization/Makefile.in \
+# (optional: NDK_ROOT=<ndk_root>) contrib_quantization_tests
+# TODO(satok): Support more targets
+
+GTEST_DIR := \
+$(MAKEFILE_DIR)/downloads/googletest/googletest
+
+GTEST_HEADERS = \
+$(wildcard $(GTEST_DIR)/include/gtest/*.h) \
+$(wildcard $(GTEST_DIR)/include/gtest/internal/*.h)
+
+GTEST_SRCS := \
+$(wildcard $(GTEST_DIR)/src/*.cc) \
+$(wildcard $(GTEST_DIR)/src/*.h) \
+$(GTEST_HEADERS)
+
+QUANTIZATION_TEST_SRCS := \
+tensorflow/contrib/quantization/ops/math_ops.cc \
+tensorflow/contrib/quantization/kernels/quantize_op.cc \
+tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc \
+tensorflow/contrib/quantization/kernels/quantized_matmul_op.cc \
+tensorflow/contrib/quantization/kernels/quantized_matmul_op_test.cc \
+tensorflow/contrib/quantization/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc \
+tensorflow/contrib/makefile/test/test_main.cc
+
+QUANTIZATION_TEST_OBJS := $(addprefix $(OBJDIR), $(QUANTIZATION_TEST_SRCS:.cc=.o))
+
+QUANTIZATION_TEST_NAME := contrib_quantization_tests
+QUANTIZATION_TEST_BIN_PATH := $(BINDIR)$(QUANTIZATION_TEST_NAME)
+
+INCLUDES += \
+-I$(MAKEFILE_DIR)/downloads/gemmlowp \
+-I$(MAKEFILE_DIR)/downloads/googletest/googletest/include
+
+QUANTIZATION_TEST_INCLUDES := $(INCLUDES)
+
+$(OBJDIR)gtest-all.o : $(GTEST_SRCS)
+	$(CXX) $(CXXFLAGS) $(QUANTIZATION_TEST_INCLUDES) -I $(GTEST_DIR) -c \
+	$(GTEST_DIR)/src/gtest-all.cc -o $@
+
+$(LIBDIR)gtest.a : $(OBJDIR)gtest-all.o
+	$(AR) $(ARFLAGS) $@ $^
+
+$(QUANTIZATION_TEST_BIN_PATH): $(LIB_PATH) $(LIBDIR)gtest.a $(QUANTIZATION_TEST_OBJS)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(QUANTIZATION_TEST_INCLUDES) \
+	-o $(QUANTIZATION_TEST_BIN_PATH) $(QUANTIZATION_TEST_OBJS) \
+	$(LIBFLAGS) $(LIB_PATH) $(LIBDIR)gtest.a $(LDFLAGS) $(LIBS)
+
+$(QUANTIZATION_TEST_NAME): $(QUANTIZATION_TEST_BIN_PATH)
diff --git a/tensorflow/contrib/quantization/__init__.py b/tensorflow/contrib/quantization/__init__.py
index dcb73399b3..833dd20b5a 100644
--- a/tensorflow/contrib/quantization/__init__.py
+++ b/tensorflow/contrib/quantization/__init__.py
@@ -24,7 +24,7 @@ from tensorflow.contrib.quantization.python import array_ops as quantized_array_
 from tensorflow.contrib.quantization.python.math_ops import *
 from tensorflow.contrib.quantization.python.nn_ops import *
 
-from tensorflow.python.ops import gen_array_ops as quantized_gen_array_ops
-from tensorflow.python.ops.gen_array_ops import dequantize
-from tensorflow.python.ops.gen_array_ops import quantize_v2
-from tensorflow.python.ops.gen_array_ops import quantized_concat
+from tensorflow.contrib.quantization.ops import gen_array_ops as quantized_gen_array_ops
+from tensorflow.contrib.quantization.ops.gen_array_ops import dequantize
+from tensorflow.contrib.quantization.ops.gen_array_ops import quantize_v2
+from tensorflow.contrib.quantization.ops.gen_array_ops import quantized_concat
diff --git a/tensorflow/contrib/quantization/kernels/BUILD b/tensorflow/contrib/quantization/kernels/BUILD
new file mode 100644
index 0000000000..6be2ccaa07
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/BUILD
@@ -0,0 +1,311 @@
+# Description:
+#   quantization-specific OpKernels
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["-parse_headers"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+    "tf_custom_op_library",
+    "tf_kernel_library",
+)
+
+filegroup(
+    name = "android_ops",
+    srcs = [
+        "dequantize_op.cc",
+        "quantization_utils.cc",
+        "quantization_utils.h",
+        "quantize_down_and_shrink_range.cc",
+        "quantize_op.cc",
+        "quantized_activation_ops.cc",
+        "quantized_batch_norm_op.cc",
+        "quantized_bias_add_op.cc",
+        "quantized_concat_op.cc",
+        "quantized_conv_ops.cc",
+        "quantized_matmul_op.cc",
+        "quantized_pooling_ops.cc",
+        "reference_gemm.h",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+tf_kernel_library(
+    name = "quantized_ops",
+    srcs = [
+        "dequantize_op.cc",
+        "quantization_utils.cc",
+        "quantize_down_and_shrink_range.cc",
+        "quantize_op.cc",
+        "quantized_activation_ops.cc",
+        "quantized_batch_norm_op.cc",
+        "quantized_bias_add_op.cc",
+        "quantized_concat_op.cc",
+        "quantized_conv_ops.cc",
+        "quantized_matmul_op.cc",
+        "quantized_pooling_ops.cc",
+    ],
+    hdrs = [
+        "quantization_utils.h",
+        "reference_gemm.h",
+    ],
+    deps = [
+        "//tensorflow/contrib/quantization:cc_array_ops",
+        "//tensorflow/contrib/quantization:cc_math_ops",
+        "//tensorflow/contrib/quantization:cc_nn_ops",
+        "//tensorflow/core",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:concat_lib_hdrs",
+        "//tensorflow/core/kernels:conv_ops",
+        "//tensorflow/core/kernels:eigen_helpers",
+        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/kernels:pooling_ops",
+        "//third_party/eigen3",
+        "@gemmlowp//:eight_bit_int_gemm",
+    ],
+)
+
+tf_custom_op_library(
+    name = "_quantized_kernels.so",
+    srcs = [
+        "dequantize_op.cc",
+        "quantization_utils.cc",
+        "quantization_utils.h",
+        "quantize_down_and_shrink_range.cc",
+        "quantize_op.cc",
+        "quantized_activation_ops.cc",
+        "quantized_batch_norm_op.cc",
+        "quantized_bias_add_op.cc",
+        "quantized_concat_op.cc",
+        "quantized_conv_ops.cc",
+        "quantized_matmul_op.cc",
+        "quantized_pooling_ops.cc",
+        "reference_gemm.h",
+    ],
+    deps = [
+        "//tensorflow/core/kernels:concat_lib_hdrs",
+        "//tensorflow/core/kernels:ops_util_hdrs",
+        "//tensorflow/core/kernels:pooling_ops_hdrs",
+        "@gemmlowp//:eight_bit_int_gemm",
+    ],
+)
+
+py_library(
+    name = "quantized_kernels_py",
+    srcs = ["load_quantized_kernels_so.py"],
+    data = ["_quantized_kernels.so"],
+    srcs_version = "PY2AND3",
+)
+
+tf_cc_test(
+    name = "quantize_down_and_shrink_range_op_test",
+    size = "small",
+    srcs = ["quantize_down_and_shrink_range_op_test.cc"],
+    deps = [
+        ":quantized_ops",
+        "//tensorflow/contrib/quantization:cc_array_ops",
+        "//tensorflow/contrib/quantization:cc_math_ops",
+        "//tensorflow/contrib/quantization:cc_nn_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+tf_cc_test(
+    name = "quantization_utils_test",
+    srcs = ["quantization_utils_test.cc"],
+    deps = [
+        ":quantized_ops",
+        "//tensorflow/contrib/quantization:cc_array_ops",
+        "//tensorflow/contrib/quantization:cc_math_ops",
+        "//tensorflow/contrib/quantization:cc_nn_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_cc_test(
+    name = "quantized_activation_ops_test",
+    srcs = ["quantized_activation_ops_test.cc"],
+    deps = [
+        ":quantized_ops",
+        "//tensorflow/contrib/quantization:cc_array_ops",
+        "//tensorflow/contrib/quantization:cc_math_ops",
+        "//tensorflow/contrib/quantization:cc_nn_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+tf_cc_test(
+    name = "quantized_bias_add_op_test",
+    size = "small",
+    srcs = ["quantized_bias_add_op_test.cc"],
+    deps = [
+        ":quantized_ops",
+        "//tensorflow/contrib/quantization:cc_array_ops",
+        "//tensorflow/contrib/quantization:cc_math_ops",
+        "//tensorflow/contrib/quantization:cc_nn_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+tf_cc_test(
+    name = "quantized_conv_ops_test",
+    size = "small",
+    srcs = ["quantized_conv_ops_test.cc"],
+    deps = [
+        ":quantized_ops",
+        "//tensorflow/contrib/quantization:cc_array_ops",
+        "//tensorflow/contrib/quantization:cc_math_ops",
+        "//tensorflow/contrib/quantization:cc_nn_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+tf_cc_test(
+    name = "quantize_op_test",
+    size = "small",
+    srcs = ["quantize_op_test.cc"],
+    deps = [
+        ":quantized_ops",
+        "//tensorflow/contrib/quantization:cc_array_ops",
+        "//tensorflow/contrib/quantization:cc_math_ops",
+        "//tensorflow/contrib/quantization:cc_nn_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+tf_cc_test(
+    name = "quantized_matmul_op_test",
+    size = "small",
+    srcs = ["quantized_matmul_op_test.cc"],
+    deps = [
+        ":quantized_ops",
+        "//tensorflow/contrib/quantization:cc_array_ops",
+        "//tensorflow/contrib/quantization:cc_math_ops",
+        "//tensorflow/contrib/quantization:cc_nn_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+tf_cc_test(
+    name = "quantized_pooling_ops_test",
+    size = "small",
+    srcs = ["quantized_pooling_ops_test.cc"],
+    deps = [
+        ":quantized_ops",
+        "//tensorflow/contrib/quantization:cc_array_ops",
+        "//tensorflow/contrib/quantization:cc_math_ops",
+        "//tensorflow/contrib/quantization:cc_nn_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+tf_cc_test(
+    name = "quantized_concat_op_test",
+    size = "small",
+    srcs = ["quantized_concat_op_test.cc"],
+    deps = [
+        ":quantized_ops",
+        "//tensorflow/contrib/quantization:cc_array_ops",
+        "//tensorflow/contrib/quantization:cc_math_ops",
+        "//tensorflow/contrib/quantization:cc_nn_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+tf_cc_test(
+    name = "quantized_batch_norm_op_test",
+    size = "small",
+    srcs = ["quantized_batch_norm_op_test.cc"],
+    deps = [
+        ":quantized_ops",
+        "//tensorflow/contrib/quantization:cc_array_ops",
+        "//tensorflow/contrib/quantization:cc_math_ops",
+        "//tensorflow/contrib/quantization:cc_nn_ops",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:batch_norm_op",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//third_party/eigen3",
+    ],
+)
diff --git a/tensorflow/contrib/quantization/kernels/dequantize_op.cc b/tensorflow/contrib/quantization/kernels/dequantize_op.cc
new file mode 100644
index 0000000000..a088954fc2
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/dequantize_op.cc
@@ -0,0 +1,106 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace {
+enum { QUANTIZE_MODE_MIN_COMBINED, QUANTIZE_MODE_MIN_FIRST };
+}  // namespace
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T>
+class DequantizeOp : public OpKernel {
+ public:
+  explicit DequantizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    half_range_ = !std::is_signed<T>::value
+                      ? 0.0f
+                      : (static_cast<float>(std::numeric_limits<T>::max()) -
+                         std::numeric_limits<T>::min() + 1) /
+                            2.0f;
+    string mode_string;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("mode", &mode_string));
+    OP_REQUIRES(ctx,
+                (mode_string == "MIN_COMBINED" || mode_string == "MIN_FIRST"),
+                errors::InvalidArgument("Mode string must be 'MIN_COMBINED' or"
+                                        " 'MIN_FIRST', is '" +
+                                        mode_string + "'"));
+    if (mode_string == "MIN_COMBINED") {
+      mode_ = QUANTIZE_MODE_MIN_COMBINED;
+    } else if (mode_string == "MIN_FIRST") {
+      mode_ = QUANTIZE_MODE_MIN_FIRST;
+    }
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input = ctx->input(0);
+    const float min_range = ctx->input(1).flat<float>()(0);
+    const float max_range = ctx->input(2).flat<float>()(0);
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
+    if (mode_ == QUANTIZE_MODE_MIN_COMBINED) {
+      const float scale_factor =
+          (max_range - min_range) /
+          (static_cast<float>(std::numeric_limits<T>::max()) -
+           std::numeric_limits<T>::min());
+
+      // Multiply by scale factor and add min_range.
+      output->flat<float>() =
+          ((input.flat<T>().template cast<int>().template cast<float>() +
+            half_range_) *
+           scale_factor) +
+          min_range;
+    } else if (mode_ == QUANTIZE_MODE_MIN_FIRST) {
+      QuantizedTensorToFloatInPlaceUsingEigen<T>(
+          ctx->template eigen_device<Device>(), input, min_range, max_range,
+          output);
+    }
+  }
+
+ private:
+  float half_range_;
+  int mode_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("Dequantize").Device(DEVICE_CPU).TypeConstraint<quint8>("T"),
+    DequantizeOp<CPUDevice, quint8>);
+REGISTER_KERNEL_BUILDER(
+    Name("Dequantize").Device(DEVICE_CPU).TypeConstraint<qint8>("T"),
+    DequantizeOp<CPUDevice, qint8>);
+REGISTER_KERNEL_BUILDER(
+    Name("Dequantize").Device(DEVICE_CPU).TypeConstraint<quint16>("T"),
+    DequantizeOp<CPUDevice, quint16>);
+REGISTER_KERNEL_BUILDER(
+    Name("Dequantize").Device(DEVICE_CPU).TypeConstraint<qint16>("T"),
+    DequantizeOp<CPUDevice, qint16>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("Dequantize").Device(DEVICE_CPU).TypeConstraint<qint32>("T"),
+    DequantizeOp<CPUDevice, qint32>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/hexagon/BUILD b/tensorflow/contrib/quantization/kernels/hexagon/BUILD
new file mode 100644
index 0000000000..b57a2ac1b5
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/hexagon/BUILD
@@ -0,0 +1,45 @@
+# Description:
+#   quantization-specific OpKernels for hexagon
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["-parse_headers"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+tf_cc_test(
+    name = "quantized_matmul_op_for_hexagon_test",
+    size = "small",
+    srcs = ["quantized_matmul_op_for_hexagon_test.cc"],
+    deps = [
+        "//tensorflow/contrib/quantization:cc_array_ops",
+        "//tensorflow/contrib/quantization:cc_math_ops",
+        "//tensorflow/contrib/quantization:cc_nn_ops",
+        "//tensorflow/contrib/quantization/kernels:quantized_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
diff --git a/tensorflow/contrib/quantization/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc b/tensorflow/contrib/quantization/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc
new file mode 100644
index 0000000000..3d139fbe0a
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc
@@ -0,0 +1,136 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Tests in this file are designed to evaluate hexagon DSP operations.
+
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+#ifdef USE_HEXAGON_LIBS
+#include "tensorflow/core/platform/hexagon/gemm_wrapper.h"
+#include "tensorflow/core/platform/profile_utils/cpu_utils.h"
+#endif
+
+namespace tensorflow {
+
+class QuantizedMatMulOpForHexagonTest : public OpsTestBase {
+ protected:
+  void SetUp() final {
+#ifdef USE_HEXAGON_LIBS
+    profile_utils::CpuUtils::EnableClockCycleProfiling(true);
+    LOG(INFO) << "Hexagon libs are linked (wrapper version = "
+              << hexagon_gemm_wrapper_GetWrapperVersion()
+              << ", hexagon binary version = "
+              << hexagon_gemm_wrapper_GetHexagonBinaryVersion() << ")";
+    LOG(INFO) << "Cpu frequency = "
+              << profile_utils::CpuUtils::GetCycleCounterFrequency();
+#else
+    LOG(WARNING) << "Hexagon libs are not linked.";
+#endif
+  }
+};
+
+// Shows some statistics of hexagon dsp using hexagon specific APIs
+#ifdef USE_HEXAGON_LIBS
+TEST_F(QuantizedMatMulOpForHexagonTest, EvaluateSharedLibOverhead) {
+  const uint64 overhead_shared_lib_start =
+      profile_utils::CpuUtils::GetCurrentClockCycle();
+  const int wrapper_version = hexagon_gemm_wrapper_GetWrapperVersion();
+  const uint64 overhead_shared_lib_end =
+      profile_utils::CpuUtils::GetCurrentClockCycle();
+  const uint64 overhead_shared_lib_diff =
+      (overhead_shared_lib_end - overhead_shared_lib_start);
+  const uint64 overhead_hexagon_rpc_start =
+      profile_utils::CpuUtils::GetCurrentClockCycle();
+  const int hexagon_binary_version =
+      hexagon_gemm_wrapper_GetHexagonBinaryVersion();
+  const uint64 overhead_hexagon_rpc_end =
+      profile_utils::CpuUtils::GetCurrentClockCycle();
+  const uint64 overhead_hexagon_rpc_diff =
+      (overhead_hexagon_rpc_end - overhead_hexagon_rpc_start);
+  LOG(INFO) << "Shared lib (ver = " << wrapper_version << ") overhead is "
+            << overhead_shared_lib_diff << " cycles, time = "
+            << std::chrono::duration_cast<std::chrono::microseconds>(
+                   profile_utils::CpuUtils::ConvertClockCycleToTime(
+                       overhead_shared_lib_diff))
+                   .count()
+            << " usec";
+  LOG(INFO) << "hexagon rpc (ver = " << hexagon_binary_version
+            << ") overhead is " << overhead_hexagon_rpc_diff
+            << " cycles, time = "
+            << std::chrono::duration_cast<std::chrono::microseconds>(
+                   profile_utils::CpuUtils::ConvertClockCycleToTime(
+                       overhead_hexagon_rpc_diff))
+                   .count()
+            << " usec";
+}
+#endif
+
+// Runs two small matrices through the operator, and leaves all the parameters
+// at their default values.
+// This test is a sample to execute matmul on hexagon.
+TEST_F(QuantizedMatMulOpForHexagonTest, Small_NoParams) {
+  TF_ASSERT_OK(NodeDefBuilder("quantized_mat_mul_op", "QuantizedMatMul")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("Toutput", DataTypeToEnum<qint32>::v())
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  // A matrix is:
+  // |  1 |  2 |  3 |
+  // |  4 |  5 |  6 |
+  AddInputFromArray<quint8>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
+  // B matrix is:
+  // |  7 |  8 |  9 | 10 |
+  // | 11 | 12 | 13 | 14 |
+  // | 15 | 16 | 17 | 18 |
+  AddInputFromArray<quint8>(TensorShape({3, 4}),
+                            {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
+  AddInputFromArray<float>(TensorShape({1}), {0});
+  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+  AddInputFromArray<float>(TensorShape({1}), {0});
+  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+
+  TF_ASSERT_OK(RunOpKernel());
+  // Here are the results we expect, from hand calculations:
+  // (1 * 7) + (2 * 11) + (3 * 15) = 74
+  // (1 * 8) + (2 * 12) + (3 * 16) = 80
+  // (1 * 9) + (2 * 13) + (3 * 17) = 86
+  // (1 * 10) + (2 * 14) + (3 * 18) = 92
+  // (4 * 7) + (5 * 11) + (6 * 15) = 173
+  // (4 * 8) + (5 * 12) + (6 * 16) = 188
+  // (4 * 9) + (5 * 13) + (6 * 17) = 203
+  // (4 * 10) + (5 * 14) + (6 * 18) = 218
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 4}));
+  test::FillValues<qint32>(&expected, {74, 80, 86, 92, 173, 188, 203, 218});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/load_quantized_kernels_so.py b/tensorflow/contrib/quantization/kernels/load_quantized_kernels_so.py
new file mode 100644
index 0000000000..3b7fd57a93
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/load_quantized_kernels_so.py
@@ -0,0 +1,48 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops for quantized evaluation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import threading
+
+import tensorflow as tf
+
+QUANTIZED_KERNELS_FILE = '_quantized_kernels.so'
+
+_quantized_kernels = None
+_kernels_lock = threading.Lock()
+
+
+# Workaround for the fact that importing tensorflow imports contrib
+# (even if a user isn't using this or any other contrib op), but
+# there's not yet any guarantee that the shared object exists.
+# In which case, "import tensorflow" will always crash, even for users that
+# never use contrib.
+def Load(library_base_dir=''):
+  """Load the quantized ops library and return the loaded module."""
+  with _kernels_lock:
+    global _quantized_kernels
+    if not _quantized_kernels:
+      data_files_path = os.path.join(library_base_dir,
+                                     tf.resource_loader.get_data_files_path())
+      tf.logging.info('data path: %s', data_files_path)
+      _quantized_kernels = tf.load_op_library(os.path.join(
+          data_files_path, QUANTIZED_KERNELS_FILE))
+
+      assert _quantized_kernels, 'Could not load _quantized_kernels.so'
+  return _quantized_kernels
diff --git a/tensorflow/contrib/quantization/kernels/quantization_utils.cc b/tensorflow/contrib/quantization/kernels/quantization_utils.cc
new file mode 100644
index 0000000000..72651f96b0
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantization_utils.cc
@@ -0,0 +1,42 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+
+namespace tensorflow {
+
+void GetOutputMinAndMaxForQuantizedAdd(float input_min, float input_max,
+                                       float smaller_input_min,
+                                       float smaller_input_max,
+                                       float* output_min, float* output_max) {
+  // We need to have a good range to add our two arguments together in. This
+  // is surprisingly tricky, since it has to satisfy a few different needs:
+  //  - Must be symmetrical around zero, so that 0 + 0 = 0.
+  //  - Must hold the largest of the argument ranges.
+  //  - Should have enough range that the bits of the lowest and highest
+  //    arguments overlap if possible without the lower getting truncated.
+  //  - Should have some headroom so that there's no overflow.
+  //  - Needs to be signed.
+  // This leads us to use a scheme where we (assuming the inputs are eight bit
+  // and the output is 32-bit) use the bottom 32 - 17 = 15 bits to store the
+  // accumulated results. This gives us all the properties we need.
+  *output_max =
+      std::max(input_max, std::max(-input_min, std::max(smaller_input_max,
+                                                        -smaller_input_min))) *
+      (1 << 17);
+  *output_min = -(*output_max);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantization_utils.h b/tensorflow/contrib/quantization/kernels/quantization_utils.h
new file mode 100644
index 0000000000..3b6a4901ba
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantization_utils.h
@@ -0,0 +1,555 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_QUANTIZATION_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_QUANTIZATION_UTILS_H_
+
+#define EIGEN_USE_THREADS
+
+// This is a set of functions that standardizes how quantized values are
+// interpreted as float numbers.
+// All of the current implementations are for reference and have not been
+// optimized. They should be implementable using fixed point representations
+// to avoid a dependency on floating-point hardware.
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "public/gemmlowp.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+
+namespace tensorflow {
+
+// We have to be able to detect and handle overflows in int32, so this function
+// uses doubles and int64's to make sure we have enough room.
+template <class T>
+int64 FloatToQuantizedUnclamped(float input, float range_min, float range_max) {
+  const int64 lowest_quantized =
+      static_cast<double>(Eigen::NumTraits<T>::lowest());
+  if (range_min == range_max) {
+    return lowest_quantized;
+  }
+  const int number_of_bits = sizeof(T) * 8;
+  const int64 number_of_steps = static_cast<int64>(1) << number_of_bits;
+  const double range_adjust = (number_of_steps / (number_of_steps - 1.0));
+  const double range = ((range_max - range_min) * range_adjust);
+  const double range_scale = (number_of_steps / range);
+  int64 quantized =
+      (round(input * range_scale) - round(range_min * range_scale));
+  quantized += lowest_quantized;
+  return quantized;
+}
+
+// This converts the float into the final quantized type, clamping/saturating
+// any over or underflows.
+template <class T>
+T FloatToQuantized(float input, float range_min, float range_max) {
+  int64 quantized = FloatToQuantizedUnclamped<T>(input, range_min, range_max);
+  const int64 lowest_quantized =
+      static_cast<int64>(Eigen::NumTraits<T>::lowest());
+  const int64 highest_quantized =
+      static_cast<int64>(Eigen::NumTraits<T>::highest());
+  quantized = std::max(quantized, lowest_quantized);
+  quantized = std::min(quantized, highest_quantized);
+  return static_cast<T>(static_cast<int32>(quantized));
+}
+
+template <class T>
+float QuantizedToFloat(T input, float range_min, float range_max) {
+  if (range_min == range_max) {
+    return range_min;
+  }
+  const int number_of_bits = sizeof(T) * 8;
+  const int64 number_of_steps = static_cast<int64>(1) << number_of_bits;
+  const double range_adjust = (number_of_steps / (number_of_steps - 1.0));
+  const double range = ((range_max - range_min) * range_adjust);
+  const double range_scale = (range / number_of_steps);
+  const int64 lowest_quantized =
+      static_cast<int64>(Eigen::NumTraits<T>::lowest());
+  const double offset_input = static_cast<double>(input) - lowest_quantized;
+  const double result = range_min + (offset_input * range_scale);
+  return static_cast<float>(result);
+}
+
+template <class T>
+float FloatForOneQuantizedLevel(float range_min, float range_max) {
+  const int64 highest = static_cast<int64>(Eigen::NumTraits<T>::highest());
+  const int64 lowest = static_cast<int64>(Eigen::NumTraits<T>::lowest());
+  const float float_for_one_quantized_level =
+      (range_max - range_min) / (highest - lowest);
+  return float_for_one_quantized_level;
+}
+
+template <class T1, class T2, class T3>
+void QuantizationRangeForMultiplication(float min_a, float max_a, float min_b,
+                                        float max_b, float* min_c,
+                                        float* max_c) {
+  const float a_float_for_one_quant_level =
+      FloatForOneQuantizedLevel<T1>(min_a, max_a);
+  const float b_float_for_one_quant_level =
+      FloatForOneQuantizedLevel<T2>(min_b, max_b);
+
+  const int64 c_highest = static_cast<int64>(Eigen::NumTraits<T3>::highest());
+  const int64 c_lowest = static_cast<int64>(Eigen::NumTraits<T3>::lowest());
+  const float c_float_for_one_quant_level =
+      a_float_for_one_quant_level * b_float_for_one_quant_level;
+
+  *min_c = c_float_for_one_quant_level * c_lowest;
+  *max_c = c_float_for_one_quant_level * c_highest;
+}
+
+// input_array is an eigen Tensor.  q2f is a QuantizedToFloatStruct.
+// This evaluates to an eigen tensor expression, to be used like:
+// auto tensor = DEQUANTIZE_WITH_EIGEN(input_tensor, q2f);
+#define DEQUANTIZE_WITH_EIGEN(input_array, q2f)                       \
+  (q2f.range_min +                                                    \
+   (((input_array.template cast<float>() - q2f.lowest_quantized())) * \
+    q2f.range_scale));
+
+// input_array is an eigen Tensor.  f2q is a FloatToQuantizedStruct.
+// OutputType is the type of output (e.g. quint8).
+// This evaluates to an eigen tensor expression, to be used like:
+// auto tensor = QUANTIZE_WITH_EIGEN(input_tensor, f2q, T);
+#define QUANTIZE_WITH_EIGEN(input_array, f2q, OutputType) \
+  ((input_array * f2q.range_scale).round() -              \
+   (f2q.range_min_scaled - f2q.lowest_quantized()))       \
+      .cwiseMax(f2q.lower_bound_float())                  \
+      .cwiseMin(f2q.upper_bound_float())                  \
+      .template cast<int32>()                             \
+      .template cast<OutputType>()
+
+// For use with DEQUANTIZE_WITH_EIGEN.
+template <typename T>
+struct QuantizedToFloatStruct {
+  static constexpr int number_of_bits = sizeof(T) * 8;
+  static constexpr int64 number_of_steps = static_cast<int64>(1)
+                                           << number_of_bits;
+
+  static float lowest_quantized() {
+    return static_cast<float>(Eigen::NumTraits<T>::lowest());
+  }
+
+  QuantizedToFloatStruct(float range_min, float range_max)
+      : range_min(range_min),
+        range_scale((range_max - range_min) / (number_of_steps - 1.0)) {}
+
+  const float range_min;
+  const float range_scale;
+};
+
+// For use with QUANTIZE_WITH_EIGEN.
+template <typename T>
+struct FloatToQuantizedStruct {
+  static constexpr int number_of_bits = sizeof(T) * 8;
+  static constexpr int64 number_of_steps = static_cast<int64>(1)
+                                           << number_of_bits;
+  static constexpr double range_adjust =
+      (number_of_steps / (number_of_steps - 1.0));
+
+  // Casting QInt32's lowest or highest to a float gives a float that can't be
+  // cast back to int32 or QInt32.  Instead, use bounds that can be converted
+  // back to int32 without going outside the range of an int32.
+  static float lower_bound_float() {
+    return Eigen::numext::maxi(
+        static_cast<float>(Eigen::NumTraits<T>::lowest()), -2.147483648e+09f);
+  }
+  static float upper_bound_float() {
+    return Eigen::numext::mini(
+        static_cast<float>(Eigen::NumTraits<T>::highest()), +2.147483520e+09f);
+  }
+
+  static float lowest_quantized() {
+    return static_cast<float>(Eigen::NumTraits<T>::lowest());
+  }
+
+  FloatToQuantizedStruct(float range_min, float range_max)
+      : range_min(range_min),
+        range_scale(range_max == range_min
+                        ? 0.0
+                        : (number_of_steps - 1.0) / (range_max - range_min)),
+        range_min_scaled(round(range_min * range_scale)) {}
+
+  const float range_min;
+  const float range_scale;
+  const float range_min_scaled;
+};
+
+template <class T1, class T2>
+inline T2 RequantizeInNewRange(T1 input, float min_input, float max_input,
+                               float min_new, float max_new) {
+  const float input_float = QuantizedToFloat<T1>(input, min_input, max_input);
+  return FloatToQuantized<T2>(input_float, min_new, max_new);
+}
+
+template <class T1, class T2>
+inline void RequantizeManyInNewRange(const T1* input, size_t count,
+                                     float min_input, float max_input,
+                                     float min_output, float max_output,
+                                     T2* output) {
+  for (size_t index = 0; index < count; ++index) {
+    const float input_float =
+        QuantizedToFloat<T1>(input[index], min_input, max_input);
+    output[index] = FloatToQuantized<T2>(input_float, min_output, max_output);
+  }
+}
+
+// Because converting 32-bit accumulated results down to eight bit is a common
+// case, we have a specialized code path to handle it as efficiently as
+// possible using only fixed-point math for the inner loop.
+template <>
+inline void RequantizeManyInNewRange<qint32, quint8>(
+    const qint32* input, size_t count, float min_input, float max_input,
+    float min_output, float max_output, quint8* output) {
+  // Initially we calculate all the constants we need once, before we go into
+  // the inner loop.  If this is updated, also update the Eigen version.
+  const int fp_shift = 16;
+  const float input_range = max_input - min_input;
+  const float output_range = max_output - min_output;
+  const float recip_output_range =
+      output_range == 0.0 ? 0.0 : (255.0 / output_range);
+  const float input_rezero = (min_input + max_input) / 2.0;
+  const int64 range_scale_fp =
+      output_range == 0.0 ? 0.0
+                          : static_cast<int64>(255.0 * (1 << fp_shift) *
+                                               input_range / output_range);
+  const int64 input_offset_fp =
+      static_cast<int64>(input_rezero * recip_output_range * (1 << fp_shift));
+  const int64 output_offset_fp =
+      output_range == 0.0 ? 0 : static_cast<int64>((1 << fp_shift) *
+                                                   (min_output * 255.0) /
+                                                   output_range);
+  const int64 rounding_delta = 1 << (fp_shift - 1);
+
+  // Inside this loop we just do minimal adds, multiplies, and shifts, in a way
+  // that could be easily adapted for a SIMD implementation. It should also be
+  // possible to perform all the calculations in 32-bit rather than 64, but
+  // that's not been implemented yet.
+  for (size_t index = 0; index < count; ++index) {
+    const int64 input_value = static_cast<int64>(input[index]);
+    const int64 fp_value =
+        ((input_value * range_scale_fp) >> 32) + input_offset_fp;
+    const int64 offset_intermediate = fp_value - output_offset_fp;
+    const int64 round_intermediate = offset_intermediate + rounding_delta;
+    int64 quantized_int64 = round_intermediate >> fp_shift;
+    quantized_int64 = std::max(quantized_int64, 0LL);
+    quantized_int64 = std::min(quantized_int64, 255LL);
+    output[index] = static_cast<quint8>(static_cast<int32>(quantized_int64));
+  }
+}
+
+template <int shift>
+struct int64_right_shift_op {
+  EIGEN_EMPTY_STRUCT_CTOR(int64_right_shift_op)
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const int64 operator()(const int64& a) const {
+    return a >> shift;
+  }
+};
+
+// See RequantizeManyInNewRange() for a non-eigen reference implementation.
+template <class T1, class T2>
+inline void RequantizeManyInNewRangeUsingEigen(
+    const Eigen::ThreadPoolDevice& device, const Tensor& input, float min_input,
+    float max_input, float min_output, float max_output, Tensor* output) {
+  auto input_array = input.flat<T1>();
+  QuantizedToFloatStruct<T1> q2f(min_input, max_input);
+  auto input_float = DEQUANTIZE_WITH_EIGEN(input_array, q2f);
+  FloatToQuantizedStruct<T2> f2q(min_output, max_output);
+  auto input_requantized = QUANTIZE_WITH_EIGEN(input_float, f2q, T2);
+
+  output->flat<T2>().device(device) = input_requantized;
+}
+
+// See RequantizeManyInNewRange() for a non-eigen reference implementation.
+//
+// Because converting 32-bit accumulated results down to eight bit is a common
+// case, we have a specialized code path to handle it as efficiently as
+// possible using only fixed-point math for the inner loop.
+template <>
+inline void RequantizeManyInNewRangeUsingEigen<qint32, quint8>(
+    const Eigen::ThreadPoolDevice& device, const Tensor& input, float min_input,
+    float max_input, float min_output, float max_output, Tensor* output) {
+  // Initially we calculate all the constants we need once, before we go into
+  // the inner loop.  If this is updated, also update the non-Eigen version.
+  const int fp_shift = 16;
+  const float input_range = max_input - min_input;
+  const float output_range = max_output - min_output;
+  const float recip_output_range =
+      output_range == 0.0 ? 0.0 : (255.0 / output_range);
+  const float input_rezero = (min_input + max_input) / 2.0;
+  const int64 range_scale_fp =
+      output_range == 0.0 ? 0.0
+                          : static_cast<int64>(255.0 * (1 << fp_shift) *
+                                               input_range / output_range);
+  const int64 input_offset_fp =
+      static_cast<int64>(input_rezero * recip_output_range * (1 << fp_shift));
+  const int64 output_offset_fp =
+      output_range == 0.0 ? 0 : static_cast<int64>((1 << fp_shift) *
+                                                   (min_output * 255.0) /
+                                                   output_range);
+  const int64 rounding_delta = 1 << (fp_shift - 1);
+
+  // Inside this eigen expression we just do minimal adds, multiplies, and
+  // shifts. It should be possible to perform all the calculations in 32-bit
+  // rather than 64, but that's not been implemented yet.
+  auto input_array = input.flat<qint32>();
+  auto fp_value = ((input_array.template cast<int64>() * range_scale_fp)
+                       .unaryExpr(int64_right_shift_op<32>())) +
+                  (input_offset_fp - output_offset_fp + rounding_delta);
+  auto intermediate = fp_value.unaryExpr(int64_right_shift_op<fp_shift>());
+  auto input_requantized = intermediate.cwiseMax(0LL)
+                               .cwiseMin(255LL)
+                               .template cast<int32>()
+                               .template cast<quint8>();
+  output->flat<quint8>().device(device) = input_requantized;
+}
+
+// REQUIRES: 'result->NumElements() == input.NumElements()'
+template <class T>
+void FloatTensorToQuantizedInPlaceUsingEigen(
+    const Eigen::ThreadPoolDevice& device, const Tensor& input, float min,
+    float max, Tensor* result) {
+  DCHECK_EQ(DataTypeToEnum<T>::v(), result->dtype());
+  auto flat_input = input.flat<float>();
+  auto flat_result = result->flat<T>();
+  DCHECK_EQ(flat_input.size(), flat_result.size());
+
+  FloatToQuantizedStruct<T> f2q(min, max);
+  flat_result.device(device) = QUANTIZE_WITH_EIGEN(flat_input, f2q, T);
+}
+
+template <class T>
+void FloatTensorToQuantizedInPlace(const Tensor& input, float min, float max,
+                                   Tensor* result) {
+  DCHECK_EQ(DataTypeToEnum<T>::v(), result->dtype());
+  auto flat_input = input.flat<float>();
+  auto flat_result = result->flat<T>();
+  const int data_size = flat_input.size();
+  DCHECK(data_size == flat_result.size());
+  for (int i = 0; i < data_size; ++i) {
+    flat_result(i) = FloatToQuantized<T>(flat_input(i), min, max);
+  }
+}
+
+template <class T>
+Tensor FloatTensorToQuantized(const Tensor& input, float min, float max) {
+  Tensor result(DataTypeToEnum<T>::v(), input.shape());
+  FloatTensorToQuantizedInPlace<T>(input, min, max, &result);
+  return result;
+}
+
+// REQUIRES: 'result->NumElements() == input.NumElements()'
+template <class T>
+void QuantizedTensorToFloatInPlaceUsingEigen(
+    const Eigen::ThreadPoolDevice& device, const Tensor& input, float min,
+    float max, Tensor* result) {
+  DCHECK_EQ(DataTypeToEnum<T>::v(), input.dtype());
+  auto flat_input = input.flat<T>();
+  auto flat_result = result->flat<float>();
+  const int data_size = flat_input.size();
+  DCHECK(data_size == flat_result.size());
+
+  QuantizedToFloatStruct<T> q2f(min, max);
+  flat_result.device(device) = DEQUANTIZE_WITH_EIGEN(flat_input, q2f);
+}
+
+// REQUIRES: 'result->NumElements() == input.NumElements()'
+template <class T>
+void QuantizedTensorToFloatInPlace(const Tensor& input, float min, float max,
+                                   Tensor* result) {
+  DCHECK_EQ(DataTypeToEnum<T>::v(), input.dtype());
+  auto flat_input = input.flat<T>();
+  auto flat_result = result->flat<float>();
+  const int data_size = flat_input.size();
+  DCHECK(data_size == flat_result.size());
+  for (int i = 0; i < data_size; ++i) {
+    flat_result(i) = QuantizedToFloat<T>(flat_input(i), min, max);
+  }
+}
+
+template <class T>
+Tensor QuantizedTensorToFloat(const Tensor& input, float min, float max) {
+  Tensor result(DT_FLOAT, input.shape());
+  QuantizedTensorToFloatInPlace<T>(input, min, max, &result);
+  return result;
+}
+
+void GetOutputMinAndMaxForQuantizedAdd(float input_min, float input_max,
+                                       float smaller_input_min,
+                                       float smaller_input_max,
+                                       float* output_min, float* output_max);
+
+// Add <input> and <smaller_input>.  If <smaller_input> has fewer elements than
+// <input>, then it is broadcast onto <input>.
+template <typename T1, typename T2, typename T3>
+void QuantizedAddUsingEigen(const Eigen::ThreadPoolDevice& device,
+                            const Tensor& input, float input_min,
+                            float input_max, const Tensor& smaller_input,
+                            float smaller_input_min, float smaller_input_max,
+                            Tensor* output, float* output_min,
+                            float* output_max) {
+  const auto& input_flat = input.flat<T1>();
+  const auto& smaller_input_flat = smaller_input.flat<T2>();
+  auto output_flat = output->flat<T3>();
+
+  GetOutputMinAndMaxForQuantizedAdd(input_min, input_max, smaller_input_min,
+                                    smaller_input_max, output_min, output_max);
+  // To do addition properly, we need to compensate for a possibly unbalanced
+  // zero point in the total representation. The quantized value that
+  // represents the real number zero needs to be subtracted before addition to
+  // make sure that the identity of zero + zero = zero holds.
+  const T3 zero_in_total_space =
+      FloatToQuantized<T3>(0.0f, *output_min, *output_max);
+
+  const int64 input_element_count = input.NumElements();
+  const int64 smaller_input_element_count = smaller_input.NumElements();
+
+  QuantizedToFloatStruct<T1> smaller_input_q2f(smaller_input_min,
+                                               smaller_input_max);
+  QuantizedToFloatStruct<T2> input_q2f(input_min, input_max);
+  FloatToQuantizedStruct<T3> f2q(*output_min, *output_max);
+
+  auto smaller_input_float =
+      DEQUANTIZE_WITH_EIGEN(smaller_input_flat, smaller_input_q2f);
+  auto smaller_input_in_total_space =
+      QUANTIZE_WITH_EIGEN(smaller_input_float, f2q, T3);
+
+  auto input_float = DEQUANTIZE_WITH_EIGEN(input_flat, input_q2f);
+  auto input_in_total_space = QUANTIZE_WITH_EIGEN(input_float, f2q, T3);
+
+  Eigen::array<Eigen::DenseIndex, 1> bcast;
+  bcast[0] = input_element_count / smaller_input_element_count;
+  output_flat.device(device) =
+      input_in_total_space +
+      (smaller_input_in_total_space.broadcast(bcast) + zero_in_total_space);
+}
+
+// This is a reference implementation of the bias addition for quantized
+// buffers, designed to provide a clear specification for the result we
+// want. We'll want to specialize this for particular hardware, and
+// probably even fuse it with matrix multiplications in a lot of cases. It's
+// important to show the clamping behavior we want in particular.
+template <typename T1, typename T2, typename T3>
+void QuantizedAdd(const Eigen::ThreadPoolDevice& device, const Tensor& input,
+                  float input_min, float input_max, const Tensor& smaller_input,
+                  float smaller_input_min, float smaller_input_max,
+                  Tensor* output, float* output_min, float* output_max) {
+  const auto& input_flat = input.flat<T1>();
+  const auto& smaller_input_flat = smaller_input.flat<T2>();
+  auto output_flat = output->flat<T3>();
+
+  GetOutputMinAndMaxForQuantizedAdd(input_min, input_max, smaller_input_min,
+                                    smaller_input_max, output_min, output_max);
+  // To do addition properly, we need to compensate for a possibly unbalanced
+  // zero point in the total representation. The quantized value that
+  // represents the real number zero needs to be subtracted before addition to
+  // make sure that the identity of zero + zero = zero holds.
+  const T3 zero_in_total_space =
+      FloatToQuantized<T3>(0.0f, *output_min, *output_max);
+
+  const int64 input_element_count = input.NumElements();
+  const int64 smaller_input_element_count = smaller_input.NumElements();
+
+  float total_min = *output_min;
+  float total_max = *output_max;
+  const size_t how_many_iterations =
+      (input_element_count / smaller_input_element_count);
+  for (size_t iteration = 0; iteration < how_many_iterations; ++iteration) {
+    const size_t offset = iteration * smaller_input_element_count;
+    for (int c = 0; c < smaller_input_element_count; ++c) {
+      const int index = (offset + c);
+      // The two numbers we're going to add can each be in very different
+      // ranges (e.g. the quantized value '127' may represent very different
+      // real numbers in both) so we need to convert them to a common range
+      // before we sum them.
+      const T1 input_value = input_flat(index);
+      const T3 input_in_total_space = RequantizeInNewRange<T1, T3>(
+          input_value, input_min, input_max, total_min, total_max);
+      const T2 smaller_input_value = smaller_input_flat(c);
+      const T3 smaller_input_in_total_space =
+          RequantizeInNewRange<T2, T3>(smaller_input_value, smaller_input_min,
+                                       smaller_input_max, total_min, total_max);
+      const T3 total_pre = input_in_total_space + smaller_input_in_total_space;
+      // As noted above, we need to compensate for the offset of the actual
+      // zero point in the space we're operating in.
+      const T3 total = total_pre + zero_in_total_space;
+      output_flat(index) = total;
+    }
+  }
+}
+
+// See gemmlowp/internal/multi_thread_gemm.h for definitions of
+// Prepare, Wait, StartWorker, and CreateWorkers.
+class TensorflowGemmlowpWorkersPool {
+ public:
+  TensorflowGemmlowpWorkersPool(thread::ThreadPool* workers)
+      : workers_(workers) {}
+
+  ~TensorflowGemmlowpWorkersPool() {
+    // This workaround ensures that all worker tasks have exited methods in the
+    // BlockingCounter. Without this, there is a race where the context is torn
+    // down while the counter is in use.
+    counter_to_decrement_when_ready_.Reset(0);
+  }
+
+  void Prepare(int workers_count) {
+    counter_to_decrement_when_ready_.Reset(workers_count);
+  }
+
+  void Wait() { counter_to_decrement_when_ready_.Wait(); }
+
+  void StartWorker(int index, gemmlowp::Task* task) {
+    CHECK(workers_ != nullptr);
+    // <index> is ignored - the tensorflow threadpool does not support assigning
+    // to a specific thread.
+    workers_->Schedule([this, task]() {
+      // TODO(cwhipkey): get a local_allocator from a thread local.
+      gemmlowp::Allocator local_allocator;
+      CHECK(task != nullptr);
+      task->local_allocator = &local_allocator;
+      task->Run();
+      delete task;
+      counter_to_decrement_when_ready_.DecrementCount();
+    });
+  }
+
+  void CreateWorkers(std::size_t workers_count) {}
+
+ private:
+  thread::ThreadPool* const workers_;
+
+  // The BlockingCounter used to wait for the workers.
+  gemmlowp::BlockingCounter counter_to_decrement_when_ready_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorflowGemmlowpWorkersPool);
+};
+
+class TensorflowGemmContext : public gemmlowp::MultiThreadGemmContextBase {
+ public:
+  TensorflowGemmContext(int num_threads, thread::ThreadPool* workers)
+      : workers_pool_(workers) {
+    set_max_num_threads(num_threads);
+  }
+
+  TensorflowGemmlowpWorkersPool* workers_pool() { return &workers_pool_; }
+
+ private:
+  TensorflowGemmlowpWorkersPool workers_pool_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorflowGemmContext);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_QUANTIZATION_UTILS_H_
diff --git a/tensorflow/contrib/quantization/kernels/quantization_utils_test.cc b/tensorflow/contrib/quantization/kernels/quantization_utils_test.cc
new file mode 100644
index 0000000000..d62610b2ca
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantization_utils_test.cc
@@ -0,0 +1,550 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <limits>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+#include "tensorflow/core/common_runtime/eigen_thread_pool.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class QuantizationUtilsTest : public ::testing::Test {
+ protected:
+  void TestRequantizeMany(Eigen::ThreadPoolDevice* eigen_device,
+                          float input_min, float input_max, float output_min,
+                          float output_max,
+                          const std::vector<qint32>& values_quantized,
+                          int tolerance = 1) {
+    const int values_count = values_quantized.size();
+    std::vector<quint8> expected_values;
+    for (int value_index = 0; value_index < values_count; ++value_index) {
+      expected_values.push_back(FloatToQuantized<quint8>(
+          QuantizedToFloat(values_quantized[value_index], input_min, input_max),
+          output_min, output_max));
+    }
+
+    Tensor i_tensor =
+        tensorflow::test::AsTensor(gtl::ArraySlice<qint32>(values_quantized));
+    Tensor o_tensor(DT_QUINT8, TensorShape{values_count});
+    auto output_values = o_tensor.flat<quint8>();
+
+    if (eigen_device == nullptr) {
+      auto input_array = i_tensor.flat<qint32>();
+      RequantizeManyInNewRange(input_array.data(), input_array.size(),
+                               input_min, input_max, output_min, output_max,
+                               output_values.data());
+    } else {
+      RequantizeManyInNewRangeUsingEigen<qint32, quint8>(
+          *eigen_device, i_tensor, input_min, input_max, output_min, output_max,
+          &o_tensor);
+    }
+
+    const string tolerance_str = strings::StrCat("+-", tolerance);
+    for (size_t value_index = 0; value_index < values_count; ++value_index) {
+      int e = expected_values[value_index];
+      int v = output_values(value_index);
+      ASSERT_TRUE(std::abs(e - v) <= tolerance)
+          << "actual=" << v << ", expected=" << e << tolerance_str
+          << ", values_quantized[" << value_index
+          << "]=" << values_quantized[value_index]
+          << ", input_min=" << input_min << ", input_max=" << input_max
+          << ", output_min=" << output_min << ", output_max=" << output_max
+          << ", value_index=" << value_index;
+    }
+  }
+
+  // If eigen_device is NULL, then the reference implementation is tested.
+  void TestRequantizeManyInNewRange32To8Bit(
+      Eigen::ThreadPoolDevice* eigen_device) {
+    // These are the float values we're going to test the conversions on.
+    const size_t values_count = 6;
+    const float values[values_count] = {0.0f,  0.45f,  1.0f,
+                                        -1.0f, 127.0f, 255.0f};
+    // These are the input and output ranges we'll test.
+    const size_t ranges_count = 6;
+    const float ranges[ranges_count][4] = {
+        {0.0f, 255.0f, 0.0f, 255.0f},    //
+        {0.0f, 1.0f, 0.0f, 1.0f},        //
+        {-1.0f, 1.0f, -1.0f, 1.0f},      //
+        {-1.0f, 1.0f, -255.0f, 255.0f},  //
+        {3.0f, 3.0f, 0.0f, 255.0f},      // input min == max
+        {0.0f, 255.0f, 5.0f, 5.0f},      // output min == max
+    };
+    for (int i = 0; i < ranges_count; ++i) {
+      const auto& r = ranges[i];
+      std::vector<qint32> values_quantized;
+      for (int value_index = 0; value_index < values_count; ++value_index) {
+        const float v = values[value_index];
+        values_quantized.push_back(FloatToQuantized<qint32>(v, r[0], r[1]));
+      }
+      TestRequantizeMany(eigen_device, r[0], r[1], r[2], r[3],
+                         values_quantized);
+    }
+
+    // Test with many different values in the input quantized range.
+    qint32 low = Eigen::NumTraits<qint32>::lowest();
+    qint32 high = Eigen::NumTraits<qint32>::highest();
+    std::vector<qint32> vals{low, high};
+    int num_steps = 14419;
+    qint32 step = static_cast<int32>((1L << 32) / num_steps);
+    qint32 v = low + static_cast<qint32>(1);
+    for (int i = 0; i < num_steps; ++i) {
+      vals.push_back(v);
+      v += step;
+    }
+    TestRequantizeMany(eigen_device, -1.0f, 1.0f, -1.0f, 1.0f, vals);
+    TestRequantizeMany(eigen_device, -255.0f, 255.0f, -255.0f, 255.0f, vals);
+    TestRequantizeMany(eigen_device, -1.0f, 1.0f, -12345678.0f, 12345678.0f,
+                       vals);
+    TestRequantizeMany(eigen_device, -1.0f, 12345678.0f, -12345678.0f,
+                       12345678.0f, vals);
+
+    // Test when the input range is large and output range is small.
+    // Use all quantized values where the float is in the output range.
+    const float out_min = -29.1234;
+    const float out_max = 23.1234;
+    const float in_min = -1e6;
+    const float in_max = 1e6;
+
+    low = FloatToQuantized<qint32>(out_min, in_min, in_max);
+    high = FloatToQuantized<qint32>(out_max, in_min, in_max);
+    vals.clear();
+    for (int32 i = low; i <= high; ++i) vals.push_back(i);
+    TestRequantizeMany(eigen_device, in_min, in_max, out_min, out_max, vals);
+  }
+
+  template <typename InputType, typename OutputType>
+  void TestRequantizeManyInNewRangeEigenVsNonEigen() {
+    thread::ThreadPool threadpool(Env::Default(), "test", 2 /* num_threads */);
+    EigenThreadPoolWrapper wrapper(&threadpool);
+    Eigen::ThreadPoolDevice eigen_device(&wrapper, 2 /* num_threads */);
+
+    const size_t ranges_count = 6;
+    const float ranges[ranges_count][4] = {
+        {0.0f, 255.0f, 0.0f, 255.0f},    //
+        {0.0f, 1.0f, 0.0f, 1.0f},        //
+        {-1.0f, 1.0f, -1.0f, 1.0f},      //
+        {-1.0f, 1.0f, -255.0f, 255.0f},  //
+        {3.0f, 3.0f, 0.0f, 255.0f},      // input min == max
+        {0.0f, 255.0f, 5.0f, 5.0f},      // output min == max
+    };
+
+    // Random values.
+    for (size_t range_index = 0; range_index < ranges_count; ++range_index) {
+      const float input_min = ranges[range_index][0];
+      const float input_max = ranges[range_index][1];
+      const float output_min = ranges[range_index][2];
+      const float output_max = ranges[range_index][3];
+      const int values_count = 10000;
+      random::PhiloxRandom philox(testing::RandomSeed(), 17);
+      random::SimplePhilox rnd(&philox);
+      std::vector<InputType> values_quantized;
+      for (int i = 0; i < values_count; ++i) {
+        float v = (rnd.RandFloat() * (input_max - input_min)) + input_min;
+        values_quantized.push_back(
+            FloatToQuantized<InputType>(v, input_min, input_max));
+      }
+
+      Tensor i_tensor = tensorflow::test::AsTensor(
+          gtl::ArraySlice<InputType>(values_quantized));
+      const auto i_array = i_tensor.flat<InputType>();
+      Tensor o_tensor_eigen(DataTypeToEnum<OutputType>::v(),
+                            TensorShape{values_count});
+      auto output_values_eigen = o_tensor_eigen.flat<OutputType>();
+      Tensor o_tensor_ref(DataTypeToEnum<OutputType>::v(),
+                          TensorShape{values_count});
+      auto output_values_ref = o_tensor_ref.flat<OutputType>();
+
+      RequantizeManyInNewRange(i_array.data(), i_array.size(), input_min,
+                               input_max, output_min, output_max,
+                               output_values_ref.data());
+      RequantizeManyInNewRangeUsingEigen<InputType, OutputType>(
+          eigen_device, i_tensor, input_min, input_max, output_min, output_max,
+          &o_tensor_eigen);
+
+      const int tolerance = 1;
+      for (int i = 0; i < values_quantized.size(); ++i) {
+        auto expected = output_values_ref(i);
+        auto actual = output_values_eigen(i);
+        // The eigen computation uses float for constants and computation
+        // instead of doubles, so can be different by 1 or 2 in some cases
+        // (e.g., input value 144.062744140625, min -1, max 255, type quint8).
+        ASSERT_TRUE(std::abs(expected - actual) <= tolerance)
+            << "expected=" << expected << " actual=" << actual
+            << " tolerance=" << tolerance << " v=" << values_quantized[i]
+            << " i=" << i << " input_min=" << input_min
+            << " input_max=" << input_max
+            << " input_type=" << DataTypeString(DataTypeToEnum<InputType>::v())
+            << " output_type="
+            << DataTypeString(DataTypeToEnum<OutputType>::v());
+      }
+    }
+  }
+
+  template <typename T>
+  void TestFloatToQuantizedInPlaceUsingEigen(
+      Eigen::ThreadPoolDevice* eigen_device) {
+    // These are the float values we're going to test the conversions on.
+    typedef std::pair<float, float> FPair;
+    for (FPair min_and_max : std::vector<FPair>{FPair(-255.0f, 255.0f),  //
+                                                FPair(-1.0f, 1.0f),      //
+                                                FPair(-1.0f, 255.0f),    //
+                                                FPair(0.0f, 1e6),        //
+                                                FPair(0.0f, 1.0f),       //
+                                                FPair(-31.0f, 13.0f)}) {
+      const float f_min = min_and_max.first;
+      const float f_max = min_and_max.second;
+      const float f_range = f_max - f_min;
+      const int values_count = 50000;
+      Tensor input(DT_FLOAT, TensorShape{values_count});
+      auto input_array = input.flat<float>();
+      for (int i = 0; i < values_count; ++i) {
+        input_array(i) = f_min + f_range * i / (values_count - 1);
+      }
+
+      Tensor output(DataTypeToEnum<T>::v(), TensorShape{values_count});
+      FloatTensorToQuantizedInPlaceUsingEigen<T>(*eigen_device, input, f_min,
+                                                 f_max, &output);
+      auto output_array = output.flat<T>();
+
+      const int tolerance = 1;
+      for (int i = 0; i < values_count; ++i) {
+        int32 expected = FloatToQuantized<T>(input_array(i), f_min, f_max);
+        int32 actual = output_array(i);
+
+        // The eigen computation uses float for constants and computation
+        // instead
+        // of doubles, so can be different by 1 or 2 in some cases (e.g., input
+        // value 144.062744140625, min -1, max 255, type quint8).
+        ASSERT_TRUE(std::abs(expected - actual) <= tolerance)
+            << "expected=" << expected << " actual=" << actual
+            << " tolerance=" << tolerance << " v=" << input_array(i)
+            << " i=" << i << " f_min=" << f_min << " f_max=" << f_max
+            << " type=" << DataTypeString(DataTypeToEnum<T>::v());
+      }
+    }
+  }
+
+  template <typename T>
+  void TestQuantizedToFloatInPlaceUsingEigen(
+      Eigen::ThreadPoolDevice* eigen_device) {
+    // These are the float values we're going to test the conversions on.
+    typedef std::pair<float, float> FPair;
+    for (FPair min_and_max : std::vector<FPair>{FPair(-255.0f, 255.0f),  //
+                                                FPair(-1.0f, 1.0f),      //
+                                                FPair(-1.0f, 255.0f),    //
+                                                FPair(0.0f, 1e6),        //
+                                                FPair(0.0f, 1.0f),       //
+                                                FPair(-31.0f, 13.0f)}) {
+      const float f_min = min_and_max.first;
+      const float f_max = min_and_max.second;
+      const int values_count = sizeof(T) == 1 ? 256 : 50000;
+      Tensor input(DataTypeToEnum<T>::v(), TensorShape{values_count});
+      auto input_array = input.flat<T>();
+      const double q_range =
+          static_cast<double>(Eigen::NumTraits<T>::highest()) -
+          Eigen::NumTraits<T>::lowest();
+      for (int i = 0; i < values_count; ++i) {
+        if (sizeof(T) == 1) {
+          input_array(i) = Eigen::NumTraits<T>::lowest() + i;
+        } else {
+          int64 offset = static_cast<int64>(q_range / values_count * i);
+          input_array(i) = static_cast<int32>(
+              Eigen::NumTraits<T>::lowest() +
+              std::min<int64>(Eigen::NumTraits<T>::highest(), offset));
+        }
+      }
+
+      Tensor output(DT_FLOAT, TensorShape{values_count});
+      QuantizedTensorToFloatInPlaceUsingEigen<T>(*eigen_device, input, f_min,
+                                                 f_max, &output);
+      auto output_array = output.flat<float>();
+      const double range = static_cast<double>(f_max) - f_min;
+      for (int i = 0; i < values_count; ++i) {
+        float expected = QuantizedToFloat<T>(input_array(i), f_min, f_max);
+        float actual = output_array(i);
+        ASSERT_NEAR(expected, actual, range * 1e-6)
+            << "expected=" << expected << " actual=" << actual
+            << " v=" << input_array(i) << " i=" << i << " f_min=" << f_min
+            << " f_max=" << f_max
+            << " type=" << DataTypeString(DataTypeToEnum<T>::v());
+      }
+    }
+  }
+};
+
+TEST_F(QuantizationUtilsTest, FloatToQuantized) {
+  EXPECT_EQ(quint8(0), FloatToQuantized<quint8>(0.0f, 0.0f, 1.0f));
+  EXPECT_EQ(quint8(0), FloatToQuantized<quint8>(0.0f, 0.0f, 2.0f));
+  EXPECT_EQ(quint8(128), FloatToQuantized<quint8>(0.5f, 0.0f, 1.0f));
+  EXPECT_EQ(quint8(128), FloatToQuantized<quint8>(1.0f, 0.0f, 2.0f));
+  EXPECT_EQ(quint8(255), FloatToQuantized<quint8>(1.0f, 0.0f, 1.0f));
+  EXPECT_EQ(quint8(255), FloatToQuantized<quint8>(2.0f, 0.0f, 2.0f));
+  EXPECT_EQ(quint8(0), FloatToQuantized<quint8>(-128.0f, -128.0f, 127.0f));
+  EXPECT_EQ(quint8(128), FloatToQuantized<quint8>(0.0f, -128.0f, 127.0f));
+  EXPECT_EQ(quint8(255), FloatToQuantized<quint8>(127.0f, -128.0f, 127.0f));
+  EXPECT_EQ(quint8(0), FloatToQuantized<quint8>(1.0f, 1.0f, 256.0f));
+  EXPECT_EQ(quint8(127), FloatToQuantized<quint8>(128.0f, 1.0f, 256.0f));
+  EXPECT_EQ(quint8(255), FloatToQuantized<quint8>(256.0f, 1.0f, 256.0f));
+
+  const int int32_min = std::numeric_limits<int>::min();
+  const int int32_max = std::numeric_limits<int>::max();
+
+  EXPECT_EQ(qint32(int32_min),
+            FloatToQuantized<qint32>(-128.0f, -128.0f, 128.0f));
+  EXPECT_EQ(qint32(0), FloatToQuantized<qint32>(0.0f, -128.0f, 128.0f));
+  EXPECT_EQ(qint32(int32_max),
+            FloatToQuantized<qint32>(128.0f, -128.0f, 128.0f));
+}
+
+TEST_F(QuantizationUtilsTest, QuantizedToFloat) {
+  EXPECT_LT(fabsf(0.0f - QuantizedToFloat<quint8>(0, 0.0f, 1.0f)), 1 / 255.0f);
+  EXPECT_LT(fabsf(0.0f - QuantizedToFloat<quint8>(0, 0.0f, 2.0f)), 1 / 255.0f);
+  EXPECT_LT(fabsf(0.5f - QuantizedToFloat<quint8>(127, 0.0f, 1.0f)),
+            1 / 255.0f);
+  EXPECT_LT(fabsf(1.0f - QuantizedToFloat<quint8>(127, 0.0f, 2.0f)),
+            1 / 255.0f);
+  EXPECT_LT(fabsf(1.0f - QuantizedToFloat<quint8>(255, 0.0f, 1.0f)),
+            1 / 255.0f);
+  EXPECT_LT(fabsf(2.0f - QuantizedToFloat<quint8>(255, 0.0f, 2.0f)),
+            1 / 255.0f);
+  EXPECT_LT(fabsf(1.0f - QuantizedToFloat<quint8>(0, 1.0f, 256.0f)),
+            1 / 255.0f);
+  EXPECT_LT(fabsf(128.0f - QuantizedToFloat<quint8>(127, 1.0f, 256.0f)),
+            1 / 255.0f);
+  EXPECT_LT(fabsf(256.0f - QuantizedToFloat<quint8>(255, 1.0f, 256.0f)),
+            1 / 255.0f);
+
+  const int int32_min = std::numeric_limits<int>::min();
+  const int int32_max = std::numeric_limits<int>::max();
+
+  EXPECT_LT(
+      fabsf(-1.0f - QuantizedToFloat<qint32>(qint32(int32_min), -1.0f, 1.0f)),
+      1e-5f);
+  EXPECT_LT(fabsf(0.0f - QuantizedToFloat<qint32>(qint32(0), -1.0f, 1.0f)),
+            1e-5f);
+  EXPECT_LT(
+      fabsf(1.0f - QuantizedToFloat<qint32>(qint32(int32_max), -1.0f, 1.0f)),
+      1e-5f);
+}
+
+TEST_F(QuantizationUtilsTest, AvoidBias) {
+  for (int i = 0; i < 256; ++i) {
+    const float as_float = QuantizedToFloat<quint8>(i, 0.0f, 2.0f);
+    const int back_to_int = FloatToQuantized<quint8>(as_float, 0.0f, 2.0f);
+    EXPECT_EQ(i, back_to_int);
+  }
+}
+
+TEST_F(QuantizationUtilsTest, RequantizeInNewRange) {
+  // These are the float values we're going to test the conversions on.
+  const size_t values_count = 6;
+  const float values[values_count] = {0.0f, 0.5f, 1.0f, -1.0f, 127.0f, 255.0f};
+  // These are the input and output ranges we'll test.
+  const size_t ranges_count = 4;
+  const float ranges[ranges_count][4] = {
+      {0.0f, 255.0f, 0.0f, 255.0f},
+      {0.0f, 1.0f, 0.0f, 1.0f},
+      {-1.0f, 1.0f, -1.0f, 1.0f},
+      {-1.0f, 1.0f, -255.0f, 255.0f},
+  };
+  for (size_t value_index = 0; value_index < values_count; ++value_index) {
+    const float value_float = values[value_index];
+    for (size_t range_index = 0; range_index < ranges_count; ++range_index) {
+      const float input_min = ranges[range_index][0];
+      const float input_max = ranges[range_index][1];
+      const float output_min = ranges[range_index][2];
+      const float output_max = ranges[range_index][3];
+      const quint8 input_value =
+          FloatToQuantized<quint8>(value_float, input_min, input_max);
+      // Here we convert the quantized input value to what we expect
+      // to get in the output range.
+      const qint32 expected_value = FloatToQuantized<qint32>(
+          QuantizedToFloat(input_value, input_min, input_max), output_min,
+          output_max);
+      EXPECT_EQ(expected_value,
+                (RequantizeInNewRange<quint8, qint32>(
+                    input_value, input_min, input_max, output_min, output_max)))
+          << "value_float=" << value_float << ", input_min=" << input_min
+          << ", input_max=" << input_max << ", output_min=" << output_min
+          << ", output_max=" << output_max;
+    }
+  }
+}
+
+TEST_F(QuantizationUtilsTest, RequantizeInNewRangeRealData) {
+  const float value_as_float = -0.290169f;
+  const float input_min = -0.739539f;
+  const float input_max = 0.641057f;
+  const float output_min = -2381.49f;
+  const float output_max = 2207.6f;
+  const quint8 value_as_quint8 =
+      FloatToQuantized<quint8>(value_as_float, input_min, input_max);
+  EXPECT_EQ(quint8(83), value_as_quint8);
+  const qint32 actual_output = RequantizeInNewRange<quint8, qint32>(
+      value_as_quint8, input_min, input_max, output_min, output_max);
+  const qint32 value_as_qint32 =
+      FloatToQuantized<qint32>(value_as_float, output_min, output_max);
+  EXPECT_LT(std::abs(value_as_qint32 - actual_output), 10);
+}
+
+TEST_F(QuantizationUtilsTest, RequantizeInNewRange32To8Bit) {
+  // These are the float values we're going to test the conversions on.
+  const size_t values_count = 6;
+  const float values[values_count] = {0.0f, 0.45f, 1.0f, -1.0f, 127.0f, 255.0f};
+  // These are the input and output ranges we'll test.
+  const size_t ranges_count = 4;
+  const float ranges[ranges_count][4] = {
+      {0.0f, 255.0f, 0.0f, 255.0f},
+      {0.0f, 1.0f, 0.0f, 1.0f},
+      {-1.0f, 1.0f, -1.0f, 1.0f},
+      {-1.0f, 1.0f, -255.0f, 255.0f},
+  };
+  for (size_t value_index = 0; value_index < values_count; ++value_index) {
+    const float value_float = values[value_index];
+    for (size_t range_index = 0; range_index < ranges_count; ++range_index) {
+      const float input_min = ranges[range_index][0];
+      const float input_max = ranges[range_index][1];
+      const float output_min = ranges[range_index][2];
+      const float output_max = ranges[range_index][3];
+      const qint32 input_value =
+          FloatToQuantized<qint32>(value_float, input_min, input_max);
+      // Here we convert the quantized input value to what we expect
+      // to get in the output range.
+      const quint8 expected_value = FloatToQuantized<quint8>(
+          QuantizedToFloat(input_value, input_min, input_max), output_min,
+          output_max);
+      EXPECT_EQ(expected_value,
+                (RequantizeInNewRange<qint32, quint8>(
+                    input_value, input_min, input_max, output_min, output_max)))
+          << "input_value=" << input_value << ", value_float=" << value_float
+          << ", input_min=" << input_min << ", input_max=" << input_max
+          << ", output_min=" << output_min << ", output_max=" << output_max;
+    }
+  }
+}
+
+TEST_F(QuantizationUtilsTest, RequantizeManyInNewRange32To8Bit) {
+  TestRequantizeManyInNewRange32To8Bit(nullptr /* eigen_device */);
+}
+
+TEST_F(QuantizationUtilsTest, RequantizeManyInNewRange32To8BitUsingEigen) {
+  thread::ThreadPool threadpool(Env::Default(), "test", 2 /* num_threads */);
+  EigenThreadPoolWrapper wrapper(&threadpool);
+  Eigen::ThreadPoolDevice eigen_device(&wrapper, 2 /* num_threads */);
+  TestRequantizeManyInNewRange32To8Bit(&eigen_device);
+}
+
+TEST_F(QuantizationUtilsTest, RequantizeManyInNewRange32To8BitEigenVsNonEigen) {
+  TestRequantizeManyInNewRangeEigenVsNonEigen<qint32, quint8>();
+}
+
+TEST_F(QuantizationUtilsTest,
+       RequantizeManyInNewRange32To8BitSignedEigenVsNonEigen) {
+  TestRequantizeManyInNewRangeEigenVsNonEigen<qint32, qint8>();
+}
+
+TEST_F(QuantizationUtilsTest, FloatTensorToQuantized) {
+  const int input_width = 3;
+  const int input_height = 3;
+  const float input_min = 0.0f;
+  const float input_max = 255.0f;
+  Tensor input(DT_FLOAT, TensorShape({input_height, input_width}));
+  test::FillValues<float>(&input, {1.0f, -1.0f, 10.0f, 10.25f, 127.0f, 255.0f,
+                                   512.0f, 0.0f, 23.0f});
+  Tensor expected(DT_QUINT8, TensorShape({input_height, input_width}));
+  test::FillValues<quint8>(&expected, {1, 0, 10, 10, 127, 255, 255, 0, 23});
+  Tensor output = FloatTensorToQuantized<quint8>(input, input_min, input_max);
+  test::ExpectTensorEqual<quint8>(expected, output);
+}
+
+// Verify that FloatToQuantizedInPlaceUsingEigen is same result as
+// FloatToQuantized.
+TEST_F(QuantizationUtilsTest, FloatToQuantizedInPlaceUsingEigen) {
+  thread::ThreadPool threadpool(Env::Default(), "test", 2 /* num_threads */);
+  EigenThreadPoolWrapper wrapper(&threadpool);
+  Eigen::ThreadPoolDevice eigen_device(&wrapper, 2 /* num_threads */);
+
+  TestFloatToQuantizedInPlaceUsingEigen<quint8>(&eigen_device);
+  TestFloatToQuantizedInPlaceUsingEigen<qint8>(&eigen_device);
+  TestFloatToQuantizedInPlaceUsingEigen<quint16>(&eigen_device);
+  TestFloatToQuantizedInPlaceUsingEigen<qint16>(&eigen_device);
+}
+
+TEST_F(QuantizationUtilsTest, OverflowWithEigen) {
+  thread::ThreadPool threadpool(Env::Default(), "test", 2 /* num_threads */);
+  EigenThreadPoolWrapper wrapper(&threadpool);
+  Eigen::ThreadPoolDevice eigen_device(&wrapper, 2 /* num_threads */);
+
+  const int num_vals = 4;
+  const float input_min = 0.0f;
+  const float input_max = 2400.0f;
+  TensorShape shape({num_vals});
+  Tensor input(DT_FLOAT, shape);
+  test::FillValues<float>(&input, {-100.f, 0.f, 2400.0f, 2400.0f});
+  Tensor expected(DT_QINT32, shape);
+  // Note that the positive expected values are not the highest int32 value,
+  // because the implementation does a bounds check using float, not int32.
+  test::FillValues<qint32>(
+      &expected,
+      {static_cast<int32>(-2147483648), static_cast<int32>(-2147483648),
+       static_cast<int32>(2147483520), static_cast<int32>(2147483520)});
+
+  FloatToQuantizedStruct<qint32> f2q(input_min, input_max);
+  Tensor output(DT_QINT32, shape);
+  auto input_array = input.flat<float>();
+  output.flat<qint32>() = QUANTIZE_WITH_EIGEN(input_array, f2q, qint32);
+  test::ExpectTensorEqual<qint32>(expected, output);
+}
+
+TEST_F(QuantizationUtilsTest, QuantizedTensorToFloat) {
+  const int input_width = 3;
+  const int input_height = 3;
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  Tensor input(DT_QUINT8, TensorShape({input_height, input_width}));
+  test::FillValues<quint8>(&input, {0, 128, 255, 23, 24, 25, 243, 244, 245});
+  Tensor expected(DT_FLOAT, TensorShape({input_height, input_width}));
+  test::FillValues<float>(&expected, {-128.0f, 0.0f, 127.0f, -105.0f, -104.0f,
+                                      -103.0f, 115.0f, 116.0f, 117.0f});
+  Tensor output = QuantizedTensorToFloat<quint8>(input, input_min, input_max);
+  test::ExpectTensorEqual<float>(expected, output);
+}
+
+// Verify that QuantizedToFloatInPlaceUsingEigen is same result as
+// QuantizedToFloat.
+TEST_F(QuantizationUtilsTest, QuantizedToFloatInPlaceUsingEigen) {
+  thread::ThreadPool threadpool(Env::Default(), "test", 2 /* num_threads */);
+  EigenThreadPoolWrapper wrapper(&threadpool);
+  Eigen::ThreadPoolDevice eigen_device(&wrapper, 2 /* num_threads */);
+
+  TestQuantizedToFloatInPlaceUsingEigen<quint8>(&eigen_device);
+  TestQuantizedToFloatInPlaceUsingEigen<qint8>(&eigen_device);
+  TestQuantizedToFloatInPlaceUsingEigen<quint16>(&eigen_device);
+  TestQuantizedToFloatInPlaceUsingEigen<qint16>(&eigen_device);
+  TestQuantizedToFloatInPlaceUsingEigen<qint32>(&eigen_device);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantize_down_and_shrink_range.cc b/tensorflow/contrib/quantization/kernels/quantize_down_and_shrink_range.cc
new file mode 100644
index 0000000000..18dffd1dc6
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantize_down_and_shrink_range.cc
@@ -0,0 +1,97 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include <math.h>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <class T1, class T2>
+class QuantizeDownAndShrinkRangeOp : public OpKernel {
+ public:
+  explicit QuantizeDownAndShrinkRangeOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input = ctx->input(0);
+    const float input_min_float = ctx->input(1).flat<float>()(0);
+    const float input_max_float = ctx->input(2).flat<float>()(0);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
+    Tensor* output_min = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(1, TensorShape({}), &output_min));
+    Tensor* output_max = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(2, TensorShape({}), &output_max));
+
+    auto input_array = input.flat<T1>();
+    const int32 input_lowest_quantized =
+        static_cast<int32>(Eigen::NumTraits<T1>::lowest());
+    const int32 input_highest_quantized =
+        static_cast<int32>(Eigen::NumTraits<T1>::highest());
+    T1 actual_min_quantized = input_highest_quantized;
+    T1 actual_max_quantized = input_lowest_quantized;
+    for (int i = 0; i < input_array.size(); ++i) {
+      const T1 value = input_array(i);
+      actual_min_quantized = std::min(actual_min_quantized, value);
+      actual_max_quantized = std::max(actual_max_quantized, value);
+    }
+    // We want to make sure that the minimum is no larger than zero, so that the
+    // convolution operation can run efficiently.
+    const float actual_min_float =
+        std::min(0.0f, QuantizedToFloat(actual_min_quantized, input_min_float,
+                                        input_max_float));
+    const float actual_max_float = QuantizedToFloat(
+        actual_max_quantized, input_min_float, input_max_float);
+
+#if 0
+    // This is the reference, non-eigen implementation:
+    auto output_array = output->flat<T2>();
+    RequantizeManyInNewRange<T1, T2>(input_array.data(), input_array.size(),
+                                     input_min_float, input_max_float,
+                                     actual_min_float, actual_max_float,
+                                     output_array.data());
+#endif
+
+    if (input_array.size() > 0) {
+      RequantizeManyInNewRangeUsingEigen<T1, T2>(
+          ctx->eigen_device<CPUDevice>(), input, input_min_float,
+          input_max_float, actual_min_float, actual_max_float, output);
+    }
+
+    output_min->flat<float>().setConstant(actual_min_float);
+    output_max->flat<float>().setConstant(actual_max_float);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("QuantizeDownAndShrinkRange")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint32>("Tinput")
+                            .TypeConstraint<quint8>("out_type"),
+                        QuantizeDownAndShrinkRangeOp<qint32, quint8>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantize_down_and_shrink_range_op_test.cc b/tensorflow/contrib/quantization/kernels/quantize_down_and_shrink_range_op_test.cc
new file mode 100644
index 0000000000..73a50aad26
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantize_down_and_shrink_range_op_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class QuantizeDownAndShrinkRangeTest : public OpsTestBase {
+ protected:
+};
+
+// Runs a manually generated array through the operator, and makes sure that the
+// results match the expected hand-calculated values.
+TEST_F(QuantizeDownAndShrinkRangeTest, HandCrafted) {
+  TF_ASSERT_OK(NodeDefBuilder("quantize_down_and_shrink_range_op",
+                              "QuantizeDownAndShrinkRange")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("Tinput", DataTypeToEnum<qint32>::v())
+                   .Attr("out_type", DataTypeToEnum<quint8>::v())
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  // For this test we have an input that has the theoretical range of -256.0f to
+  // +256.0f, but the actual values present only span -1.0f to 1.0f. We expect
+  // the operator to take advantage of this, and rescale the output to fill up
+  // the available range in the lower bit depth, and update to the true min and
+  // max ranges.
+  const int value_count = 3;
+  AddInputFromArray<qint32>(TensorShape({value_count}),
+                            {-(1 << 23), 0, (1 << 23)});
+  AddInputFromArray<float>(TensorShape({1}), {-256.0f});
+  AddInputFromArray<float>(TensorShape({1}), {256.0f});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QUINT8, TensorShape({value_count}));
+  test::FillValues<quint8>(&expected, {0, 127, 255});
+  test::ExpectTensorEqual<quint8>(expected, *GetOutput(0));
+  Tensor expected_min(allocator(), DT_FLOAT, TensorShape({}));
+  test::FillValues<float>(&expected_min, {-1.0f});
+  test::ExpectTensorEqual<float>(expected_min, *GetOutput(1));
+  Tensor expected_max(allocator(), DT_FLOAT, TensorShape({}));
+  test::FillValues<float>(&expected_max, {1.0f});
+  test::ExpectTensorEqual<float>(expected_max, *GetOutput(2));
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantize_op.cc b/tensorflow/contrib/quantization/kernels/quantize_op.cc
new file mode 100644
index 0000000000..2bab8ad447
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantize_op.cc
@@ -0,0 +1,159 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace {
+enum { QUANTIZE_MODE_MIN_COMBINED, QUANTIZE_MODE_MIN_FIRST };
+}  // namespace
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+// Quantize a tensor from float to T, with user-specified min_range and
+// max_range.
+// TODO(xbing): Add a new QuantizeOp just taking scale,
+//              rather than min_range and max_range.
+template <typename Device, typename T>
+class QuantizeV2Op : public OpKernel {
+ public:
+  explicit QuantizeV2Op(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    half_range_ = !std::is_signed<T>::value
+                      ? 0.0f
+                      : (std::numeric_limits<T>::max() -
+                         std::numeric_limits<T>::min() + 1) /
+                            2.0f;
+    string mode_string;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("mode", &mode_string));
+    OP_REQUIRES(ctx,
+                (mode_string == "MIN_COMBINED" || mode_string == "MIN_FIRST"),
+                errors::InvalidArgument("Mode string must be 'MIN_COMBINED' or"
+                                        " 'MIN_FIRST', is '" +
+                                        mode_string + "'"));
+    if (mode_string == "MIN_COMBINED") {
+      mode_ = QUANTIZE_MODE_MIN_COMBINED;
+    } else if (mode_string == "MIN_FIRST") {
+      mode_ = QUANTIZE_MODE_MIN_FIRST;
+    }
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input = ctx->input(0);
+    const float input_min_range = ctx->input(1).flat<float>()(0);
+    const float input_max_range = ctx->input(2).flat<float>()(0);
+
+    float min_range;
+    float max_range;
+    OP_REQUIRES(ctx, !(input_max_range < input_min_range),
+                errors::InvalidArgument(
+                    "input_max_range must be larger than input_min_range."));
+
+    // When the minimum and maximum ranges are too close together, nudge them
+    // apart by a small value so that they are slightly different. This helps
+    // us avoid creating ill-formed buffers where all quantized values map to
+    // the same float number. These kinds of buffers cause problems for
+    // downstream ops when they need to do calculations on them.
+    // We pick the value by making sure that zero is not more than 100x the
+    // overall range from the maximum, so that the value can be easily
+    // represented when we promote the quantized value to a higher
+    // intermediate bit depth, since that's a common requirement.
+    min_range = input_min_range;
+    const float epsilon = std::max(1.0f, std::max(fabsf(input_min_range),
+                                                  fabsf(input_max_range))) /
+                          100.0f;
+    max_range = std::max(input_max_range, input_min_range + epsilon);
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
+    if (mode_ == QUANTIZE_MODE_MIN_COMBINED) {
+      const float scale_factor =
+          (std::numeric_limits<T>::max() - std::numeric_limits<T>::min()) /
+          (max_range - min_range);
+
+      // Quantize:
+      // Make input in range of [min_range, max_range], then
+      // subtract min_range to be in range of [0, max_range - min_range]
+      // Divide by (max_range - min_range) to get to [0, 1.0]
+      // Multiply by range of T, after that shift left 1/2 range of T if
+      // T is signed.
+      // Note that std::round is used to round the number before the cast.
+      // std::round implements "round-half-away-zero",
+      // e.g., -5.5 gets rounded to -6, -5.4 goes to -5, 5.4 goes to 5,
+      // and 5.5 goes to 6.
+      auto o = output->template flat<T>();
+      bool is_signed = std::is_signed<T>::value;
+      if (is_signed) {
+        // The slow path.
+        // TODO(xbing,yonghui): Speedup this path as well.
+        o.device(ctx->template eigen_device<Device>()) =
+            ((input.flat<float>().cwiseMin(max_range).cwiseMax(min_range) -
+              min_range) *
+                 scale_factor -
+             half_range_)
+                .unaryExpr(std::function<float(float)>(round))
+                .template cast<T>();
+      } else {
+        // The fast path that avoids unaryExpr
+        // According to the micro-benchmark, adding device here doesn't help.
+        o = ((input.flat<float>().cwiseMin(max_range).cwiseMax(min_range) -
+              min_range) *
+                 scale_factor +
+             0.5f)
+                .template cast<T>();
+      }
+    } else if (mode_ == QUANTIZE_MODE_MIN_FIRST) {
+      FloatTensorToQuantizedInPlaceUsingEigen<T>(
+          ctx->template eigen_device<Device>(), input, min_range, max_range,
+          output);
+    }
+
+    Tensor* output_min_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(1, {}, &output_min_tensor));
+    output_min_tensor->flat<float>()(0) = min_range;
+
+    Tensor* output_max_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(2, {}, &output_max_tensor));
+    output_max_tensor->flat<float>()(0) = max_range;
+  }
+
+ private:
+  float half_range_;
+  int mode_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("QuantizeV2").Device(DEVICE_CPU).TypeConstraint<quint8>("T"),
+    QuantizeV2Op<CPUDevice, quint8>);
+REGISTER_KERNEL_BUILDER(
+    Name("QuantizeV2").Device(DEVICE_CPU).TypeConstraint<qint8>("T"),
+    QuantizeV2Op<CPUDevice, qint8>);
+REGISTER_KERNEL_BUILDER(
+    Name("QuantizeV2").Device(DEVICE_CPU).TypeConstraint<quint16>("T"),
+    QuantizeV2Op<CPUDevice, quint16>);
+REGISTER_KERNEL_BUILDER(
+    Name("QuantizeV2").Device(DEVICE_CPU).TypeConstraint<qint16>("T"),
+    QuantizeV2Op<CPUDevice, qint16>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantize_op_test.cc b/tensorflow/contrib/quantization/kernels/quantize_op_test.cc
new file mode 100644
index 0000000000..d3ac7d3f7c
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantize_op_test.cc
@@ -0,0 +1,113 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+class QuantizedOpTest : public OpsTestBase {
+ protected:
+};
+
+TEST_F(QuantizedOpTest, QuantizeV2) {
+  TF_ASSERT_OK(NodeDefBuilder("quantize_op", "QuantizeV2")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("mode", "MIN_FIRST")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({6}),
+                           {1.0, 1.25, 1.75, 127.0, 255.0, 500.0});
+  AddInputFromArray<float>(TensorShape({1}), {0});
+  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QUINT8, TensorShape({6}));
+  test::FillValues<quint8>(&expected, {1, 1, 2, 127, 255, 255});
+  test::ExpectTensorEqual<quint8>(expected, *GetOutput(0));
+}
+
+TEST_F(QuantizedOpTest, QuantizeV2Ports) {
+  TF_ASSERT_OK(NodeDefBuilder("quantize_op", "QuantizeV2")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("mode", "MIN_FIRST")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({6}),
+                           {1.0, 1.25, 1.75, 127.0, 255.0, 500.0});
+  AddInputFromArray<float>(TensorShape({1}), {0});
+  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QUINT8, TensorShape({6}));
+  test::FillValues<quint8>(&expected, {1, 1, 2, 127, 255, 255});
+  test::ExpectTensorEqual<quint8>(expected, *GetOutput(0));
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  EXPECT_NEAR(0.0f, output_min, 1e-5f);
+  EXPECT_NEAR(255.0f, output_max, 1e-5f);
+}
+
+TEST_F(QuantizedOpTest, QuantizeV2EqualRange) {
+  TF_ASSERT_OK(NodeDefBuilder("quantize_op", "QuantizeV2")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("mode", "MIN_FIRST")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({6}), {1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
+  AddInputFromArray<float>(TensorShape({1}), {1.0f});
+  AddInputFromArray<float>(TensorShape({1}), {1.0f});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QUINT8, TensorShape({6}));
+  test::FillValues<quint8>(&expected, {0, 0, 0, 0, 0, 0});
+  test::ExpectTensorEqual<quint8>(expected, *GetOutput(0));
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  EXPECT_NEAR(1.0f, output_min, 1e-5f);
+  EXPECT_LT(1.0f, output_max);
+}
+
+TEST_F(QuantizedOpTest, Dequantize) {
+  TF_ASSERT_OK(NodeDefBuilder("dequantize_op", "Dequantize")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("mode", "MIN_FIRST")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<quint8>(TensorShape({6}), {1, 2, 4, 8, 16, 255});
+  AddInputFromArray<float>(TensorShape({1}), {0});
+  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
+  test::FillValues<float>(&expected, {1.0, 2.0, 4.0, 8.0, 16.0, 255.0});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.5);
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantized_activation_ops.cc b/tensorflow/contrib/quantization/kernels/quantized_activation_ops.cc
new file mode 100644
index 0000000000..a86b611ad6
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantized_activation_ops.cc
@@ -0,0 +1,101 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implements a quantized version of the Relu6 operation.
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+template <typename T>
+class QuantizedReluOp : public OpKernel {
+ public:
+  explicit QuantizedReluOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const float min_input = context->input(1).flat<float>()(0);
+    const float max_input = context->input(2).flat<float>()(0);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+    const T min_as_quantized = FloatToQuantized<T>(0.0f, min_input, max_input);
+    output->flat<T>().device(context->eigen_cpu_device()) =
+        input.flat<T>().cwiseMax(min_as_quantized).template cast<T>();
+    Tensor* output_min = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, {}, &output_min));
+    output_min->flat<float>()(0) = min_input;
+    Tensor* output_max = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(2, {}, &output_max));
+    output_max->flat<float>()(0) = max_input;
+  }
+};
+
+template <typename T>
+class QuantizedRelu6Op : public OpKernel {
+ public:
+  explicit QuantizedRelu6Op(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const float min_input = context->input(1).flat<float>()(0);
+    const float max_input = context->input(2).flat<float>()(0);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+    const T min_as_quantized = FloatToQuantized<T>(0.0f, min_input, max_input);
+    const T max_as_quantized = FloatToQuantized<T>(6.0f, min_input, max_input);
+    output->flat<T>().device(context->eigen_cpu_device()) =
+        input.flat<T>()
+            .cwiseMax(min_as_quantized)
+            .cwiseMin(max_as_quantized)
+            .template cast<T>();
+    Tensor* output_min = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, {}, &output_min));
+    output_min->flat<float>()(0) = min_input;
+    Tensor* output_max = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(2, {}, &output_max));
+    output_max->flat<float>()(0) = max_input;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("QuantizedRelu")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint32>("Tinput")
+                            .TypeConstraint<qint32>("out_type"),
+                        QuantizedReluOp<qint32>);
+REGISTER_KERNEL_BUILDER(Name("QuantizedRelu")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<quint8>("out_type"),
+                        QuantizedReluOp<quint8>);
+
+REGISTER_KERNEL_BUILDER(Name("QuantizedRelu6")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint32>("Tinput")
+                            .TypeConstraint<qint32>("out_type"),
+                        QuantizedRelu6Op<qint32>);
+REGISTER_KERNEL_BUILDER(Name("QuantizedRelu6")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<quint8>("out_type"),
+                        QuantizedRelu6Op<quint8>);
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantized_activation_ops_test.cc b/tensorflow/contrib/quantization/kernels/quantized_activation_ops_test.cc
new file mode 100644
index 0000000000..19efe6093e
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantized_activation_ops_test.cc
@@ -0,0 +1,99 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class QuantizedActivationsTest : public OpsTestBase {
+ protected:
+};
+
+TEST_F(QuantizedActivationsTest, TestRelu) {
+  TF_ASSERT_OK(NodeDefBuilder("quantized_relu_op", "QuantizedRelu")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const int input_width = 2;
+  const int input_height = 4;
+  Tensor input_float(DT_FLOAT, {input_height, input_width});
+  test::FillValues<float>(&input_float, {-100, -1, 0, 1, 3, 6, 7, 100});
+  Tensor input_quantized =
+      FloatTensorToQuantized<quint8>(input_float, input_min, input_max);
+  Tensor expected_float(DT_FLOAT, {input_height, input_width});
+  test::FillValues<float>(&expected_float, {0, 0, 0, 1, 3, 6, 7, 100});
+
+  AddInputFromArray<quint8>(input_quantized.shape(),
+                            input_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {input_min});
+  AddInputFromArray<float>(TensorShape({1}), {input_max});
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<quint8>(output_quantized, output_min, output_max);
+  test::ExpectTensorNear<float>(expected_float, output_float, 0.2);
+}
+
+TEST_F(QuantizedActivationsTest, TestRelu6) {
+  TF_ASSERT_OK(NodeDefBuilder("quantized_relu6_op", "QuantizedRelu6")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const int input_width = 2;
+  const int input_height = 4;
+  Tensor input_float(DT_FLOAT, {input_height, input_width});
+  test::FillValues<float>(&input_float, {-100, -1, 0, 1, 3, 6, 7, 100});
+  Tensor input_quantized =
+      FloatTensorToQuantized<quint8>(input_float, input_min, input_max);
+  Tensor expected_float(DT_FLOAT, {input_height, input_width});
+  test::FillValues<float>(&expected_float, {0, 0, 0, 1, 3, 6, 6, 6});
+
+  AddInputFromArray<quint8>(input_quantized.shape(),
+                            input_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {input_min});
+  AddInputFromArray<float>(TensorShape({1}), {input_max});
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<quint8>(output_quantized, output_min, output_max);
+  test::ExpectTensorNear<float>(expected_float, output_float, 0.2);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantized_batch_norm_op.cc b/tensorflow/contrib/quantization/kernels/quantized_batch_norm_op.cc
new file mode 100644
index 0000000000..2a684824d3
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantized_batch_norm_op.cc
@@ -0,0 +1,240 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// A slow but straightforward implementation of batch normalization.
+template <typename T1, typename T2>
+void ReferenceBatchNorm(const Tensor& input, const float input_min,
+                        const float input_max, const Tensor& mean,
+                        float mean_min, float mean_max, const Tensor& var,
+                        float var_min, float var_max, const Tensor& beta,
+                        float beta_min, float beta_max, const Tensor& gamma,
+                        float gamma_min, float gamma_max,
+                        float variance_epsilon, bool scale_after_normalization,
+                        Tensor* output, float* output_min, float* output_max) {
+  auto input_flat = input.flat<T1>();
+  auto mean_flat = mean.flat<T1>();
+  auto var_flat = var.flat<T1>();
+  auto beta_flat = beta.flat<T1>();
+  auto gamma_flat = gamma.flat<T1>();
+  auto output_flat = output->flat<T2>();
+
+  const int depth = mean.dim_size(0);
+  const int row_count = input_flat.size() / depth;
+
+  *output_min = std::numeric_limits<float>::max();
+  *output_max = std::numeric_limits<float>::lowest();
+  for (int pass = 0; pass < 2; ++pass) {
+    const bool is_range_pass = (pass == 0);
+    for (int row_index = 0; row_index < row_count; ++row_index) {
+      for (int channel = 0; channel < depth; ++channel) {
+        const int input_index = (row_index * depth) + channel;
+        const float input_value =
+            QuantizedToFloat(input_flat(input_index), input_min, input_max);
+        const float mean_value =
+            QuantizedToFloat(mean_flat(channel), mean_min, mean_max);
+        const float var_value =
+            QuantizedToFloat(var_flat(channel), var_min, var_max);
+        const float beta_value =
+            QuantizedToFloat(beta_flat(channel), beta_min, beta_max);
+        const float gamma_value =
+            QuantizedToFloat(gamma_flat(channel), gamma_min, gamma_max);
+        float output_value;
+        if (scale_after_normalization) {
+          output_value = (((input_value - mean_value) /
+                           sqrtf(var_value + variance_epsilon)) *
+                          gamma_value) +
+                         beta_value;
+        } else {
+          output_value = ((input_value - mean_value) /
+                          sqrtf(var_value + variance_epsilon)) +
+                         beta_value;
+        }
+        if (is_range_pass) {
+          *output_min = std::min(output_value, *output_min);
+          *output_max = std::max(output_value, *output_max);
+        } else {
+          output_flat(input_index) =
+              FloatToQuantized<T2>(output_value, *output_min, *output_max);
+        }
+      }
+    }
+  }
+}
+
+// An implementation of batch normalization that does the main calculations
+// using only fixed-point arithmetic. There's a prologue with some floating
+// calculations, but assuming the weights are constant these could be hoisted to
+// an offline process, or baked into the weights.
+template <typename T1, typename T2>
+void FixedPointBatchNorm(const Tensor& input, const float input_min,
+                         const float input_max, const Tensor& mean,
+                         float mean_min, float mean_max, const Tensor& var,
+                         float var_min, float var_max, const Tensor& beta,
+                         float beta_min, float beta_max, const Tensor& gamma,
+                         float gamma_min, float gamma_max,
+                         float variance_epsilon, bool scale_after_normalization,
+                         Tensor* output, float* output_min, float* output_max) {
+  auto input_flat = input.flat<T1>();
+  auto mean_flat = mean.flat<T1>();
+  auto var_flat = var.flat<T1>();
+  auto beta_flat = beta.flat<T1>();
+  auto gamma_flat = gamma.flat<T1>();
+  auto output_flat = output->flat<T2>();
+
+  const int depth = mean.dim_size(0);
+  const int row_count = input_flat.size() / depth;
+
+  // The range here is chosen so that typical input values fit in without any
+  // overflow or loss of precision, going from +1m to -1m with 10 bits of fixed
+  // point precision.
+  *output_min = -(1 << 20);
+  *output_max = (1 << 20);
+
+  Tensor scale_tensor(DataTypeToEnum<T2>::v(), {depth});
+  auto scale_flat = scale_tensor.flat<T2>();
+  Tensor offset_tensor(DataTypeToEnum<T2>::v(), {depth});
+  auto offset_flat = offset_tensor.flat<T2>();
+  for (int channel = 0; channel < depth; ++channel) {
+    const float mean_value =
+        QuantizedToFloat(mean_flat(channel), mean_min, mean_max);
+    const float var_value =
+        QuantizedToFloat(var_flat(channel), var_min, var_max);
+    const float beta_value =
+        QuantizedToFloat(beta_flat(channel), beta_min, beta_max);
+    const float gamma_value =
+        QuantizedToFloat(gamma_flat(channel), gamma_min, gamma_max);
+    float scale_value;
+    if (scale_after_normalization) {
+      scale_value = (1.0f / sqrtf(var_value + variance_epsilon)) * gamma_value;
+    } else {
+      scale_value = (1.0f / sqrtf(var_value + variance_epsilon));
+    }
+    const float offset_value = (-mean_value * scale_value) + beta_value;
+    scale_flat(channel) =
+        FloatToQuantized<T2>(scale_value, *output_min, *output_max);
+    offset_flat(channel) =
+        FloatToQuantized<T2>(offset_value, *output_min, *output_max);
+  }
+
+  const T2 one_in_output_space =
+      FloatToQuantized<T2>(1.0f, *output_min, *output_max);
+  for (int row_index = 0; row_index < row_count; ++row_index) {
+    for (int channel = 0; channel < depth; ++channel) {
+      const int input_index = (row_index * depth) + channel;
+      const T2 input_value =
+          RequantizeInNewRange<T1, T2>(input_flat(input_index), input_min,
+                                       input_max, *output_min, *output_max);
+      const T2 scale_value = scale_flat(channel);
+      const T2 offset_value = offset_flat(channel);
+      const T2 output_value =
+          ((input_value * scale_value) / one_in_output_space) + offset_value;
+      output_flat(input_index) = output_value;
+    }
+  }
+}
+
+}  // namespace
+
+template <typename T1, typename T2>
+class QuantizedBatchNormOp : public OpKernel {
+ public:
+  explicit QuantizedBatchNormOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("variance_epsilon", &variance_epsilon_));
+    OP_REQUIRES_OK(context, context->GetAttr("scale_after_normalization",
+                                             &scale_after_normalization_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const float input_min = context->input(1).flat<float>()(0);
+    const float input_max = context->input(2).flat<float>()(0);
+    const Tensor& mean = context->input(3);
+    const float mean_min = context->input(4).flat<float>()(0);
+    const float mean_max = context->input(5).flat<float>()(0);
+    const Tensor& var = context->input(6);
+    const float var_min = context->input(7).flat<float>()(0);
+    const float var_max = context->input(8).flat<float>()(0);
+    const Tensor& beta = context->input(9);
+    const float beta_min = context->input(10).flat<float>()(0);
+    const float beta_max = context->input(11).flat<float>()(0);
+    const Tensor& gamma = context->input(12);
+    const float gamma_min = context->input(13).flat<float>()(0);
+    const float gamma_max = context->input(14).flat<float>()(0);
+
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().DebugString()));
+    OP_REQUIRES(context, mean.dims() == 1,
+                errors::InvalidArgument("mean must be 1-dimensional",
+                                        mean.shape().DebugString()));
+    OP_REQUIRES(context, var.dims() == 1,
+                errors::InvalidArgument("var must be 1-dimensional",
+                                        var.shape().DebugString()));
+    OP_REQUIRES(context, beta.dims() == 1,
+                errors::InvalidArgument("beta must be 1-dimensional",
+                                        beta.shape().DebugString()));
+    OP_REQUIRES(context, gamma.dims() == 1,
+                errors::InvalidArgument("gamma must be 1-dimensional",
+                                        gamma.shape().DebugString()));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+    float output_min;
+    float output_max;
+    FixedPointBatchNorm<T1, T2>(input, input_min, input_max, mean, mean_min,
+                                mean_max, var, var_min, var_max, beta, beta_min,
+                                beta_max, gamma, gamma_min, gamma_max,
+                                variance_epsilon_, scale_after_normalization_,
+                                output, &output_min, &output_max);
+
+    Tensor* output_min_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, {}, &output_min_tensor));
+    output_min_tensor->flat<float>()(0) = output_min;
+
+    Tensor* output_max_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(2, {}, &output_max_tensor));
+    output_max_tensor->flat<float>()(0) = output_max;
+  }
+
+ private:
+  float variance_epsilon_;
+  bool scale_after_normalization_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("QuantizedBatchNormWithGlobalNormalization")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint32>("out_type"),
+                        QuantizedBatchNormOp<quint8, qint32>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantized_batch_norm_op_test.cc b/tensorflow/contrib/quantization/kernels/quantized_batch_norm_op_test.cc
new file mode 100644
index 0000000000..ccb6a59ecf
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantized_batch_norm_op_test.cc
@@ -0,0 +1,242 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+#include "tensorflow/core/common_runtime/eigen_thread_pool.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/batch_norm_op.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class QuantizedBatchNormOpTest : public OpsTestBase {};
+
+TEST_F(QuantizedBatchNormOpTest, Simple) {
+  TF_EXPECT_OK(NodeDefBuilder("quantized_batch_norm_op",
+                              "QuantizedBatchNormWithGlobalNormalization")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("scale_after_normalization", false)
+                   .Attr("variance_epsilon", 0.001)
+                   .Attr("Tinput", DT_QUINT8)
+                   .Attr("out_type", DT_QINT32)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const int input_batch = 1;
+  const int input_height = 1;
+  const int input_width = 6;
+  const int input_depth = 2;
+  Tensor input_float(DT_FLOAT,
+                     {input_batch, input_height, input_width, input_depth});
+  test::FillValues<float>(&input_float,
+                          {1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6});
+  Tensor input_quantized =
+      FloatTensorToQuantized<quint8>(input_float, input_min, input_max);
+  const float mean_min = 0.0f;
+  const float mean_max = 20.0f;
+  Tensor mean_float(DT_FLOAT, {input_depth});
+  test::FillValues<float>(&mean_float, {10, 20});
+  Tensor mean_quantized =
+      FloatTensorToQuantized<quint8>(mean_float, mean_min, mean_max);
+  const float variance_min = 0.0f;
+  const float variance_max = 1.0f;
+  Tensor variance_float(DT_FLOAT, {input_depth});
+  test::FillValues<float>(&variance_float, {0.25, 0.5});
+  Tensor variance_quantized = FloatTensorToQuantized<quint8>(
+      variance_float, variance_min, variance_max);
+  const float beta_min = 0.0f;
+  const float beta_max = 1.0f;
+  Tensor beta_float(DT_FLOAT, {input_depth});
+  test::FillValues<float>(&beta_float, {0.1, 0.6});
+  Tensor beta_quantized =
+      FloatTensorToQuantized<quint8>(beta_float, beta_min, beta_max);
+  const float gamma_min = 0.0f;
+  const float gamma_max = 1.0f;
+  Tensor gamma_float(DT_FLOAT, {input_depth});
+  test::FillValues<float>(&gamma_float, {0.0, 0.0});
+  Tensor gamma_quantized =
+      FloatTensorToQuantized<quint8>(gamma_float, gamma_min, gamma_max);
+
+  AddInputFromArray<quint8>(input_quantized.shape(),
+                            input_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {input_min});
+  AddInputFromArray<float>(TensorShape({1}), {input_max});
+  AddInputFromArray<quint8>(mean_quantized.shape(),
+                            mean_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {mean_min});
+  AddInputFromArray<float>(TensorShape({1}), {mean_max});
+  AddInputFromArray<quint8>(variance_quantized.shape(),
+                            variance_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {variance_min});
+  AddInputFromArray<float>(TensorShape({1}), {variance_max});
+  AddInputFromArray<quint8>(beta_quantized.shape(),
+                            beta_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {beta_min});
+  AddInputFromArray<float>(TensorShape({1}), {beta_max});
+  AddInputFromArray<quint8>(gamma_quantized.shape(),
+                            gamma_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {gamma_min});
+  AddInputFromArray<float>(TensorShape({1}), {gamma_max});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected_float(
+      allocator(), DT_FLOAT,
+      TensorShape({input_batch, input_height, input_width, input_depth}));
+  test::FillValues<float>(
+      &expected_float, {-17.86, -22.00, -15.87, -20.59, -13.87, -19.18, -21.86,
+                        -33.31, -23.85, -34.72, -25.85, -36.13});
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<qint32>(output_quantized, output_min, output_max);
+  test::ExpectTensorNear<float>(expected_float, output_float, 0.1);
+}
+
+TEST_F(QuantizedBatchNormOpTest, SameAsFloat) {
+  TF_EXPECT_OK(NodeDefBuilder("quantized_batch_norm_op",
+                              "QuantizedBatchNormWithGlobalNormalization")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("scale_after_normalization", false)
+                   .Attr("variance_epsilon", 0.001)
+                   .Attr("Tinput", DT_QUINT8)
+                   .Attr("out_type", DT_QINT32)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const int input_batch = 1;
+  const int input_height = 1;
+  const int input_width = 6;
+  const int input_depth = 2;
+  Tensor input_float(DT_FLOAT,
+                     {input_batch, input_height, input_width, input_depth});
+  test::FillValues<float>(&input_float,
+                          {1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6});
+  Tensor input_quantized =
+      FloatTensorToQuantized<quint8>(input_float, input_min, input_max);
+  const float mean_min = 0.0f;
+  const float mean_max = 20.0f;
+  Tensor mean_float(DT_FLOAT, {input_depth});
+  test::FillValues<float>(&mean_float, {10, 20});
+  Tensor mean_quantized =
+      FloatTensorToQuantized<quint8>(mean_float, mean_min, mean_max);
+  const float variance_min = 0.0f;
+  const float variance_max = 1.0f;
+  Tensor variance_float(DT_FLOAT, {input_depth});
+  test::FillValues<float>(&variance_float, {0.25, 0.5});
+  Tensor variance_quantized = FloatTensorToQuantized<quint8>(
+      variance_float, variance_min, variance_max);
+  const float beta_min = 0.0f;
+  const float beta_max = 1.0f;
+  Tensor beta_float(DT_FLOAT, {input_depth});
+  test::FillValues<float>(&beta_float, {0.1, 0.6});
+  Tensor beta_quantized =
+      FloatTensorToQuantized<quint8>(beta_float, beta_min, beta_max);
+  const float gamma_min = 0.0f;
+  const float gamma_max = 1.0f;
+  Tensor gamma_float(DT_FLOAT, {input_depth});
+  test::FillValues<float>(&gamma_float, {0.0, 0.0});
+  Tensor gamma_quantized =
+      FloatTensorToQuantized<quint8>(gamma_float, gamma_min, gamma_max);
+
+  AddInputFromArray<quint8>(input_quantized.shape(),
+                            input_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {input_min});
+  AddInputFromArray<float>(TensorShape({1}), {input_max});
+  AddInputFromArray<quint8>(mean_quantized.shape(),
+                            mean_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {mean_min});
+  AddInputFromArray<float>(TensorShape({1}), {mean_max});
+  AddInputFromArray<quint8>(variance_quantized.shape(),
+                            variance_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {variance_min});
+  AddInputFromArray<float>(TensorShape({1}), {variance_max});
+  AddInputFromArray<quint8>(beta_quantized.shape(),
+                            beta_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {beta_min});
+  AddInputFromArray<float>(TensorShape({1}), {beta_max});
+  AddInputFromArray<quint8>(gamma_quantized.shape(),
+                            gamma_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {gamma_min});
+  AddInputFromArray<float>(TensorShape({1}), {gamma_max});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected_float(
+      allocator(), DT_FLOAT,
+      TensorShape({input_batch, input_height, input_width, input_depth}));
+  thread::ThreadPool threadpool(Env::Default(), "test", 1);
+  EigenThreadPoolWrapper wrapper(&threadpool);
+  Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, 1);
+  const Tensor& const_input_float = input_float;
+  const Tensor& const_mean_float = mean_float;
+  const Tensor& const_variance_float = variance_float;
+  const Tensor& const_beta_float = beta_float;
+  const Tensor& const_gamma_float = gamma_float;
+  functor::BatchNorm<Eigen::ThreadPoolDevice, float>()(
+      eigen_cpu_device, const_input_float.tensor<float, 4>(),
+      const_mean_float.vec<float>(), const_variance_float.vec<float>(),
+      const_beta_float.vec<float>(), const_gamma_float.vec<float>(), 0.001,
+      false, expected_float.tensor<float, 4>());
+
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<qint32>(output_quantized, output_min, output_max);
+  test::ExpectTensorNear<float>(expected_float, output_float, 0.1);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantized_bias_add_op.cc b/tensorflow/contrib/quantization/kernels/quantized_bias_add_op.cc
new file mode 100644
index 0000000000..c319eb97da
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantized_bias_add_op.cc
@@ -0,0 +1,89 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implements a quantized eight-bit version of the bias addition operation.
+
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <class T1, class T2, class T3>
+class QuantizedBiasAddOp : public OpKernel {
+ public:
+  explicit QuantizedBiasAddOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& bias = context->input(1);
+    const float input_min = context->input(2).flat<float>()(0);
+    const float input_max = context->input(3).flat<float>()(0);
+    const float bias_min = context->input(4).flat<float>()(0);
+    const float bias_max = context->input(5).flat<float>()(0);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input.shape()),
+                errors::InvalidArgument("Input tensor must be at least 2D: ",
+                                        input.shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(bias.shape()),
+                errors::InvalidArgument("Biases must be 1D: ",
+                                        bias.shape().DebugString()));
+    const auto last_dim = input.shape().dims() - 1;
+    OP_REQUIRES(
+        context, bias.shape().dim_size(0) == input.shape().dim_size(last_dim),
+        errors::InvalidArgument(
+            "Must provide as many biases as the last dimension "
+            "of the input tensor: ",
+            bias.shape().DebugString(), " vs. ", input.shape().DebugString()));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+
+    float total_min;
+    float total_max;
+    QuantizedAddUsingEigen<T1, T2, T3>(
+        context->template eigen_device<CPUDevice>(), input, input_min,
+        input_max, bias, bias_min, bias_max, output, &total_min, &total_max);
+
+    Tensor* output_min = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, {}, &output_min));
+    output_min->flat<float>()(0) = total_min;
+
+    Tensor* output_max = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(2, {}, &output_max));
+    output_max->flat<float>()(0) = total_max;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("QuantizedBiasAdd")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("T1")
+                            .TypeConstraint<quint8>("T2")
+                            .TypeConstraint<qint32>("out_type"),
+                        QuantizedBiasAddOp<quint8, quint8, qint32>);
+REGISTER_KERNEL_BUILDER(Name("QuantizedBiasAdd")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint8>("T1")
+                            .TypeConstraint<qint8>("T2")
+                            .TypeConstraint<qint32>("out_type"),
+                        QuantizedBiasAddOp<qint8, qint8, qint32>);
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantized_bias_add_op_test.cc b/tensorflow/contrib/quantization/kernels/quantized_bias_add_op_test.cc
new file mode 100644
index 0000000000..56535029b5
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantized_bias_add_op_test.cc
@@ -0,0 +1,171 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class QuantizedBiasAddTest : public OpsTestBase {
+ protected:
+};
+
+TEST_F(QuantizedBiasAddTest, Small) {
+  TF_ASSERT_OK(NodeDefBuilder("quantized_bias_add_op", "QuantizedBiasAdd")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("out_type", DataTypeToEnum<qint32>::v())
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const float input_min = 0.0f;
+  const float input_max = 60.0f;
+  const int input_height = 2;
+  const int input_width = 3;
+  Tensor input_float(DT_FLOAT, {input_height, input_width});
+  test::FillValues<float>(&input_float,
+                          {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f});
+  Tensor input_quantized =
+      FloatTensorToQuantized<quint8>(input_float, input_min, input_max);
+
+  const float bias_min = 0.0f;
+  const float bias_max = 3.0f;
+  const int bias_width = 3;
+  Tensor bias_float(DT_FLOAT, {bias_width});
+  test::FillValues<float>(&bias_float, {1.0f, 2.0f, 3.0f});
+  Tensor bias_quantized =
+      FloatTensorToQuantized<quint8>(bias_float, bias_min, bias_max);
+
+  Tensor expected_float(DT_FLOAT, {input_height, input_width});
+  test::FillValues<float>(&expected_float,
+                          {11.0f, 22.0f, 33.0f, 41.0f, 52.0f, 63.0f});
+
+  AddInputFromArray<quint8>(input_quantized.shape(),
+                            input_quantized.flat<quint8>());
+  AddInputFromArray<quint8>(bias_quantized.shape(),
+                            bias_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {input_min});
+  AddInputFromArray<float>(TensorShape({1}), {input_max});
+  AddInputFromArray<float>(TensorShape({1}), {bias_min});
+  AddInputFromArray<float>(TensorShape({1}), {bias_max});
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<qint32>(output_quantized, output_min, output_max);
+  test::ExpectTensorNear<float>(expected_float, output_float, 0.2);
+}
+
+TEST_F(QuantizedBiasAddTest, RealData) {
+  TF_ASSERT_OK(NodeDefBuilder("quantized_bias_add_op", "QuantizedBiasAdd")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("out_type", DataTypeToEnum<qint32>::v())
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const float input_min = -2164.25f;
+  const float input_max = 2006.27f;
+  const int input_height = 1;
+  const int input_width = 64;
+  Tensor input_float(DT_FLOAT, {input_height, input_width});
+  test::FillValues<float>(
+      &input_float,
+      {-1014.12, -157.382, -810.17,  1435.28,  1016.37,  219.684,  -316.054,
+       -2164.25, 2006.27,  -547.444, 857.376,  404.376,  9.72115,  332.588,
+       194.385,  -286.57,  26.062,   23.1125,  110.436,  247.055,  -127.683,
+       -376.275, -124.81,  -846.826, -77.1507, 305.581,  -202.747, 12.9528,
+       9.64886,  872.686,  40.9069,  197.816,  44.16,    -306.768, -1457.52,
+       -368.939, -1049.42, -486.353, 1745.87,  95.7695,  395.773,  -254.333,
+       -404.27,  787.16,   -2.44114, 199.37,   -1024.08, 784.901,  235.055,
+       -42.7295, 241.498,  -245.365, 470.763,  186.159,  186.579,  -220.163,
+       1304.58,  386.272,  -358.853, -755.996, 360.109,  -866.007, 55.2828,
+       -508.801});
+  Tensor input_quantized =
+      FloatTensorToQuantized<quint8>(input_float, input_min, input_max);
+
+  const float bias_min = -0.739539f;
+  const float bias_max = 0.641057f;
+  const int bias_width = 64;
+  Tensor bias_float(DT_FLOAT, {bias_width});
+  test::FillValues<float>(
+      &bias_float,
+      {-0.294619, -0.0670519, 0.261507,   -0.126274, 0.127229,   -0.176945,
+       -0.251223, 0.231086,   0.453694,   0.415666,  -0.288733,  0.508717,
+       0.211551,  0.0435907,  -0.582383,  -0.308779, 0.0696883,  -0.438122,
+       0.114,     0.433964,   0.109883,   0.284931,  -0.149661,  0.108657,
+       0.458333,  -0.130231,  -0.35805,   -0.123206, -0.437968,  0.0282411,
+       0.628818,  -0.0522173, -0.0233403, 0.124863,  0.217165,   0.262294,
+       -0.171005, -0.254693,  -0.200433,  -0.287354, 0.488166,   -0.0354688,
+       -0.118091, -0.590444,  0.491537,   -0.739539, 0.083117,   0.282482,
+       0.275269,  -0.36574,   0.107476,   0.0511428, -0.136887,  -0.0149852,
+       -0.259694, 0.641057,   0.264054,   -0.295126, -0.0218791, 0.361211,
+       0.012448,  0.0709718,  -0.392394,  -0.434215});
+  Tensor bias_quantized =
+      FloatTensorToQuantized<quint8>(bias_float, bias_min, bias_max);
+
+  Tensor expected_float(DT_FLOAT, {input_height, input_width});
+  test::FillValues<float>(
+      &expected_float,
+      {-1014.42, -157.449, -809.908, 1435.16,  1016.5,  219.507,  -316.305,
+       -2164.02, 2006.73,  -547.028, 857.088,  404.885, 9.9327,   332.632,
+       193.803,  -286.878, 26.1317,  22.6744,  110.55,  247.489,  -127.573,
+       -375.99,  -124.959, -846.717, -76.6923, 305.451, -203.105, 12.8296,
+       9.21089,  872.714,  41.5357,  197.764,  44.1367, -306.643, -1457.3,
+       -368.677, -1049.6,  -486.608, 1745.67,  95.4821, 396.261,  -254.368,
+       -404.388, 786.57,   -1.94961, 198.63,   -1024.0, 785.183,  235.33,
+       -43.0953, 241.605,  -245.314, 470.627,  186.144, 186.319,  -219.522,
+       1304.84,  385.977,  -358.874, -755.635, 360.122, -865.936, 54.8904,
+       -509.235});
+
+  AddInputFromArray<quint8>(input_quantized.shape(),
+                            input_quantized.flat<quint8>());
+  AddInputFromArray<quint8>(bias_quantized.shape(),
+                            bias_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {input_min});
+  AddInputFromArray<float>(TensorShape({1}), {input_max});
+  AddInputFromArray<float>(TensorShape({1}), {bias_min});
+  AddInputFromArray<float>(TensorShape({1}), {bias_max});
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<qint32>(output_quantized, output_min, output_max);
+  test::ExpectTensorNear<float>(expected_float, output_float, 20.0);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantized_concat_op.cc b/tensorflow/contrib/quantization/kernels/quantized_concat_op.cc
new file mode 100644
index 0000000000..abe8c9138d
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantized_concat_op.cc
@@ -0,0 +1,246 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <vector>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/concat_lib_cpu.h"
+
+namespace tensorflow {
+
+namespace {
+template <typename T>
+struct RequantizeCopier {
+  RequantizeCopier(
+      const std::vector<std::pair<float, float>>* input_min_and_max,
+      float output_min, float output_max)
+      : output_min(output_min),
+        output_max(output_max),
+        input_min_and_max(input_min_and_max) {}
+
+  inline void Copy(T* dst, const T* src, int input_index, size_t n) {
+    const float input_min = (*input_min_and_max)[input_index].first;
+    const float input_max = (*input_min_and_max)[input_index].second;
+    if (input_min == output_min && input_max == output_max) {
+      DCHECK(DataTypeCanUseMemcpy(DataTypeToEnum<T>::v()));
+      memcpy(dst, src, n * sizeof(T));
+    } else {
+      Eigen::array<Eigen::DenseIndex, 1> dims;
+      dims[0] = n;
+      typename TTypes<T, 1>::UnalignedConstTensor input_array(src, dims);
+      typename TTypes<T, 1>::UnalignedTensor output_array(dst, dims);
+
+      QuantizedToFloatStruct<T> q2f(input_min, input_max);
+      auto input_float = DEQUANTIZE_WITH_EIGEN(input_array, q2f);
+      FloatToQuantizedStruct<T> f2q(output_min, output_max);
+      auto input_requantized = QUANTIZE_WITH_EIGEN(input_float, f2q, T);
+
+      // RequantizeCopier::Copy is called from within a shard of computation, so
+      // don't use the threadpool device here, simply assign with default CPU
+      // device.
+      output_array = input_requantized;
+    }
+  }
+
+  float output_min;
+  float output_max;
+  const std::vector<std::pair<float, float>>* input_min_and_max;
+};
+}  // namespace
+
+template <typename T>
+class QuantizedConcatOp : public OpKernel {
+ public:
+  typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>
+      ConstMatrixVector;
+
+  explicit QuantizedConcatOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void CalculateInputAndOutputRange(
+      const OpInputList& input_mins, const OpInputList& input_maxes,
+      const size_t N,
+      std::vector<std::pair<float, float>>* input_mins_and_maxes,
+      float* output_min, float* output_max) {
+    input_mins_and_maxes->reserve(N);
+    float overall_min = std::numeric_limits<float>::max();
+    float overall_max = std::numeric_limits<float>::lowest();
+    for (int i = 0; i < N; ++i) {
+      const float input_min = input_mins[i].flat<float>()(0);
+      const float input_max = input_maxes[i].flat<float>()(0);
+      input_mins_and_maxes->emplace_back(input_min, input_max);
+      overall_min = std::min(overall_min, input_min);
+      overall_max = std::max(overall_max, input_max);
+    }
+    if (std::is_signed<T>::value) {
+      // For signed, we want a symmetrical distribution including zero for the
+      // output, so pick a range that meets that need.
+      const float largest_value =
+          std::max(std::abs(overall_min), std::abs(overall_max));
+      *output_min = -largest_value;
+      *output_max = largest_value;
+    } else {
+      *output_min = overall_min;
+      *output_max = overall_max;
+    }
+  }
+
+  int64 CalculateInputsDim(const TensorShape& input_shape,
+                           const int32 concat_dim) {
+    int64 inputs_flat_dim0 = 1;
+    for (int d = 0; d < concat_dim; ++d) {
+      inputs_flat_dim0 *= input_shape.dim_size(d);
+    }
+    return inputs_flat_dim0;
+  }
+
+  void CalculateConcatDims(const size_t N, const TensorShape& input_shape,
+                           int input_dims, const OpInputList& values,
+                           OpKernelContext* context, const int32 concat_dim,
+                           const int64 inputs_flat_dim0,
+                           ConstMatrixVector* inputs_flat,
+                           int* output_concat_dim) {
+    // Note that we reduce the concat of n-dimensional tensors into a two
+    // dimensional concat. Assuming the dimensions of any input/output
+    // tensor are {x0, x1,...,xn-1, y0, y1,...,ym-1}, where the concat is along
+    // the dimension indicated with size y0, we flatten it to {x, y}, where y =
+    // Prod_i(yi) and x = ((n > 0) ? Prod_i(xi) : 1).
+    inputs_flat->reserve(N);
+    *output_concat_dim = 0;
+    const bool input_is_scalar = IsLegacyScalar(input_shape);
+    for (int i = 0; i < N; ++i) {
+      const auto in = values[i];
+      const bool in_is_scalar = IsLegacyScalar(in.shape());
+      OP_REQUIRES(
+          context, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
+          errors::InvalidArgument(
+              "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
+              input_shape.DebugString(), " vs. shape[", i, "] = ",
+              in.shape().DebugString()));
+      for (int j = 0; j < input_dims; ++j) {
+        if (j == concat_dim) {
+          continue;
+        }
+        OP_REQUIRES(
+            context, in.dim_size(j) == input_shape.dim_size(j),
+            errors::InvalidArgument(
+                "ConcatOp : Dimensions of inputs should match: shape[0] = ",
+                input_shape.DebugString(), " vs. shape[", i, "] = ",
+                in.shape().DebugString()));
+      }
+      if (in.NumElements() > 0) {
+        int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
+        inputs_flat->emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            in.shaped<T, 2>({inputs_flat_dim0, inputs_flat_dim1})));
+      }
+      *output_concat_dim += in.dims() > 0 ? in.dim_size(concat_dim) : 1;
+    }
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* concat_dim_tensor = nullptr;
+    OP_REQUIRES_OK(context, context->input("concat_dim", &concat_dim_tensor));
+    OP_REQUIRES(
+        context, IsLegacyScalar(concat_dim_tensor->shape()),
+        errors::InvalidArgument(
+            "Concat dim tensor should be a scalar integer, but got shape ",
+            concat_dim_tensor->shape().DebugString()));
+    const int32 concat_dim = concat_dim_tensor->scalar<int32>()();
+    OpInputList values;
+    OP_REQUIRES_OK(context, context->input_list("values", &values));
+    const size_t N = values.size();
+    OpInputList input_mins;
+    OP_REQUIRES_OK(context, context->input_list("input_mins", &input_mins));
+    OP_REQUIRES(context, (input_mins.size() == N),
+                errors::InvalidArgument(
+                    "QuantizedConcatOp : Expected mins input list length ",
+                    input_mins.size(), " to equal values length ", N))
+    OpInputList input_maxes;
+    OP_REQUIRES_OK(context, context->input_list("input_maxes", &input_maxes));
+    OP_REQUIRES(context, (input_maxes.size() == N),
+                errors::InvalidArgument(
+                    "QuantizedConcatOp : Expected maxes input list length ",
+                    input_maxes.size(), " to equal values length ", N))
+    const int input_dims = values[0].dims();
+    const TensorShape& input_shape = values[0].shape();
+    OP_REQUIRES(
+        context, (0 <= concat_dim && concat_dim < input_dims) ||
+                     (allow_legacy_scalars() && concat_dim == 0),
+        errors::InvalidArgument(
+            "ConcatOp : Expected concatenating dimensions in the range [", 0,
+            ", ", input_dims, "), but got ", concat_dim));
+
+    float output_min = std::numeric_limits<float>::max();
+    float output_max = std::numeric_limits<float>::lowest();
+    std::vector<std::pair<float, float>> input_mins_and_maxes;
+    CalculateInputAndOutputRange(input_mins, input_maxes, N,
+                                 &input_mins_and_maxes, &output_min,
+                                 &output_max);
+    const int64 inputs_flat_dim0 = CalculateInputsDim(input_shape, concat_dim);
+    ConstMatrixVector inputs_flat;
+    int output_concat_dim;
+    CalculateConcatDims(N, input_shape, input_dims, values, context, concat_dim,
+                        inputs_flat_dim0, &inputs_flat, &output_concat_dim);
+
+    TensorShape output_shape(input_shape);
+    // TODO(irving): Remove rank 0 case once !kAllowLegacyScalars
+    if (output_shape.dims() == 0) {
+      output_shape.AddDim(output_concat_dim);
+    } else {
+      output_shape.set_dim(concat_dim, output_concat_dim);
+    }
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+    if (output->NumElements() > 0) {
+      int64 output_dim1 = output->NumElements() / inputs_flat_dim0;
+      auto output_flat = output->shaped<T, 2>({inputs_flat_dim0, output_dim1});
+      ConcatCPUImpl<T>(
+          context->device(), inputs_flat, sizeof(T) /* cost_per_unit */,
+          RequantizeCopier<T>(&input_mins_and_maxes, output_min, output_max),
+          &output_flat);
+    }
+
+    Tensor* output_min_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, {}, &output_min_tensor));
+    output_min_tensor->flat<float>()(0) = output_min;
+
+    Tensor* output_max_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(2, {}, &output_max_tensor));
+    output_max_tensor->flat<float>()(0) = output_max;
+  }
+};
+
+#define REGISTER_QUANTIZED_CONCAT(type)                  \
+  REGISTER_KERNEL_BUILDER(Name("QuantizedConcat")        \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("concat_dim"), \
+                          QuantizedConcatOp<type>)
+
+REGISTER_QUANTIZED_CONCAT(quint8);
+REGISTER_QUANTIZED_CONCAT(qint32);
+
+#undef REGISTER_QUANTIZED_CONCAT
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantized_concat_op_test.cc b/tensorflow/contrib/quantization/kernels/quantized_concat_op_test.cc
new file mode 100644
index 0000000000..1301259fdd
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantized_concat_op_test.cc
@@ -0,0 +1,337 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+using test::graph::Constant;
+
+class QuantizedConcatTest : public OpsTestBase {
+ protected:
+  QuantizedConcatTest() {}
+
+  void TestSmall8Bit(float first_min, float first_max, float second_min,
+                     float second_max);
+  void TestSmall32Bit(float first_min, float first_max, float second_min,
+                      float second_max);
+  void TestSecondDim8Bit(float first_min, float first_max, float second_min,
+                         float second_max);
+};
+
+TEST_F(QuantizedConcatTest, Small8Bit) {
+  TestSmall8Bit(0.0f, 255.0f, 0.0f, 25.0f);
+}
+
+TEST_F(QuantizedConcatTest, Small8BitSameRange) {
+  // Range for both is the same, so impl can use memcpy.
+  TestSmall8Bit(0.0f, 255.0f, 0.0f, 255.0f);
+}
+
+void QuantizedConcatTest::TestSmall8Bit(float first_min, float first_max,
+                                        float second_min, float second_max) {
+  TF_ASSERT_OK(NodeDefBuilder("quantized_concat_op", "QuantizedConcat")
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(2, DT_QUINT8))
+                   .Input(FakeInput(2, DT_FLOAT))
+                   .Input(FakeInput(2, DT_FLOAT))
+                   .Attr("N", 2)
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const int first_batch = 2;
+  const int first_height = 2;
+  const int first_width = 3;
+  Tensor first_float(DT_FLOAT, {first_batch, first_height, first_width});
+  test::FillValues<float>(&first_float,
+                          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  Tensor first_quantized =
+      FloatTensorToQuantized<quint8>(first_float, first_min, first_max);
+
+  const int second_batch = 2;
+  const int second_height = 2;
+  const int second_width = 3;
+  Tensor second_float(DT_FLOAT, {second_batch, second_height, second_width});
+  test::FillValues<float>(&second_float,
+                          {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  Tensor second_quantized =
+      FloatTensorToQuantized<quint8>(second_float, second_min, second_max);
+
+  const int expected_batch = first_batch + second_batch;
+  Tensor expected_float(DT_FLOAT, {expected_batch, first_height, first_width});
+  test::FillValues<float>(&expected_float,
+                          {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                           13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<quint8>(first_quantized.shape(),
+                            first_quantized.flat<quint8>());
+  AddInputFromArray<quint8>(second_quantized.shape(),
+                            second_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({}), {first_min});
+  AddInputFromArray<float>(TensorShape({}), {second_min});
+  AddInputFromArray<float>(TensorShape({}), {first_max});
+  AddInputFromArray<float>(TensorShape({}), {second_max});
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<quint8>(output_quantized, output_min, output_max);
+  test::ExpectTensorNear<float>(expected_float, output_float, 0.2);
+}
+
+TEST_F(QuantizedConcatTest, Small32Bit) {
+  TestSmall32Bit(0.0f, 1200.0f, 0.0f, 2400.0f);
+}
+
+TEST_F(QuantizedConcatTest, Small32BitSameRange) {
+  TestSmall32Bit(-2400.0f, 2400.0f, -2400.0f, 2400.0f);
+}
+
+TEST_F(QuantizedConcatTest, Small32BitOneDimSameRangeAsOutput) {
+  TestSmall32Bit(-2400.0f, 2400.0f, -1200.0f, 2400.0f);
+}
+
+void QuantizedConcatTest::TestSmall32Bit(float first_min, float first_max,
+                                         float second_min, float second_max) {
+  TF_ASSERT_OK(NodeDefBuilder("quantized_concat_op", "QuantizedConcat")
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(2, DT_QINT32))
+                   .Input(FakeInput(2, DT_FLOAT))
+                   .Input(FakeInput(2, DT_FLOAT))
+                   .Attr("N", 2)
+                   .Attr("T", DataTypeToEnum<qint32>::v())
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const int first_batch = 2;
+  const int first_height = 2;
+  const int first_width = 3;
+  Tensor first_float(DT_FLOAT, {first_batch, first_height, first_width});
+  test::FillValues<float>(&first_float, {100, 200, 300, 400, 500, 600, 700, 800,
+                                         900, 1000, 1100, 1200});
+  Tensor first_quantized =
+      FloatTensorToQuantized<qint32>(first_float, first_min, first_max);
+
+  const int second_batch = 2;
+  const int second_height = 2;
+  const int second_width = 3;
+  Tensor second_float(DT_FLOAT, {second_batch, second_height, second_width});
+  test::FillValues<float>(&second_float, {1300, 1400, 1500, 1600, 1700, 1800,
+                                          1900, 2000, 2100, 2200, 2300, 2400});
+  Tensor second_quantized =
+      FloatTensorToQuantized<qint32>(second_float, second_min, second_max);
+
+  const int expected_batch = first_batch + second_batch;
+  Tensor expected_float(DT_FLOAT, {expected_batch, first_height, first_width});
+  test::FillValues<float>(
+      &expected_float,
+      {100,  200,  300,  400,  500,  600,  700,  800,  900,  1000, 1100, 1200,
+       1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400});
+
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<qint32>(first_quantized.shape(),
+                            first_quantized.flat<qint32>());
+  AddInputFromArray<qint32>(second_quantized.shape(),
+                            second_quantized.flat<qint32>());
+  AddInputFromArray<float>(TensorShape({}), {first_min});
+  AddInputFromArray<float>(TensorShape({}), {second_min});
+  AddInputFromArray<float>(TensorShape({}), {first_max});
+  AddInputFromArray<float>(TensorShape({}), {second_max});
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<qint32>(output_quantized, output_min, output_max);
+  test::ExpectTensorNear<float>(expected_float, output_float, 0.2);
+}
+
+TEST_F(QuantizedConcatTest, SecondDim8Bit) {
+  TestSecondDim8Bit(-10.0f, 150.0f, 0.0f, 200.0f);
+}
+
+TEST_F(QuantizedConcatTest, SecondDim8BitSameRange) {
+  TestSecondDim8Bit(-10.0f, 150.0f, -10.0f, 150.0f);
+}
+
+void QuantizedConcatTest::TestSecondDim8Bit(float first_min, float first_max,
+                                            float second_min,
+                                            float second_max) {
+  TF_ASSERT_OK(NodeDefBuilder("quantized_concat_op", "QuantizedConcat")
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(2, DT_QUINT8))
+                   .Input(FakeInput(2, DT_FLOAT))
+                   .Input(FakeInput(2, DT_FLOAT))
+                   .Attr("N", 2)
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const int first_batch = 2;
+  const int first_height = 2;
+  const int first_width = 3;
+  Tensor first_float(DT_FLOAT, {first_batch, first_height, first_width});
+  test::FillValues<float>(&first_float,
+                          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  Tensor first_quantized =
+      FloatTensorToQuantized<quint8>(first_float, first_min, first_max);
+
+  const int second_batch = 2;
+  const int second_height = 2;
+  const int second_width = 3;
+  Tensor second_float(DT_FLOAT, {second_batch, second_height, second_width});
+  test::FillValues<float>(&second_float,
+                          {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  Tensor second_quantized =
+      FloatTensorToQuantized<quint8>(second_float, second_min, second_max);
+
+  const int expected_height = first_height + second_height;
+  Tensor expected_float(DT_FLOAT, {first_batch, expected_height, first_width});
+  test::FillValues<float>(&expected_float,
+                          {1, 2, 3, 4,  5,  6,  13, 14, 15, 16, 17, 18,
+                           7, 8, 9, 10, 11, 12, 19, 20, 21, 22, 23, 24});
+
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<quint8>(first_quantized.shape(),
+                            first_quantized.flat<quint8>());
+  AddInputFromArray<quint8>(second_quantized.shape(),
+                            second_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({}), {first_min});
+  AddInputFromArray<float>(TensorShape({}), {second_min});
+  AddInputFromArray<float>(TensorShape({}), {first_max});
+  AddInputFromArray<float>(TensorShape({}), {second_max});
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<quint8>(output_quantized, output_min, output_max);
+  test::ExpectTensorNear<float>(expected_float, output_float, 1.0);
+}
+
+// For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
+// in size, and concat them together along "concat_dimension".
+// If <same_limits> is true, then both concatenated dimensions have the same
+// quantized range; otherwise, they are set to different values.
+template <typename T>
+static void ConcatHelper(int iters, int concat_dimension, bool same_limits,
+                         int dim2) {
+  testing::StopTiming();
+  Graph* g = new Graph(OpRegistry::Global());
+
+  DataType dt = DataTypeToEnum<T>::v();
+  const int kDim1 = 100;
+  TensorShape shape({kDim1, dim2});
+
+  Tensor concat_dim = test::AsScalar<int32>(concat_dimension);
+  Tensor in0(dt, shape);
+  in0.flat<T>().setRandom();
+  Tensor in1(dt, shape);
+  in1.flat<T>().setRandom();
+
+  Tensor mins0 = test::AsScalar<float>(-1.0);
+  Tensor maxes0 = test::AsScalar<float>(1.0);
+  Tensor mins1 = test::AsScalar<float>(same_limits ? -1.0 : -255.0);
+  Tensor maxes1 = test::AsScalar<float>(same_limits ? 1.0 : 255.0);
+
+  Node* node;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "QuantizedConcat")
+                  .Input(Constant(g, concat_dim))
+                  .Input({Constant(g, in0), Constant(g, in1)})
+                  .Input({Constant(g, mins0), Constant(g, mins1)})
+                  .Input({Constant(g, maxes0), Constant(g, maxes1)})
+                  .Attr("N", 2)
+                  .Attr("T", dt)
+                  .Finalize(g, &node));
+
+  testing::BytesProcessed(static_cast<int64>(iters) *
+                          ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(T));
+  testing::StartTiming();
+  test::Benchmark("cpu", g).Run(iters);
+  testing::UseRealTime();
+}
+
+static void BM_QConcatDim0SameLimitQInt32(int iters, int dim2) {
+  ConcatHelper<qint32>(iters, 0 /* concat_dimension */, true /* same_limits */,
+                       dim2);
+}
+
+static void BM_QConcatDim1SameLimitQInt32(int iters, int dim2) {
+  ConcatHelper<qint32>(iters, 1 /* concat_dimension */, true /* same_limits */,
+                       dim2);
+}
+
+static void BM_QConcatDim0DifferLimitQInt32(int iters, int dim2) {
+  ConcatHelper<qint32>(iters, 0 /* concat_dimension */, false /* same_limits */,
+                       dim2);
+}
+
+static void BM_QConcatDim1DifferLimitQInt32(int iters, int dim2) {
+  ConcatHelper<qint32>(iters, 1 /* concat_dimension */, false /* same_limits */,
+                       dim2);
+}
+
+BENCHMARK(BM_QConcatDim0SameLimitQInt32)->Arg(1000)->Arg(20000)->Arg(100000);
+BENCHMARK(BM_QConcatDim1SameLimitQInt32)->Arg(1000)->Arg(20000)->Arg(100000);
+BENCHMARK(BM_QConcatDim0DifferLimitQInt32)->Arg(1000)->Arg(20000)->Arg(100000);
+BENCHMARK(BM_QConcatDim1DifferLimitQInt32)->Arg(1000)->Arg(20000)->Arg(100000);
+
+static void BM_QConcatDim0SameLimitQUint8(int iters, int dim2) {
+  ConcatHelper<qint32>(iters, 0 /* concat_dimension */, true /* same_limits */,
+                       dim2);
+}
+
+static void BM_QConcatDim1SameLimitQUint8(int iters, int dim2) {
+  ConcatHelper<qint32>(iters, 1 /* concat_dimension */, true /* same_limits */,
+                       dim2);
+}
+
+static void BM_QConcatDim0DifferLimitQUint8(int iters, int dim2) {
+  ConcatHelper<qint32>(iters, 0 /* concat_dimension */, false /* same_limits */,
+                       dim2);
+}
+
+static void BM_QConcatDim1DifferLimitQUint8(int iters, int dim2) {
+  ConcatHelper<qint32>(iters, 1 /* concat_dimension */, false /* same_limits */,
+                       dim2);
+}
+
+BENCHMARK(BM_QConcatDim0SameLimitQUint8)->Arg(1000)->Arg(20000)->Arg(100000);
+BENCHMARK(BM_QConcatDim1SameLimitQUint8)->Arg(1000)->Arg(20000)->Arg(100000);
+BENCHMARK(BM_QConcatDim0DifferLimitQUint8)->Arg(1000)->Arg(20000)->Arg(100000);
+BENCHMARK(BM_QConcatDim1DifferLimitQUint8)->Arg(1000)->Arg(20000)->Arg(100000);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc b/tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc
new file mode 100644
index 0000000000..b25bff45a1
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc
@@ -0,0 +1,526 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implements quantized eight-bit versions of the convolution operations.
+
+#include <algorithm>
+#include <vector>
+
+#include "public/gemmlowp.h"
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+#include "tensorflow/contrib/quantization/kernels/reference_gemm.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+
+// This functor implements the convolution operation in as simple a form as
+// possible. It won't give great performance, but it is very useful for
+// stepping through and instrumenting for debugging, creating minimal benchmarks
+// to prototype with, and sharing with teams that want to run this outside of
+// our environment.
+// With that in mind, I've avoided using anything except pretty standard C++
+// types. This is especially noticeable in the data access through raw array
+// indexing. It's deliberate in this case though, since it makes the underlying
+// memory order very explicit, which is important for both inspecting memory
+// contents during debugging and for specifying what we expect to others.
+// The memory layout of the data is, from biggest stride to smallest:
+// input_data = [input_batches, input_height, input_width, input_depth]
+// filter_data = [filter_height, filter_width, input_depth, filter_count]
+// output_data = [input_batches, output_height, output_width, filter_count]
+template <class T1, class T2, class T3>
+class ReferenceConvFunctor {
+ public:
+  void operator()(OpKernelContext* op_context, const T1* input_data,
+                  int input_batches, int input_height, int input_width,
+                  int input_depth, int input_offset, const T2* filter_data,
+                  int filter_height, int filter_width, int filter_count,
+                  int filter_offset, int stride, Padding padding,
+                  T3* output_data, int output_height, int output_width,
+                  int output_shift, int output_offset, int output_mult) {
+    // Set up some constants we need for the output down-shifting and
+    // saturation.
+    const int32 highest = static_cast<int32>(Eigen::NumTraits<T3>::highest());
+    const int32 lowest = static_cast<int32>(Eigen::NumTraits<T3>::lowest());
+
+    // When we're converting the 32 bit accumulator to a lower bit depth, we
+    // need to add on 0.5 in fixed-point terms to make the operation round half
+    // up towards positive infinity, rather than a floor.
+    // We also need to watch out for the case when there's no down shift,
+    // because a left shift by a negative number gives undefined results.
+    const int32 rounding = (output_shift < 1) ? 0 : (1 << (output_shift - 1));
+
+    // The two different padding modes we support can be a bit confusing. SAME
+    // means we're trying to produce an output image that's the same size as the
+    // input. It's complicated by stride, which shrinks the output image by a
+    // a factor, but it means we end up sampling from outside the borders of the
+    // input. These out-of-bounds values are read as zeroes. VALID means only
+    // produce output values where the filters can read all their values from
+    // within the input image. It effectively removes the margins of the output
+    // image compared to the one produced by SAME. Stride complicates this
+    // definition though, because it can result in the right and bottom filter
+    // patches sampling from outside the borders if it's greater than 1.
+    // Most of the logic for sorting this all out is done before this function,
+    // when we calculate the output size, but the positioning of the origin of
+    // the filters is different between the two modes, since SAME positions the
+    // first filter off the edge of the input.
+    int filter_left_offset;
+    int filter_top_offset;
+    if (padding == VALID) {
+      filter_left_offset =
+          ((output_width - 1) * stride + filter_width - input_width) / 2;
+      filter_top_offset =
+          ((output_height - 1) * stride + filter_height - input_height) / 2;
+    } else {
+      filter_left_offset =
+          ((output_width - 1) * stride + filter_width - input_width) / 2;
+      filter_top_offset =
+          ((output_height - 1) * stride + filter_height - input_height) / 2;
+    }
+
+    // If we've got multiple images in our input, work through each of them.
+    for (int batch = 0; batch < input_batches; ++batch) {
+      // Walk through all the output image values, sliding the filter to
+      // different
+      // positions in the input.
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          // Each filter kernel produces one output channel.
+          for (int out_channel = 0; out_channel < filter_count; ++out_channel) {
+            // We're going to calculate a single output value, which means we
+            // need to multiply a three dimensional kernel of weights against
+            // the current location within the input image.
+            /*
+              *-------------------------------...
+              |\ ^
+              | \in_depth
+              |  \ v
+              |   *-------------------------------...
+              |   |            ^
+              |   |       in_y_origin
+              |   |            v   \
+              |   |<in_x_origin>*---*^
+              |   |            \|   |filter_height
+              .   |             *---*v
+              .   |             <--->
+                  .         filter_width
+                  .
+            */
+            const int in_x_origin = (out_x * stride) - filter_left_offset;
+            const int in_y_origin = (out_y * stride) - filter_top_offset;
+            int32 total = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                for (int in_channel = 0; in_channel < input_depth;
+                     ++in_channel) {
+                  const int in_x = in_x_origin + filter_x;
+                  const int in_y = in_y_origin + filter_y;
+                  int32 input_value;
+                  // If the location is outside the bounds of the input image,
+                  // use zero as a default value.
+                  if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                      (in_y < input_height)) {
+                    const T1 input_source_value =
+                        input_data[(batch * input_height * input_width *
+                                    input_depth) +
+                                   (in_y * input_width * input_depth) +
+                                   (in_x * input_depth) + in_channel];
+                    // We're promoting the T1 type to a higher bit depth here as
+                    // we do the subtraction.
+                    input_value =
+                        static_cast<int32>(input_source_value) - input_offset;
+                  } else {
+                    input_value = 0;
+                  }
+                  const T2 filter_source_value =
+                      filter_data[(filter_y * filter_width * input_depth *
+                                   filter_count) +
+                                  (filter_x * input_depth * filter_count) +
+                                  (in_channel * filter_count) + out_channel];
+                  // Another promotion to 32 bit, as above.
+                  const int32 filter_value =
+                      static_cast<int32>(filter_source_value) - filter_offset;
+                  total += (input_value * filter_value);
+                }
+              }
+            }
+            // Here we're applying scale factors to compress the 32 bit
+            // accumulated total to a potentially lower bit depth.
+            const int32_t output =
+                ((((total + output_offset) * output_mult) + rounding) >>
+                 output_shift);
+            // We need to saturate the results against the largest and smallest
+            // values that can be represented in this type.
+            const int32 top_clamped_output = std::min(output, highest);
+            const int32 clamped_output = std::max(top_clamped_output, lowest);
+            output_data[(batch * output_height * output_width * filter_count) +
+                        (out_y * output_width * filter_count) +
+                        (out_x * filter_count) + out_channel] = clamped_output;
+          }
+        }
+      }
+    }
+  }
+};
+
+// Implements convolution as a two stage process, first packing the patches of
+// the input image into columns (im2col) and then running GEMM to produce the
+// final result.
+// TODO(petewarden) - We need to update gemmlowp to support 32-bit outputs
+// before we can re-enable this path.
+template <class T1, class T2, class T3>
+class Im2ColConvFunctor {
+ public:
+  void operator()(OpKernelContext* op_context, const T1* input_data,
+                  int input_batches, int input_height, int input_width,
+                  int input_depth, int input_offset, const T2* filter_data,
+                  int filter_height, int filter_width, int filter_count,
+                  int filter_offset, int stride, Padding padding,
+                  T3* output_data, int output_height, int output_width,
+                  int output_shift, int output_offset, int output_mult) {
+    if (input_offset < 0) {
+      // Only log the first few occurrences of this warning.
+      static int warning_count = 0;
+      if (warning_count < 10) {
+        ++warning_count;
+        LOG(WARNING)
+            << "Zero is not representable in the quantized range used by the"
+            << " input. This means QuantizedConv2d has to fall back to a slow"
+            << " implementation, since the border of zero values can't be"
+            << " represented easily. You should try to construct graphs that"
+            << " avoid this situation.";
+      }
+      ReferenceConvFunctor<T1, T2, T3> conv_functor;
+      conv_functor(op_context, input_data, input_batches, input_height,
+                   input_width, input_depth, input_offset, filter_data,
+                   filter_height, filter_width, filter_count, filter_offset,
+                   stride, padding, output_data, output_height, output_width,
+                   output_shift, output_offset, output_mult);
+      return;
+    }
+
+    CHECK_GT(output_width, 0);
+    CHECK_GT(output_height, 0);
+    int filter_left_offset;
+    int filter_top_offset;
+    if (padding == VALID) {
+      filter_left_offset =
+          ((output_width - 1) * stride + filter_width - input_width) / 2;
+      filter_top_offset =
+          ((output_height - 1) * stride + filter_height - input_height) / 2;
+    } else {
+      filter_left_offset =
+          ((output_width - 1) * stride + filter_width - input_width) / 2;
+      filter_top_offset =
+          ((output_height - 1) * stride + filter_height - input_height) / 2;
+    }
+
+    // The im2col buffer has # of patches rows, and # of filters cols.
+    // It's laid out like this, in row major order in memory:
+    //        < filter value count >
+    //   ^   +---------------------+
+    // patch |                     |
+    // count |                     |
+    //   v   +---------------------+
+    // Each patch row contains a filter_width x filter_height patch of the
+    // input, with the depth channel as the most contiguous in memory, followed
+    // by the width, then the height. This is the standard memory order in the
+    // image world if it helps to visualize it.
+    const int filter_value_count = filter_width * filter_height * input_depth;
+    const int patch_count = input_batches * output_width * output_height;
+    const int im2col_size = patch_count * filter_value_count;
+    // TODO(petewarden) - Memory allocation can be very slow on Android. Can we
+    // optimize this by keeping the scratch buffer around?
+    std::unique_ptr<T1[]> im2col_buffer(new T1[im2col_size]);
+
+    for (int batch = 0; batch < input_batches; ++batch) {
+      const T1* input_batch_start =
+          input_data + (batch * input_height * input_width * input_depth);
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        const int in_y_origin = (out_y * stride) - filter_top_offset;
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin = (out_x * stride) - filter_left_offset;
+          const int patch_index = (batch * output_width * output_height) +
+                                  (out_y * output_width) + out_x;
+          T1* im2col_patch_start =
+              im2col_buffer.get() + (patch_index * filter_value_count);
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + filter_y;
+            T1* im2col_row_start =
+                im2col_patch_start + (filter_y * filter_width * input_depth);
+            // If we're off the top or the bottom of the input, fill the whole
+            // row with zeroes.
+            if ((in_y < 0) || (in_y >= input_height)) {
+              T1* im2col_row_end =
+                  im2col_row_start + (filter_width * input_depth);
+              // We'll be subtracting this offset during the calculations
+              // so to get an actual zero after that bias we need to set
+              // it to input_offset here.
+              std::fill(im2col_row_start, im2col_row_end, input_offset);
+            } else {
+              // What we're doing here is trying to copy and fill the im2col
+              // buffer as efficiently as possible, using functions to set or
+              // duplicate values en masse. We know we don't have to worry about
+              // vertical edges because we dealt with that case above, so we
+              // just need to handle filters that overlap the left or right
+              // edges. Here's what that looks like:
+              //
+              // < left_zero_count > < center_copy_count > < right_zero_count >
+              // +------------------+---------------------+--------------------+
+              // |     (filter)     |       (image)       |      (filter)      |
+              // +------------------+---------------------+--------------------+
+              // in_x_origin        0                 input_width       in_x_end
+              //
+              // In reality it's unlikely that a filter patch will be wider
+              // than an input, but this shows all the edge cases.
+              // We use std::fill() to set the left and right sections to zeroes
+              // and std::copy() to copy over the input data for the center.
+              const int in_x_end = in_x_origin + filter_width;
+              const int left_zero_count = std::max(0, 0 - in_x_origin);
+              const int right_zero_count = std::max(0, in_x_end - input_width);
+              const int center_copy_count =
+                  filter_width - (left_zero_count + right_zero_count);
+              if (left_zero_count > 0) {
+                T1* im2col_left_start = im2col_row_start;
+                T1* im2col_left_end =
+                    im2col_left_start + (left_zero_count * input_depth);
+                std::fill(im2col_left_start, im2col_left_end, input_offset);
+              }
+              if (center_copy_count > 0) {
+                const T1* input_row_start =
+                    input_batch_start + (in_y * input_width * input_depth) +
+                    (std::max(0, in_x_origin) * input_depth);
+                const T1* input_row_end =
+                    input_row_start + (center_copy_count * input_depth);
+                T1* im2col_center_start =
+                    im2col_row_start + (left_zero_count * input_depth);
+                std::copy(input_row_start, input_row_end, im2col_center_start);
+              }
+              if (right_zero_count > 0) {
+                T1* im2col_right_start =
+                    im2col_row_start +
+                    ((left_zero_count + center_copy_count) * input_depth);
+                T1* im2col_right_end =
+                    im2col_right_start + (right_zero_count * input_depth);
+                std::fill(im2col_right_start, im2col_right_end, input_offset);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    CHECK_GT(patch_count, 0);
+    CHECK_GT(filter_count, 0);
+    CHECK_GT(filter_value_count, 0);
+
+    const bool transpose_a = false;
+    const bool transpose_b = false;
+    const bool transpose_c = false;
+    const int m = patch_count;
+    const int n = filter_count;
+    const int k = filter_value_count;
+    const int lda = filter_value_count;
+    const int ldb = filter_count;
+    const int ldc = filter_count;
+    // The gemmlowp optimized library only works for a particular set of data
+    // types, so check if we meet those requirements and
+    // fall back to a slower reference implementation if not.
+    if (std::is_same<T1, quint8>() && std::is_same<T2, quint8>() &&
+        std::is_same<T3, qint32>() && (output_offset == 0) &&
+        (output_mult == 1) && (output_shift == 0)) {
+      const uint8* im2col_data_as_uint8 = &(im2col_buffer.get()->value);
+      const uint8* filter_data_as_uint8 = &(filter_data->value);
+      int32* output_data_as_int32 = &(output_data->value);
+      // All of the transpose_* variables are currently compile-time consts, so
+      // we could just hard-code these values too, but that would break if
+      // anybody changed those values in the future (e.g. to match the ability
+      // of MatMul to specify them as attributes). We're using a verbose
+      // approach of deriving the order values from the transpose variables to
+      // be able to catch any changes like that.
+      static const gemmlowp::MapOrder ResultOrder =
+          !transpose_c ? gemmlowp::MapOrder::RowMajor
+                       : gemmlowp::MapOrder::ColMajor;
+      static const gemmlowp::MapOrder LhsOrder =
+          !transpose_a ? gemmlowp::MapOrder::RowMajor
+                       : gemmlowp::MapOrder::ColMajor;
+      static const gemmlowp::MapOrder RhsOrder =
+          !transpose_b ? gemmlowp::MapOrder::RowMajor
+                       : gemmlowp::MapOrder::ColMajor;
+      gemmlowp::MatrixMap<const std::uint8_t, LhsOrder> lhs(
+          im2col_data_as_uint8, m, k, lda);
+      gemmlowp::MatrixMap<const std::uint8_t, RhsOrder> rhs(
+          filter_data_as_uint8, k, n, ldb);
+      gemmlowp::MatrixMap<std::int32_t, ResultOrder> result(
+          output_data_as_int32, m, n, ldc);
+      const std::tuple<> empty_pipeline = {};
+
+      auto& worker_threads =
+          *(op_context->device()->tensorflow_cpu_worker_threads());
+      TensorflowGemmContext context(worker_threads.num_threads,
+                                    worker_threads.workers);
+      gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t,
+                                       gemmlowp::DefaultL8R8BitDepthParams>(
+          &context, lhs, rhs, &result, -input_offset, -filter_offset,
+          empty_pipeline);
+    } else {
+      ReferenceGemm<T1, T2, T3>(transpose_a, transpose_b, transpose_c, m, n, k,
+                                im2col_buffer.get(), input_offset, lda,
+                                filter_data, filter_offset, ldb, output_data,
+                                output_shift, output_offset, output_mult, ldc);
+    }
+  }
+};
+
+template <class T1, class T2, class T3,
+          template <class TF1, class TF2, class TF3> class ConvFunctor>
+class QuantizedConv2DOp : public OpKernel {
+ public:
+  explicit QuantizedConv2DOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, strides_[1] == strides_[2],
+                errors::InvalidArgument(
+                    "Current implementation only supports equal length "
+                    "strides in the row and column dimensions."));
+    OP_REQUIRES(
+        context, (strides_[0] == 1 && strides_[3] == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Input tensor is of the following dimensions:
+    // [ batch, in_rows, in_cols, in_depth ]
+    const Tensor& input = context->input(0);
+
+    // Input filter is of the following dimensions:
+    // [ filter_rows, filter_cols, in_depth, out_depth]
+    const Tensor& filter = context->input(1);
+
+    // For 2D convolution, there should be 4 dimensions.
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().DebugString()));
+    OP_REQUIRES(context, filter.dims() == 4,
+                errors::InvalidArgument("filter must be 4-dimensional: ",
+                                        filter.shape().DebugString()));
+
+    const float min_input = context->input(2).flat<float>()(0);
+    const float max_input = context->input(3).flat<float>()(0);
+    const float min_filter = context->input(4).flat<float>()(0);
+    const float max_filter = context->input(5).flat<float>()(0);
+    const int32 offset_input =
+        FloatToQuantizedUnclamped<T1>(0.0f, min_input, max_input);
+    const int32 offset_filter =
+        FloatToQuantizedUnclamped<T2>(0.0f, min_filter, max_filter);
+    const int32 offset_output = 0;
+    const int32 mult_output = 1;
+    const int32 shift_output = 0;
+
+    // The last dimension for input is in_depth. It must be the same as the
+    // filter's in_depth.
+    const int64 in_depth = input.dim_size(3);
+    OP_REQUIRES(
+        context, in_depth == filter.dim_size(2),
+        errors::InvalidArgument("input and filter must have the same depth: ",
+                                in_depth, " vs ", filter.dim_size(2)));
+
+    // The last dimension for filter is out_depth.
+    const int64 out_depth = filter.dim_size(3);
+
+    // The second dimension for input is rows/height.
+    // The first dimension for filter is rows/height.
+    const int64 input_rows = input.dim_size(1);
+    const int64 filter_rows = filter.dim_size(0);
+
+    // The third dimension for input is columns/width.
+    // The second dimension for filter is columns/width.
+    const int64 input_cols = input.dim_size(2);
+    const int64 filter_cols = filter.dim_size(1);
+
+    // The first dimension for input is batch.
+    const int64 batch = input.dim_size(0);
+
+    // For now we take the stride from the second dimension only (we
+    // assume row = col stride, and do not support striding on the
+    // batch or depth dimension).
+    const int stride = strides_[1];
+
+    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_rows, filter_rows, stride,
+                                         padding_, &out_rows, &pad_rows));
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_cols, filter_cols, stride,
+                                         padding_, &out_cols, &pad_cols));
+    CHECK_GT(batch, 0);
+    CHECK_GT(out_rows, 0);
+    CHECK_GT(out_cols, 0);
+    CHECK_GT(out_depth, 0);
+    TensorShape out_shape({batch, out_rows, out_cols, out_depth});
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    // This will call different implementations (e.g. reference or optimized)
+    // depending on the template parameter.
+    ConvFunctor<T1, T2, T3> conv_functor;
+    conv_functor(context, input.flat<T1>().data(), batch, input_rows,
+                 input_cols, in_depth, offset_input, filter.flat<T2>().data(),
+                 filter_rows, filter_cols, out_depth, offset_filter, stride,
+                 padding_, output->flat<T3>().data(), out_rows, out_cols,
+                 shift_output, offset_output, mult_output);
+
+    float min_output_value;
+    float max_output_value;
+    QuantizationRangeForMultiplication<T1, T2, T3>(
+        min_input, max_input, min_filter, max_filter, &min_output_value,
+        &max_output_value);
+
+    Tensor* output_min = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, {}, &output_min));
+    output_min->flat<float>()(0) = min_output_value;
+
+    Tensor* output_max = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(2, {}, &output_max));
+    output_max->flat<float>()(0) = max_output_value;
+  }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+};
+
+// Right now we only support taking two eight bit inputs, and returning the
+// results as signed 32-bit integers.
+REGISTER_KERNEL_BUILDER(
+    Name("QuantizedConv2D")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<quint8>("Tfilter")
+        .TypeConstraint<qint32>("out_type"),
+    QuantizedConv2DOp<quint8, quint8, qint32, Im2ColConvFunctor>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantized_conv_ops_test.cc b/tensorflow/contrib/quantization/kernels/quantized_conv_ops_test.cc
new file mode 100644
index 0000000000..6a07004a92
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantized_conv_ops_test.cc
@@ -0,0 +1,324 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class QuantizedConv2DTest : public OpsTestBase {
+ protected:
+};
+
+TEST_F(QuantizedConv2DTest, Small) {
+  const int stride = 1;
+  TF_ASSERT_OK(NodeDefBuilder("quantized_conv_op", "QuantizedConv2D")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("out_type", DataTypeToEnum<qint32>::v())
+                   .Attr("strides", {1, stride, stride, 1})
+                   .Attr("padding", "SAME")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  // The image data should always be able to represent zero, to allow a fast
+  // implementation of border padding, so we set the min value to 0.
+  const float image_min = 0.0f;
+  const float image_max = 12.0f;
+  // The image matrix is:
+  // |  1 |  2 |  3 |  4 |
+  // |  5 |  6 |  7 |  8 |
+  // |  9 | 10 | 11 | 12 |
+  Tensor image_float(DT_FLOAT,
+                     {image_batch_count, image_height, image_width, depth});
+  test::FillValues<float>(&image_float,
+                          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  Tensor image_quantized =
+      FloatTensorToQuantized<quint8>(image_float, image_min, image_max);
+
+  // The filter matrix is:
+  // | 1 | 4 | 7 |
+  // | 2 | 5 | 8 |
+  // | 3 | 6 | 9 |
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const float filter_min = 1.0f;
+  const float filter_max = 9.0f;
+  Tensor filter_float(DT_FLOAT,
+                      {filter_size, filter_size, depth, filter_count});
+  test::FillValues<float>(&filter_float, {1, 4, 7, 2, 5, 8, 3, 6, 9});
+  Tensor filter_quantized =
+      FloatTensorToQuantized<quint8>(filter_float, filter_min, filter_max);
+
+  AddInputFromArray<quint8>(image_quantized.shape(),
+                            image_quantized.flat<quint8>());
+  AddInputFromArray<quint8>(filter_quantized.shape(),
+                            filter_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {image_min});
+  AddInputFromArray<float>(TensorShape({1}), {image_max});
+  AddInputFromArray<float>(TensorShape({1}), {filter_min});
+  AddInputFromArray<float>(TensorShape({1}), {filter_max});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // We're sliding the 3x3 filter across the 3x4 image, with accesses outside
+  // the input set to zero because we're using the 'SAME' padding mode.
+  // The calculations behind the expected output are:
+  // (1*0)+(4*0)+(7*0)+(2*0)+(5*1)+(8*2)+(3*0)+(6*5)+(9*6)=105
+  // (1*0)+(4*0)+(7*0)+(2*1)+(5*2)+(8*3)+(3*5)+(6*6)+(9*7)=150
+  // (1*0)+(4*0)+(7*0)+(2*2)+(5*3)+(8*4)+(3*6)+(6*7)+(9*8)=183
+  // (1*0)+(4*0)+(7*0)+(2*3)+(5*4)+(8*0)+(3*7)+(6*8)+(9*0)=95
+  // (1*0)+(4*1)+(7*2)+(2*0)+(5*5)+(8*6)+(3*0)+(6*9)+(9*10)=235
+  // (1*1)+(4*2)+(7*3)+(2*5)+(5*6)+(8*7)+(3*9)+(6*10)+(9*11)=312
+  // (1*2)+(4*3)+(7*4)+(2*6)+(5*7)+(8*8)+(3*10)+(6*11)+(9*12)=357
+  // (1*3)+(4*4)+(7*0)+(2*7)+(5*8)+(8*0)+(3*11)+(6*12)+(9*0)=178
+  // (1*0)+(4*5)+(7*6)+(2*0)+(5*9)+(8*10)+(3*0)+(6*0)+(9*0)=187
+  // (1*5)+(4*6)+(7*7)+(2*9)+(5*10)+(8*11)+(3*0)+(6*0)+(9*0)=234
+  // (1*6)+(4*7)+(7*8)+(2*10)+(5*11)+(8*12)+(3*0)+(6*0)+(9*0)=261
+  // (1*7)+(4*11)+(7*0)+(2*8)+(5*12)+(8*0)+(3*0)+(6*0)+(9*0)=121
+  // This means we should end up with this matrix:
+  // |  105  |  150  |  183  |   95  |
+  // |  235  |  312  |  357  |  178  |
+  // |  187  |  234  |  261  |  121  |
+  const int expected_width = image_width;
+  const int expected_height = image_height * filter_count;
+  Tensor expected_float(
+      DT_FLOAT, TensorShape({image_batch_count, expected_height, expected_width,
+                             filter_count}));
+  test::FillValues<float>(&expected_float, {105, 150, 183, 95, 235, 312, 357,
+                                            178, 187, 234, 261, 121});
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<qint32>(output_quantized, output_min, output_max);
+  test::ExpectTensorNear<float>(expected_float, output_float, 1.0);
+}
+
+TEST_F(QuantizedConv2DTest, Small32Bit) {
+  const int stride = 1;
+  TF_ASSERT_OK(NodeDefBuilder("quantized_conv_op", "QuantizedConv2D")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("out_type", DataTypeToEnum<qint32>::v())
+                   .Attr("strides", {1, stride, stride, 1})
+                   .Attr("padding", "SAME")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  AddInputFromArray<quint8>(
+      TensorShape({image_batch_count, image_height, image_width, depth}),
+      {10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120});
+  const int filter_size = 3;
+  const int filter_count = 1;
+  AddInputFromArray<quint8>(
+      TensorShape({filter_size, filter_size, depth, filter_count}),
+      {10, 40, 70, 20, 50, 80, 30, 60, 90});
+  AddInputFromArray<float>(TensorShape({1}), {0});
+  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+  AddInputFromArray<float>(TensorShape({1}), {0});
+  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+
+  TF_ASSERT_OK(RunOpKernel());
+  const int expected_width = image_width;
+  const int expected_height = image_height * filter_count;
+  Tensor expected(DT_QINT32, TensorShape({image_batch_count, expected_height,
+                                          expected_width, filter_count}));
+  test::FillValues<qint32>(
+      &expected, {10500, 15000, 18300, 9500, 23500, 31200, 35700, 17800, 18700,
+                  23400, 26100, 12100});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(QuantizedConv2DTest, OddPadding) {
+  const int stride = 2;
+  TF_ASSERT_OK(NodeDefBuilder("quantized_conv_op", "QuantizedConv2D")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("out_type", DataTypeToEnum<qint32>::v())
+                   .Attr("strides", {1, stride, stride, 1})
+                   .Attr("padding", "SAME")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 4;
+  const int image_batch_count = 1;
+  AddInputFromArray<quint8>(
+      TensorShape({image_batch_count, image_height, image_width, depth}),
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  const int filter_size = 3;
+  const int filter_count = 1;
+  AddInputFromArray<quint8>(
+      TensorShape({filter_size, filter_size, depth, filter_count}),
+      {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  AddInputFromArray<float>(TensorShape({1}), {0});
+  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+  AddInputFromArray<float>(TensorShape({1}), {0});
+  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+
+  TF_ASSERT_OK(RunOpKernel());
+  const int expected_width = image_width / stride;
+  const int expected_height = (image_height * filter_count) / stride;
+  Tensor expected(DT_QINT32, TensorShape({image_batch_count, expected_height,
+                                          expected_width, filter_count}));
+  test::FillValues<qint32>(&expected, {348, 252, 274, 175});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(QuantizedConv2DTest, OddPaddingBatch) {
+  const int stride = 2;
+  TF_ASSERT_OK(NodeDefBuilder("quantized_conv_op", "QuantizedConv2D")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("out_type", DataTypeToEnum<qint32>::v())
+                   .Attr("strides", {1, stride, stride, 1})
+                   .Attr("padding", "SAME")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 4;
+  const int image_batch_count = 3;
+  AddInputFromArray<quint8>(
+      TensorShape({image_batch_count, image_height, image_width, depth}),
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  const int filter_size = 3;
+  const int filter_count = 1;
+  AddInputFromArray<quint8>(
+      TensorShape({filter_size, filter_size, depth, filter_count}),
+      {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  AddInputFromArray<float>(TensorShape({1}), {0});
+  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+  AddInputFromArray<float>(TensorShape({1}), {0});
+  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+
+  TF_ASSERT_OK(RunOpKernel());
+  const int expected_width = image_width / stride;
+  const int expected_height = (image_height * filter_count) / stride;
+  Tensor expected(DT_QINT32, TensorShape({image_batch_count, expected_height,
+                                          expected_width, filter_count}));
+  test::FillValues<qint32>(&expected, {348, 252, 274, 175,  //
+                                       348, 252, 274, 175,  //
+                                       348, 252, 274, 175});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(QuantizedConv2DTest, SmallWithNoZero) {
+  const int stride = 1;
+  TF_ASSERT_OK(NodeDefBuilder("quantized_conv_op", "QuantizedConv2D")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("out_type", DataTypeToEnum<qint32>::v())
+                   .Attr("strides", {1, stride, stride, 1})
+                   .Attr("padding", "SAME")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  // Here we're testing a slow implementation path, where zero is not
+  // representable in the image data and so simple border padding is not
+  // possible, so we have a min value greater than 0.
+  const float image_min = 1.0f;
+  const float image_max = 12.0f;
+  Tensor image_float(DT_FLOAT,
+                     {image_batch_count, image_height, image_width, depth});
+  test::FillValues<float>(&image_float,
+                          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  Tensor image_quantized =
+      FloatTensorToQuantized<quint8>(image_float, image_min, image_max);
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const float filter_min = 1.0f;
+  const float filter_max = 9.0f;
+  Tensor filter_float(DT_FLOAT,
+                      {filter_size, filter_size, depth, filter_count});
+  test::FillValues<float>(&filter_float, {1, 4, 7, 2, 5, 8, 3, 6, 9});
+  Tensor filter_quantized =
+      FloatTensorToQuantized<quint8>(filter_float, filter_min, filter_max);
+  AddInputFromArray<quint8>(image_quantized.shape(),
+                            image_quantized.flat<quint8>());
+  AddInputFromArray<quint8>(filter_quantized.shape(),
+                            filter_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {image_min});
+  AddInputFromArray<float>(TensorShape({1}), {image_max});
+  AddInputFromArray<float>(TensorShape({1}), {filter_min});
+  AddInputFromArray<float>(TensorShape({1}), {filter_max});
+  TF_ASSERT_OK(RunOpKernel());
+  const int expected_width = image_width;
+  const int expected_height = image_height * filter_count;
+  Tensor expected_float(
+      DT_FLOAT, TensorShape({image_batch_count, expected_height, expected_width,
+                             filter_count}));
+  test::FillValues<float>(&expected_float, {105, 150, 183, 95, 235, 312, 357,
+                                            178, 187, 234, 261, 121});
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<qint32>(output_quantized, output_min, output_max);
+  test::ExpectTensorNear<float>(expected_float, output_float, 1.0);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantized_matmul_op.cc b/tensorflow/contrib/quantization/kernels/quantized_matmul_op.cc
new file mode 100644
index 0000000000..18de2d1d97
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantized_matmul_op.cc
@@ -0,0 +1,186 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implements a quantized eight-bit version of the matmul operation.
+
+#include "public/gemmlowp.h"
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+#include "tensorflow/contrib/quantization/kernels/reference_gemm.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+// We have to break this out as a separate function because there are multiple
+// combinations of transpose attributes we need to support, and they have to be
+// compile-time constants to work with the templates used internally.
+template <bool TransposeA, bool TransposeB, bool TransposeC>
+void GemmlowpMultiply(OpKernelContext* op_context, const quint8* a_data,
+                      const quint8* b_data, qint32* c_data, int m, int n, int k,
+                      int offset_a, int offset_b, int lda, int ldb, int ldc) {
+  const uint8* a_data_as_uint8 = &(a_data->value);
+  const uint8* b_data_as_uint8 = &(b_data->value);
+  int32* c_data_as_int32 = &(c_data->value);
+  static const gemmlowp::MapOrder ResultOrder =
+      !TransposeC ? gemmlowp::MapOrder::RowMajor : gemmlowp::MapOrder::ColMajor;
+  static const gemmlowp::MapOrder LhsOrder =
+      !TransposeA ? gemmlowp::MapOrder::RowMajor : gemmlowp::MapOrder::ColMajor;
+  static const gemmlowp::MapOrder RhsOrder =
+      !TransposeB ? gemmlowp::MapOrder::RowMajor : gemmlowp::MapOrder::ColMajor;
+  gemmlowp::MatrixMap<const std::uint8_t, LhsOrder> lhs(a_data_as_uint8, m, k,
+                                                        lda);
+  gemmlowp::MatrixMap<const std::uint8_t, RhsOrder> rhs(b_data_as_uint8, k, n,
+                                                        ldb);
+  gemmlowp::MatrixMap<std::int32_t, ResultOrder> result(c_data_as_int32, m, n,
+                                                        ldc);
+  const std::tuple<> empty_pipeline = {};
+  auto& worker_threads =
+      *(op_context->device()->tensorflow_cpu_worker_threads());
+  TensorflowGemmContext context(worker_threads.num_threads,
+                                worker_threads.workers);
+  gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t,
+                                   gemmlowp::DefaultL8R8BitDepthParams>(
+      &context, lhs, rhs, &result, -offset_a, -offset_b, empty_pipeline);
+}
+
+template <class T1, class T2, class Toutput>
+class QuantizedMatMulOp : public OpKernel {
+ public:
+  explicit QuantizedMatMulOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("transpose_a", &transpose_a_));
+    OP_REQUIRES_OK(context, context->GetAttr("transpose_b", &transpose_b_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& a = context->input(0);
+    const Tensor& b = context->input(1);
+    const float min_a = context->input(2).flat<float>()(0);
+    const float max_a = context->input(3).flat<float>()(0);
+    const float min_b = context->input(4).flat<float>()(0);
+    const float max_b = context->input(5).flat<float>()(0);
+
+    // Make sure that we have valid quantization ranges for the input buffers.
+    // If the difference between the min and max is negative or zero, it makes
+    // it hard to do meaningful intermediate operations on the values.
+    OP_REQUIRES(context, (max_a > min_a),
+                errors::InvalidArgument("max_a must be larger than min_a."));
+    OP_REQUIRES(context, (max_b > min_b),
+                errors::InvalidArgument("max_b must be larger than min_b."));
+    const int32 offset_a = FloatToQuantizedUnclamped<T1>(0.0f, min_a, max_a);
+    const int32 offset_b = FloatToQuantizedUnclamped<T2>(0.0f, min_b, max_b);
+    const int32 offset_c = 0;
+    const int32 mult_c = 1;
+    const int32 shift_c = 0;
+
+    // Check that the dimensions of the two matrices are valid.
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(a.shape()),
+                errors::InvalidArgument("In[0] is not a matrix"));
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(b.shape()),
+                errors::InvalidArgument("In[1] is not a matrix"));
+    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+    dim_pair[0].first = transpose_a_ ? 0 : 1;
+    dim_pair[0].second = transpose_b_ ? 1 : 0;
+
+    OP_REQUIRES(context,
+                a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
+                errors::InvalidArgument("Matrix size-compatible: In[0]: ",
+                                        a.shape().DebugString(), ", In[1]: ",
+                                        b.shape().DebugString()));
+
+    OP_REQUIRES(context, ((shift_c >= 0) && (shift_c <= 31)),
+                errors::InvalidArgument("shift_c must be between 0 and 31, "
+                                        "inclusive."));
+
+    int a_dim_remaining = 1 - dim_pair[0].first;
+    int b_dim_remaining = 1 - dim_pair[0].second;
+    TensorShape out_shape(
+        {a.dim_size(a_dim_remaining), b.dim_size(b_dim_remaining)});
+    Tensor* c = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &c));
+    CHECK(c);
+
+    const T1* a_data = a.flat<T1>().data();
+    const T2* b_data = b.flat<T2>().data();
+    Toutput* c_data = c->flat<Toutput>().data();
+
+    const bool transpose_c = false;
+    const size_t m = a.dim_size(a_dim_remaining);
+    const size_t n = b.dim_size(b_dim_remaining);
+    const size_t k = a.dim_size(dim_pair[0].first);
+    const size_t lda = a.dim_size(1);
+    const size_t ldb = b.dim_size(1);
+    const size_t ldc = n;
+
+    // The gemmlowp optimized library only works for a particular set of data
+    // types, so check if we meet those requirements and
+    // fall back to a slower reference implementation if not.
+    if (std::is_same<T1, quint8>() && std::is_same<T2, quint8>() &&
+        std::is_same<Toutput, qint32>() && (offset_c == 0) && (mult_c == 1) &&
+        (shift_c == 0) && (transpose_c == false)) {
+      if (transpose_a_) {
+        if (transpose_b_) {
+          GemmlowpMultiply<true, true, false>(context, a_data, b_data, c_data,
+                                              m, n, k, offset_a, offset_b, lda,
+                                              ldb, ldc);
+        } else {
+          GemmlowpMultiply<true, false, false>(context, a_data, b_data, c_data,
+                                               m, n, k, offset_a, offset_b, lda,
+                                               ldb, ldc);
+        }
+      } else {
+        if (transpose_b_) {
+          GemmlowpMultiply<false, true, false>(context, a_data, b_data, c_data,
+                                               m, n, k, offset_a, offset_b, lda,
+                                               ldb, ldc);
+        } else {
+          GemmlowpMultiply<false, false, false>(context, a_data, b_data, c_data,
+                                                m, n, k, offset_a, offset_b,
+                                                lda, ldb, ldc);
+        }
+      }
+    } else {
+      ReferenceGemm<T1, T2, Toutput>(
+          transpose_a_, transpose_b_, transpose_c, m, n, k, a_data, offset_a,
+          lda, b_data, offset_b, ldb, c_data, shift_c, offset_c, mult_c, ldc);
+    }
+
+    float min_c_value;
+    float max_c_value;
+    QuantizationRangeForMultiplication<T1, T2, Toutput>(
+        min_a, max_a, min_b, max_b, &min_c_value, &max_c_value);
+    Tensor* c_min = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, {}, &c_min));
+    c_min->flat<float>()(0) = min_c_value;
+
+    Tensor* c_max = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(2, {}, &c_max));
+    c_max->flat<float>()(0) = max_c_value;
+  }
+
+ private:
+  bool transpose_a_;
+  bool transpose_b_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("QuantizedMatMul")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("T1")
+                            .TypeConstraint<quint8>("T2")
+                            .TypeConstraint<qint32>("Toutput"),
+                        QuantizedMatMulOp<quint8, quint8, qint32>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantized_matmul_op_test.cc b/tensorflow/contrib/quantization/kernels/quantized_matmul_op_test.cc
new file mode 100644
index 0000000000..3eea751818
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantized_matmul_op_test.cc
@@ -0,0 +1,336 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class QuantizedMatMulTest : public OpsTestBase {
+ protected:
+};
+
+// Runs two small matrices through the operator, and leaves all the parameters
+// at their default values.
+TEST_F(QuantizedMatMulTest, Small_NoParams) {
+  TF_ASSERT_OK(NodeDefBuilder("quantized_mat_mul_op", "QuantizedMatMul")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("Toutput", DataTypeToEnum<qint32>::v())
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  // A matrix is:
+  // |  1 |  2 |  3 |
+  // |  4 |  5 |  6 |
+  AddInputFromArray<quint8>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
+  // B matrix is:
+  // |  7 |  8 |  9 | 10 |
+  // | 11 | 12 | 13 | 14 |
+  // | 15 | 16 | 17 | 18 |
+  AddInputFromArray<quint8>(TensorShape({3, 4}),
+                            {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
+  AddInputFromArray<float>(TensorShape({1}), {0});
+  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+  AddInputFromArray<float>(TensorShape({1}), {0});
+  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+
+  TF_ASSERT_OK(RunOpKernel());
+  // Here are the results we expect, from hand calculations:
+  // (1 * 7) + (2 * 11) + (3 * 15) = 74
+  // (1 * 8) + (2 * 12) + (3 * 16) = 80
+  // (1 * 9) + (2 * 13) + (3 * 17) = 86
+  // (1 * 10) + (2 * 14) + (3 * 18) = 92
+  // (4 * 7) + (5 * 11) + (6 * 15) = 173
+  // (4 * 8) + (5 * 12) + (6 * 16) = 188
+  // (4 * 9) + (5 * 13) + (6 * 17) = 203
+  // (4 * 10) + (5 * 14) + (6 * 18) = 218
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 4}));
+  test::FillValues<qint32>(&expected, {74, 80, 86, 92, 173, 188, 203, 218});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+// This test multiplies two 1x1 8bit matrices, and compares the
+// results with hand-calculated expectations.
+TEST_F(QuantizedMatMulTest, VerySmall_WithParams) {
+  // These parameters reflect a typical production usage of eight-bit matmuls
+  // in an Inception-style network.
+  const bool transpose_a = true;
+  const int a_rows = 1;
+  const int a_cols = 1;
+  const int b_rows = 1;
+  const int b_cols = 1;
+  const bool transpose_b = false;
+  TF_ASSERT_OK(NodeDefBuilder("quantized_mat_mul_op", "QuantizedMatMul")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("Toutput", DataTypeToEnum<qint32>::v())
+                   .Attr("transpose_a", transpose_a)
+                   .Attr("transpose_b", transpose_b)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  // The A matrix is:
+  // |  -1 |
+  // The input array only contains unsigned bytes, so we specify the actual
+  // values as n+a_offset, where a_offset is 12 above. For example that means -1
+  // is represented as -1 + 12, or 11.
+  // We have set the transpose_a flag to true, so the matrix is transposed, and
+  // for filling the the values the in-memory storage order is effectively
+  // column major, rather than the default row-major.
+  AddInputFromArray<quint8>(TensorShape({a_rows, a_cols}), {11});
+
+  // The B matrix is:
+  // |   1 |
+  AddInputFromArray<quint8>(TensorShape({b_rows, b_cols}), {0});
+  AddInputFromArray<float>(TensorShape({1}), {-12.0f});
+  AddInputFromArray<float>(TensorShape({1}), {243.0f});
+  AddInputFromArray<float>(TensorShape({1}), {1.0f});
+  AddInputFromArray<float>(TensorShape({1}), {256.0f});
+  TF_ASSERT_OK(RunOpKernel());
+  // We're requesting C = A.transposed() * B,
+  // so we expect to get these results:
+  // 1*-1 = -1
+  // | -1 |
+  Tensor expected(allocator(), DT_QINT32, TensorShape({a_cols, b_cols}));
+  test::FillValues<qint32>(&expected, {-1});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+// This test multiplies two 1x1 8bit matrices, but sets an invalid quantization
+// range, so we expect to get an error
+TEST_F(QuantizedMatMulTest, VerySmall_BadRange) {
+  // These parameters reflect a typical production usage of eight-bit matmuls
+  // in an Inception-style network.
+  const bool transpose_a = true;
+  const int a_rows = 1;
+  const int a_cols = 1;
+  const int b_rows = 1;
+  const int b_cols = 1;
+  const bool transpose_b = false;
+  TF_ASSERT_OK(NodeDefBuilder("quantized_mat_mul_op", "QuantizedMatMul")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("Toutput", DataTypeToEnum<qint32>::v())
+                   .Attr("transpose_a", transpose_a)
+                   .Attr("transpose_b", transpose_b)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  // The A matrix is:
+  // |  -1 |
+  AddInputFromArray<quint8>(TensorShape({a_rows, a_cols}), {11});
+
+  // The B matrix is:
+  // |   1 |
+  AddInputFromArray<quint8>(TensorShape({b_rows, b_cols}), {0});
+  AddInputFromArray<float>(TensorShape({1}), {-12.0f});
+  AddInputFromArray<float>(TensorShape({1}), {243.0f});
+  // Here we set the range so that the min and max are equal, so we expect to
+  // see an error when we run.
+  AddInputFromArray<float>(TensorShape({1}), {1.0f});
+  AddInputFromArray<float>(TensorShape({1}), {1.0f});
+  EXPECT_EQ(::tensorflow::error::INVALID_ARGUMENT, RunOpKernel().code());
+}
+
+// This test multiplies a couple of small 8-bit matrices, and compares the
+// results with hand-calculated expectations. It uses shifts and offsets to
+// control the range of the outputs.
+TEST_F(QuantizedMatMulTest, Small_WithParams) {
+  // These parameters reflect a typical production usage of eight-bit matmuls
+  // in an Inception-style network.
+  const bool transpose_a = true;
+  const int a_rows = 3;
+  const int a_cols = 4;
+  const int b_rows = 3;
+  const int b_cols = 2;
+  const bool transpose_b = false;
+  TF_ASSERT_OK(NodeDefBuilder("quantized_mat_mul_op", "QuantizedMatMul")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("Toutput", DataTypeToEnum<qint32>::v())
+                   .Attr("transpose_a", transpose_a)
+                   .Attr("transpose_b", transpose_b)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  // The A matrix is:
+  // |  -1 |  -5 |  -9 |
+  // |  -2 |  -6 | -10 |
+  // |  -3 |  -7 | -11 |
+  // |  -4 |  -8 | -12 |
+  // The input array only contains unsigned bytes, so we specify the actual
+  // values as n+a_offset, where a_offset is 12 above. For example that means -1
+  // is represented as -1 + 12, or 11.
+  // We have set the transpose_a flag to true, so the matrix is transposed, and
+  // for filling the the values the in-memory storage order is effectively
+  // column major, rather than the default row-major.
+  AddInputFromArray<quint8>(TensorShape({a_rows, a_cols}),
+                            {
+                                11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                            });
+
+  // The B matrix is:
+  // |   1 |   4|
+  // |   2 |   5|
+  // |   3 |   6|
+  AddInputFromArray<quint8>(TensorShape({b_rows, b_cols}), {
+                                                               1, 4, 2, 5, 3, 6,
+                                                           });
+  AddInputFromArray<float>(TensorShape({1}), {-12.0f});
+  AddInputFromArray<float>(TensorShape({1}), {243.0f});
+  AddInputFromArray<float>(TensorShape({1}), {0});
+  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+  TF_ASSERT_OK(RunOpKernel());
+  // We're requesting C = A.transposed() * B,
+  // so we expect to get these results:
+  // 1*-1 + 2*-5 + 3*-9 = -38
+  // 4*-1 + 5*-5 + 6*-9 = -83
+  // 1*-2 + 2*-6 + 3*-10 = -44
+  // 4*-2 + 5*-6 + 6*-10 = -98
+  // 1*-3 + 2*-7 + 3*-11 = -50
+  // 4*-3 + 5*-7 + 6*-11 = -113
+  // 1*-4 + 2*-8 + 3*-12 = -56
+  // 4*-4 + 5*-8 + 6*-12 = -128
+  // |  -38 |  -83 |
+  // |  -44 |  -98 |
+  // |  -50 | -113 |
+  // |  -56 | -128 |
+  Tensor expected(allocator(), DT_QINT32, TensorShape({a_cols, b_cols}));
+  test::FillValues<qint32>(&expected,
+                           {
+                               -38, -83, -44, -98, -50, -113, -56, -128,
+                           });
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+// This test multiplies a couple of medium-sized 8-bit matrices, and tests the
+// results against what we saw from running a float MatMul with equivalent
+// inputs.
+TEST_F(QuantizedMatMulTest, Medium_WithParams) {
+  const bool transpose_a = true;
+  const bool transpose_b = false;
+  TF_ASSERT_OK(NodeDefBuilder("quantized_mat_mul_op", "QuantizedMatMul")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("Toutput", DataTypeToEnum<qint32>::v())
+                   .Attr("transpose_a", transpose_a)
+                   .Attr("transpose_b", transpose_b)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  const int a_rows = 8;
+  const int a_cols = 8;
+  const float a_min = -2164.25f;
+  const float a_max = 2006.27f;
+  Tensor a_float(DT_FLOAT, {a_rows, a_cols});
+  test::FillValues<float>(
+      &a_float,
+      {-1014.12, -157.382, -810.17,  1435.28,  1016.37,  219.684,  -316.054,
+       -2164.25, 2006.27,  -547.444, 857.376,  404.376,  9.72115,  332.588,
+       194.385,  -286.57,  26.062,   23.1125,  110.436,  247.055,  -127.683,
+       -376.275, -124.81,  -846.826, -77.1507, 305.581,  -202.747, 12.9528,
+       9.64886,  872.686,  40.9069,  197.816,  44.16,    -306.768, -1457.52,
+       -368.939, -1049.42, -486.353, 1745.87,  95.7695,  395.773,  -254.333,
+       -404.27,  787.16,   -2.44114, 199.37,   -1024.08, 784.901,  235.055,
+       -42.7295, 241.498,  -245.365, 470.763,  186.159,  186.579,  -220.163,
+       1304.58,  386.272,  -358.853, -755.996, 360.109,  -866.007, 55.2828,
+       -508.801});
+  Tensor a_quantized = FloatTensorToQuantized<quint8>(a_float, a_min, a_max);
+
+  const int b_rows = 8;
+  const int b_cols = 8;
+  const float b_min = -0.739539f;
+  const float b_max = 0.641057f;
+  Tensor b_float(DT_FLOAT, {b_rows, b_cols});
+  test::FillValues<float>(
+      &b_float,
+      {-0.294619, -0.0670519, 0.261507,   -0.126274, 0.127229,   -0.176945,
+       -0.251223, 0.231086,   0.453694,   0.415666,  -0.288733,  0.508717,
+       0.211551,  0.0435907,  -0.582383,  -0.308779, 0.0696883,  -0.438122,
+       0.114,     0.433964,   0.109883,   0.284931,  -0.149661,  0.108657,
+       0.458333,  -0.130231,  -0.35805,   -0.123206, -0.437968,  0.0282411,
+       0.628818,  -0.0522173, -0.0233403, 0.124863,  0.217165,   0.262294,
+       -0.171005, -0.254693,  -0.200433,  -0.287354, 0.488166,   -0.0354688,
+       -0.118091, -0.590444,  0.491537,   -0.739539, 0.083117,   0.282482,
+       0.275269,  -0.36574,   0.107476,   0.0511428, -0.136887,  -0.0149852,
+       -0.259694, 0.641057,   0.264054,   -0.295126, -0.0218791, 0.361211,
+       0.012448,  0.0709718,  -0.392394,  -0.434215});
+  Tensor b_quantized = FloatTensorToQuantized<quint8>(b_float, b_min, b_max);
+
+  AddInputFromArray<quint8>(a_quantized.shape(), a_quantized.flat<quint8>());
+  AddInputFromArray<quint8>(b_quantized.shape(), b_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {a_min});
+  AddInputFromArray<float>(TensorShape({1}), {a_max});
+  AddInputFromArray<float>(TensorShape({1}), {b_min});
+  AddInputFromArray<float>(TensorShape({1}), {b_max});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected_float(DT_FLOAT, {a_cols, b_cols});
+  test::FillValues<float>(
+      &expected_float,
+      {1776.82f,  421.058f,  -854.308f, 1430.65f,  503.105f,  57.2744f,
+       -1514.97f, -1163.66f, -87.0979f, -394.577f, -39.4983f, -79.1938f,
+       -329.029f, 313.475f,  446.929f,  -59.5855f, 350.837f,  238.655f,
+       -609.21f,  350.499f,  192.238f,  847.576f,  -103.177f, 185.886f,
+       -90.5335f, 200.787f,  99.1981f,  -717.076f, 763.815f,  -703.726f,
+       -125.164f, 732.325f,  -51.5303f, -418.826f, 60.0783f,  -299.658f,
+       231.41f,   72.0622f,  -289.244f, 663.776f,  391.177f,  294.415f,
+       -484.148f, -677.932f, -180.342f, -194.764f, 761.715f,  553.061f,
+       -283.355f, 321.109f,  351.269f,  1171.7f,   -857.497f, 343.804f,
+       -494.599f, -844.119f, 725.237f,  586.052f,  -735.013f, -897.723f,
+       -122.434f, -502.907f, 1264.6f,   -239.991f});
+
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<qint32>(output_quantized, output_min, output_max);
+  test::ExpectTensorNear<float>(expected_float, output_float, 15.0);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantized_pooling_ops.cc b/tensorflow/contrib/quantization/kernels/quantized_pooling_ops.cc
new file mode 100644
index 0000000000..33a12c4746
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantized_pooling_ops.cc
@@ -0,0 +1,135 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/pooling_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T>
+class QuantizedAvgPoolingOp : public OpKernel {
+ public:
+  explicit QuantizedAvgPoolingOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    PoolParameters params{context,  ksize_,      stride_,
+                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    if (!context->status().ok()) {
+      return;
+    }
+
+    const float min_input = context->input(1).flat<float>()(0);
+    const float max_input = context->input(2).flat<float>()(0);
+
+    OP_REQUIRES(context, params.depth_window == 1,
+                errors::Unimplemented("Non-spatial pooling is not "
+                                      "yet supported. Volunteers? :)"));
+
+    OP_REQUIRES(context, tensor_in.dims() == 4,
+                errors::InvalidArgument("tensor_in must be 4-dimensional"));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, params.forward_output_shape(), &output));
+    const int32 highest = static_cast<int32>(Eigen::NumTraits<T>::highest());
+    const int32 lowest = static_cast<int32>(Eigen::NumTraits<T>::lowest());
+
+    // TODO(vrv): Switch this to the Eigen::Tensor version of
+    // SpatialAvgPooling once that version is running quickly.
+    Tensor int32_output(DT_INT32, params.forward_output_shape());
+    // Cast input to int32 tensor and call SpatialAvgPool.
+    Tensor int32_input(DT_INT32, tensor_in.shape());
+    int32_input.flat<int32>() = tensor_in.flat<T>().template cast<int32>();
+    SpatialAvgPool<Device, int32>(context, &int32_output, int32_input, params,
+                                  padding_);
+
+    // Clamp the int32 output back into quantized space.
+    output->flat<T>() = int32_output.flat<int32>()
+                            .cwiseMax(lowest)
+                            .cwiseMin(highest)
+                            .template cast<T>();
+
+    Tensor* output_min = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, {}, &output_min));
+    output_min->flat<float>()(0) = min_input;
+    Tensor* output_max = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(2, {}, &output_max));
+    output_max->flat<float>()(0) = max_input;
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+};
+
+template <typename Device, typename T>
+class QuantizedMaxPoolingOp : public MaxPoolingOp<Device, T> {
+ public:
+  explicit QuantizedMaxPoolingOp(OpKernelConstruction* context)
+      : MaxPoolingOp<Device, T>(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const float min_input = context->input(1).flat<float>()(0);
+    const float max_input = context->input(2).flat<float>()(0);
+    MaxPoolingOp<Device, T>::Compute(context);
+    Tensor* output_min = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, {}, &output_min));
+    output_min->flat<float>()(0) = min_input;
+    Tensor* output_max = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(2, {}, &output_max));
+    output_max->flat<float>()(0) = max_input;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("QuantizedAvgPool").Device(DEVICE_CPU).TypeConstraint<quint8>("T"),
+    QuantizedAvgPoolingOp<CPUDevice, quint8>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("QuantizedMaxPool").Device(DEVICE_CPU).TypeConstraint<quint8>("T"),
+    QuantizedMaxPoolingOp<CPUDevice, quint8>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/quantized_pooling_ops_test.cc b/tensorflow/contrib/quantization/kernels/quantized_pooling_ops_test.cc
new file mode 100644
index 0000000000..3bc05ed455
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/quantized_pooling_ops_test.cc
@@ -0,0 +1,127 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class QuantizedPoolingTest : public OpsTestBase {
+ protected:
+};
+
+TEST_F(QuantizedPoolingTest, SmallAveragePooling) {
+  const int ksize = 2;
+  const int stride = 2;
+  TF_ASSERT_OK(NodeDefBuilder("quantized_avg_pool_op", "QuantizedAvgPool")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("ksize", {1, ksize, ksize, 1})
+                   .Attr("strides", {1, stride, stride, 1})
+                   .Attr("padding", "SAME")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const float input_min = 0.0f;
+  const float input_max = 255.0f;
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+  Tensor input_float(DT_FLOAT, {1, input_height, input_width, input_channels});
+  test::FillValues<float>(
+      &input_float,
+      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
+  Tensor input_quantized =
+      FloatTensorToQuantized<quint8>(input_float, input_min, input_max);
+
+  const int expected_width = input_width / stride;
+  const int expected_height = input_height / stride;
+  Tensor expected_float(DT_FLOAT,
+                        {1, expected_height, expected_width, input_channels});
+  test::FillValues<float>(&expected_float, {6, 7, 10, 11, 22, 23, 26, 27});
+
+  AddInputFromArray<quint8>(input_quantized.shape(),
+                            input_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {input_min});
+  AddInputFromArray<float>(TensorShape({1}), {input_max});
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<quint8>(output_quantized, output_min, output_max);
+  test::ExpectTensorNear<float>(expected_float, output_float, 0.2);
+}
+
+TEST_F(QuantizedPoolingTest, SmallMaxPooling) {
+  const int ksize = 2;
+  const int stride = 2;
+  TF_ASSERT_OK(NodeDefBuilder("quantized_max_pool_op", "QuantizedMaxPool")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("ksize", {1, ksize, ksize, 1})
+                   .Attr("strides", {1, stride, stride, 1})
+                   .Attr("padding", "SAME")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const float input_min = 0.0f;
+  const float input_max = 255.0f;
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+  Tensor input_float(DT_FLOAT, {1, input_height, input_width, input_channels});
+  test::FillValues<float>(
+      &input_float,
+      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
+  Tensor input_quantized =
+      FloatTensorToQuantized<quint8>(input_float, input_min, input_max);
+
+  const int expected_width = input_width / stride;
+  const int expected_height = input_height / stride;
+  Tensor expected_float(DT_FLOAT,
+                        {1, expected_height, expected_width, input_channels});
+  test::FillValues<float>(&expected_float, {11, 12, 15, 16, 27, 28, 31, 32});
+
+  AddInputFromArray<quint8>(input_quantized.shape(),
+                            input_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {input_min});
+  AddInputFromArray<float>(TensorShape({1}), {input_max});
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<quint8>(output_quantized, output_min, output_max);
+  test::ExpectTensorNear<float>(expected_float, output_float, 0.2);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/kernels/reference_gemm.h b/tensorflow/contrib/quantization/kernels/reference_gemm.h
new file mode 100644
index 0000000000..5af3a77128
--- /dev/null
+++ b/tensorflow/contrib/quantization/kernels/reference_gemm.h
@@ -0,0 +1,90 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_REFERENCE_GEMM_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_REFERENCE_GEMM_H_
+
+// This is an unoptimized but debuggable implementation of the GEMM matrix
+// multiply function, used to compare to faster but more opaque versions, or
+// for bit depths or argument combinations that aren't supported by optimized
+// code.
+// It assumes the row-major convention used by TensorFlow, and implements
+// C = A * B, like the standard BLAS GEMM interface. If the tranpose flags are
+// true, then the relevant matrix is treated as stored in column-major order.
+
+namespace tensorflow {
+template <class T1, class T2, class T3>
+void ReferenceGemm(bool transpose_a, bool transpose_b, bool transpose_c,
+                   size_t m, size_t n, size_t k, const T1* a, int32 offset_a,
+                   size_t lda, const T2* b, int32 offset_b, size_t ldb, T3* c,
+                   int32 shift_c, int32 offset_c, int32 mult_c, size_t ldc) {
+  int a_i_stride;
+  int a_l_stride;
+  if (transpose_a) {
+    a_i_stride = 1;
+    a_l_stride = lda;
+  } else {
+    a_i_stride = lda;
+    a_l_stride = 1;
+  }
+  int b_j_stride;
+  int b_l_stride;
+  if (transpose_b) {
+    b_j_stride = ldb;
+    b_l_stride = 1;
+  } else {
+    b_j_stride = 1;
+    b_l_stride = ldb;
+  }
+  int c_i_stride;
+  int c_j_stride;
+  if (transpose_c) {
+    c_i_stride = 1;
+    c_j_stride = ldc;
+  } else {
+    c_i_stride = ldc;
+    c_j_stride = 1;
+  }
+
+  const int32 highest = static_cast<int32>(Eigen::NumTraits<T3>::highest());
+  const int32 lowest = static_cast<int32>(Eigen::NumTraits<T3>::lowest());
+  const int32 rounding = (shift_c < 1) ? 0 : (1 << (shift_c - 1));
+
+  int i, j, l;
+  for (j = 0; j < n; j++) {
+    for (i = 0; i < m; i++) {
+      int32 total = 0;
+      for (l = 0; l < k; l++) {
+        const size_t a_index = ((i * a_i_stride) + (l * a_l_stride));
+        const int32 a_value = static_cast<int32>(a[a_index]) - offset_a;
+        const size_t b_index = ((j * b_j_stride) + (l * b_l_stride));
+        const int32 b_value = static_cast<int32>(b[b_index]) - offset_b;
+        total += (a_value * b_value);
+      }
+      const size_t c_index = ((i * c_i_stride) + (j * c_j_stride));
+      int32_t output = ((((total + offset_c) * mult_c) + rounding) >> shift_c);
+      if (output > highest) {
+        output = highest;
+      }
+      if (output < lowest) {
+        output = lowest;
+      }
+      c[c_index] = static_cast<T3>(output);
+    }
+  }
+}
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_REFERENCE_GEMM_H_
diff --git a/tensorflow/contrib/quantization/load_quantized_ops_so.py b/tensorflow/contrib/quantization/load_quantized_ops_so.py
new file mode 100644
index 0000000000..6eb424e534
--- /dev/null
+++ b/tensorflow/contrib/quantization/load_quantized_ops_so.py
@@ -0,0 +1,48 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops for quantized evaluation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import threading
+
+import tensorflow as tf
+
+QUANTIZED_OPS_FILE = '_quantized_ops.so'
+
+_quantized_ops = None
+_ops_lock = threading.Lock()
+
+
+# Workaround for the fact that importing tensorflow imports contrib
+# (even if a user isn't using this or any other contrib op), but
+# there's not yet any guarantee that the shared object exists.
+# In which case, "import tensorflow" will always crash, even for users that
+# never use contrib.
+def Load(library_base_dir=''):
+  """Load the quantized ops library and return the loaded module."""
+  with _ops_lock:
+    global _quantized_ops
+    if not _quantized_ops:
+      data_files_path = os.path.join(library_base_dir,
+                                     tf.resource_loader.get_data_files_path())
+      tf.logging.info('q:data path: %s', data_files_path)
+      _quantized_ops = tf.load_op_library(os.path.join(
+          data_files_path, QUANTIZED_OPS_FILE))
+
+      assert _quantized_ops, 'Could not load quantized_ops.so'
+  return _quantized_ops
diff --git a/tensorflow/contrib/quantization/ops/array_ops.cc b/tensorflow/contrib/quantization/ops/array_ops.cc
new file mode 100644
index 0000000000..ff636c7957
--- /dev/null
+++ b/tensorflow/contrib/quantization/ops/array_ops.cc
@@ -0,0 +1,195 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("QuantizeV2")
+    .Input("input: float")
+    .Input("min_range: float")
+    .Input("max_range: float")
+    .Output("output: T")
+    .Output("output_min: float")
+    .Output("output_max: float")
+    .Attr("T: quantizedtype")
+    .Attr("mode: {'MIN_COMBINED', 'MIN_FIRST'} = 'MIN_COMBINED'")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
+
+[min_range, max_range] are scalar floats that specify the range for
+the 'input' data. The 'mode' attribute controls exactly which calculations are
+used to convert the float values to their quantized equivalents.
+
+In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+
+```
+out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+if T == qint8, out[i] -= (range(T) + 1) / 2.0
+```
+here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+
+*MIN_COMBINED Mode Example*
+
+Assume the input is type float and has a possible range of [0.0, 6.0] and the
+output type is quint8 ([0, 255]). The min_range and max_range values should be
+specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+value of the input by 255/6 and cast to quint8.
+
+If the output type was qint8 ([-128, 127]), the operation will additionally
+subtract each value by 128 prior to casting, so that the range of values aligns
+with the range of qint8.
+
+If the mode is 'MIN_FIRST', then this approach is used:
+
+```
+number_of_steps = 1 << (# of bits in T)
+range_adjust = number_of_steps / (number_of_steps - 1)
+range = (range_max - range_min) * range_adjust
+range_scale = number_of_steps / range
+quantized = round(input * range_scale) - round(range_min * range_scale) +
+  numeric_limits<T>::min()
+quantized = max(quantized, numeric_limits<T>::min())
+quantized = min(quantized, numeric_limits<T>::max())
+```
+
+The biggest difference between this and MIN_COMBINED is that the minimum range
+is rounded first, before it's subtracted from the rounded value. With
+MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+and dequantizing will introduce a larger and larger error.
+
+One thing to watch out for is that the operator may choose to adjust the
+requested minimum and maximum values slightly during the quantization process,
+so you should always use the output ports as the range for further calculations.
+For example, if the requested minimum and maximum values are close to equal,
+they will be separated by a small epsilon value to prevent ill-formed quantized
+buffers from being created. Otherwise, you can end up with buffers where all the
+quantized values map to the same float value, which causes problems for
+operations that have to perform further calculations on them.
+
+min_range: The minimum scalar value possibly produced for the input.
+max_range: The maximum scalar value possibly produced for the input.
+output: The quantized data produced from the float input.
+output_min: The actual minimum scalar value used for the output.
+output_max: The actual maximum scalar value used for the output.
+
+)doc");
+
+REGISTER_OP("Dequantize")
+    .Input("input: T")
+    .Input("min_range: float")
+    .Input("max_range: float")
+    .Output("output: float")
+    .Attr("T: quantizedtype")
+    .Attr("mode: {'MIN_COMBINED', 'MIN_FIRST'} = 'MIN_COMBINED'")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Dequantize the 'input' tensor into a float Tensor.
+
+[min_range, max_range] are scalar floats that specify the range for
+the 'input' data. The 'mode' attribute controls exactly which calculations are
+used to convert the float values to their quantized equivalents.
+
+In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+
+```
+if T == qint8, in[i] += (range(T) + 1)/ 2.0
+out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
+```
+here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+
+*MIN_COMBINED Mode Example*
+
+If the input comes from a QuantizedRelu6, the output type is
+quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
+0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
+Dequantize on quint8 will take each value, cast to float, and multiply
+by 6 / 255.
+Note that if quantizedtype is qint8, the operation will additionally add
+each value by 128 prior to casting.
+
+If the mode is 'MIN_FIRST', then this approach is used:
+
+```
+number_of_steps = 1 << (# of bits in T)
+range_adjust = number_of_steps / (number_of_steps - 1)
+range = (range_max - range_min) * range_adjust
+range_scale = range / number_of_steps
+const double offset_input = static_cast<double>(input) - lowest_quantized;
+result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
+```
+
+min_range: The minimum scalar value possibly produced for the input.
+max_range: The maximum scalar value possibly produced for the input.
+
+)doc");
+
+REGISTER_OP("QuantizedConcat")
+    .Input("concat_dim: int32")
+    .Input("values: N * T")
+    .Input("input_mins: N * float32")
+    .Input("input_maxes: N * float32")
+    .Output("output: T")
+    .Output("output_min: float")
+    .Output("output_max: float")
+    .Attr("N: int >= 2")
+    .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::ConcatShape(c));
+      ShapeHandle unused;
+      for (int i = 2; i < c->num_inputs(); ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 0, &unused));
+      }
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Concatenates quantized tensors along one dimension.
+
+concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+  range [0, rank(values)).
+values: The `N` Tensors to concatenate. Their ranks and types must match,
+  and their sizes must match in all dimensions except `concat_dim`.
+input_mins: The minimum scalar values for each of the input tensors.
+input_maxes: The maximum scalar values for each of the input tensors.
+output_min: The float value that the minimum quantized output value represents.
+output_max: The float value that the maximum quantized output value represents.
+output: A `Tensor` with the concatenation of values stacked along the
+  `concat_dim` dimension.  This tensor's shape matches that of `values` except
+  in `concat_dim` where it has the sum of the sizes.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/ops/math_ops.cc b/tensorflow/contrib/quantization/ops/math_ops.cc
new file mode 100644
index 0000000000..93bb283630
--- /dev/null
+++ b/tensorflow/contrib/quantization/ops/math_ops.cc
@@ -0,0 +1,126 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("QuantizedMatMul")
+    .Input("a: T1")
+    .Input("b: T2")
+    .Input("min_a: float")
+    .Input("max_a: float")
+    .Input("min_b: float")
+    .Input("max_b: float")
+    .Output("out: Toutput")
+    .Output("min_out: float")
+    .Output("max_out: float")
+    .Attr("T1: quantizedtype")
+    .Attr("T2: quantizedtype")
+    .Attr("Toutput: quantizedtype = DT_QINT32")
+    .Attr("transpose_a: bool = false")
+    .Attr("transpose_b: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::MatMulShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Perform a quantized matrix multiplication of  `a` by the matrix `b`.
+
+The inputs must be two-dimensional matrices and the inner dimension of
+`a` (after being transposed if `transpose_a` is non-zero) must match the
+outer dimension of `b` (after being transposed if `transposed_b` is
+non-zero).
+
+a: Must be a two-dimensional tensor.
+b: Must be a two-dimensional tensor.
+transpose_a: If true, `a` is transposed before multiplication.
+transpose_b: If true, `b` is transposed before multiplication.
+min_a: The float value that the lowest quantized `a` value represents.
+max_a: The float value that the highest quantized `a` value represents.
+min_b: The float value that the lowest quantized `b` value represents.
+max_b: The float value that the highest quantized `b` value represents.
+min_out: The float value that the lowest quantized output value represents.
+max_out: The float value that the highest quantized output value represents.
+
+)doc");
+
+REGISTER_OP("QuantizeDownAndShrinkRange")
+    .Input("input: Tinput")
+    .Input("input_min: float")
+    .Input("input_max: float")
+    .Output("output: out_type")
+    .Output("output_min: float")
+    .Output("output_max: float")
+    .Attr("Tinput: quantizedtype")
+    .Attr("out_type: quantizedtype")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Convert the quantized 'input' tensor into a lower-precision 'output', using the
+actual distribution of the values to maximize the usage of the lower bit depth
+and adjusting the output min and max ranges accordingly.
+
+[input_min, input_max] are scalar floats that specify the range for the float
+interpretation of the 'input' data. For example, if input_min is -1.0f and
+input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+
+This operator tries to squeeze as much precision as possible into an output with
+a lower bit depth by calculating the actual min and max values found in the
+data. For example, maybe that quint16 input has no values lower than 16,384 and
+none higher than 49,152. That means only half the range is actually needed, all
+the float interpretations are between -0.5f and 0.5f, so if we want to compress
+the data into a quint8 output, we can use that range rather than the theoretical
+-1.0f to 1.0f that is suggested by the input min and max.
+
+In practice, this is most useful for taking output from operations like
+QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
+may have large potential output ranges, but in practice have a distribution of
+input values that only uses a small fraction of the possible range. By feeding
+that output into this operator, we can reduce it from 32 bits down to 8 with
+minimal loss of accuracy.
+
+input_min: The float value that the minimum quantized input value represents.
+input_max: The float value that the maximum quantized input value represents.
+Tinput: The type of the input.
+output_min: The float value that the minimum quantized output value represents.
+output_max: The float value that the maximum quantized output value represents.
+out_type: The type of the output. Should be a lower bit depth than Tinput.
+
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/ops/nn_ops.cc b/tensorflow/contrib/quantization/ops/nn_ops.cc
new file mode 100644
index 0000000000..720377043d
--- /dev/null
+++ b/tensorflow/contrib/quantization/ops/nn_ops.cc
@@ -0,0 +1,348 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("QuantizedAvgPool")
+    .Input("input: T")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Output("output: T")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Attr("T: quantizedtype")
+    .Attr("ksize: list(int)")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::AvgPoolShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Produces the average pool of the input tensor for quantized types.
+
+input: 4-D with shape `[batch, height, width, channels]`.
+ksize: The size of the window for each dimension of the input tensor.
+  The length must be 4 to match the number of dimensions of the input.
+strides: The stride of the sliding window for each dimension of the input
+  tensor.  The length must be 4 to match the number of dimensions of the input.
+padding: The type of padding algorithm to use.
+min_input: The float value that the lowest quantized input value represents.
+max_input: The float value that the highest quantized input value represents.
+min_output: The float value that the lowest quantized output value represents.
+max_output: The float value that the highest quantized output value represents.
+
+)doc");
+
+REGISTER_OP("QuantizedBiasAdd")
+    .Input("input: T1")
+    .Input("bias: T2")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_bias: float")
+    .Input("max_bias: float")
+    .Output("output: out_type")
+    .Output("min_out: float")
+    .Output("max_out: float")
+    .Attr("T1: quantizedtype")
+    .Attr("T2: quantizedtype")
+    .Attr("out_type: quantizedtype")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::BiasAddShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Adds Tensor 'bias' to Tensor 'input' for Quantized types.
+
+Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
+
+bias: A 1D bias Tensor with size matching the last dimension of 'input'.
+min_input: The float value that the lowest quantized input value represents.
+max_input: The float value that the highest quantized input value represents.
+min_bias: The float value that the lowest quantized bias value represents.
+max_bias: The float value that the highest quantized bias value represents.
+min_out: The float value that the lowest quantized output value represents.
+max_out: The float value that the highest quantized output value represents.
+
+)doc");
+
+REGISTER_OP("QuantizedConv2D")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Computes a 2D convolution given quantized 4D input and filter tensors.
+The inputs are quantized tensors where the lowest value represents the real
+number of the associated minimum, and the highest represents the maximum.
+This means that you can only interpret the quantized output in the same way, by
+taking the returned minimum and maximum values into account.
+
+filter: filter's input_depth dimension must match input's depth dimensions.
+strides: The stride of the sliding window for each dimension of the input
+  tensor.
+padding: The type of padding algorithm to use.
+min_input: The float value that the lowest quantized input value represents.
+max_input: The float value that the highest quantized input value represents.
+min_filter: The float value that the lowest quantized filter value represents.
+max_filter: The float value that the highest quantized filter value represents.
+min_output: The float value that the lowest quantized output value represents.
+max_output: The float value that the highest quantized output value represents.
+
+)doc");
+
+REGISTER_OP("QuantizedMaxPool")
+    .Input("input: T")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Output("output: T")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Attr("T: quantizedtype")
+    .Attr("ksize: list(int)")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::MaxPoolShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Produces the max pool of the input tensor for quantized types.
+
+input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+ksize: The size of the window for each dimension of the input tensor.
+  The length must be 4 to match the number of dimensions of the input.
+strides: The stride of the sliding window for each dimension of the input
+  tensor. The length must be 4 to match the number of dimensions of the input.
+padding: The type of padding algorithm to use.
+min_input: The float value that the lowest quantized input value represents.
+max_input: The float value that the highest quantized input value represents.
+min_output: The float value that the lowest quantized output value represents.
+max_output: The float value that the highest quantized output value represents.
+
+)doc");
+
+REGISTER_OP("QuantizedRelu")
+    .Input("features: Tinput")
+    .Input("min_features: float")
+    .Input("max_features: float")
+    .Output("activations: out_type")
+    .Output("min_activations: float")
+    .Output("max_activations: float")
+    .Attr("Tinput: quantizedtype")
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Computes Quantized Rectified Linear: `max(features, 0)`
+
+activations: Has the same output shape as "features".
+min_features: The float value that the lowest quantized value represents.
+max_features: The float value that the highest quantized value represents.
+min_activations: The float value that the lowest quantized value represents.
+max_activations: The float value that the highest quantized value represents.
+
+)doc");
+
+REGISTER_OP("QuantizedRelu6")
+    .Input("features: Tinput")
+    .Input("min_features: float")
+    .Input("max_features: float")
+    .Output("activations: out_type")
+    .Output("min_activations: float")
+    .Output("max_activations: float")
+    .Attr("Tinput: quantizedtype")
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
+
+activations: Has the same output shape as "features".
+min_features: The float value that the lowest quantized value represents.
+max_features: The float value that the highest quantized value represents.
+min_activations: The float value that the lowest quantized value represents.
+max_activations: The float value that the highest quantized value represents.
+
+)doc");
+
+REGISTER_OP("QuantizedReluX")
+    .Input("features: Tinput")
+    .Input("max_value: float")
+    .Input("min_features: float")
+    .Input("max_features: float")
+    .Output("activations: out_type")
+    .Output("min_activations: float")
+    .Output("max_activations: float")
+    .Attr("Tinput: quantizedtype")
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
+
+activations: Has the same output shape as "features".
+min_features: The float value that the lowest quantized value represents.
+max_features: The float value that the highest quantized value represents.
+min_activations: The float value that the lowest quantized value represents.
+max_activations: The float value that the highest quantized value represents.
+
+)doc");
+
+REGISTER_OP("QuantizedBatchNormWithGlobalNormalization")
+    .Input("t: Tinput")
+    .Input("t_min: float")
+    .Input("t_max: float")
+    .Input("m: Tinput")
+    .Input("m_min: float")
+    .Input("m_max: float")
+    .Input("v: Tinput")
+    .Input("v_min: float")
+    .Input("v_max: float")
+    .Input("beta: Tinput")
+    .Input("beta_min: float")
+    .Input("beta_max: float")
+    .Input("gamma: Tinput")
+    .Input("gamma_min: float")
+    .Input("gamma_max: float")
+    .Output("result: out_type")
+    .Output("result_min: float")
+    .Output("result_max: float")
+    .Attr("Tinput: quantizedtype")
+    .Attr("out_type: quantizedtype")
+    .Attr("variance_epsilon: float")
+    .Attr("scale_after_normalization: bool")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+
+      DimensionHandle last_dim = c->Dim(input, 3);
+      for (int i = 1; i < 5; ++i) {  // covers m, v, beta, gamma
+        ShapeHandle vec;
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i * 3), 1, &vec));
+        TF_RETURN_IF_ERROR(c->Merge(last_dim, c->Dim(vec, 0), &last_dim));
+      }
+
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->ReplaceDim(input, 3, last_dim, &out));
+      c->set_output(0, out);
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Quantized Batch normalization.
+
+This op is deprecated and will be removed in the future. Prefer
+`tf.nn.batch_normalization`.
+
+t: A 4D input Tensor.
+t_min: The value represented by the lowest quantized input.
+t_max: The value represented by the highest quantized input.
+m: A 1D mean Tensor with size matching the last dimension of t.
+  This is the first output from tf.nn.moments,
+  or a saved moving average thereof.
+m_min: The value represented by the lowest quantized mean.
+m_max: The value represented by the highest quantized mean.
+v: A 1D variance Tensor with size matching the last dimension of t.
+  This is the second output from tf.nn.moments,
+  or a saved moving average thereof.
+v_min: The value represented by the lowest quantized variance.
+v_max: The value represented by the highest quantized variance.
+beta: A 1D beta Tensor with size matching the last dimension of t.
+  An offset to be added to the normalized tensor.
+beta_min: The value represented by the lowest quantized offset.
+beta_max: The value represented by the highest quantized offset.
+gamma: A 1D gamma Tensor with size matching the last dimension of t.
+  If "scale_after_normalization" is true, this tensor will be multiplied
+  with the normalized tensor.
+gamma_min: The value represented by the lowest quantized gamma.
+gamma_max: The value represented by the highest quantized gamma.
+variance_epsilon: A small float number to avoid dividing by 0.
+scale_after_normalization: A bool indicating whether the resulted tensor
+  needs to be multiplied with gamma.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/quantization/python/array_ops.py b/tensorflow/contrib/quantization/python/array_ops.py
index b873d4df14..2ab65e903e 100644
--- a/tensorflow/contrib/quantization/python/array_ops.py
+++ b/tensorflow/contrib/quantization/python/array_ops.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,wildcard-import
-from tensorflow.python.ops import gen_array_ops as quantized_gen_array_ops
-from tensorflow.python.ops.gen_array_ops import dequantize
-from tensorflow.python.ops.gen_array_ops import quantize_v2
-from tensorflow.python.ops.gen_array_ops import quantized_concat
+from tensorflow.contrib.quantization.ops import gen_array_ops as quantized_gen_array_ops
+from tensorflow.contrib.quantization.ops.gen_array_ops import dequantize
+from tensorflow.contrib.quantization.ops.gen_array_ops import quantize_v2
+from tensorflow.contrib.quantization.ops.gen_array_ops import quantized_concat
diff --git a/tensorflow/contrib/quantization/python/dequantize_op_test.py b/tensorflow/contrib/quantization/python/dequantize_op_test.py
new file mode 100644
index 0000000000..b1d47cc4a2
--- /dev/null
+++ b/tensorflow/contrib/quantization/python/dequantize_op_test.py
@@ -0,0 +1,85 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for Dequantize Operations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+# TODO(petewarden) - Remove this ugly hack to get around Python linking problems
+# with Bazel.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.quantization import load_quantized_ops_so
+from tensorflow.contrib.quantization.kernels import load_quantized_kernels_so
+
+
+class DequantizeOpTest(tf.test.TestCase):
+
+  def __init__(self, method_name="runTest"):
+    super(DequantizeOpTest, self).__init__(method_name)
+    load_quantized_ops_so.Load()
+    load_quantized_kernels_so.Load()
+
+  def _testDequantizeOp(self, inputs, min_range, max_range, dtype):
+    with self.test_session():
+      input_op = tf.constant(inputs, shape=[len(inputs)], dtype=dtype)
+      dequantized = tf.contrib.quantization.dequantize(
+          input_op, min_range, max_range)
+      tf_ans = dequantized.eval()
+
+    # TODO(vrv): Add support for DT_QINT32 quantization if needed.
+    type_dict = {
+        tf.quint8: np.uint8,
+        tf.qint8: np.int8,
+        tf.quint16: np.uint16,
+        tf.qint16: np.int16
+        }
+    self.assertTrue(dtype in type_dict.keys())
+    v_max = np.iinfo(type_dict[dtype]).max
+    v_min = np.iinfo(type_dict[dtype]).min
+    self.assertTrue(min_range >= v_min)
+    self.assertTrue(max_range <= v_max)
+    type_range = v_max - v_min
+    if v_min < 0:
+      half_range = (type_range + 1) / 2
+    else:
+      half_range = 0.0
+
+    np_ans = ((inputs.astype(np.float32) + half_range) *
+              (max_range - min_range) / type_range) + min_range
+    self.assertAllClose(tf_ans, np_ans)
+
+  def testBasicQuint8(self):
+    self._testDequantizeOp(np.array([0, 128, 255]),
+                           0.0, 6.0, tf.quint8)
+    self._testDequantizeOp(np.array([0, 128, 255]),
+                           0.0, 123.456, tf.quint8)
+    self._testDequantizeOp(np.array([0, 4, 42, 108, 243]),
+                           5.0, 200.2, tf.quint8)
+
+  def testBasicQint8(self):
+    self._testDequantizeOp(np.array([-128, 0, 127]),
+                           -1.0, 2.0, tf.qint8)
+    self._testDequantizeOp(np.array([-2, 4, -17]),
+                           -5.0, -3.0, tf.qint8)
+    self._testDequantizeOp(np.array([0, -4, 42, -108]),
+                           5.0, 40.0, tf.qint8)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/quantization/python/math_ops.py b/tensorflow/contrib/quantization/python/math_ops.py
index d863cdad26..d4fabbd36b 100644
--- a/tensorflow/contrib/quantization/python/math_ops.py
+++ b/tensorflow/contrib/quantization/python/math_ops.py
@@ -19,7 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.quantization.ops import gen_math_ops
+from tensorflow.contrib.quantization.ops.gen_math_ops import *
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops.gen_math_ops import *
+
+
+ops.RegisterShape("QuantizedMatMul")(common_shapes.call_cpp_shape_fn)
diff --git a/tensorflow/contrib/quantization/python/nn_ops.py b/tensorflow/contrib/quantization/python/nn_ops.py
index fd28423317..d31f1d4e68 100644
--- a/tensorflow/contrib/quantization/python/nn_ops.py
+++ b/tensorflow/contrib/quantization/python/nn_ops.py
@@ -19,7 +19,17 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.quantization.ops import gen_nn_ops
+from tensorflow.contrib.quantization.ops.gen_nn_ops import *
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_nn_ops
-from tensorflow.python.ops.gen_nn_ops import *
+
+
+ops.RegisterShape("QuantizedAvgPool")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("QuantizedBiasAdd")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("QuantizedConv2D")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("QuantizedMaxPool")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("QuantizedRelu")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("QuantizedRelu6")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("QuantizedReluX")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("QuantizeDownAndShrinkRange")(common_shapes.call_cpp_shape_fn)
diff --git a/tensorflow/contrib/quantization/python/quantized_conv_ops_test.py b/tensorflow/contrib/quantization/python/quantized_conv_ops_test.py
new file mode 100644
index 0000000000..9b24d4129d
--- /dev/null
+++ b/tensorflow/contrib/quantization/python/quantized_conv_ops_test.py
@@ -0,0 +1,198 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functional tests for quantized convolutional operations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+# TODO(petewarden) - Remove this ugly hack to get around Python linking problems
+# with Bazel.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.quantization import load_quantized_ops_so
+from tensorflow.contrib.quantization.kernels import load_quantized_kernels_so
+
+
+class Conv2DTest(tf.test.TestCase):
+
+  def __init__(self, method_name="runTest"):
+    super(Conv2DTest, self).__init__(method_name)
+    load_quantized_ops_so.Load()
+    load_quantized_kernels_so.Load()
+
+  def _VerifyValues(self, tensor_in_sizes, filter_in_sizes, stride, padding,
+                    expected):
+    """Verifies the output values of the convolution function.
+
+    Args:
+      tensor_in_sizes: Input tensor dimensions in
+        [batch, input_rows, input_cols, input_depth].
+      filter_in_sizes: Filter tensor dimensions in
+        [kernel_rows, kernel_cols, input_depth, output_depth].
+      stride: Stride.
+      padding: Padding type.
+      expected: An array containing the expected operation outputs.
+    """
+    total_size_1 = 1
+    total_size_2 = 1
+    for s in tensor_in_sizes:
+      total_size_1 *= s
+    for s in filter_in_sizes:
+      total_size_2 *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = np.array([f for f in range(1, total_size_1 + 1)])
+    x1 = x1.astype(np.uint8).reshape(tensor_in_sizes)
+    x1_min = 0.0
+    x1_max = 255.0
+    x2 = np.array([f for f in range(1, total_size_2 + 1)]).astype(np.uint8)
+    x2 = x2.astype(np.uint8).reshape(filter_in_sizes)
+    x2_min = 0.0
+    x2_max = 255.0
+    with self.test_session(use_gpu=False) as sess:
+      t1 = tf.constant(x1, shape=tensor_in_sizes, dtype=tf.quint8)
+      t2 = tf.constant(x2, shape=filter_in_sizes, dtype=tf.quint8)
+      conv = tf.contrib.quantization.quantized_conv2d(t1,
+                                                      t2,
+                                                      out_type=tf.qint32,
+                                                      strides=[1, stride,
+                                                               stride, 1],
+                                                      padding=padding,
+                                                      min_input=x1_min,
+                                                      max_input=x1_max,
+                                                      min_filter=x2_min,
+                                                      max_filter=x2_max)
+      value = sess.run(conv)
+    quantized_output = value[0]
+    output_min = value[1]
+    output_max = value[2]
+    float_output = self._QuantizedOutputToFloat(quantized_output, output_min,
+                                                output_max)
+    self.assertArrayNear(expected, float_output.flatten(), 1.0)
+    self.assertEqual(value[0].shape, conv[0].get_shape())
+
+  def _assertQuantizedArrayEquals(self, iarray1, iarray2):
+    for i1, i2 in zip(iarray1, iarray2):
+      self.assertTrue(i1 == i2)
+
+  def _QuantizedOutputToFloat(self, quantized, quantized_min, quantized_max):
+    number_of_bits = 32
+    number_of_steps = 1 << number_of_bits
+    range_adjust = (number_of_steps / (number_of_steps - 1.0))
+    quantized_range = ((quantized_max - quantized_min) * range_adjust)
+    range_scale = (quantized_range / number_of_steps)
+    lowest_quantized = -(1 << (number_of_bits - 1))
+    result = np.array([(quantized_min + ((x - lowest_quantized) * range_scale))
+                       for x in quantized.flatten()])
+    return result
+
+  def testConv2D1x1Filter(self):
+    # Our generated input is [batch, rows, cols, depth], and looks like this:
+    # (1,2,3)    (4,5,6)    (7,8,9)
+    # (10,11,12) (13,14,15) (16,17,18)
+    # The filter data is:
+    # (1,4,7) (2,5,8) (3,6,9)
+    # That means the calculations are:
+    # 1*1+2*4+3*7=30
+    # 1*2+2*5+3*8=36
+    # 1*3+2*6+3*9=42
+    # 4*1+5*4+6*7=66
+    # 4*2+5*5+6*8=81
+    # 4*3+5*6+6*9=96
+    # 7*1+5*8+6*9=102
+    # 7*2+8*5+9*8=126
+    # 7*3+8*6+9*9=150
+    # 10*1+11*4+12*7=138
+    # 10*2+11*5+12*8=171
+    # 10*3+11*6+12*9=204
+    # 13*1+14*4+15*7=174
+    # 13*2+14*5+15*8=216
+    # 13*3+14*6+15*9=258, clamped to 255
+    # 16*1+17*4+18*7=210
+    # 16*2+17*5+18*8=261, clamped to 255
+    # 16*3+17*6+18*9=312, clamped to 255
+    # Because the output shift is zero, we call the non-optimized reference
+    # path for the convolution.
+    expected_output = [30, 36, 42, 66, 81, 96, 102, 126, 150, 138, 171, 204,
+                       174, 216, 258, 210, 261, 312]
+    self._VerifyValues(tensor_in_sizes=[1, 2, 3, 3],
+                       filter_in_sizes=[1, 1, 3, 3],
+                       stride=1,
+                       padding="VALID",
+                       expected=expected_output)
+
+  def testConv2D2x2Filter(self):
+    # Our generated input is [batch, rows, cols, depth], and looks like this:
+    # (1,2,3)    (4,5,6)    (7,8,9)
+    # (10,11,12) (13,14,15) (16,17,18)
+    # The filter data is [filter_height, filter_width, depth, filter_count]:
+    # ( 1, 4, 7) (10, 13, 16)
+    # (19,22,25) (28, 31, 34)
+    # -
+    # ( 2, 5, 8) (11, 14, 17)
+    # (20,23,26) (29, 32, 35)
+    # -
+    # ( 3, 6, 9) (12, 15, 18)
+    # (21,24,27) (30, 33, 36)
+    # The raw accumulated totals are:
+    # 1*1+2*4+3*7+4*10+5*13+6*16+10*19+11*22+12*25+13*28+14*31+15*34=2271
+    # 1*2+2*5+3*8+4*11+5*14+6*17+10*20+11*23+12*26+13*29+14*32+15*35=2367
+    # 1*3+2*6+3*9+4*12+5*15+6*18+10*21+11*24+12*27+13*30+14*33+15*36=2463
+    # 4*1+5*4+6*7+7*10+8*13+9*16+13*19+14*22+15*25+16*28+17*31+18*34=2901
+    # 4*2+5*5+6*8+7*11+8*14+9*17+13*20+14*23+15*26+16*29+17*32+18*35=3033
+    # 4*3+5*6+6*9+7*12+8*15+9*18+13*21+14*24+15*27+16*30+17*33+18*36=3165
+    # The expected values are taken from the raw totals and rescaled to fit into
+    # eight bits.
+    expected_output = [2271.0, 2367.0, 2463.0, 2901.0, 3033.0, 3165.0]
+    self._VerifyValues(tensor_in_sizes=[1, 2, 3, 3],
+                       filter_in_sizes=[2, 2, 3, 3],
+                       stride=1,
+                       padding="VALID",
+                       expected=expected_output)
+
+  def testConv2D1x2Filter(self):
+    # The outputs are computed using third_party/py/IPython/notebook.
+    # With a shift of 21, we should execute the optimized path here.
+    expected_output = [231.0, 252.0, 273.0, 384.0, 423.0, 462.0, 690.0, 765.0,
+                       840.0, 843.0, 936.0, 1029.0]
+    self._VerifyValues(tensor_in_sizes=[1, 2, 3, 3],
+                       filter_in_sizes=[1, 2, 3, 3],
+                       stride=1,
+                       padding="VALID",
+                       expected=expected_output)
+
+  def testConv2D2x2FilterStride2(self):
+    # With a shift of 21, we should execute the optimized path here.
+    expected_output = [2271.0, 2367.0, 2463.0]
+    self._VerifyValues(tensor_in_sizes=[1, 2, 3, 3],
+                       filter_in_sizes=[2, 2, 3, 3],
+                       stride=2,
+                       padding="VALID",
+                       expected=expected_output)
+
+  def testConv2D2x2FilterStride2Same(self):
+    # With a shift of 21, we should execute the optimized path here.
+    expected_output = [2271.0, 2367.0, 2463.0, 1230.0, 1305.0, 1380.0]
+    self._VerifyValues(tensor_in_sizes=[1, 2, 3, 3],
+                       filter_in_sizes=[2, 2, 3, 3],
+                       stride=2,
+                       padding="SAME",
+                       expected=expected_output)
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/quantization/tools/BUILD b/tensorflow/contrib/quantization/tools/BUILD
new file mode 100644
index 0000000000..82a13e04d6
--- /dev/null
+++ b/tensorflow/contrib/quantization/tools/BUILD
@@ -0,0 +1,72 @@
+# Description:
+#   Utilities for quantizing TensorFlow graphs to lower bit depths.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "quantize_graph_lib",
+    srcs = ["quantize_graph.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/quantization:ops",
+        "//tensorflow/contrib/quantization:quantized_ops_py",
+        "//tensorflow/contrib/quantization/kernels:quantized_kernels_py",
+        "//tensorflow/python:platform",
+    ],
+)
+
+py_binary(
+    name = "quantize_graph",
+    srcs = ["quantize_graph.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/quantization:ops",
+        "//tensorflow/contrib/quantization:quantized_ops_py",
+        "//tensorflow/contrib/quantization/kernels:quantized_kernels_py",
+        "//tensorflow/python:platform",
+    ],
+)
+
+py_test(
+    name = "quantize_graph_test",
+    size = "small",
+    srcs = [
+        "quantize_graph_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":quantize_graph",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_binary(
+    name = "graph_to_dot",
+    srcs = [
+        "graph_to_dot.py",
+    ],
+    main = "graph_to_dot.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/quantization/tools/graph_to_dot.py b/tensorflow/contrib/quantization/tools/graph_to_dot.py
new file mode 100644
index 0000000000..c1ee4ea9d3
--- /dev/null
+++ b/tensorflow/contrib/quantization/tools/graph_to_dot.py
@@ -0,0 +1,69 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converts a GraphDef file into a DOT format suitable for visualization.
+
+This script takes a GraphDef representing a network, and produces a DOT file
+that can then be visualized by GraphViz tools like dot and xdot.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+import tensorflow as tf
+
+from google.protobuf import text_format
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.platform import gfile
+
+
+FLAGS = tf.flags.FLAGS
+
+tf.flags.DEFINE_string("graph", "", """TensorFlow 'GraphDef' file to load.""")
+tf.flags.DEFINE_bool("input_binary", True,
+                     """Whether the input files are in binary format.""")
+tf.flags.DEFINE_string("dot_output", "", """Where to write the DOT output.""")
+
+
+def main(unused_args):
+  if not gfile.Exists(FLAGS.graph):
+    print("Input graph file '" + FLAGS.graph + "' does not exist!")
+    return -1
+
+  graph = graph_pb2.GraphDef()
+  with open(FLAGS.graph, "rb") as f:
+    if FLAGS.input_binary:
+      graph.ParseFromString(f.read())
+    else:
+      text_format.Merge(f.read(), graph)
+
+  with open(FLAGS.dot_output, "wb") as f:
+    print("digraph graphname {", file=f)
+    for node in graph.node:
+      output_name = node.name
+      print("  \"" + output_name + "\" [label=\"" + node.op + "\"];", file=f)
+      for input_full_name in node.input:
+        parts = input_full_name.split(":")
+        input_name = re.sub(r"^\^", "", parts[0])
+        print("  \"" + input_name + "\" -> \"" + output_name + "\";", file=f)
+    print("}", file=f)
+  print("Created DOT file '" + FLAGS.dot_output + "'.")
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensorflow/contrib/quantization/tools/quantize_graph.py b/tensorflow/contrib/quantization/tools/quantize_graph.py
new file mode 100644
index 0000000000..5ded556691
--- /dev/null
+++ b/tensorflow/contrib/quantization/tools/quantize_graph.py
@@ -0,0 +1,1003 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Transforms a float-trained graph into an equivalent quantized version.
+
+An example of command-line usage is:
+bazel build tensorflow/contrib/quantization/tools:quantize_graph \
+&& bazel-bin/tensorflow/contrib/quantization/tools/quantize_graph \
+--input=tensorflow_inception_graph.pb
+--output_node_names="softmax2" --print_nodes --output=/tmp/quantized_graph.pb \
+--mode=eightbit --logtostderr
+
+"""
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.framework import graph_util
+from tensorflow.python.framework import tensor_util
+
+# TODO(petewarden) - Remove this ugly hack to get around Python linking problems
+# with Bazel.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.quantization import load_quantized_ops_so
+from tensorflow.contrib.quantization.kernels import load_quantized_kernels_so
+
+
+flags = tf.app.flags
+FLAGS = flags.FLAGS
+
+flags.DEFINE_boolean("print_nodes", False, """Lists all nodes in the model.""")
+flags.DEFINE_string("input", "", """TensorFlow 'GraphDef' file to load.""")
+flags.DEFINE_string("output_node_names", "",
+                    """Output node names, comma separated.""")
+flags.DEFINE_string("output", "", """File to save the output graph to.""")
+flags.DEFINE_integer("bitdepth", 8,
+                     """How many bits to quantize the graph to.""")
+flags.DEFINE_string("mode", "round",
+                    """What transformation to apply (round, quantize,"""
+                    """ eightbit, weights, or weights_rounded).""")
+flags.DEFINE_string("test_input_dims", "1,224,224,3",
+                    """The size of the input tensor to use when testing a"""
+                    """ graph loaded from a file.""")
+flags.DEFINE_boolean("strip_redundant_quantization", True,
+                     """Removes redundant dequantize/quantize pairs.""")
+flags.DEFINE_boolean("load_quantization_so", True,
+                     """Explicitly load the quantization ops library""")
+
+
+def print_input_nodes(current_node, nodes_map, indent, already_visited):
+  print(" " * indent + current_node.op + ":" + current_node.name)
+  already_visited[current_node.name] = True
+  for input_node_name in current_node.input:
+    if input_node_name in already_visited:
+      continue
+    input_node = nodes_map[input_node_name]
+    print_input_nodes(input_node, nodes_map, indent + 1, already_visited)
+
+
+def create_node(op, name, inputs):
+  new_node = tf.NodeDef()
+  new_node.op = op
+  new_node.name = name
+  for input_name in inputs:
+    new_node.input.extend([input_name])
+  return new_node
+
+
+def create_constant_node(name, value, dtype, shape=None):
+  node = create_node("Const", name, [])
+  set_attr_dtype(node, "dtype", dtype)
+  set_attr_tensor(node, "value", value, dtype, shape)
+  return node
+
+
+def copy_attr(node, key, attr_value):
+  try:
+    node.attr[key].CopyFrom(attr_value)
+  except KeyError:
+    pass
+
+
+def set_attr_dtype(node, key, value):
+  try:
+    node.attr[key].CopyFrom(tf.AttrValue(type=value.as_datatype_enum))
+  except KeyError:
+    pass
+
+
+def set_attr_tensor(node, key, value, dtype, shape=None):
+  try:
+    node.attr[key].CopyFrom(tf.AttrValue(
+        tensor=tensor_util.make_tensor_proto(value,
+                                             dtype=dtype,
+                                             shape=shape)))
+  except KeyError:
+    pass
+
+
+def set_attr_string(node, key, value):
+  try:
+    node.attr[key].CopyFrom(tf.AttrValue(s=value))
+  except KeyError:
+    pass
+
+
+def set_attr_int_list(node, key, value):
+  list_value = tf.AttrValue.ListValue(i=value)
+  try:
+    node.attr[key].CopyFrom(tf.AttrValue(list=list_value))
+  except KeyError:
+    pass
+
+
+def set_attr_bool(node, key, value):
+  try:
+    node.attr[key].CopyFrom(tf.AttrValue(b=value))
+  except KeyError:
+    pass
+
+
+def set_attr_int(node, key, value):
+  try:
+    node.attr[key].CopyFrom(tf.AttrValue(i=value))
+  except KeyError:
+    pass
+
+
+def set_attr_float(node, key, value):
+  try:
+    node.attr[key].CopyFrom(tf.AttrValue(f=value))
+  except KeyError:
+    pass
+
+
+def node_name_from_input(node_name):
+  """Strips off ports and other decorations to get the underlying node name."""
+  if node_name.startswith("^"):
+    node_name = node_name[1:]
+  m = re.search(r"(.*):\d+$", node_name)
+  if m:
+    node_name = m.group(1)
+  return node_name
+
+
+def ensure_tensor_name_has_port(node_name):
+  """Makes sure that a tensor name has :0 if no explicit port exists."""
+  m = re.search(r"(.*):\d+$", node_name)
+  if m:
+    name_with_port = node_name
+  else:
+    name_with_port = node_name + ":0"
+  return name_with_port
+
+
+def unique_node_name_from_input(node_name):
+  """Replaces invalid characters in input names to get a unique node name."""
+  return node_name.replace(":", "__port__").replace("^", "__hat__")
+
+
+def quantize_array(arr, num_buckets):
+  """Quantizes a numpy array.
+
+  This function maps each scalar in arr to the center of one of num_buckets
+  buckets. For instance,
+  quantize_array([0, 0.3, 0.6, 1], 2) => [0.25, 0.25, 0.75, 0.75]
+
+  Args:
+    arr: The numpy array to quantize.
+    num_buckets: The number of buckets to map "var" to.
+  Returns:
+    The quantized numpy array.
+  Raises:
+    ValueError: when num_buckets < 1.
+  """
+  if num_buckets < 1:
+    raise ValueError("num_buckets must be >= 1")
+  arr_max = arr.max()
+  arr_min = arr.min()
+  if arr_max == arr_min:
+    return arr
+  bucket_width = (arr_max - arr_min) / num_buckets
+  # Map scalars to bucket indices. Take special care of max(arr).
+  bucket_indices = np.floor((arr - arr_min) / bucket_width)
+  bucket_indices[bucket_indices == num_buckets] = num_buckets - 1
+  # Map each scalar to the center of a bucket.
+  arr = arr_min + bucket_width * (bucket_indices + 0.5)
+  return arr
+
+
+def quantize_weight_rounded(input_node):
+  """Returns a replacement node for input_node containing bucketed floats."""
+  input_tensor = input_node.attr["value"].tensor
+  tensor_value = tensor_util.MakeNdarray(input_tensor)
+  tensor_shape = input_tensor.tensor_shape
+  # Currently, the parameter FLAGS.bitdepth is used to compute the
+  # number of buckets as 1 << FLAGS.bitdepth, meaning the number of
+  # buckets can only be a power of 2.
+  # This could be fixed by intorducing a new parameter, num_buckets,
+  # which would allow for more flexibility in chosing the right model
+  # size/accuracy tradeoff. But I didn't want to add more parameters
+  # to this script than absolutely necessary.
+  num_buckets = 1 << FLAGS.bitdepth
+  tensor_value_rounded = quantize_array(tensor_value, num_buckets)
+  tensor_shape_list = tensor_util.TensorShapeProtoToList(tensor_shape)
+  return [create_constant_node(input_node.name, tensor_value_rounded,
+                               tf.float32, shape=tensor_shape_list)]
+
+
+def quantize_weight_eightbit(input_node, quantization_mode):
+  """Returns replacement nodes for input_node using the Dequantize op."""
+  base_name = input_node.name + "_"
+  quint8_const_name = base_name + "quint8_const"
+  min_name = base_name + "min"
+  max_name = base_name + "max"
+  float_tensor = tensor_util.MakeNdarray(
+      input_node.attr["value"].tensor)
+  min_value = np.min(float_tensor.flatten())
+  max_value = np.max(float_tensor.flatten())
+  # min_value == max_value is a tricky case. It can occur for general
+  # tensors, and of course for scalars. The quantized ops cannot deal
+  # with this case, so we set max_value to something else.
+  # It's a tricky question what is the numerically best solution to
+  # deal with this degeneracy.
+  # TODO(petewarden): Better use a tolerance than a hard comparison?
+  if min_value == max_value:
+    if abs(min_value) < 0.000001:
+      max_value = min_value + 1.0
+    elif min_value > 0:
+      max_value = 2 * min_value
+    else:
+      max_value = min_value / 2.0
+
+  sess = tf.Session()
+  with sess.as_default():
+    quantize_op = tf.contrib.quantization.python.quantize_v2(
+        float_tensor,
+        min_value,
+        max_value,
+        tf.quint8,
+        mode=quantization_mode)
+    quint8_tensor = quantize_op[0].eval()
+  shape = tensor_util.TensorShapeProtoToList(input_node.attr[
+      "value"].tensor.tensor_shape)
+  quint8_const_node = create_constant_node(quint8_const_name,
+                                           quint8_tensor,
+                                           tf.quint8,
+                                           shape=shape)
+  min_node = create_constant_node(min_name, min_value, tf.float32)
+  max_node = create_constant_node(max_name, max_value, tf.float32)
+  dequantize_node = create_node("Dequantize", input_node.name,
+                                [quint8_const_name, min_name, max_name])
+  set_attr_dtype(dequantize_node, "T", tf.quint8)
+  set_attr_string(dequantize_node, "mode", quantization_mode)
+  return [quint8_const_node, min_node, max_node, dequantize_node]
+
+
+class GraphRewriter(object):
+  """Takes a float graph, and rewrites it in quantized form."""
+
+  def __init__(self, input_graph, mode):
+    """Sets up the class to rewrite a float graph.
+
+    Args:
+      input_graph: A float graph to transform.
+      mode: A string controlling how quantization is performed -
+        round, quantize, eightbit, or weights.
+
+    Raises:
+      ValueError: Two nodes with the same name were found in the graph.
+    """
+    self.input_graph = input_graph
+    self.nodes_map = self.create_nodes_map(input_graph)
+    self.output_graph = None
+    self.mode = mode
+    if FLAGS.load_quantization_so:
+      load_quantized_ops_so.Load()
+      load_quantized_kernels_so.Load()
+
+  def create_nodes_map(self, graph):
+    """Builds a mapping of node names to their defs from the graph."""
+    nodes_map = {}
+    for node in graph.node:
+      if node.name not in nodes_map.keys():
+        nodes_map[node.name] = node
+      else:
+        raise ValueError("Duplicate node names detected.")
+    return nodes_map
+
+  def rewrite(self, output_node_names):
+    """Triggers rewriting of the float graph.
+
+    Args:
+      output_node_names: A list of names of the nodes that produce the final
+        results.
+
+    Returns:
+      A quantized version of the float graph.
+    """
+    self.output_graph = tf.GraphDef()
+    output_nodes = [self.nodes_map[output_node_name]
+                    for output_node_name in output_node_names]
+    if self.mode == "round":
+      self.already_visited = {}
+      for output_node in output_nodes:
+        self.round_nodes_recursively(output_node)
+    elif self.mode == "quantize":
+      self.already_visited = {}
+      self.already_quantized = {}
+      for output_node in output_nodes:
+        self.quantize_nodes_recursively(output_node)
+    elif self.mode == "eightbit":
+      self.set_input_graph(graph_util.remove_training_nodes(self.input_graph))
+      self.already_visited = {}
+      self.layers_eightbitized = []
+      for output_node in output_nodes:
+        self.eightbitize_nodes_recursively(output_node)
+      self.output_graph = self.quantize_weights(self.output_graph, b"MIN_FIRST")
+      if FLAGS.strip_redundant_quantization:
+        self.output_graph = self.remove_redundant_quantization(
+            self.output_graph)
+        self.remove_dead_nodes(output_node_names)
+    elif self.mode == "weights":
+      self.output_graph = self.quantize_weights(self.input_graph,
+                                                b"MIN_COMBINED")
+      self.remove_dead_nodes(output_node_names)
+    elif self.mode == "weights_rounded":
+      self.output_graph = self.quantize_weights(self.input_graph, self.mode)
+      self.remove_dead_nodes(output_node_names)
+    else:
+      print("Bad mode - " + self.mode + ".")
+    return self.output_graph
+
+  def round_nodes_recursively(self, current_node):
+    """The entry point for simple rounding quantization."""
+    self.already_visited[current_node.name] = True
+    for input_node_name in current_node.input:
+      input_node_name = node_name_from_input(input_node_name)
+      if input_node_name in self.already_visited:
+        continue
+      input_node = self.nodes_map[input_node_name]
+      self.round_nodes_recursively(input_node)
+    nodes_to_quantize = ["Conv2D", "BiasAdd", "MatMul"]
+    if any(current_node.op in s for s in nodes_to_quantize):
+      new_node = tf.NodeDef()
+      new_node.CopyFrom(current_node)
+      new_node.name = current_node.name + "_original"
+      self.add_output_graph_node(new_node)
+      levels = 1 << FLAGS.bitdepth
+      constant_name = current_node.name + "_round_depth"
+      constant_tensor = tf.constant(levels, dtype=tf.int32, name=constant_name)
+      constant_node = constant_tensor.op.node_def
+      self.add_output_graph_node(constant_node)
+      quantize_node = tf.NodeDef()
+      quantize_node.op = "RoundToSteps"
+      quantize_node.name = current_node.name
+      quantize_node.input.extend([current_node.name + "_original"])
+      quantize_node.input.extend([constant_node.name])
+      self.add_output_graph_node(quantize_node)
+    else:
+      new_node = tf.NodeDef()
+      new_node.CopyFrom(current_node)
+      self.add_output_graph_node(new_node)
+
+  def quantize_nodes_recursively(self, current_node):
+    """The entry point for quantizing nodes to eight bit and back."""
+    self.already_visited[current_node.name] = True
+    for input_node_name in current_node.input:
+      input_node_name = node_name_from_input(input_node_name)
+      if input_node_name in self.already_visited:
+        continue
+      input_node = self.nodes_map[input_node_name]
+      self.quantize_nodes_recursively(input_node)
+    nodes_to_quantize = ["Conv2D", "BiasAdd", "MatMul"]
+    if any(current_node.op in s for s in nodes_to_quantize):
+      for input_name in current_node.input:
+        input_name = node_name_from_input(input_name)
+        input_node = self.nodes_map[input_name]
+        self.quantize_node(input_node)
+      self.quantize_node(current_node)
+    else:
+      new_node = tf.NodeDef()
+      new_node.CopyFrom(current_node)
+      self.add_output_graph_node(new_node)
+
+  def quantize_node(self, input_node):
+    """Handles quantizing a single node."""
+    input_name = input_node.name
+    if input_name in self.already_quantized:
+      return
+    self.already_quantized[input_name] = True
+    original_input_name = input_name + "_original"
+    reshape_name = input_name + "_reshape"
+    reshape_dims_name = input_name + "_reshape_dims"
+    max_name = input_name + "_max"
+    min_name = input_name + "_min"
+    dims_name = input_name + "_dims"
+    quantize_name = input_name + "_quantize"
+    dequantize_name = input_name
+    original_input_node = tf.NodeDef()
+    original_input_node.CopyFrom(input_node)
+    original_input_node.name = original_input_name
+    self.add_output_graph_node(original_input_node)
+    reshape_dims_node = create_constant_node(reshape_dims_name, -1, tf.int32,
+                                             [1])
+    self.add_output_graph_node(reshape_dims_node)
+    reshape_node = create_node("Reshape", reshape_name, [original_input_name,
+                                                         reshape_dims_name])
+    set_attr_dtype(reshape_node, "T", tf.float32)
+    self.add_output_graph_node(reshape_node)
+    dims_node = create_constant_node(dims_name, 0, tf.int32, [1])
+    self.add_output_graph_node(dims_node)
+    max_node = create_node("Max", max_name, [reshape_name, dims_name])
+    set_attr_dtype(max_node, "T", tf.float32)
+    set_attr_bool(max_node, "keep_dims", False)
+    self.add_output_graph_node(max_node)
+    min_node = create_node("Min", min_name, [reshape_name, dims_name])
+    set_attr_dtype(min_node, "T", tf.float32)
+    set_attr_bool(min_node, "keep_dims", False)
+    self.add_output_graph_node(min_node)
+    quantize_node = create_node("Quantize", quantize_name, [original_input_name,
+                                                            min_name, max_name])
+    set_attr_dtype(quantize_node, "T", tf.quint8)
+    set_attr_string(quantize_node, "mode", b"MIN_FIRST")
+    self.add_output_graph_node(quantize_node)
+    dequantize_node = create_node("Dequantize", dequantize_name,
+                                  [quantize_name, min_name, max_name])
+    set_attr_dtype(dequantize_node, "T", tf.quint8)
+    set_attr_string(dequantize_node, "mode", b"MIN_FIRST")
+    self.add_output_graph_node(dequantize_node)
+
+  def eightbitize_nodes_recursively(self, current_node):
+    """The entry point for transforming a graph into full eight bit."""
+    self.already_visited[current_node.name] = True
+    for input_node_name in current_node.input:
+      input_node_name = node_name_from_input(input_node_name)
+      if input_node_name in self.already_visited:
+        continue
+      input_node = self.nodes_map[input_node_name]
+      self.eightbitize_nodes_recursively(input_node)
+    if current_node.op == "MatMul":
+      self.eightbitize_mat_mul_node(current_node)
+    elif current_node.op == "Conv2D":
+      self.eightbitize_conv_node(current_node)
+      self.layers_eightbitized.append(current_node.name)
+    elif current_node.op == "BiasAdd":
+      self.eightbitize_bias_add_node(current_node)
+    elif current_node.op == "MaxPool" or current_node.op == "AvgPool":
+      self.eightbitize_single_input_tensor_node(current_node,
+                                                self.add_pool_function)
+    elif current_node.op == "Relu" or current_node.op == "Relu6":
+      self.eightbitize_single_input_tensor_node(current_node,
+                                                self.add_relu_function)
+    elif current_node.op == "Concat":
+      self.eightbitize_concat_node(current_node)
+    elif current_node.op == "BatchNormWithGlobalNormalization":
+      self.eightbitize_batch_norm_node(current_node)
+    else:
+      new_node = tf.NodeDef()
+      new_node.CopyFrom(current_node)
+      self.add_output_graph_node(new_node)
+
+  def add_eightbit_prologue_nodes(self, original_node):
+    """Adds input conversion nodes to handle quantizing the underlying node."""
+    namespace_prefix = original_node.name + "_eightbit"
+    reshape_dims_name, reduction_dims_name = self.add_common_quantization_nodes(
+        namespace_prefix)
+    input_names = []
+    min_max_names = []
+    for original_input_name in original_node.input:
+      quantize_input_name, min_input_name, max_input_name = (
+          self.eightbitize_input_to_node(namespace_prefix, original_input_name,
+                                         reshape_dims_name,
+                                         reduction_dims_name))
+      input_names.append(quantize_input_name)
+      min_max_names.append(min_input_name)
+      min_max_names.append(max_input_name)
+    all_input_names = []
+    all_input_names.extend(input_names)
+    all_input_names.extend(min_max_names)
+    return all_input_names
+
+  def add_common_quantization_nodes(self, namespace_prefix):
+    """Builds constant nodes needed for quantization of inputs."""
+    reshape_dims_name = namespace_prefix + "_reshape_dims"
+    reduction_dims_name = namespace_prefix + "_reduction_dims"
+
+    reshape_dims_node = create_constant_node(reshape_dims_name, -1, tf.int32,
+                                             [1])
+    self.add_output_graph_node(reshape_dims_node)
+    reduction_dims_node = create_constant_node(reduction_dims_name, 0, tf.int32,
+                                               [1])
+    self.add_output_graph_node(reduction_dims_node)
+    return reshape_dims_name, reduction_dims_name
+
+  def eightbitize_input_to_node(self, namespace_prefix, original_input_name,
+                                reshape_dims_name, reduction_dims_name):
+    """Takes one float input to an op, and converts it to quantized form."""
+    unique_input_name = unique_node_name_from_input(original_input_name)
+    reshape_input_name = namespace_prefix + "_reshape_" + unique_input_name
+    min_input_name = namespace_prefix + "_min_" + unique_input_name
+    max_input_name = namespace_prefix + "_max_" + unique_input_name
+    quantize_input_name = namespace_prefix + "_quantize_" + unique_input_name
+    reshape_input_node = create_node("Reshape", reshape_input_name,
+                                     [original_input_name, reshape_dims_name])
+    set_attr_dtype(reshape_input_node, "T", tf.float32)
+    self.add_output_graph_node(reshape_input_node)
+    min_input_node = create_node("Min", min_input_name, [reshape_input_name,
+                                                         reduction_dims_name])
+    set_attr_dtype(min_input_node, "T", tf.float32)
+    set_attr_bool(min_input_node, "keep_dims", False)
+    self.add_output_graph_node(min_input_node)
+    max_input_node = create_node("Max", max_input_name, [reshape_input_name,
+                                                         reduction_dims_name])
+    set_attr_dtype(max_input_node, "T", tf.float32)
+    set_attr_bool(max_input_node, "keep_dims", False)
+    self.add_output_graph_node(max_input_node)
+    quantize_input_node = create_node("QuantizeV2", quantize_input_name,
+                                      [original_input_name, min_input_name,
+                                       max_input_name])
+    set_attr_dtype(quantize_input_node, "T", tf.quint8)
+    set_attr_string(quantize_input_node, "mode", b"MIN_FIRST")
+    self.add_output_graph_node(quantize_input_node)
+    min_output_name = quantize_input_name + ":1"
+    max_output_name = quantize_input_name + ":2"
+    return quantize_input_name, min_output_name, max_output_name
+
+  def add_quantize_down_node(self, original_node, quantized_output_name):
+    quantize_down_name = original_node.name + "_eightbit_quantize_down"
+    quantize_down_node = create_node(
+        "QuantizeDownAndShrinkRange", quantize_down_name,
+        [quantized_output_name, quantized_output_name + ":1",
+         quantized_output_name + ":2"])
+    set_attr_dtype(quantize_down_node, "Tinput", tf.qint32)
+    set_attr_dtype(quantize_down_node, "out_type", tf.quint8)
+    self.add_output_graph_node(quantize_down_node)
+    return quantize_down_name
+
+  def add_dequantize_result_node(self, quantized_output_name,
+                                 original_node_name):
+    dequantize_name = original_node_name
+    dequantize_node = create_node("Dequantize", dequantize_name,
+                                  [quantized_output_name,
+                                   quantized_output_name + ":1",
+                                   quantized_output_name + ":2"])
+    set_attr_dtype(dequantize_node, "T", tf.quint8)
+    set_attr_string(dequantize_node, "mode", b"MIN_FIRST")
+    self.add_output_graph_node(dequantize_node)
+
+  def eightbitize_mat_mul_node(self, original_node):
+    """Replaces a MatMul node with the eight bit equivalent sub-graph."""
+    quantized_mat_mul_name = original_node.name + "_eightbit_quantized_bias_add"
+    all_input_names = self.add_eightbit_prologue_nodes(original_node)
+    quantized_mat_mul_node = create_node(
+        "QuantizedMatMul", quantized_mat_mul_name,
+        all_input_names)
+    set_attr_dtype(quantized_mat_mul_node, "T1", tf.quint8)
+    set_attr_dtype(quantized_mat_mul_node, "T2", tf.quint8)
+    set_attr_dtype(quantized_mat_mul_node, "Toutput", tf.qint32)
+    copy_attr(quantized_mat_mul_node, "transpose_a",
+              original_node.attr["transpose_a"])
+    copy_attr(quantized_mat_mul_node, "transpose_b",
+              original_node.attr["transpose_b"])
+    self.add_output_graph_node(quantized_mat_mul_node)
+    quantize_down_name = self.add_quantize_down_node(original_node,
+                                                     quantized_mat_mul_name)
+    self.add_dequantize_result_node(quantize_down_name, original_node.name)
+
+  def eightbitize_conv_node(self, original_node):
+    """Replaces a Conv2D node with the eight bit equivalent sub-graph."""
+    all_input_names = self.add_eightbit_prologue_nodes(original_node)
+    quantized_conv_name = original_node.name + "_eightbit_quantized_conv"
+    quantized_conv_node = create_node("QuantizedConv2D", quantized_conv_name,
+                                      all_input_names)
+    copy_attr(quantized_conv_node, "strides", original_node.attr["strides"])
+    copy_attr(quantized_conv_node, "padding", original_node.attr["padding"])
+    set_attr_dtype(quantized_conv_node, "Tinput", tf.quint8)
+    set_attr_dtype(quantized_conv_node, "Tfilter", tf.quint8)
+    set_attr_dtype(quantized_conv_node, "out_type", tf.qint32)
+    self.add_output_graph_node(quantized_conv_node)
+    quantize_down_name = self.add_quantize_down_node(original_node,
+                                                     quantized_conv_name)
+    self.add_dequantize_result_node(quantize_down_name, original_node.name)
+
+  def eightbitize_bias_add_node(self, original_node):
+    """Replaces a BiasAdd node with the eight bit equivalent sub-graph."""
+    quantized_bias_add_name = (original_node.name +
+                               "_eightbit_quantized_bias_add")
+    all_input_names = self.add_eightbit_prologue_nodes(original_node)
+    quantized_bias_add_node = create_node(
+        "QuantizedBiasAdd", quantized_bias_add_name,
+        all_input_names)
+    set_attr_dtype(quantized_bias_add_node, "T1", tf.quint8)
+    set_attr_dtype(quantized_bias_add_node, "T2", tf.quint8)
+    set_attr_dtype(quantized_bias_add_node, "out_type", tf.qint32)
+    self.add_output_graph_node(quantized_bias_add_node)
+    quantize_down_name = self.add_quantize_down_node(original_node,
+                                                     quantized_bias_add_name)
+    self.add_dequantize_result_node(quantize_down_name, original_node.name)
+
+  def eightbitize_single_input_tensor_node(self, original_node,
+                                           add_op_function):
+    """Replaces a single-tensor node with the eight bit equivalent sub-graph.
+
+    Converts a node like this:
+
+       Shape(f)   Input(f)
+         |          |
+         +--------v v
+                Operation
+                    |
+                    v
+                   (f)
+
+     Into a quantized equivalent:
+
+                    Input(f)              ReshapeDims
+                       +------v v-------------+
+                       |    Reshape
+                       |      |
+                       |      |          ReductionDims
+                       |      +-----+         |
+                       |      | +---c---------+
+                       |      v v   v v-------+
+                       |      Min   Max
+                       |  +----+      |
+                       v  v  v--------+
+                      Quantize
+                          |
+                          v
+                   QuantizedOperation
+                      |   |   |
+                      v   v   v
+                      Dequantize
+                          |
+                          v
+                         (f)
+
+
+    Args:
+      original_node: Float node to be converted.
+      add_op_function: Function to create the actual node.
+
+    Returns:
+      Subgraph representing the quantized version of the original node.
+
+    """
+    quantized_op_name = original_node.name + "_eightbit_quantized"
+    quantized_op_type = "Quantized" + original_node.op
+    all_input_names = self.add_eightbit_prologue_nodes(original_node)
+    quantized_op_node = create_node(
+        quantized_op_type, quantized_op_name, all_input_names)
+    add_op_function(original_node, quantized_op_node)
+    self.add_output_graph_node(quantized_op_node)
+    self.add_dequantize_result_node(quantized_op_name, original_node.name)
+
+  def add_pool_function(self, original_node, quantized_op_node):
+    set_attr_dtype(quantized_op_node, "T", tf.quint8)
+    copy_attr(quantized_op_node, "ksize", original_node.attr["ksize"])
+    copy_attr(quantized_op_node, "strides", original_node.attr["strides"])
+    copy_attr(quantized_op_node, "padding", original_node.attr["padding"])
+
+  def add_relu_function(self, unused_arg_node, quantized_op_node):
+    set_attr_dtype(quantized_op_node, "Tinput", tf.quint8)
+
+  def eightbitize_concat_node(self, original_node):
+    """Replaces a Concat node with the eight bit equivalent sub-graph.
+
+    Converts a node like this:
+
+       Shape(f)   Input0(f)   Input1(f)
+         |          |            |
+         +--------v v v----------+
+                  Concat
+                    |
+                    v
+                   (f)
+
+     Into a quantized equivalent:
+
+       Shape(f)     Input0(f)             ReshapeDims                  Input1(f)
+         |             +------v v--------------+------------------v v------+
+         |             |    Reshape                             Reshape    |
+         |             |      |                                     |      |
+         |             |      |           ReductionDims             |      |
+         |             |      +------+         |           +--------+      |
+         |             |      |  +---c---------+-----------c-----+  |      |
+         |             |      +v v   v v-------+---------v v     v v+      |
+         |             |       Min   Max                 Min     Max       |
+         |             |  +----+      |                   |       +-----+  |
+         |             v  v  v--------+                   +----------v  v  v
+         |            Quantize                                       Quantize
+         |                +------------------+   +----------------------+
+         +-------------------------------+   |   |
+                                         v   v   v
+                                      QuantizedConcat
+                                         |   |   |
+                                         v   v   v
+                                        Dequantize
+                                             |
+                                             v
+                                            (f)
+    Args:
+      original_node: Float node to be converted.
+
+    Returns:
+      Subgraph representing the quantized version of the original node.
+
+    """
+    namespace_prefix = original_node.name + "_eightbit"
+    quantized_concat_name = namespace_prefix + "_quantized_concat"
+    reshape_dims_name, reduction_dims_name = self.add_common_quantization_nodes(
+        namespace_prefix)
+    shape_input_name = original_node.input[0]
+    original_inputs = original_node.input[1:]
+    input_names = []
+    min_names = []
+    max_names = []
+    for original_input_name in original_inputs:
+      quantize_input_name, min_input_name, max_input_name = (
+          self.eightbitize_input_to_node(namespace_prefix, original_input_name,
+                                         reshape_dims_name,
+                                         reduction_dims_name))
+      input_names.append(quantize_input_name)
+      min_names.append(min_input_name)
+      max_names.append(max_input_name)
+    all_input_names = [shape_input_name]
+    all_input_names.extend(input_names)
+    all_input_names.extend(min_names)
+    all_input_names.extend(max_names)
+    quantized_concat_node = create_node(
+        "QuantizedConcat", quantized_concat_name, all_input_names)
+    set_attr_int(quantized_concat_node, "N", len(original_inputs))
+    set_attr_dtype(quantized_concat_node, "T", tf.quint8)
+    self.add_output_graph_node(quantized_concat_node)
+    self.add_dequantize_result_node(quantized_concat_name, original_node.name)
+
+  def eightbitize_batch_norm_node(self, original_node):
+    """Replaces a MatMul node with the eight bit equivalent sub-graph."""
+    namespace_prefix = original_node.name + "_eightbit"
+    original_input_name = original_node.input[0]
+    original_mean_name = original_node.input[1]
+    original_variance_name = original_node.input[2]
+    original_beta_name = original_node.input[3]
+    original_gamma_name = original_node.input[4]
+    quantized_batch_norm_name = namespace_prefix + "_quantized_batch_norm"
+
+    reshape_dims_name, reduction_dims_name = self.add_common_quantization_nodes(
+        namespace_prefix)
+    quantize_input_name, min_input_name, max_input_name = (
+        self.eightbitize_input_to_node(namespace_prefix, original_input_name,
+                                       reshape_dims_name, reduction_dims_name))
+    quantize_mean_name, min_mean_name, max_mean_name = (
+        self.eightbitize_input_to_node(namespace_prefix, original_mean_name,
+                                       reshape_dims_name, reduction_dims_name))
+    quantize_variance_name, min_variance_name, max_variance_name = (
+        self.eightbitize_input_to_node(namespace_prefix, original_variance_name,
+                                       reshape_dims_name, reduction_dims_name))
+    quantize_beta_name, min_beta_name, max_beta_name = (
+        self.eightbitize_input_to_node(namespace_prefix, original_beta_name,
+                                       reshape_dims_name, reduction_dims_name))
+    quantize_gamma_name, min_gamma_name, max_gamma_name = (
+        self.eightbitize_input_to_node(namespace_prefix, original_gamma_name,
+                                       reshape_dims_name, reduction_dims_name))
+    quantized_batch_norm_node = create_node(
+        "QuantizedBatchNormWithGlobalNormalization", quantized_batch_norm_name,
+        [quantize_input_name, min_input_name, max_input_name,
+         quantize_mean_name, min_mean_name, max_mean_name,
+         quantize_variance_name, min_variance_name, max_variance_name,
+         quantize_beta_name, min_beta_name, max_beta_name, quantize_gamma_name,
+         min_gamma_name, max_gamma_name])
+    set_attr_dtype(quantized_batch_norm_node, "Tinput", tf.quint8)
+    set_attr_dtype(quantized_batch_norm_node, "out_type", tf.qint32)
+    copy_attr(quantized_batch_norm_node, "scale_after_normalization",
+              original_node.attr["scale_after_normalization"])
+    copy_attr(quantized_batch_norm_node, "variance_epsilon",
+              original_node.attr["variance_epsilon"])
+    self.add_output_graph_node(quantized_batch_norm_node)
+    quantize_down_name = self.add_quantize_down_node(original_node,
+                                                     quantized_batch_norm_name)
+    self.add_dequantize_result_node(quantize_down_name, original_node.name)
+
+  def add_output_graph_node(self, output_node):
+    """Inserts one node into the new graph."""
+    self.output_graph.node.extend([output_node])
+
+  def remove_redundant_quantization(self, old_graph):
+    """Removes unneeded pairs of quantize/dequantize ops from the graph.
+
+    This is a bit of a tricky function, because it's attempting to spot the
+    pattern of dequantizing from eight-bit up to float, and then immediately
+    quantizing back down to eight bits again, that's introduced by previous
+    passes that do 'key-hole' conversions of individual nodes but have to
+    convert back to float to match the previous output interface, since they
+    don't know that the next op can handle quantized tensors.
+    It works by:
+     - Looking for Quantize nodes.
+     - Checking to see if their first input is a Dequantize node.
+     - Seeing if their min/max inputs come from Min/Max nodes.
+     - Making sure those Min/Max nodes are being fed from the same Dequantize.
+     - Or that the Min is indirectly being fed from the same Dequantize as Max.
+     - Making sure the Dequantize is going through a Reshape (which we add
+       during the previous pass when we create the quantize sub-graph).
+     - Looking for the dims Const op for the Min/Max dims.
+    If all of these conditions are met, then it's a sub-graph pattern that
+    we know how to optimize out (and is likely the common one we've introduced).
+    We then rewire the graph to skip it entirely, and then rely on the dead node
+    removal pass to get rid of any nodes that are no longer needed.
+
+    Args:
+      old_graph: The model we'll be stripping redundant nodes from.
+
+    Returns:
+      A graph with the unnecessary nodes removed.
+
+    Raises:
+      ValueError: Two nodes with the same name were found in the graph.
+    """
+    old_nodes_map = self.create_nodes_map(old_graph)
+    self.output_graph = tf.GraphDef()
+    inputs_to_rename = {}
+    # We go through all the nodes, looking for any that match the patterns we
+    # know how to optimize away.
+    for node in old_graph.node:
+      # We always start with a Quantize node, and examine its inputs to see if
+      # they are in a form that can be removed.
+      if node.op not in ["Quantize", "QuantizeV2"]:
+        continue
+      dequantize_node_name = node_name_from_input(node.input[0])
+      if dequantize_node_name not in old_nodes_map:
+        raise ValueError("Input node name '" + dequantize_node_name +
+                         "' not found in node '" + node.name + "'")
+      dequantize_node = old_nodes_map[dequantize_node_name]
+      # Do we have a Dequantize feeding in, with the same type as the Quantize?
+      if dequantize_node.op != "Dequantize":
+        continue
+      if node.attr["T"] != dequantize_node.attr["T"]:
+        continue
+      # Now look at the other inputs, and ensure they're Min/Max nodes.
+      min_node_name = node_name_from_input(node.input[1])
+      max_node_name = node_name_from_input(node.input[2])
+      min_node = old_nodes_map[min_node_name]
+      max_node = old_nodes_map[max_node_name]
+      is_min_right_type = (min_node.op in ["Min", "Dequantize"])
+      is_max_right_type = (max_node.op in ["Max", "Dequantize"])
+      if not is_min_right_type or not is_max_right_type:
+        print("Didn't find expected types on inputs : %s, %s." % (
+            min_node.op, max_node.op))
+        continue
+      min_node_input_name = node_name_from_input(min_node.input[0])
+      max_node_input_name = node_name_from_input(max_node.input[0])
+      # There are two different patterns for Min nodes we can recognize, one
+      # where the input comes directly from the same one as the Max, and
+      # another where we run it through another Min first, so check for both.
+      is_same_input = False
+      if min_node_input_name == max_node_input_name:
+        is_same_input = True
+      else:
+        first_min_node_input = old_nodes_map[min_node_input_name]
+        if first_min_node_input.op == "Concat":
+          second_min_node_name = node_name_from_input(
+              first_min_node_input.input[1])
+          second_min_node = old_nodes_map[second_min_node_name]
+          if second_min_node.op == "Min":
+            second_min_node_input_name = node_name_from_input(
+                second_min_node.input[0])
+            is_same_input = (second_min_node_input_name == max_node_input_name)
+      if not is_same_input:
+        print("Different min/max inputs: " + min_node_input_name)
+        continue
+      # We recognize this pattern, so mark the graph edges to be rewired to
+      # route around it entirely, since we know it's a no-op.
+      dequantize_source_name = node_name_from_input(dequantize_node.input[0])
+      node_tensor_name = ensure_tensor_name_has_port(node.name)
+      min_tensor_name = node.name + ":1"
+      max_tensor_name = node.name + ":2"
+      inputs_to_rename[node_tensor_name] = dequantize_source_name
+      inputs_to_rename[min_tensor_name] = dequantize_node.input[1]
+      inputs_to_rename[max_tensor_name] = dequantize_node.input[2]
+    # Finally we apply all the rewiring we've marked to the graph.
+    for node in old_graph.node:
+      for index, input_full_name in enumerate(node.input):
+        input_name = ensure_tensor_name_has_port(input_full_name)
+        if input_name in inputs_to_rename:
+          node.input[index] = inputs_to_rename[input_name]
+      self.add_output_graph_node(node)
+    return self.output_graph
+
+  def remove_dead_nodes(self, output_names):
+    """Removes nodes that are no longer needed for inference from the graph."""
+    old_output_graph = self.output_graph
+    self.output_graph = graph_util.extract_sub_graph(old_output_graph,
+                                                     output_names)
+
+  def quantize_weights(self, input_graph, quantization_mode):
+    """Quantize float Const ops.
+
+    There are two modes of operations, both replace float Const ops with
+    quantized values.
+    1. If quantization_mode is "weights_rounded", this function replaces float
+    Const ops with quantized float Const ops - same as the original op, but
+    float values being mapped to the center of one of 1<<FLAGS.bitdepth buckets.
+    This does not change the raw model size, but compression algorithms such as
+    zip (as used for compressing apks) or bzip2 will achieve a very good
+    compression ratio.
+    2. For other quantization modes ("MIN_COMBINED" or "MIN_FIRST"), float
+    Const ops are quantized and replaced by a tuple of four ops to perform
+    the dequantization at runtime:
+    * eight-bit Const (bucket indices, same shape as original float Const op
+    * two float Const ops (min and max value of original float Const op)
+    * Dequantize op to convert the eight-bit consts to float tensors.
+    The quantization mode is important because we see accuracy problems when
+    quantizing weights for different situations depending on the algorithm
+    used. We haven't figured out exactly what the underlying cause is yet,
+    unfortunately.
+
+    Args:
+      input_graph: A GraphDef of the model containing float Const ops.
+      quantization_mode: How to quantize and dequantize the values.
+
+    Returns:
+      A GraphDef of the converted graph.
+
+    Raises:
+      ValueError: If quantization_mode is unsupported.
+    """
+    output_graph = tf.GraphDef()
+    for input_node in input_graph.node:
+      should_quantize = False
+      if input_node.op == "Const":
+        dtype = tf.as_dtype(input_node.attr["dtype"].type)
+        if dtype == tf.float32:
+          should_quantize = True
+      if should_quantize:
+        if quantization_mode == "weights_rounded":
+          output_graph.node.extend(quantize_weight_rounded(input_node))
+        elif quantization_mode in (b"MIN_COMBINED", b"MIN_FIRST"):
+          output_graph.node.extend(quantize_weight_eightbit(input_node,
+                                                            quantization_mode))
+        else:
+          raise ValueError("Unsupported quantization mode %s." %
+                           quantization_mode)
+      else:
+        output_node = tf.NodeDef()
+        output_node.CopyFrom(input_node)
+        output_graph.node.extend([output_node])
+    return output_graph
+
+  def set_input_graph(self, new_input_graph):
+    self.input_graph = new_input_graph
+    self.nodes_map = self.create_nodes_map(self.input_graph)
+
+
+def main(unused_args):
+  if not tf.gfile.Exists(FLAGS.input):
+    print("Input graph file '" + FLAGS.input + "' does not exist!")
+    return -1
+
+  known_modes = ["round", "quantize", "eightbit", "weights", "test",
+                 "weights_rounded"]
+  if not any(FLAGS.mode in s for s in known_modes):
+    print("mode is '" + FLAGS.mode + "', not in " + ", ".join(known_modes) +
+          ".")
+    return -1
+
+  tf_graph = tf.GraphDef()
+  with tf.gfile.Open(FLAGS.input, "rb") as f:
+    data = f.read()
+    tf_graph.ParseFromString(data)
+
+  graph = tf.Graph()
+  with graph.as_default():
+    tf.import_graph_def(tf_graph, input_map={}, name="")
+
+  rewriter = GraphRewriter(tf_graph, FLAGS.mode)
+
+  output_graph = rewriter.rewrite(FLAGS.output_node_names.split(","))
+
+  f = tf.gfile.FastGFile(FLAGS.output, "wb")
+  f.write(output_graph.SerializeToString())
+
+  return 0
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensorflow/contrib/quantization/tools/quantize_graph_test.py b/tensorflow/contrib/quantization/tools/quantize_graph_test.py
new file mode 100644
index 0000000000..4826ea2689
--- /dev/null
+++ b/tensorflow/contrib/quantization/tools/quantize_graph_test.py
@@ -0,0 +1,698 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests the graph quantization script.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+import tensorflow as tf
+from tensorflow.contrib.quantization.tools import quantize_graph
+from tensorflow.python.framework import graph_util
+
+flags = tf.app.flags
+FLAGS = flags.FLAGS
+
+
+def run_graph_def(graph_def, input_map, outputs):
+  graph = tf.Graph()
+  with graph.as_default():
+    tf.import_graph_def(graph_def, input_map={}, name="")
+  with tf.Session(graph=graph) as sess:
+    results = sess.run(outputs, feed_dict=input_map)
+  return results
+
+
+def test_mat_mul(m, n, k, a, b):
+  """Tests a MatMul replacement."""
+  a_constant_name = "a_constant"
+  b_constant_name = "b_constant"
+  mat_mul_name = "mat_mul"
+
+  float_graph_def = tf.GraphDef()
+  a_constant = quantize_graph.create_constant_node(a_constant_name,
+                                                   value=a,
+                                                   dtype=tf.float32,
+                                                   shape=[m, k])
+  float_graph_def.node.extend([a_constant])
+  b_constant = quantize_graph.create_constant_node(b_constant_name,
+                                                   value=b,
+                                                   dtype=tf.float32,
+                                                   shape=[k, n])
+  float_graph_def.node.extend([b_constant])
+  mat_mul_node = quantize_graph.create_node("MatMul", mat_mul_name,
+                                            [a_constant_name, b_constant_name])
+  quantize_graph.set_attr_dtype(mat_mul_node, "T", tf.float32)
+  quantize_graph.set_attr_bool(mat_mul_node, "transpose_a", False)
+  quantize_graph.set_attr_bool(mat_mul_node, "transpose_b", False)
+  float_graph_def.node.extend([mat_mul_node])
+
+  test_graph(float_graph_def, {}, [mat_mul_name])
+
+
+def test_conv(depth, image_width, image_height, image_batch_count, filter_size,
+              filter_count, stride, padding, input_values, filter_values):
+  """Tests a Conv replacement."""
+  input_constant_name = "input_constant"
+  filter_constant_name = "filter_constant"
+  conv_name = "conv"
+
+  float_graph_def = tf.GraphDef()
+  input_constant = quantize_graph.create_constant_node(
+      input_constant_name,
+      value=input_values,
+      dtype=tf.float32,
+      shape=[
+          image_batch_count, image_height, image_width, depth
+      ])
+  float_graph_def.node.extend([input_constant])
+  filter_constant = quantize_graph.create_constant_node(
+      filter_constant_name,
+      value=filter_values,
+      dtype=tf.float32,
+      shape=[
+          filter_size, filter_size, depth, filter_count
+      ])
+  float_graph_def.node.extend([filter_constant])
+  conv_node = quantize_graph.create_node("Conv2D", conv_name,
+                                         [input_constant_name,
+                                          filter_constant_name])
+  quantize_graph.set_attr_dtype(conv_node, "T", tf.float32)
+  quantize_graph.set_attr_int_list(conv_node, "strides", [1, stride, stride, 1])
+  quantize_graph.set_attr_string(conv_node, "padding", padding)
+  float_graph_def.node.extend([conv_node])
+
+  test_graph(float_graph_def, {}, [conv_name])
+
+
+def are_tensors_near(a, b, tolerance):
+  """Tests whether two tensors are nearly identical.
+
+  This is a specialized comparison function designed to help debug problems with
+  quantization. It prints out information about the differences between tensors
+  on failure, paying special attention to possible biases by looking at the mean
+  and absolute average errors.
+
+  Args:
+    a: First comparison tensor.
+    b: Second comparison tensor.
+    tolerance: Float value indicating how large an error between values is ok.
+
+  Returns:
+    Boolean indicating whether the two inputs were close enough.
+  """
+  flat_a = a.flatten()
+  flat_b = b.flatten()
+  if len(flat_a) != len(flat_b):
+    print("Tensors are different sizes: " + str(len(flat_a)) + " vs " +
+          str(len(flat_b)))
+    return False
+  value_count = len(flat_a)
+  how_many_different = 0
+  total_difference = 0
+  total_abs_difference = 0
+  for index in range(value_count):
+    a_value = flat_a[index]
+    b_value = flat_b[index]
+    difference = a_value - b_value
+    total_difference += difference
+    total_abs_difference += abs(difference)
+    if abs(difference) > tolerance:
+      how_many_different += 1
+  mean_difference = total_difference / value_count
+  mean_abs_difference = total_abs_difference / value_count
+  proportion_different = (how_many_different * 1.0) / value_count
+  if how_many_different == 0:
+    return True
+  else:
+    print("Tensors have {0} different values ({1}%), with mean difference"
+          " {2} and mean absolute difference {3}".format(
+              how_many_different, proportion_different * 100, mean_difference,
+              mean_abs_difference))
+    return False
+
+
+def get_top_value(input_values):
+  max_value = None
+  max_index = None
+  for index, value in enumerate(input_values.flatten()):
+    if max_value is None or value > max:
+      max_value = value
+      max_index = index
+  return max_index, max_value
+
+
+def test_graph(float_graph_def, input_map, output_names):
+  """Runs the float graph through the rewriter and tests the results."""
+  float_results = run_graph_def(float_graph_def, input_map,
+                                [output_name + ":0"
+                                 for output_name in output_names])
+  # TODO(petewarden): round test is currently failing because there is no
+  # RoundToSteps op available.
+  # round_rewriter = quantize_graph.GraphRewriter(float_graph_def, "round")
+  # round_graph_def = round_rewriter.rewrite(output_name)
+  # round_results = run_graph_def(round_graph_def, input_map,
+  #                               [output_name + ":0"])
+  # assert are_tensors_near(expected, round_results[0], 1.0)
+  #
+  # TODO(petewarden): Add test for "quantize" mode.
+
+  eightbit_rewriter = quantize_graph.GraphRewriter(float_graph_def, "eightbit")
+  eightbit_graph_def = eightbit_rewriter.rewrite(output_names)
+  eightbit_results = run_graph_def(eightbit_graph_def, input_map,
+                                   [output_name + ":0"
+                                    for output_name in output_names])
+  for expected, result in zip(float_results, eightbit_results):
+    assert are_tensors_near(expected, result, 1.0)
+
+  # Test the weights_rounded mode. This uses the default bit_depth.
+  weights_rounded_rewriter = quantize_graph.GraphRewriter(
+      float_graph_def, "weights_rounded")
+  weights_rounded_graph_def = weights_rounded_rewriter.rewrite(output_names)
+  weights_rounded_results = run_graph_def(weights_rounded_graph_def, input_map,
+                                          [output_name + ":0"
+                                           for output_name in output_names])
+  for expected, result in zip(float_results, weights_rounded_results):
+    assert are_tensors_near(expected, result, 1.0)
+
+
+class QuantizeGraphTest(tf.test.TestCase):
+
+  def test_negative_const_problem(self):
+    shape_constant_name = "shape_constant"
+    shape_constant = quantize_graph.create_constant_node(
+        shape_constant_name, value=-0.8, dtype=tf.float32, shape=[1])
+    quantization_result = quantize_graph.quantize_weight_eightbit(
+        shape_constant, b"MIN_COMBINED")
+    self.assertEqual(4, len(quantization_result))
+
+  def test_odd_padding_problem(self):
+    """Tests one error case we ran into in a real graph."""
+    test_conv(1, 4, 4, 1, 3, 1, 2, b"SAME",
+              [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+              [1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+  def test_mat_mul_tiny(self):
+    # These tests are added to test the generate case where
+    # min(matrix) == max(matrix), which used to cause problems.
+    test_mat_mul(1, 1, 1, [2], [3])
+    test_mat_mul(1, 2, 1, [1], [2, 3])
+    test_mat_mul(1, 1, 2, [1, 1], [1, 1])
+    test_mat_mul(1, 1, 2, [0, 0], [1, 1])
+    # The general case.
+    test_mat_mul(1, 1, 2, [1, 2], [1, 2])
+
+  def test_mat_mul_small(self):
+    test_mat_mul(2, 4, 3, [1, 2, 3, 4, 5, 6],
+                 [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18])
+
+  def test_conv(self):
+    test_conv(1, 4, 3, 1, 3, 1, 1, b"SAME",
+              [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+              [1, 4, 7, 2, 5, 8, 3, 6, 9])
+
+  def test_quantize_array(self):
+    # Test invalid parameters (empty array, or 0 buckets.
+    self.assertRaises(ValueError, quantize_graph.quantize_array,
+                      np.array([]), 2)
+    self.assertRaises(ValueError, quantize_graph.quantize_array,
+                      np.array([1, 2]), 0)
+    # Test input array of length 1.
+    arr = np.array([1])
+    qarr = quantize_graph.quantize_array(arr, 1)
+    self.assertEqual(arr, qarr)
+    qarr = quantize_graph.quantize_array(arr, 2)
+    self.assertEqual(arr, qarr)
+    # Test input array with all elements equal.
+    arr = np.array([1, 1, 1])
+    qarr = quantize_graph.quantize_array(arr, 10)
+    self.assertTrue((np.array([1, 1, 1]) == qarr).all())
+    # Test "normal" input arrays.
+    arr = np.array([0, 0.3, 0.6, 1])
+    qarr = quantize_graph.quantize_array(arr, 1)
+    self.assertTrue((np.array([0.5, 0.5, 0.5, 0.5]) == qarr).all())
+    qarr = quantize_graph.quantize_array(arr, 2)
+    self.assertTrue((np.array([0.25, 0.25, 0.75, 0.75]) == qarr).all())
+    qarr = quantize_graph.quantize_array(arr.reshape((2, 2)), 2)
+    self.assertTrue((np.array([[0.25, 0.25], [0.75, 0.75]]) == qarr).all())
+
+  def test_concat(self):
+    shape_constant_name = "shape_constant"
+    a_constant_name = "a_constant"
+    b_constant_name = "b_constant"
+    concat_name = "concat"
+
+    float_graph_def = tf.GraphDef()
+    shape_constant = quantize_graph.create_constant_node(shape_constant_name,
+                                                         value=0,
+                                                         dtype=tf.int32,
+                                                         shape=[])
+    float_graph_def.node.extend([shape_constant])
+    a_constant = quantize_graph.create_constant_node(a_constant_name,
+                                                     value=[1, 2, 3, 4, 5, 6, 7,
+                                                            8, 9, 10, 11, 12],
+                                                     dtype=tf.float32,
+                                                     shape=[2, 2, 3])
+    float_graph_def.node.extend([a_constant])
+    b_constant = quantize_graph.create_constant_node(b_constant_name,
+                                                     value=[13, 14, 15, 16, 17,
+                                                            18, 19, 20, 21, 22,
+                                                            23, 24],
+                                                     dtype=tf.float32,
+                                                     shape=[2, 2, 3])
+    float_graph_def.node.extend([b_constant])
+    concat_node = quantize_graph.create_node("Concat", concat_name,
+                                             [shape_constant_name,
+                                              a_constant_name, b_constant_name])
+    quantize_graph.set_attr_int(concat_node, "N", 2)
+    quantize_graph.set_attr_dtype(concat_node, "T", tf.float32)
+    float_graph_def.node.extend([concat_node])
+
+    test_graph(float_graph_def, {}, [concat_name])
+
+  def test_multiple_outputs(self):
+    input_constant_name = "input_constant"
+    split_constant_name = "split_constant"
+    split_name = "split"
+    concat_constant_name = "concat_constant"
+    concat_name = "concat"
+
+    float_graph_def = tf.GraphDef()
+    input_constant = quantize_graph.create_constant_node(input_constant_name,
+                                                         value=[1, 2, 3, 4, 5,
+                                                                6, 7, 8, 9, 10,
+                                                                11, 12],
+                                                         dtype=tf.float32,
+                                                         shape=[2, 6])
+    float_graph_def.node.extend([input_constant])
+    split_constant = quantize_graph.create_constant_node(split_constant_name,
+                                                         value=1,
+                                                         dtype=tf.int32,
+                                                         shape=[])
+    float_graph_def.node.extend([split_constant])
+    split_node = quantize_graph.create_node("Split", split_name,
+                                            [split_constant_name,
+                                             input_constant_name])
+    quantize_graph.set_attr_int(split_node, "num_split", 2)
+    quantize_graph.set_attr_dtype(split_node, "T", tf.float32)
+    float_graph_def.node.extend([split_node])
+    concat_constant = quantize_graph.create_constant_node(concat_constant_name,
+                                                          value=1,
+                                                          dtype=tf.int32,
+                                                          shape=[])
+    float_graph_def.node.extend([concat_constant])
+    concat_node = quantize_graph.create_node("Concat", concat_name,
+                                             [concat_constant_name,
+                                              split_name + ":0",
+                                              split_name + ":1"])
+    quantize_graph.set_attr_int(concat_node, "N", 2)
+    quantize_graph.set_attr_dtype(concat_node, "T", tf.float32)
+    float_graph_def.node.extend([concat_node])
+
+    test_graph(float_graph_def, {}, [concat_name])
+
+  def test_node_name_from_input(self):
+    self.assertEqual("SomeName",
+                     quantize_graph.node_name_from_input("^SomeName:2"))
+
+  def test_unique_node_name_from_input(self):
+    self.assertEqual("__hat__SomeName__port__2",
+                     quantize_graph.unique_node_name_from_input("^SomeName:2"))
+
+  def test_identity(self):
+    input_constant_name = "input_constant"
+    identity_name = "identity"
+    float_graph_def = tf.GraphDef()
+    input_constant = quantize_graph.create_constant_node(input_constant_name,
+                                                         value=[1, 2, 3, 4, 5,
+                                                                6, 7, 8, 9, 10,
+                                                                11, 12],
+                                                         dtype=tf.float32,
+                                                         shape=[2, 6])
+    float_graph_def.node.extend([input_constant])
+    identity_node = quantize_graph.create_node("Identity", identity_name,
+                                               [input_constant_name])
+    quantize_graph.set_attr_dtype(identity_node, "T", tf.float32)
+    float_graph_def.node.extend([identity_node])
+    test_graph(float_graph_def, {}, [identity_name])
+
+  def test_keep_control_edges(self):
+    no_op_name = "no_op"
+    a_constant_name = "a_constant"
+    b_constant_name = "b_constant"
+    a_check_name = "a_check"
+    b_check_name = "b_check"
+    a_identity_name = "a_identity"
+    b_identity_name = "b_identity"
+    add_name = "add"
+    graph_def = tf.GraphDef()
+    no_op = quantize_graph.create_node("NoOp", no_op_name, [])
+    graph_def.node.extend([no_op])
+    a_constant = quantize_graph.create_constant_node(a_constant_name,
+                                                     value=1,
+                                                     dtype=tf.float32,
+                                                     shape=[])
+    graph_def.node.extend([a_constant])
+    a_check_node = quantize_graph.create_node("CheckNumerics", a_check_name,
+                                              [a_constant_name])
+    graph_def.node.extend([a_check_node])
+    a_identity_node = quantize_graph.create_node("Identity", a_identity_name,
+                                                 [a_constant_name,
+                                                  "^" + a_check_name,
+                                                  "^" + no_op_name])
+    graph_def.node.extend([a_identity_node])
+    b_constant = quantize_graph.create_constant_node(b_constant_name,
+                                                     value=1,
+                                                     dtype=tf.float32,
+                                                     shape=[])
+    graph_def.node.extend([b_constant])
+    b_check_node = quantize_graph.create_node("CheckNumerics", b_check_name,
+                                              [b_constant_name])
+    graph_def.node.extend([b_check_node])
+    b_identity_node = quantize_graph.create_node("Identity", b_identity_name,
+                                                 [b_constant_name,
+                                                  "^" + b_check_name])
+    graph_def.node.extend([b_identity_node])
+    add_node = quantize_graph.create_node("Add", add_name,
+                                          [a_identity_name,
+                                           b_identity_name])
+    quantize_graph.set_attr_dtype(add_node, "T", tf.float32)
+    graph_def.node.extend([add_node])
+
+    expected_output = tf.GraphDef()
+    no_op = quantize_graph.create_node("NoOp", no_op_name, [])
+    expected_output.node.extend([no_op])
+    a_constant = quantize_graph.create_constant_node(a_constant_name,
+                                                     value=1,
+                                                     dtype=tf.float32,
+                                                     shape=[])
+    expected_output.node.extend([a_constant])
+    a_identity_node = quantize_graph.create_node("Identity", a_identity_name,
+                                                 [a_constant_name,
+                                                  "^" + no_op_name])
+    expected_output.node.extend([a_identity_node])
+    b_constant = quantize_graph.create_constant_node(b_constant_name,
+                                                     value=1,
+                                                     dtype=tf.float32,
+                                                     shape=[])
+    expected_output.node.extend([b_constant])
+    add_node = quantize_graph.create_node("Add", add_name,
+                                          [a_identity_name,
+                                           b_constant_name])
+    quantize_graph.set_attr_dtype(add_node, "T", tf.float32)
+    expected_output.node.extend([add_node])
+
+    output = graph_util.remove_training_nodes(graph_def)
+    stripped_output = graph_util.extract_sub_graph(output, [add_name])
+    self.assertProtoEquals(expected_output, stripped_output)
+
+  def test_batch_norm(self):
+    input_constant_name = "input_constant"
+    mean_constant_name = "mean_constant"
+    variance_constant_name = "variance_constant"
+    beta_constant_name = "beta_constant"
+    gamma_constant_name = "gamma_constant"
+    batch_norm_name = "batch_norm"
+    float_graph_def = tf.GraphDef()
+    input_constant = quantize_graph.create_constant_node(input_constant_name,
+                                                         value=[1, 4, 2, 5, 3,
+                                                                6, -1, -4, -2,
+                                                                -5, -3, -6],
+                                                         dtype=tf.float32,
+                                                         shape=[1, 1, 6, 2])
+    float_graph_def.node.extend([input_constant])
+    mean_constant = quantize_graph.create_constant_node(mean_constant_name,
+                                                        value=[10, 20],
+                                                        dtype=tf.float32,
+                                                        shape=[2])
+    float_graph_def.node.extend([mean_constant])
+    variance_constant = quantize_graph.create_constant_node(
+        variance_constant_name, value=[0.25, 0.5], dtype=tf.float32, shape=[2])
+    float_graph_def.node.extend([variance_constant])
+    beta_constant = quantize_graph.create_constant_node(beta_constant_name,
+                                                        value=[0.1, 0.6],
+                                                        dtype=tf.float32,
+                                                        shape=[2])
+    float_graph_def.node.extend([beta_constant])
+    gamma_constant = quantize_graph.create_constant_node(gamma_constant_name,
+                                                         value=[0, 0],
+                                                         dtype=tf.float32,
+                                                         shape=[2])
+    float_graph_def.node.extend([gamma_constant])
+    batch_norm_node = quantize_graph.create_node(
+        "BatchNormWithGlobalNormalization", batch_norm_name,
+        [input_constant_name, mean_constant_name, variance_constant_name,
+         beta_constant_name, gamma_constant_name])
+    quantize_graph.set_attr_dtype(batch_norm_node, "T", tf.float32)
+    quantize_graph.set_attr_bool(batch_norm_node, "scale_after_normalization",
+                                 False)
+    quantize_graph.set_attr_float(batch_norm_node, "variance_epsilon", 0.001)
+    float_graph_def.node.extend([batch_norm_node])
+    test_graph(float_graph_def, {}, [batch_norm_name])
+
+  def test_max_pool(self):
+    input_constant_name = "input_constant"
+    max_pool_name = "max_pool"
+    float_graph_def = tf.GraphDef()
+    input_constant = quantize_graph.create_constant_node(input_constant_name,
+                                                         value=[1, 2, 3, 4, 5,
+                                                                6, 7, 8, 9, 10,
+                                                                11, 12],
+                                                         dtype=tf.float32,
+                                                         shape=[1, 2, 6, 1])
+    float_graph_def.node.extend([input_constant])
+    max_pool_node = quantize_graph.create_node("MaxPool", max_pool_name,
+                                               [input_constant_name])
+    quantize_graph.set_attr_int_list(max_pool_node, "ksize", [1, 2, 2, 1])
+    quantize_graph.set_attr_int_list(max_pool_node, "strides", [1, 1, 1, 1])
+    quantize_graph.set_attr_string(max_pool_node, "padding", b"SAME")
+    float_graph_def.node.extend([max_pool_node])
+    test_graph(float_graph_def, {}, [max_pool_name])
+
+  def test_avg_pool(self):
+    input_constant_name = "input_constant"
+    avg_pool_name = "avg_pool"
+    float_graph_def = tf.GraphDef()
+    input_constant = quantize_graph.create_constant_node(input_constant_name,
+                                                         value=[1, 2, 3, 4, 5,
+                                                                6, 7, 8, 9, 10,
+                                                                11, 12],
+                                                         dtype=tf.float32,
+                                                         shape=[1, 2, 6, 1])
+    float_graph_def.node.extend([input_constant])
+    avg_pool_node = quantize_graph.create_node("AvgPool", avg_pool_name,
+                                               [input_constant_name])
+    quantize_graph.set_attr_dtype(avg_pool_node, "T", tf.float32)
+    quantize_graph.set_attr_int_list(avg_pool_node, "ksize", [1, 2, 2, 1])
+    quantize_graph.set_attr_int_list(avg_pool_node, "strides", [1, 1, 1, 1])
+    quantize_graph.set_attr_string(avg_pool_node, "padding", b"SAME")
+    float_graph_def.node.extend([avg_pool_node])
+    test_graph(float_graph_def, {}, [avg_pool_name])
+
+  def test_relu(self):
+    input_constant_name = "input_constant"
+    relu_name = "relu"
+    float_graph_def = tf.GraphDef()
+    input_constant = quantize_graph.create_constant_node(input_constant_name,
+                                                         value=[1, 2, 3, 4, 5,
+                                                                6, 7, 8, 9, 10,
+                                                                11, 12],
+                                                         dtype=tf.float32,
+                                                         shape=[1, 2, 6, 1])
+    float_graph_def.node.extend([input_constant])
+    relu_node = quantize_graph.create_node("Relu", relu_name,
+                                           [input_constant_name])
+    quantize_graph.set_attr_dtype(relu_node, "T", tf.float32)
+    float_graph_def.node.extend([relu_node])
+    test_graph(float_graph_def, {}, [relu_name])
+
+  def test_relu6(self):
+    input_constant_name = "input_constant"
+    relu6_name = "relu6"
+    float_graph_def = tf.GraphDef()
+    input_constant = quantize_graph.create_constant_node(input_constant_name,
+                                                         value=[1, 2, 3, 4, 5,
+                                                                6, 7, 8, 9, 10,
+                                                                11, 12],
+                                                         dtype=tf.float32,
+                                                         shape=[1, 2, 6, 1])
+    float_graph_def.node.extend([input_constant])
+    relu6_node = quantize_graph.create_node("Relu6", relu6_name,
+                                            [input_constant_name])
+    quantize_graph.set_attr_dtype(relu6_node, "T", tf.float32)
+    float_graph_def.node.extend([relu6_node])
+    test_graph(float_graph_def, {}, [relu6_name])
+
+  def test_bias_add(self):
+    input_constant_name = "input_constant"
+    offset_constant_name = "offset_constant"
+    bias_add_name = "bias_add"
+    float_graph_def = tf.GraphDef()
+    input_constant = quantize_graph.create_constant_node(input_constant_name,
+                                                         value=[1, 2, 3, 4, 5,
+                                                                6, 7, 8, 9, 10,
+                                                                11, 12],
+                                                         dtype=tf.float32,
+                                                         shape=[1, 1, 2, 6])
+    float_graph_def.node.extend([input_constant])
+    offset_constant = quantize_graph.create_constant_node(offset_constant_name,
+                                                          value=[1, 2, 3, 4, 5,
+                                                                 6],
+                                                          dtype=tf.float32,
+                                                          shape=[6])
+    float_graph_def.node.extend([offset_constant])
+    bias_add_node = quantize_graph.create_node("BiasAdd", bias_add_name,
+                                               [input_constant_name,
+                                                offset_constant_name])
+    quantize_graph.set_attr_dtype(bias_add_node, "T", tf.float32)
+    float_graph_def.node.extend([bias_add_node])
+    test_graph(float_graph_def, {}, [bias_add_name])
+
+  def test_remove_redundant_quantization(self):
+    a_constant_name = "a_constant"
+    a_constant_min_name = "a_constant_min"
+    a_constant_max_name = "a_constant_max"
+    a_dequantize_name = "a_dequantize"
+    a_quantize_name = "a_quantize"
+    b_constant_name = "b_constant"
+    b_constant_min_name = "b_constant_min"
+    b_constant_max_name = "b_constant_max"
+    b_dequantize_name = "b_dequantize"
+    b_quantize_name = "b_quantize"
+    mat_mul_name = "mat_mul"
+    graph_def = tf.GraphDef()
+    a_constant = quantize_graph.create_constant_node(a_constant_name,
+                                                     value=(0,),
+                                                     dtype=tf.quint8,
+                                                     shape=[])
+    graph_def.node.extend([a_constant])
+    a_constant_min = quantize_graph.create_constant_node(a_constant_min_name,
+                                                         value=2,
+                                                         dtype=tf.float32,
+                                                         shape=[])
+    graph_def.node.extend([a_constant_min])
+    a_constant_max = quantize_graph.create_constant_node(a_constant_max_name,
+                                                         value=2,
+                                                         dtype=tf.float32,
+                                                         shape=[])
+    graph_def.node.extend([a_constant_max])
+    a_dequantize_node = quantize_graph.create_node("Dequantize",
+                                                   a_dequantize_name,
+                                                   [a_constant_name,
+                                                    a_constant_min_name,
+                                                    a_constant_max_name])
+    quantize_graph.set_attr_dtype(a_dequantize_node, "T", tf.uint8)
+    graph_def.node.extend([a_dequantize_node])
+    a_quantize_node = quantize_graph.create_node("QuantizeV2",
+                                                 a_quantize_name,
+                                                 [a_dequantize_name,
+                                                  a_dequantize_name + ":1",
+                                                  a_dequantize_name + ":2"])
+    quantize_graph.set_attr_dtype(a_quantize_node, "T", tf.uint8)
+    graph_def.node.extend([a_quantize_node])
+    b_constant = quantize_graph.create_constant_node(b_constant_name,
+                                                     value=(0,),
+                                                     dtype=tf.quint8,
+                                                     shape=[])
+    graph_def.node.extend([b_constant])
+    b_constant_min = quantize_graph.create_constant_node(b_constant_min_name,
+                                                         value=3,
+                                                         dtype=tf.float32,
+                                                         shape=[])
+    graph_def.node.extend([b_constant_min])
+    b_constant_max = quantize_graph.create_constant_node(b_constant_max_name,
+                                                         value=3,
+                                                         dtype=tf.float32,
+                                                         shape=[])
+    graph_def.node.extend([b_constant_max])
+    b_dequantize_node = quantize_graph.create_node("Dequantize",
+                                                   b_dequantize_name,
+                                                   [b_constant_name,
+                                                    b_constant_min_name,
+                                                    b_constant_max_name])
+    quantize_graph.set_attr_dtype(b_dequantize_node, "T", tf.uint8)
+    graph_def.node.extend([b_dequantize_node])
+    b_quantize_node = quantize_graph.create_node("QuantizeV2",
+                                                 b_quantize_name,
+                                                 [b_dequantize_name,
+                                                  b_dequantize_name + ":1",
+                                                  b_dequantize_name + ":2"])
+    quantize_graph.set_attr_dtype(b_quantize_node, "T", tf.uint8)
+    graph_def.node.extend([b_quantize_node])
+    mat_mul_node = quantize_graph.create_node("QuantizedMatMul", mat_mul_name,
+                                              [a_quantize_name,
+                                               b_quantize_name,
+                                               a_quantize_name + ":1",
+                                               a_quantize_name + ":2",
+                                               b_quantize_name + ":1",
+                                               b_quantize_name + ":2"])
+    quantize_graph.set_attr_dtype(mat_mul_node, "T1", tf.uint8)
+    quantize_graph.set_attr_dtype(mat_mul_node, "T2", tf.int32)
+    graph_def.node.extend([mat_mul_node])
+
+    expected_output = tf.GraphDef()
+    a_constant = quantize_graph.create_constant_node(a_constant_name,
+                                                     value=(0,),
+                                                     dtype=tf.quint8,
+                                                     shape=[])
+    expected_output.node.extend([a_constant])
+    a_constant_min = quantize_graph.create_constant_node(a_constant_min_name,
+                                                         value=2,
+                                                         dtype=tf.float32,
+                                                         shape=[])
+    expected_output.node.extend([a_constant_min])
+    a_constant_max = quantize_graph.create_constant_node(a_constant_max_name,
+                                                         value=2,
+                                                         dtype=tf.float32,
+                                                         shape=[])
+    expected_output.node.extend([a_constant_max])
+    b_constant = quantize_graph.create_constant_node(b_constant_name,
+                                                     value=(0,),
+                                                     dtype=tf.quint8,
+                                                     shape=[])
+    expected_output.node.extend([b_constant])
+    b_constant_min = quantize_graph.create_constant_node(b_constant_min_name,
+                                                         value=3,
+                                                         dtype=tf.float32,
+                                                         shape=[])
+    expected_output.node.extend([b_constant_min])
+    b_constant_max = quantize_graph.create_constant_node(b_constant_max_name,
+                                                         value=3,
+                                                         dtype=tf.float32,
+                                                         shape=[])
+    expected_output.node.extend([b_constant_max])
+    mat_mul_node = quantize_graph.create_node("QuantizedMatMul", mat_mul_name,
+                                              [a_constant_name,
+                                               b_constant_name,
+                                               a_constant_min_name,
+                                               a_constant_max_name,
+                                               b_constant_min_name,
+                                               b_constant_max_name])
+    quantize_graph.set_attr_dtype(mat_mul_node, "T1", tf.uint8)
+    quantize_graph.set_attr_dtype(mat_mul_node, "T2", tf.int32)
+    expected_output.node.extend([mat_mul_node])
+
+    rewriter = quantize_graph.GraphRewriter(graph_def, [mat_mul_name])
+    output = rewriter.remove_redundant_quantization(graph_def)
+    stripped_output = graph_util.extract_sub_graph(output, [mat_mul_name])
+    self.assertProtoEquals(expected_output, stripped_output)
+
+
+if __name__ == "__main__":
+  tf.test.main()
author	A. Unique TensorFlower <gardener@tensorflow.org>	2016-09-28 00:15:58 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2016-09-28 01:35:32 -0700
commit	419d5d072375ee0044fecb94e4bfe21a7b3b0b9e (patch)
tree	cb66e6e7238bf2e7938b58f3638bd31f65d542c2 /tensorflow/contrib/quantization
parent	c1e4f0f6a1078fd6715e8145fbef874e4d447ab8 (diff)