122 files changed, 4102 insertions, 1655 deletions
diff --git a/README.md b/README.md
index 4cc53096e0..6339c57c95 100644
--- a/README.md
+++ b/README.md
@@ -48,9 +48,9 @@ GPU packages on all platforms will arrive soon!
 * Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/))
 * Linux GPU: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/42/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
 * Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Windows CPU-only: [Python 3.5 64-bit](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp35-cp35m-win_amd64.whl) ([build history](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/)) / [Python 3.6 64-bit](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp36-cp36m-win_amd64.whl) ([build history](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/))
-* Windows GPU: Coming soon!
-* Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
+* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp36-cp36m-win_amd64.whl) ([build history](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/))
+* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly_gpu-1.head-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly_gpu-1.head-cp36-cp36m-win_amd64.whl) ([build history](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=36/))
+* Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
 ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
 
 #### *Try your first TensorFlow program*
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index 1b5dd558dd..27c5da08c1 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -52,6 +52,11 @@ class XlaAllocator : public xla::DeviceMemoryAllocator {
                                                 bool retry_on_failure) override;
   Status Deallocate(int device_ordinal, gpu::DeviceMemoryBase* mem) override;
 
+  // Register an Tensor (input or resource variable) with the allocator. If
+  // the operation returns an alias to one of its inputs, then the allocator
+  // needs to be able to handle it.
+  Status RegisterArgument(const Tensor* t);
+
   // Makes 'tensor' a wrapper around the data buffer at 'ptr'. The buffer is
   // interpreted as having data type 'dtype' and shape 'shape'.
   Status MakeTensorFromBuffer(gpu::DeviceMemoryBase buffer, DataType dtype,
@@ -103,6 +108,14 @@ xla::StatusOr<gpu::DeviceMemoryBase> XlaAllocator::Allocate(
   return gpu::DeviceMemoryBase(data, size);
 }
 
+Status XlaAllocator::RegisterArgument(const Tensor* t) {
+  void* data =
+      reinterpret_cast<void*>(const_cast<char*>(t->tensor_data().data()));
+  TF_RET_CHECK(data != nullptr);
+  tensors_[data] = *t;
+  return Status::OK();
+}
+
 Status XlaAllocator::Deallocate(int device_ordinal,
                                 gpu::DeviceMemoryBase* mem) {
   if (mem->opaque() != nullptr) {
@@ -284,6 +297,8 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
             shape, client->platform(), client->default_device_ordinal(), dmem)
             .ConsumeValueOrDie();
     arg_ptrs[i] = arg_buffers[i].get();
+
+    OP_REQUIRES_OK(ctx, xla_allocator.RegisterArgument(t));
   }
 
   // Make the final parameter point at local_runtime_context.
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index 89145a9038..7dd242425c 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -256,9 +256,9 @@ tensorflow::Status ConvolutionThunk::Convolve(
       algorithm_config.algorithm_no_scratch().algo_id());
 }
 
-std::vector<AlgorithmDesc::Index> ConvolutionThunk::GetAlgorithms(
+std::vector<AlgorithmDesc> ConvolutionThunk::GetAlgorithms(
     se::StreamExecutor* stream_exec) const {
-  std::vector<AlgorithmDesc::Index> algorithms;
+  std::vector<AlgorithmDesc> algorithms;
   // TODO(yangzihao): Currently disable the use of winograd nonfused in XLA
   // by default. Should send in conv parameters and enable it when
   // ShouldIncludeWinogradNonfusedAlgo() returns true.
@@ -297,32 +297,27 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
 
     se::dnn::ProfileResult best_result;
     se::dnn::ProfileResult best_result_without_scratch;
-    std::vector<AlgorithmDesc::Index> algorithms =
-        GetAlgorithms(stream->parent());
-    for (bool use_tensor_ops : {false, true}) {
-      for (auto algo_index : algorithms) {
-        AlgorithmDesc algorithm(algo_index, use_tensor_ops);
-        ConvolveScratchAllocator scratch_allocator(
-            buffer_allocations.device_ordinal(),
-            buffer_allocations.memory_allocator());
-        se::dnn::ProfileResult profile_result;
-        bool launch_ok =
-            Convolve(input_descriptor, input_data, filter_descriptor,
-                     filter_data, output_descriptor, output_data,
-                     convolution_descriptor,
-                     se::dnn::AlgorithmConfig(algorithm, algorithm), stream,
-                     &scratch_allocator, &profile_result)
-                .ok();
-        if (launch_ok && profile_result.is_valid()) {
-          if (profile_result.elapsed_time_in_ms() <
-              best_result.elapsed_time_in_ms()) {
-            best_result = profile_result;
-          }
-          if (scratch_allocator.TotalAllocatedBytes() == 0 &&
-              profile_result.elapsed_time_in_ms() <
-                  best_result_without_scratch.elapsed_time_in_ms()) {
-            best_result_without_scratch = profile_result;
-          }
+    std::vector<AlgorithmDesc> algorithms = GetAlgorithms(stream->parent());
+    for (auto algorithm : algorithms) {
+      ConvolveScratchAllocator scratch_allocator(
+          buffer_allocations.device_ordinal(),
+          buffer_allocations.memory_allocator());
+      se::dnn::ProfileResult profile_result;
+      bool launch_ok =
+          Convolve(input_descriptor, input_data, filter_descriptor, filter_data,
+                   output_descriptor, output_data, convolution_descriptor,
+                   se::dnn::AlgorithmConfig(algorithm, algorithm), stream,
+                   &scratch_allocator, &profile_result)
+              .ok();
+      if (launch_ok && profile_result.is_valid()) {
+        if (profile_result.elapsed_time_in_ms() <
+            best_result.elapsed_time_in_ms()) {
+          best_result = profile_result;
+        }
+        if (scratch_allocator.TotalAllocatedBytes() == 0 &&
+            profile_result.elapsed_time_in_ms() <
+                best_result_without_scratch.elapsed_time_in_ms()) {
+          best_result_without_scratch = profile_result;
         }
       }
     }
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index 509719c1fe..13432301b2 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -115,9 +115,7 @@ class ConvolutionThunk : public Thunk {
       perftools::gputools::dnn::ProfileResult* profile_result);
 
   // Returns the convolve algorithms that can be used for this ConvolutionThunk.
-  // TODO(nluehr) GetAlgorithms should return AlgorithmDesc including both
-  // tensor-op and non-tensor-op variants.
-  std::vector<perftools::gputools::dnn::AlgorithmDesc::Index> GetAlgorithms(
+  std::vector<perftools::gputools::dnn::AlgorithmDesc> GetAlgorithms(
       perftools::gputools::StreamExecutor* stream_exec) const;
 
   // Fastest cuDNN convolution algorithm for this thunk learned from
diff --git a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
index 395dd6c5d2..80e03f2036 100644
--- a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
+++ b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
@@ -31,12 +31,13 @@ import java.nio.IntBuffer;
 import java.nio.LongBuffer;
 import java.util.ArrayList;
 import java.util.List;
-import org.tensorflow.DataType;
 import org.tensorflow.Graph;
 import org.tensorflow.Operation;
 import org.tensorflow.Session;
 import org.tensorflow.Tensor;
 import org.tensorflow.TensorFlow;
+import org.tensorflow.Tensors;
+import org.tensorflow.types.UInt8;
 
 /**
  * Wrapper over the TensorFlow API ({@link Graph}, {@link Session}) providing a smaller API surface
@@ -328,7 +329,7 @@ public class TensorFlowInferenceInterface {
    * destination has capacity, the copy is truncated.
    */
   public void feed(String inputName, byte[] src, long... dims) {
-    addFeed(inputName, Tensor.create(DataType.UINT8, dims, ByteBuffer.wrap(src)));
+    addFeed(inputName, Tensor.create(UInt8.class, dims, ByteBuffer.wrap(src)));
   }
 
   /**
@@ -337,7 +338,7 @@ public class TensorFlowInferenceInterface {
    * a Java {@code String} (which is a sequence of characters).
    */
   public void feedString(String inputName, byte[] src) {
-    addFeed(inputName, Tensor.create(src));
+    addFeed(inputName, Tensors.create(src));
   }
 
   /**
@@ -346,7 +347,7 @@ public class TensorFlowInferenceInterface {
    * arbitrary sequence of bytes, not a Java {@code String} (which is a sequence of characters).
    */
   public void feedString(String inputName, byte[][] src) {
-    addFeed(inputName, Tensor.create(src));
+    addFeed(inputName, Tensors.create(src));
   }
 
   // Methods for taking a native Tensor and filling it with src from Java native IO buffers.
@@ -403,7 +404,7 @@ public class TensorFlowInferenceInterface {
    * destination has capacity, the copy is truncated.
    */
   public void feed(String inputName, ByteBuffer src, long... dims) {
-    addFeed(inputName, Tensor.create(DataType.UINT8, dims, src));
+    addFeed(inputName, Tensor.create(UInt8.class, dims, src));
   }
 
   /**
@@ -544,7 +545,7 @@ public class TensorFlowInferenceInterface {
         "Model load took " + (endMs - startMs) + "ms, TensorFlow version: " + TensorFlow.version());
   }
 
-  private void addFeed(String inputName, Tensor t) {
+  private void addFeed(String inputName, Tensor<?> t) {
     // The string format accepted by TensorFlowInferenceInterface is node_name[:output_index].
     TensorId tid = TensorId.parse(inputName);
     runner.feed(tid.name, tid.outputIndex, t);
@@ -578,7 +579,7 @@ public class TensorFlowInferenceInterface {
     }
   }
 
-  private Tensor getTensor(String outputName) {
+  private Tensor<?> getTensor(String outputName) {
     int i = 0;
     for (String n : fetchNames) {
       if (n.equals(outputName)) {
@@ -591,7 +592,7 @@ public class TensorFlowInferenceInterface {
   }
 
   private void closeFeeds() {
-    for (Tensor t : feedTensors) {
+    for (Tensor<?> t : feedTensors) {
       t.close();
     }
     feedTensors.clear();
@@ -599,7 +600,7 @@ public class TensorFlowInferenceInterface {
   }
 
   private void closeFetches() {
-    for (Tensor t : fetchTensors) {
+    for (Tensor<?> t : fetchTensors) {
       t.close();
     }
     fetchTensors.clear();
@@ -614,9 +615,9 @@ public class TensorFlowInferenceInterface {
   // State reset on every call to run.
   private Session.Runner runner;
   private List<String> feedNames = new ArrayList<String>();
-  private List<Tensor> feedTensors = new ArrayList<Tensor>();
+  private List<Tensor<?>> feedTensors = new ArrayList<Tensor<?>>();
   private List<String> fetchNames = new ArrayList<String>();
-  private List<Tensor> fetchTensors = new ArrayList<Tensor>();
+  private List<Tensor<?>> fetchTensors = new ArrayList<Tensor<?>>();
 
   // Mutable state.
   private RunStats runStats;
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
index dad3b4e10d..c329c6d4f7 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
@@ -36,7 +36,7 @@ class WeightedQuantilesSummary {
   struct SummaryEntry {
     SummaryEntry(const ValueType& v, const WeightType& w, const WeightType& min,
                  const WeightType& max) {
-      // Explicitely initialize all of memory (including padding from memory
+      // Explicitly initialize all of memory (including padding from memory
       // alignment) to allow the struct to be msan-resistant "plain old data".
       //
       // POD = http://en.cppreference.com/w/cpp/concept/PODType
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index 813c64d141..91f100e0f0 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -253,6 +253,46 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testDenseToSparseBatchDatasetWithUnknownShape(self):
+    components = np.random.randint(5, size=(40,)).astype(np.int32)
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
+                .map(lambda x: array_ops.fill([x, x], x)).dense_to_sparse_batch(
+                    4, [5, -1]).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = sparse_tensor.SparseTensor(*iterator.get_next())
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+
+      for start in range(0, len(components), 4):
+        results = sess.run(get_next)
+        self.assertAllEqual(
+            [[i, j, z] for i, c in enumerate(components[start:start+4])
+             for j in range(c) for z in range(c)], results.indices)
+        self.assertAllEqual(
+            [c for c in components[start:start+4]
+             for _ in range(c) for _ in range(c)],
+            results.values)
+        self.assertAllEqual(
+            [min(4, len(components) - start),
+             5,
+             np.max(components[start:start+4])],
+            results.dense_shape)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testDenseToSparseBatchDatasetWithInvalidShape(self):
+    input_tensor = array_ops.constant([[1]])
+    iterator = (dataset_ops.Dataset.from_tensors(input_tensor)
+                .dense_to_sparse_batch(4, [-2]).make_initializable_iterator())
+    init_op = iterator.initializer
+
+    with self.test_session() as sess:
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Dimension -2 must be >= -1"):
+        sess.run(init_op)
+
   def testDenseToSparseBatchDatasetShapeErrors(self):
     input_tensor = array_ops.placeholder(dtypes.int32)
     iterator = (dataset_ops.Dataset.from_tensors(input_tensor).apply(
diff --git a/tensorflow/contrib/data/python/ops/dataset_ops.py b/tensorflow/contrib/data/python/ops/dataset_ops.py
index ff89c47a2e..b74dcd3be2 100644
--- a/tensorflow/contrib/data/python/ops/dataset_ops.py
+++ b/tensorflow/contrib/data/python/ops/dataset_ops.py
@@ -653,7 +653,7 @@ class Dataset(dataset_ops.Dataset):
     ```python
     # Preprocess 4 files concurrently, and interleave blocks of 16 records from
     # each file.
-    filenames = ["/var/data/file1.txt", "/var/data/file2.txt", ..."]
+    filenames = ["/var/data/file1.txt", "/var/data/file2.txt", ...]
     dataset = (Dataset.from_tensor_slices(filenames)
                .interleave(lambda x:
                    TextLineDataset(x).map(parse_fn, num_parallel_calls=1),
diff --git a/tensorflow/contrib/deprecated/__init__.py b/tensorflow/contrib/deprecated/__init__.py
index bfea8445a7..7aff045de3 100644
--- a/tensorflow/contrib/deprecated/__init__.py
+++ b/tensorflow/contrib/deprecated/__init__.py
@@ -91,7 +91,7 @@ from __future__ import division
 from __future__ import print_function
 
 
-# pylint: disable=unused-import,line-too-long
+# pylint: disable=unused-import
 from tensorflow.python.ops.logging_ops import audio_summary
 from tensorflow.python.ops.logging_ops import histogram_summary
 from tensorflow.python.ops.logging_ops import image_summary
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index 888f5c38a2..b417a70b6e 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -208,7 +208,15 @@ string GetTempFilename(const string& extension) {
     }
     struct stat statbuf;
     if (!stat(dir, &statbuf) && S_ISDIR(statbuf.st_mode)) {
-      return io::JoinPath(dir, StrCat("tmp_file_", getpid(), ".", extension));
+      string tmp_filepath =
+          io::JoinPath(dir, StrCat("tmp_file_XXXXXX", ".", extension));
+      int fd = mkstemps(&tmp_filepath[0], extension.length() + 1);
+      if (fd < 0) {
+        LOG(FATAL) << "Failed to create temp file.";
+      } else {
+        close(fd);
+        return tmp_filepath;
+      }
     }
   }
   LOG(FATAL) << "No temp directory found.";
diff --git a/tensorflow/contrib/framework/python/framework/tensor_util.py b/tensorflow/contrib/framework/python/framework/tensor_util.py
index e595e4d90b..92a2a4ff2d 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util.py
@@ -78,9 +78,9 @@ def reduce_sum_n(tensors, name=None):
     return math_ops.add_n(tensors, name=name_scope)
 
 @deprecated(None,
-    "Please switch to tf.confusion_matrix.remove_squeezable_dimensions. Note "
-    "that order of the inputs and ouputs of labels and predictions have also "
-    "been switched.")
+            'Please switch to tf.confusion_matrix.remove_squeezable_dimensions.'
+            'Note that order of the inputs and outputs of labels and '
+            'predictions have also been switched.')
 def remove_squeezable_dimensions(predictions, labels, name=None):
   """Squeeze last dim if ranks of `predictions` and `labels` differ by 1.
 
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index 9275d5a22b..256f200868 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -493,42 +493,37 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
   dnn::AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune && !AutoTuneConvBiasActivation::GetInstance()->Find(
                                 fused_conv_parameters, &algorithm_config)) {
-    std::vector<dnn::AlgorithmDesc::Index> algorithms;
+    std::vector<dnn::AlgorithmDesc> algorithms;
     CHECK(stream->parent()->GetConvolveAlgorithms(
         fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(),
         &algorithms));
     dnn::ProfileResult best_result;
     dnn::ProfileResult best_result_no_scratch;
-    // TODO(benbarsdell): Ideally this should not attempt using tensor op math
-    // if it's not enabled.
-    for (bool use_tensor_ops : {false, true}) {
-      for (auto algo_index : algorithms) {
-        // TODO(zhengxq): profile each algorithm multiple times to better
-        // accuracy.
-        dnn::AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
-        CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
-        dnn::ProfileResult profile_result;
-        bool cudnn_launch_status =
-            stream
-                ->ThenFusedConvolveWithAlgorithm(
-                    conv_input_desc, conv_input_ptr, conv_input_scale,
-                    filter_desc, filter_ptr, conv_desc, side_input_ptr,
-                    side_input_scale, bias_desc, bias_ptr,
-                    dnn::ActivationMode::kRelu, output_desc, &output_ptr,
-                    &scratch_allocator, dnn::AlgorithmConfig(profile_algorithm),
-                    &profile_result)
-                .ok();
-        if (cudnn_launch_status) {
-          if (profile_result.is_valid()) {
-            if (profile_result.elapsed_time_in_ms() <
-                best_result.elapsed_time_in_ms()) {
-              best_result = profile_result;
-            }
-            if (scratch_allocator.TotalByteSize() == 0 &&
-                profile_result.elapsed_time_in_ms() <
-                    best_result_no_scratch.elapsed_time_in_ms()) {
-              best_result_no_scratch = profile_result;
-            }
+    for (auto profile_algorithm : algorithms) {
+      // TODO(zhengxq): profile each algorithm multiple times to better
+      // accuracy.
+      CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+      dnn::ProfileResult profile_result;
+      bool cudnn_launch_status =
+          stream
+              ->ThenFusedConvolveWithAlgorithm(
+                  conv_input_desc, conv_input_ptr, conv_input_scale,
+                  filter_desc, filter_ptr, conv_desc, side_input_ptr,
+                  side_input_scale, bias_desc, bias_ptr,
+                  dnn::ActivationMode::kRelu, output_desc, &output_ptr,
+                  &scratch_allocator, dnn::AlgorithmConfig(profile_algorithm),
+                  &profile_result)
+              .ok();
+      if (cudnn_launch_status) {
+        if (profile_result.is_valid()) {
+          if (profile_result.elapsed_time_in_ms() <
+              best_result.elapsed_time_in_ms()) {
+            best_result = profile_result;
+          }
+          if (scratch_allocator.TotalByteSize() == 0 &&
+              profile_result.elapsed_time_in_ms() <
+                  best_result_no_scratch.elapsed_time_in_ms()) {
+            best_result_no_scratch = profile_result;
           }
         }
       }
diff --git a/tensorflow/contrib/memory_stats/__init__.py b/tensorflow/contrib/memory_stats/__init__.py
index a2b2b65692..a32302c854 100644
--- a/tensorflow/contrib/memory_stats/__init__.py
+++ b/tensorflow/contrib/memory_stats/__init__.py
@@ -14,10 +14,12 @@
 # ==============================================================================
 """Ops for memory statistics.
 
+@@BytesInUse
 @@BytesLimit
 @@MaxBytesInUse
 """
 
+from tensorflow.contrib.memory_stats.python.ops.memory_stats_ops import BytesInUse
 from tensorflow.contrib.memory_stats.python.ops.memory_stats_ops import BytesLimit
 from tensorflow.contrib.memory_stats.python.ops.memory_stats_ops import MaxBytesInUse
 
diff --git a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
index 3b88535dce..7e2e96e160 100644
--- a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
+++ b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
@@ -40,6 +40,28 @@ class MemoryStatsOp : public OpKernel {
       const AllocatorStats& allocator_stats) const = 0;
 };
 
+// Op that measures current memory in bytes.
+class BytesInUseOp : public MemoryStatsOp {
+ public:
+  explicit BytesInUseOp(OpKernelConstruction* context)
+      : MemoryStatsOp(context) {}
+
+ private:
+  int64 ExtractAllocatorStats(
+      const AllocatorStats& allocator_stats) const override {
+    return allocator_stats.bytes_in_use;
+  }
+};
+
+// Register this op on GPU only, see comment for MaxBytesInUse for reason
+REGISTER_KERNEL_BUILDER(Name("BytesInUse").Device(DEVICE_GPU).HostMemory("out"),
+                        BytesInUseOp);
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("BytesInUse").Device(DEVICE_SYCL).HostMemory("out"), MaxBytesInUseOp);
+#endif  // TENSORFLOW_USE_SYCL
+
 // Op that measures the total memory (in bytes) of a device.
 class BytesLimitOp : public MemoryStatsOp {
  public:
diff --git a/tensorflow/contrib/memory_stats/ops/memory_stats_ops.cc b/tensorflow/contrib/memory_stats/ops/memory_stats_ops.cc
index 08859c8613..42020cf7f6 100644
--- a/tensorflow/contrib/memory_stats/ops/memory_stats_ops.cc
+++ b/tensorflow/contrib/memory_stats/ops/memory_stats_ops.cc
@@ -17,6 +17,10 @@ limitations under the License.
 
 namespace tensorflow {
 
+REGISTER_OP("BytesInUse")
+    .Output("out: int64")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape);
 REGISTER_OP("BytesLimit")
     .Output("out: int64")
     .SetIsStateful()
diff --git a/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py b/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py
index ec25c032f0..d1b430b803 100644
--- a/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py
+++ b/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.memory_stats.python.ops import memory_stats_ops
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
@@ -64,10 +65,29 @@ class MemoryStatsOpsTest(test_util.TensorFlowTestCase):
       d = math_ops.matmul(c, b)
       sess.run(d)
 
-      max_bytes_in_use = sess.run(memory_stats_ops.MaxBytesInUse())
+      max_bytes_in_use_op = memory_stats_ops.MaxBytesInUse()
+      max_bytes_in_use = sess.run(max_bytes_in_use_op)
       self.assertGreaterEqual(max_bytes_in_use, matrix_size_in_bytes * 3)
       self.assertLess(max_bytes_in_use, matrix_size_in_bytes * 4)
 
+      # run chain with 2 ops, make sure BytesInUse captures intermediate
+      # memory usage
+      a = random_ops.random_uniform(matrix_shape, dtype=dtype)
+      with ops.control_dependencies([a]):
+        bytes_in_use_op = memory_stats_ops.BytesInUse()
+      with ops.control_dependencies([bytes_in_use_op]):
+        b = random_ops.random_uniform(matrix_shape, dtype=dtype)
+
+      _, bytes_in_use, max_bytes_in_use = sess.run([a, bytes_in_use_op,
+                                                    max_bytes_in_use_op])
+
+      # intermediate result allocates 1 matrix, max usage is at least 2
+      self.assertGreaterEqual(bytes_in_use, matrix_size_in_bytes * 1)
+      self.assertLess(bytes_in_use, matrix_size_in_bytes * 2)
+
+      # max usage is still 3 because it reflects maxium from previous .run call
+      self.assertGreaterEqual(max_bytes_in_use, matrix_size_in_bytes * 3)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/memory_stats/python/ops/memory_stats_ops.py b/tensorflow/contrib/memory_stats/python/ops/memory_stats_ops.py
index d35c6583ed..c0f7788c1c 100644
--- a/tensorflow/contrib/memory_stats/python/ops/memory_stats_ops.py
+++ b/tensorflow/contrib/memory_stats/python/ops/memory_stats_ops.py
@@ -26,6 +26,11 @@ _memory_stats_ops_so = loader.load_op_library(
     resource_loader.get_path_to_datafile("_memory_stats_ops.so"))
 
 
+def BytesInUse():
+  """Generates an op that computes the current memory of a device."""
+  return gen_memory_stats_ops.bytes_in_use()
+
+
 def BytesLimit():
   """Generates an op that measures the total memory (in bytes) of a device."""
   return gen_memory_stats_ops.bytes_limit()
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.cc b/tensorflow/contrib/resampler/kernels/resampler_ops.cc
index afc8bcd446..7d9ef14cef 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops.cc
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops.cc
@@ -122,7 +122,7 @@ struct Resampler2DFunctor<CPUDevice, T>{
     };
     // Rough estimate of work for each batch entry.
     // From third_party/tensorflow/core/util/work_sharder.cc we gather that an
-    // estimate of the cost of each work unit is needed to correclty shard the
+    // estimate of the cost of each work unit is needed to correctly shard the
     // workload. Shard assumes each cost unit is 1ns, minimum cost per shard
     // being 10us.
     const int64 cost =  static_cast<int64>(num_sampling_points) *
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 1b0327d62b..6702a89d22 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -525,7 +525,7 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
       self._state_tuple_type = collections.namedtuple(
           "GridLSTMStateTuple", state_names.strip(","))
       self._state_size = self._state_tuple_type(
-              *([num_units, num_units] * self._total_blocks))
+          *([num_units, num_units] * self._total_blocks))
     else:
       self._state_tuple_type = None
       self._state_size = num_units * self._total_blocks * 2
@@ -2082,9 +2082,11 @@ def _conv(args,
   shape_length = len(shapes[0])
   for shape in shapes:
     if len(shape) not in [3,4,5]:
-      raise ValueError("Conv Linear expects 3D, 4D or 5D arguments: %s" % str(shapes))
+      raise ValueError("Conv Linear expects 3D, 4D "
+                       "or 5D arguments: %s" % str(shapes))
     if len(shape) != len(shapes[0]):
-      raise ValueError("Conv Linear expects all args to be of same Dimensiton: %s" % str(shapes))
+      raise ValueError("Conv Linear expects all args "
+                       "to be of same Dimension: %s" % str(shapes))
     else:
       total_arg_size_depth += shape[-1]
   dtype = [a.dtype for a in args][0]
@@ -2102,7 +2104,7 @@ def _conv(args,
 
   # Now the computation.
   kernel = vs.get_variable(
-      "kernel", 
+      "kernel",
       filter_size + [total_arg_size_depth, num_features],
       dtype=dtype)
   if len(args) == 1:
diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index 64e00c21c7..b55d90cbab 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -309,7 +309,7 @@ class ScheduledEmbeddingTrainingHelper(TrainingHelper):
           gen_array_ops.fill([self.batch_size], -1))
 
   def next_inputs(self, time, outputs, state, sample_ids, name=None):
-    with ops.name_scope(name, "ScheduledEmbeddingTrainingHelperSample",
+    with ops.name_scope(name, "ScheduledEmbeddingTrainingHelperNextInputs",
                         [time, outputs, state, sample_ids]):
       (finished, base_next_inputs, state) = (
           super(ScheduledEmbeddingTrainingHelper, self).next_inputs(
diff --git a/tensorflow/contrib/signal/BUILD b/tensorflow/contrib/signal/BUILD
index 43f24474ed..2204b684ac 100644
--- a/tensorflow/contrib/signal/BUILD
+++ b/tensorflow/contrib/signal/BUILD
@@ -5,6 +5,7 @@ licenses(["notice"])  # Apache 2.0
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+load("//tensorflow:tensorflow.bzl", "py_test")  # @unused
 
 py_library(
     name = "signal_py",
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
index f9449095be..094568389c 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
@@ -135,7 +135,10 @@ class BoundingBox(ItemHandler):
     """
     sides = []
     for key in self._full_keys:
-      side = array_ops.expand_dims(keys_to_tensors[key].values, 0)
+      side = keys_to_tensors[key]
+      if isinstance(side, sparse_tensor.SparseTensor):
+        side = side.values
+      side = array_ops.expand_dims(side, 0)
       sides.append(side)
 
     bounding_box = array_ops.concat(sides, 0)
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
index 96606b9c0e..60d1eba07f 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
@@ -692,7 +692,7 @@ class TFExampleDecoderTest(test.TestCase):
         else:
           self.assertAllClose(image, decoded_image, atol=0)
 
-  def testDecodeExampleWithBoundingBox(self):
+  def testDecodeExampleWithBoundingBoxSparse(self):
     num_bboxes = 10
     np_ymin = np.random.rand(num_bboxes, 1)
     np_xmin = np.random.rand(num_bboxes, 1)
@@ -731,6 +731,49 @@ class TFExampleDecoderTest(test.TestCase):
 
     self.assertAllClose(np_bboxes, bboxes)
 
+  def testDecodeExampleWithBoundingBoxDense(self):
+    num_bboxes = 10
+    np_ymin = np.random.rand(num_bboxes, 1)
+    np_xmin = np.random.rand(num_bboxes, 1)
+    np_ymax = np.random.rand(num_bboxes, 1)
+    np_xmax = np.random.rand(num_bboxes, 1)
+    np_bboxes = np.hstack([np_ymin, np_xmin, np_ymax, np_xmax])
+
+    example = example_pb2.Example(features=feature_pb2.Features(feature={
+        'image/object/bbox/ymin': self._EncodedFloatFeature(np_ymin),
+        'image/object/bbox/xmin': self._EncodedFloatFeature(np_xmin),
+        'image/object/bbox/ymax': self._EncodedFloatFeature(np_ymax),
+        'image/object/bbox/xmax': self._EncodedFloatFeature(np_xmax),
+    }))
+    serialized_example = example.SerializeToString()
+
+    with self.test_session():
+      serialized_example = array_ops.reshape(serialized_example, shape=[])
+
+      keys_to_features = {
+          'image/object/bbox/ymin': parsing_ops.FixedLenSequenceFeature(
+              [], dtypes.float32, allow_missing=True),
+          'image/object/bbox/xmin': parsing_ops.FixedLenSequenceFeature(
+              [], dtypes.float32, allow_missing=True),
+          'image/object/bbox/ymax': parsing_ops.FixedLenSequenceFeature(
+              [], dtypes.float32, allow_missing=True),
+          'image/object/bbox/xmax': parsing_ops.FixedLenSequenceFeature(
+              [], dtypes.float32, allow_missing=True),
+      }
+
+      items_to_handlers = {
+          'object/bbox':
+              tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'],
+                                            'image/object/bbox/'),
+      }
+
+      decoder = tfexample_decoder.TFExampleDecoder(keys_to_features,
+                                                   items_to_handlers)
+      [tf_bboxes] = decoder.decode(serialized_example, ['object/bbox'])
+      bboxes = tf_bboxes.eval()
+
+    self.assertAllClose(np_bboxes, bboxes)
+
   def testDecodeExampleWithRepeatedImages(self):
     image_shape = (2, 3, 3)
     image_format = 'png'
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index 2c4bed5db1..da583a2ba0 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -42,6 +42,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":feature_keys",
+        ":head",
         ":input_pipeline",
         ":model_utils",
         "//tensorflow/python:util",
@@ -78,8 +79,8 @@ py_library(
     deps = [
         ":ar_model",
         ":feature_keys",
+        ":head",
         ":math_utils",
-        ":model_utils",
         ":state_management",
         "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:filtering_postprocessor",
         "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:state_space_model",
@@ -123,9 +124,9 @@ py_test(
 )
 
 py_library(
-    name = "model_utils",
+    name = "head",
     srcs = [
-        "model_utils.py",
+        "head.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -149,9 +150,9 @@ py_library(
 )
 
 py_test(
-    name = "model_utils_test",
+    name = "head_test",
     srcs = [
-        "model_utils_test.py",
+        "head_test.py",
     ],
     srcs_version = "PY2AND3",
     tags = [
@@ -159,8 +160,8 @@ py_test(
     ],
     deps = [
         ":feature_keys",
+        ":head",
         ":model",
-        ":model_utils",
         ":state_management",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -175,6 +176,41 @@ py_test(
 )
 
 py_library(
+    name = "model_utils",
+    srcs = [
+        "model_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":feature_keys",
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:variable_scope",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "model_utils_test",
+    srcs = [
+        "model_utils_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip_gpu",  # b/63391119
+    ],
+    deps = [
+        ":model_utils",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_library(
     name = "state_management",
     srcs = [
         "state_management.py",
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
index 267a5f88da..ff140efd48 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
@@ -374,7 +374,7 @@ class ARModel(model.TimeSeriesModel):
     original_values = values
 
     # Extra shape checking for the window size (above that in
-    # model_utils.make_model_fn).
+    # `head.create_estimator_spec`).
     expected_times_shape = [None, self.window_size]
     if not times.get_shape().is_compatible_with(expected_times_shape):
       raise ValueError(
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators.py b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
index 4025a8f014..3738dfa154 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 from tensorflow.contrib.timeseries.python.timeseries import ar_model
 from tensorflow.contrib.timeseries.python.timeseries import feature_keys
+from tensorflow.contrib.timeseries.python.timeseries import head as ts_head_lib
 from tensorflow.contrib.timeseries.python.timeseries import math_utils
-from tensorflow.contrib.timeseries.python.timeseries import model_utils
 from tensorflow.contrib.timeseries.python.timeseries import state_management
 from tensorflow.contrib.timeseries.python.timeseries.state_space_models import state_space_model
 from tensorflow.contrib.timeseries.python.timeseries.state_space_models import structural_ensemble
@@ -59,9 +59,10 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
     if optimizer is None:
       optimizer = train.AdamOptimizer(0.02)
     self._model = model
-    model_fn = model_utils.make_model_fn(
+    ts_regression_head = ts_head_lib.time_series_regression_head(
         model, state_manager, optimizer,
         input_statistics_generator=input_statistics_generator)
+    model_fn = ts_regression_head.create_estimator_spec
     super(TimeSeriesRegressor, self).__init__(
         model_fn=model_fn,
         model_dir=model_dir,
@@ -132,7 +133,7 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
       with ops.Graph().as_default():
         self._model.initialize_graph()
         model_start_state = self._model.get_start_state()
-      for prefixed_state_name, state_tensor in model_utils.state_to_dictionary(
+      for prefixed_state_name, state_tensor in ts_head_lib.state_to_dictionary(
           model_start_state).items():
         state_shape_with_batch = tensor_shape.TensorShape(
             (default_batch_size,)).concatenate(state_tensor.get_shape())
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head.py b/tensorflow/contrib/timeseries/python/timeseries/head.py
new file mode 100644
index 0000000000..5896fc2a20
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/head.py
@@ -0,0 +1,375 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Timeseries head."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.contrib.layers.python.layers import optimizers
+
+from tensorflow.contrib.timeseries.python.timeseries import feature_keys
+
+from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.export import export_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import nest
+
+
+def time_series_regression_head(model,
+                                state_manager,
+                                optimizer,
+                                input_statistics_generator=None):
+  """Creates a `_Head` for time series regression.
+
+  Args:
+    model: A model for time series regression.
+    state_manager: A state manager.
+    optimizer: An optimizer.
+    input_statistics_generator: A input statistics generator.
+
+  Returns:
+    An instance of `_Head` for time series regression.
+  """
+  return _TimeSeriesRegressionHead(model, state_manager, optimizer,
+                                   input_statistics_generator)
+
+
+class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-access
+  """See `time_series_regression_head`."""
+
+  def __init__(self,
+               model,
+               state_manager,
+               optimizer,
+               input_statistics_generator=None,
+               name=None):
+    self.model = model
+    self.state_manager = state_manager
+    self.optimizer = optimizer
+    self.input_statistics_generator = input_statistics_generator
+    self._name = name
+
+  def _train_ops(self, features):
+    """Add training ops to the graph."""
+    with variable_scope.variable_scope("model"):
+      model_outputs = self.state_manager.define_loss(
+          self.model, features, estimator_lib.ModeKeys.TRAIN)
+
+    train_op = optimizers.optimize_loss(
+        model_outputs.loss,
+        global_step=variables.get_global_step(),
+        optimizer=self.optimizer,
+        # Learning rate is set in the Optimizer object
+        learning_rate=None)
+    return estimator_lib.EstimatorSpec(
+        loss=model_outputs.loss,
+        mode=estimator_lib.ModeKeys.TRAIN,
+        train_op=train_op)
+
+  # TODO(terrytangyuan): suffix summary and metrics keys by `"/" + name`
+  @property
+  def name(self):
+    return self._name
+
+  # TODO(terrytangyuan): unused for now. Need to decouple
+  # `state_manager.define_loss` to satisfy the extendable return signature of
+  # `_Head.create_loss`.
+  def create_loss(self, features, mode, logits, labels):
+    """See `_Head`."""
+    return None
+
+  # TODO(terrytangyuan): check label dimension
+  @property
+  def logits_dimension(self):
+    return None
+
+  def _evaluate_ops(self, features):
+    """Add ops for evaluation (aka filtering) to the graph."""
+    with variable_scope.variable_scope("model"):
+      model_outputs = self.state_manager.define_loss(
+          self.model, features, estimator_lib.ModeKeys.EVAL)
+    metrics = {}
+    # Just output in-sample predictions for the last chunk seen
+    for prediction_key, prediction_value in model_outputs.predictions.items():
+      metrics[prediction_key] = _identity_metric_single(prediction_key,
+                                                        prediction_value)
+    metrics[feature_keys.FilteringResults.TIMES] = _identity_metric_single(
+        feature_keys.FilteringResults.TIMES, model_outputs.prediction_times)
+    metrics[feature_keys.FilteringResults.STATE_TUPLE] = (
+        _identity_metric_nested(feature_keys.FilteringResults.STATE_TUPLE,
+                                model_outputs.end_state))
+    return estimator_lib.EstimatorSpec(
+        loss=model_outputs.loss,
+        mode=estimator_lib.ModeKeys.EVAL,
+        eval_metric_ops=metrics,
+        predictions={})
+
+  def _predict_ops(self, features):
+    """Add ops for prediction to the graph."""
+    with variable_scope.variable_scope("model"):
+      prediction = self.model.predict(features=features)
+    prediction[feature_keys.PredictionResults.TIMES] = features[
+        feature_keys.PredictionFeatures.TIMES]
+    return estimator_lib.EstimatorSpec(
+        predictions=prediction, mode=estimator_lib.ModeKeys.PREDICT)
+
+  def _serving_ops(self, features):
+    """Add ops for serving to the graph."""
+    with variable_scope.variable_scope("model"):
+      prediction_outputs = self.model.predict(features=features)
+    with variable_scope.variable_scope("model", reuse=True):
+      filtering_outputs = self.state_manager.define_loss(
+          self.model, features, estimator_lib.ModeKeys.EVAL)
+
+    return estimator_lib.EstimatorSpec(
+        mode=estimator_lib.ModeKeys.PREDICT,
+        export_outputs={
+            feature_keys.SavedModelLabels.PREDICT:
+                export_lib.PredictOutput(prediction_outputs),
+            feature_keys.SavedModelLabels.FILTER:
+                export_lib.PredictOutput(
+                    state_to_dictionary(filtering_outputs.end_state))
+        },
+        # Likely unused, but it is necessary to return `predictions` to satisfy
+        # the Estimator's error checking.
+        predictions={})
+
+  def _convert_feature_to_tensor(self, name, value):
+    """Casts features to the correct dtype based on their name."""
+    if name in [
+        feature_keys.TrainEvalFeatures.TIMES,
+        feature_keys.PredictionFeatures.TIMES
+    ]:
+      return math_ops.cast(value, dtypes.int64)
+    if name == feature_keys.TrainEvalFeatures.VALUES:
+      return math_ops.cast(value, self.model.dtype)
+    if name == feature_keys.PredictionFeatures.STATE_TUPLE:
+      return value  # Correct dtypes are model-dependent
+    return ops.convert_to_tensor(value)
+
+  def _gather_state(self, features):
+    """Returns `features` with state packed, indicates if packing was done."""
+    prefixed_state_re = re.compile(r"^" + feature_keys.State.STATE_PREFIX +
+                                   r"_(\d+)$")
+    numbered_state = []
+    for key, tensor in features.items():
+      search_result = prefixed_state_re.search(key)
+      if search_result:
+        numbered_state.append((int(search_result.group(1)), key, tensor))
+    if not numbered_state:
+      return features, False
+    features = features.copy()
+    for _, key, _ in numbered_state:
+      del features[key]
+    numbered_state.sort(key=lambda number, *_: number)
+    features[feature_keys.State.STATE_TUPLE] = nest.pack_sequence_as(
+        structure=self.model.get_start_state(),
+        flat_sequence=[tensor for _, _, tensor in numbered_state])
+    return features, True
+
+  def create_estimator_spec(self, features, mode, labels=None):
+    """Performs basic error checking and returns an EstimatorSpec."""
+    with ops.name_scope("head"):
+      if labels:
+        raise ValueError(
+            "The model received a `labels` dictionary, which is "
+            "not supported. Pass '{}' and '{}' as "
+            "features.".format(feature_keys.TrainEvalFeatures.TIMES,
+                               feature_keys.TrainEvalFeatures.VALUES))
+      del labels
+      features = {
+          name: self._convert_feature_to_tensor(name=name, value=value)
+          for name, value in features.items()
+      }
+      if self.input_statistics_generator is not None:
+        input_statistics = self.input_statistics_generator.initialize_graph(
+            features, update_statistics=(mode == estimator_lib.ModeKeys.TRAIN))
+      else:
+        input_statistics = None
+      self.model.initialize_graph(input_statistics=input_statistics)
+
+      # _gather_state requires the model to have its graph initialized (so it
+      # has access to the structure of the model's state)
+      features, passed_flat_state = self._gather_state(features)
+      if (mode == estimator_lib.ModeKeys.TRAIN or
+          mode == estimator_lib.ModeKeys.EVAL):
+        _check_train_eval_features(features, self.model)
+      elif mode == estimator_lib.ModeKeys.PREDICT:
+        _check_predict_features(features)
+      else:
+        raise ValueError("Unknown mode '{}' passed to model_fn.".format(mode))
+
+      self.state_manager.initialize_graph(
+          model=self.model, input_statistics=input_statistics)
+
+      if mode == estimator_lib.ModeKeys.TRAIN:
+        return self._train_ops(features)
+      elif mode == estimator_lib.ModeKeys.EVAL:
+        return self._evaluate_ops(features)
+      elif mode == estimator_lib.ModeKeys.PREDICT and not passed_flat_state:
+        return self._predict_ops(features)
+      elif mode == estimator_lib.ModeKeys.PREDICT and passed_flat_state:
+        # The mode is PREDICT, but we're actually in export_savedmodel for
+        # serving. We want to return two graphs: one for filtering (state + data
+        # -> state) and one for predicting (state -> prediction).
+        return self._serving_ops(features)
+
+
+def _check_feature_shapes_compatible_with(features,
+                                          compatible_with_name,
+                                          compatible_with_value,
+                                          ignore=None):
+  """Checks all features are compatible with the given time-like feature."""
+  if ignore is None:
+    ignore = set()
+  for name, value in features.items():
+    if name in ignore:
+      continue
+    feature_shape = value.get_shape()
+    if feature_shape.ndims is None:
+      continue
+    if feature_shape.ndims < 2:
+      raise ValueError(
+          ("Features must have shape (batch dimension, window size, ...) "
+           "(got rank {} for feature '{}')").format(feature_shape.ndims, name))
+    if not feature_shape[:2].is_compatible_with(
+        compatible_with_value.get_shape()):
+      raise ValueError(
+          ("Features must have shape (batch dimension, window size, ...) "
+           "where batch dimension and window size match the "
+           "'{times_feature}' feature (got shape {feature_shape} for "
+           "feature '{feature_name}' but shape {times_shape} for feature "
+           "'{times_feature}')").format(
+               times_feature=compatible_with_name,
+               feature_shape=feature_shape,
+               feature_name=name,
+               times_shape=compatible_with_value.get_shape()))
+
+
+def _check_predict_features(features):
+  """Raises errors if features are not suitable for prediction."""
+  if feature_keys.PredictionFeatures.TIMES not in features:
+    raise ValueError("Expected a '{}' feature for prediction.".format(
+        feature_keys.PredictionFeatures.TIMES))
+  if feature_keys.PredictionFeatures.STATE_TUPLE not in features:
+    raise ValueError("Expected a '{}' feature for prediction.".format(
+        feature_keys.PredictionFeatures.STATE_TUPLE))
+  times_feature = features[feature_keys.PredictionFeatures.TIMES]
+  if not times_feature.get_shape().is_compatible_with([None, None]):
+    raise ValueError(
+        ("Expected shape (batch dimension, window size) for feature '{}' "
+         "(got shape {})").format(feature_keys.PredictionFeatures.TIMES,
+                                  times_feature.get_shape()))
+  _check_feature_shapes_compatible_with(
+      features=features,
+      compatible_with_name=feature_keys.PredictionFeatures.TIMES,
+      compatible_with_value=times_feature,
+      ignore=set([
+          feature_keys.PredictionFeatures.STATE_TUPLE  # Model-dependent shapes
+      ]))
+
+
+def _check_train_eval_features(features, model):
+  """Raise errors if features are not suitable for training/evaluation."""
+  if feature_keys.TrainEvalFeatures.TIMES not in features:
+    raise ValueError("Expected a '{}' feature for training/evaluation.".format(
+        feature_keys.TrainEvalFeatures.TIMES))
+  if feature_keys.TrainEvalFeatures.VALUES not in features:
+    raise ValueError("Expected a '{}' feature for training/evaluation.".format(
+        feature_keys.TrainEvalFeatures.VALUES))
+  times_feature = features[feature_keys.TrainEvalFeatures.TIMES]
+  if not times_feature.get_shape().is_compatible_with([None, None]):
+    raise ValueError(
+        ("Expected shape (batch dimension, window size) for feature '{}' "
+         "(got shape {})").format(feature_keys.TrainEvalFeatures.TIMES,
+                                  times_feature.get_shape()))
+  values_feature = features[feature_keys.TrainEvalFeatures.VALUES]
+  if not values_feature.get_shape().is_compatible_with(
+      [None, None, model.num_features]):
+    raise ValueError(
+        ("Expected shape (batch dimension, window size, {num_features}) "
+         "for feature '{feature_name}', since the model was configured "
+         "with num_features={num_features} (got shape {got_shape})").format(
+             num_features=model.num_features,
+             feature_name=feature_keys.TrainEvalFeatures.VALUES,
+             got_shape=times_feature.get_shape()))
+  _check_feature_shapes_compatible_with(
+      features=features,
+      compatible_with_name=feature_keys.TrainEvalFeatures.TIMES,
+      compatible_with_value=times_feature,
+      ignore=set([
+          feature_keys.State.STATE_TUPLE  # Model-dependent shapes
+      ]))
+
+
+def _identity_metric_single(name, input_tensor):
+  """A metric which takes on its last updated value.
+
+  This keeps evaluation metrics in sync with one another, since update ops are
+  run separately from their result Tensors. Simply returning (input_tensor,
+  no_op) as a metric with a value but no update means that a metric will come
+  from a different batch of data than metrics which cache values in a Variable
+  (e.g. the default loss metric).
+
+  Args:
+    name: A name for the metric.
+    input_tensor: Any Tensor.
+  Returns:
+    A tuple of (value, update_op).
+  """
+  metric_variable = variable_scope.variable(
+      name="{}_identity_metric".format(name),
+      initial_value=array_ops.zeros([], dtype=input_tensor.dtype),
+      collections=[ops.GraphKeys.LOCAL_VARIABLES],
+      validate_shape=False)
+  update_op = state_ops.assign(
+      metric_variable, input_tensor, validate_shape=False)
+  # This shape will be correct once the first update runs (but may be
+  # incomplete, so is not helpful for initializing the variable).
+  metric_variable.set_shape(input_tensor.get_shape())
+  return (metric_variable.value(), update_op)
+
+
+def _identity_metric_nested(name, input_tensors):
+  """Create identity metrics for a nested tuple of Tensors."""
+  update_ops = []
+  value_tensors = []
+  for tensor_number, tensor in enumerate(nest.flatten(input_tensors)):
+    value_tensor, update_op = _identity_metric_single(
+        name="{}_{}".format(name, tensor_number), input_tensor=tensor)
+    update_ops.append(update_op)
+    value_tensors.append(value_tensor)
+  return (nest.pack_sequence_as(input_tensors, value_tensors),
+          control_flow_ops.group(*update_ops))
+
+
+def state_to_dictionary(state_tuple):
+  """Flatten model state into a dictionary with string keys."""
+  flattened = {}
+  for state_number, state_value in enumerate(nest.flatten(state_tuple)):
+    prefixed_state_name = "{}_{:02d}".format(feature_keys.State.STATE_PREFIX,
+                                             state_number)
+    flattened[prefixed_state_name] = state_value
+  return flattened
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head_test.py b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
new file mode 100644
index 0000000000..3415061cfd
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
@@ -0,0 +1,267 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for head."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.timeseries.python.timeseries import feature_keys
+from tensorflow.contrib.timeseries.python.timeseries import head as ts_head_lib
+from tensorflow.contrib.timeseries.python.timeseries import model
+from tensorflow.contrib.timeseries.python.timeseries import state_management
+
+from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator as coordinator_lib
+from tensorflow.python.training import queue_runner_impl
+from tensorflow.python.training import training as train
+
+
+class HeadTest(test.TestCase):
+
+  def test_labels_provided_error(self):
+    model_fn = _stub_model_fn()
+    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL,
+                 estimator_lib.ModeKeys.PREDICT]:
+      with self.assertRaisesRegexp(ValueError, "labels"):
+        model_fn(features={}, labels={"a": "b"}, mode=mode)
+
+  def test_unknown_mode(self):
+    model_fn = _stub_model_fn()
+    with self.assertRaisesRegexp(ValueError, "Unknown mode 'Not a mode'"):
+      model_fn(features={}, labels={}, mode="Not a mode")
+
+
+class _TickerModel(object):
+  num_features = 1
+  dtype = dtypes.float32
+
+  def initialize_graph(self, input_statistics):
+    pass
+
+  def define_loss(self, features, mode):
+    del mode  # unused
+    return model.ModelOutputs(
+        loss=features["ticker"],
+        end_state=(features["ticker"], features["ticker"]),
+        prediction_times=array_ops.zeros(()),
+        predictions={"ticker": features["ticker"]})
+
+
+class EvaluationMetricsTests(test.TestCase):
+
+  def test_metrics_consistent(self):
+    # Tests that the identity metrics used to report in-sample predictions match
+    # the behavior of standard metrics.
+    g = ops.Graph()
+    with g.as_default():
+      features = {
+          feature_keys.TrainEvalFeatures.TIMES:
+              array_ops.zeros((1, 1)),
+          feature_keys.TrainEvalFeatures.VALUES:
+              array_ops.zeros((1, 1, 1)),
+          "ticker":
+              array_ops.reshape(
+                  math_ops.cast(
+                      variables.Variable(
+                          name="ticker",
+                          initial_value=0,
+                          dtype=dtypes.int64,
+                          collections=[ops.GraphKeys.LOCAL_VARIABLES])
+                      .count_up_to(10),
+                      dtype=dtypes.float32), (1, 1, 1))
+      }
+      model_fn = ts_head_lib.time_series_regression_head(
+          model=_TickerModel(),
+          state_manager=state_management.PassthroughStateManager(),
+          optimizer=train.GradientDescentOptimizer(0.001)).create_estimator_spec
+      outputs = model_fn(
+          features=features, labels=None, mode=estimator_lib.ModeKeys.EVAL)
+      metric_update_ops = [
+          metric[1] for metric in outputs.eval_metric_ops.values()]
+      loss_mean, loss_update = metrics.mean(outputs.loss)
+      metric_update_ops.append(loss_update)
+      with self.test_session() as sess:
+        coordinator = coordinator_lib.Coordinator()
+        queue_runner_impl.start_queue_runners(sess, coord=coordinator)
+        variables.local_variables_initializer().run()
+        sess.run(metric_update_ops)
+        loss_evaled, metric_evaled, nested_metric_evaled = sess.run(
+            (loss_mean, outputs.eval_metric_ops["ticker"][0],
+             outputs.eval_metric_ops[feature_keys.FilteringResults.STATE_TUPLE][
+                 0][0]))
+        # The custom model_utils metrics for in-sample predictions should be in
+        # sync with the Estimator's mean metric for model loss.
+        self.assertAllClose(0., loss_evaled)
+        self.assertAllClose((((0.,),),), metric_evaled)
+        self.assertAllClose((((0.,),),), nested_metric_evaled)
+        coordinator.request_stop()
+        coordinator.join()
+
+
+class _StubModel(object):
+  num_features = 3
+  dtype = dtypes.float64
+
+  def initialize_graph(self, input_statistics):
+    del input_statistics  # unused
+
+
+def _stub_model_fn():
+  return ts_head_lib.time_series_regression_head(
+      model=_StubModel(),
+      state_manager=state_management.PassthroughStateManager(),
+      optimizer=train.AdamOptimizer(0.001)).create_estimator_spec
+
+
+class TrainEvalFeatureCheckingTests(test.TestCase):
+
+  def test_no_time_feature(self):
+    model_fn = _stub_model_fn()
+    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL]:
+      with self.assertRaisesRegexp(ValueError, "Expected a '{}' feature".format(
+          feature_keys.TrainEvalFeatures.TIMES)):
+        model_fn(
+            features={feature_keys.TrainEvalFeatures.VALUES: [[[1.]]]},
+            labels=None,
+            mode=mode)
+
+  def test_no_value_feature(self):
+    model_fn = _stub_model_fn()
+    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL]:
+      with self.assertRaisesRegexp(ValueError, "Expected a '{}' feature".format(
+          feature_keys.TrainEvalFeatures.VALUES)):
+        model_fn(
+            features={feature_keys.TrainEvalFeatures.TIMES: [[1]]},
+            labels=None,
+            mode=mode)
+
+  def test_bad_time_rank(self):
+    model_fn = _stub_model_fn()
+    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL]:
+      with self.assertRaisesRegexp(ValueError,
+                                   "Expected shape.*for feature '{}'".format(
+                                       feature_keys.TrainEvalFeatures.TIMES)):
+        model_fn(
+            features={
+                feature_keys.TrainEvalFeatures.TIMES: [[[1]]],
+                feature_keys.TrainEvalFeatures.VALUES: [[[1.]]]
+            },
+            labels=None,
+            mode=mode)
+
+  def test_bad_value_rank(self):
+    model_fn = _stub_model_fn()
+    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL]:
+      with self.assertRaisesRegexp(ValueError,
+                                   "Expected shape.*for feature '{}'".format(
+                                       feature_keys.TrainEvalFeatures.VALUES)):
+        model_fn(
+            features={
+                feature_keys.TrainEvalFeatures.TIMES: [[1]],
+                feature_keys.TrainEvalFeatures.VALUES: [[1.]]
+            },
+            labels=None,
+            mode=mode)
+
+  def test_bad_value_num_features(self):
+    model_fn = _stub_model_fn()
+    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL]:
+      with self.assertRaisesRegexp(
+          ValueError, "Expected shape.*, 3.*for feature '{}'".format(
+              feature_keys.TrainEvalFeatures.VALUES)):
+        model_fn(
+            features={
+                feature_keys.TrainEvalFeatures.TIMES: [[1]],
+                feature_keys.TrainEvalFeatures.VALUES: [[[1.]]]
+            },
+            labels=None,
+            mode=mode)
+
+  def test_bad_exogenous_shape(self):
+    model_fn = _stub_model_fn()
+    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL]:
+      with self.assertRaisesRegexp(
+          ValueError,
+          "Features must have shape.*for feature 'exogenous'"):
+        model_fn(
+            features={
+                feature_keys.TrainEvalFeatures.TIMES: [[1]],
+                feature_keys.TrainEvalFeatures.VALUES: [[[1., 2., 3.]]],
+                "exogenous": [[1], [2]]
+            },
+            labels=None,
+            mode=mode)
+
+
+class PredictFeatureCheckingTests(test.TestCase):
+
+  def test_no_time_feature(self):
+    model_fn = _stub_model_fn()
+    with self.assertRaisesRegexp(ValueError, "Expected a '{}' feature".format(
+        feature_keys.PredictionFeatures.TIMES)):
+      model_fn(
+          features={
+              feature_keys.PredictionFeatures.STATE_TUPLE: ([[[1.]]], 1.)
+          },
+          labels=None,
+          mode=estimator_lib.ModeKeys.PREDICT)
+
+  def test_no_start_state_feature(self):
+    model_fn = _stub_model_fn()
+    with self.assertRaisesRegexp(ValueError, "Expected a '{}' feature".format(
+        feature_keys.PredictionFeatures.STATE_TUPLE)):
+      model_fn(
+          features={feature_keys.PredictionFeatures.TIMES: [[1]]},
+          labels=None,
+          mode=estimator_lib.ModeKeys.PREDICT)
+
+  def test_bad_time_rank(self):
+    model_fn = _stub_model_fn()
+    with self.assertRaisesRegexp(ValueError,
+                                 "Expected shape.*for feature '{}'".format(
+                                     feature_keys.PredictionFeatures.TIMES)):
+      model_fn(
+          features={
+              feature_keys.PredictionFeatures.TIMES: 1,
+              feature_keys.PredictionFeatures.STATE_TUPLE: (1, (2, 3.))
+          },
+          labels=None,
+          mode=estimator_lib.ModeKeys.PREDICT)
+
+  def test_bad_exogenous_shape(self):
+    model_fn = _stub_model_fn()
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Features must have shape.*for feature 'exogenous'"):
+      model_fn(
+          features={
+              feature_keys.PredictionFeatures.TIMES: [[1]],
+              feature_keys.PredictionFeatures.STATE_TUPLE: (1, (2, 3.)),
+              "exogenous": 1.
+          },
+          labels=None,
+          mode=estimator_lib.ModeKeys.PREDICT)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/model_utils.py b/tensorflow/contrib/timeseries/python/timeseries/model_utils.py
index addcdb0575..b5d7cb376b 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/model_utils.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/model_utils.py
@@ -18,334 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import re
-
 import numpy
 
-from tensorflow.contrib.framework.python.ops import variables
-from tensorflow.contrib.layers.python.layers import optimizers
-
 from tensorflow.contrib.timeseries.python.timeseries import feature_keys
 
-from tensorflow.python.estimator import estimator_lib
-from tensorflow.python.estimator.export import export_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.util import nest
-
-
-def _check_feature_shapes_compatible_with(
-    features, compatible_with_name, compatible_with_value, ignore=None):
-  """Checks all features are compatible with the given time-like feature."""
-  if ignore is None:
-    ignore = set()
-  for name, value in features.items():
-    if name in ignore:
-      continue
-    feature_shape = value.get_shape()
-    if feature_shape.ndims is None:
-      continue
-    if feature_shape.ndims < 2:
-      raise ValueError(
-          ("Features must have shape (batch dimension, window size, ...) "
-           "(got rank {} for feature '{}')").format(
-               feature_shape.ndims, name))
-    if not feature_shape[:2].is_compatible_with(
-        compatible_with_value.get_shape()):
-      raise ValueError(
-          ("Features must have shape (batch dimension, window size, ...) "
-           "where batch dimension and window size match the "
-           "'{times_feature}' feature (got shape {feature_shape} for "
-           "feature '{feature_name}' but shape {times_shape} for feature "
-           "'{times_feature}')").format(
-               times_feature=compatible_with_name,
-               feature_shape=feature_shape,
-               feature_name=name,
-               times_shape=compatible_with_value.get_shape()))
-
-
-def _check_predict_features(features):
-  """Raises errors if features are not suitable for prediction."""
-  if feature_keys.PredictionFeatures.TIMES not in features:
-    raise ValueError("Expected a '{}' feature for prediction.".format(
-        feature_keys.PredictionFeatures.TIMES))
-  if feature_keys.PredictionFeatures.STATE_TUPLE not in features:
-    raise ValueError("Expected a '{}' feature for prediction.".format(
-        feature_keys.PredictionFeatures.STATE_TUPLE))
-  times_feature = features[feature_keys.PredictionFeatures.TIMES]
-  if not times_feature.get_shape().is_compatible_with([None, None]):
-    raise ValueError(
-        ("Expected shape (batch dimension, window size) for feature '{}' "
-         "(got shape {})").format(feature_keys.PredictionFeatures.TIMES,
-                                  times_feature.get_shape()))
-  _check_feature_shapes_compatible_with(
-      features=features,
-      compatible_with_name=feature_keys.PredictionFeatures.TIMES,
-      compatible_with_value=times_feature,
-      ignore=set([
-          feature_keys.PredictionFeatures.STATE_TUPLE  # Model-dependent shapes
-      ]))
-
-
-def _check_train_eval_features(features, model):
-  """Raise errors if features are not suitable for training/evaluation."""
-  if feature_keys.TrainEvalFeatures.TIMES not in features:
-    raise ValueError("Expected a '{}' feature for training/evaluation.".format(
-        feature_keys.TrainEvalFeatures.TIMES))
-  if feature_keys.TrainEvalFeatures.VALUES not in features:
-    raise ValueError("Expected a '{}' feature for training/evaluation.".format(
-        feature_keys.TrainEvalFeatures.VALUES))
-  times_feature = features[feature_keys.TrainEvalFeatures.TIMES]
-  if not times_feature.get_shape().is_compatible_with([None, None]):
-    raise ValueError(
-        ("Expected shape (batch dimension, window size) for feature '{}' "
-         "(got shape {})").format(feature_keys.TrainEvalFeatures.TIMES,
-                                  times_feature.get_shape()))
-  values_feature = features[feature_keys.TrainEvalFeatures.VALUES]
-  if not values_feature.get_shape().is_compatible_with(
-      [None, None, model.num_features]):
-    raise ValueError(
-        ("Expected shape (batch dimension, window size, {num_features}) "
-         "for feature '{feature_name}', since the model was configured "
-         "with num_features={num_features} (got shape {got_shape})").format(
-             num_features=model.num_features,
-             feature_name=feature_keys.TrainEvalFeatures.VALUES,
-             got_shape=times_feature.get_shape()))
-  _check_feature_shapes_compatible_with(
-      features=features,
-      compatible_with_name=feature_keys.TrainEvalFeatures.TIMES,
-      compatible_with_value=times_feature,
-      ignore=set([
-          feature_keys.State.STATE_TUPLE  # Model-dependent shapes
-      ]))
-
-
-def _identity_metric_single(name, input_tensor):
-  """A metric which takes on its last updated value.
-
-  This keeps evaluation metrics in sync with one another, since update ops are
-  run separately from their result Tensors. Simply returning (input_tensor,
-  no_op) as a metric with a value but no update means that a metric will come
-  from a different batch of data than metrics which cache values in a Variable
-  (e.g. the default loss metric).
-
-  Args:
-    name: A name for the metric.
-    input_tensor: Any Tensor.
-  Returns:
-    A tuple of (value, update_op).
-  """
-  metric_variable = variable_scope.variable(
-      name="{}_identity_metric".format(name),
-      initial_value=array_ops.zeros([], dtype=input_tensor.dtype),
-      collections=[ops.GraphKeys.LOCAL_VARIABLES],
-      validate_shape=False)
-  update_op = state_ops.assign(metric_variable, input_tensor,
-                               validate_shape=False)
-  # This shape will be correct once the first update runs (but may be
-  # incomplete, so is not helpful for initializing the variable).
-  metric_variable.set_shape(input_tensor.get_shape())
-  return (metric_variable.value(), update_op)
-
-
-def _identity_metric_nested(name, input_tensors):
-  """Create identity metrics for a nested tuple of Tensors."""
-  update_ops = []
-  value_tensors = []
-  for tensor_number, tensor in enumerate(nest.flatten(input_tensors)):
-    value_tensor, update_op = _identity_metric_single(
-        name="{}_{}".format(name, tensor_number),
-        input_tensor=tensor)
-    update_ops.append(update_op)
-    value_tensors.append(value_tensor)
-  return (nest.pack_sequence_as(input_tensors, value_tensors),
-          control_flow_ops.group(*update_ops))
-
-
-def state_to_dictionary(state_tuple):
-  """Flatten model state into a dictionary with string keys."""
-  flattened = {}
-  for state_number, state_value in enumerate(nest.flatten(state_tuple)):
-    prefixed_state_name = "{}_{:02d}".format(feature_keys.State.STATE_PREFIX,
-                                             state_number)
-    flattened[prefixed_state_name] = state_value
-  return flattened
-
-
-def make_model_fn(
-    model, state_manager, optimizer, input_statistics_generator=None):
-  """Returns a model function suitable for use with a tf.estimator.
-
-  Args:
-    model: The object (inheriting from Model) to create a function for.
-    state_manager: A state manager to wrap the model with (or
-        PassthroughStateManager if no state needs to be managed).
-    optimizer: An instance of `tf.train.Optimizer` to use for training.
-    input_statistics_generator: An InputStatisticsFromMiniBatch object from
-        math_utils.py, used for collecting statistics about input data during
-        training.
-  Returns:
-    The model function, suitable for passing to a tf.estimator.Estimator.
-  """
-
-  def _convert_feature_to_tensor(name, value):
-    """Casts features to the correct dtype based on their name."""
-    if name in [
-        feature_keys.TrainEvalFeatures.TIMES,
-        feature_keys.PredictionFeatures.TIMES
-    ]:
-      return math_ops.cast(value, dtypes.int64)
-    if name == feature_keys.TrainEvalFeatures.VALUES:
-      return math_ops.cast(value, model.dtype)
-    if name == feature_keys.PredictionFeatures.STATE_TUPLE:
-      return value  # Correct dtypes are model-dependent
-    return ops.convert_to_tensor(value)
-
-  def _gather_state(features):
-    """Returns `features` with state packed, indicates if packing was done."""
-    prefixed_state_re = re.compile(r"^" + feature_keys.State.STATE_PREFIX +
-                                   r"_(\d+)$")
-    numbered_state = []
-    for key, tensor in features.items():
-      search_result = prefixed_state_re.search(key)
-      if search_result:
-        numbered_state.append((int(search_result.group(1)), key, tensor))
-    if not numbered_state:
-      return features, False
-    features = features.copy()
-    for _, key, _ in numbered_state:
-      del features[key]
-    numbered_state.sort(key=lambda number, *_: number)
-    features[feature_keys.State.STATE_TUPLE] = nest.pack_sequence_as(
-        structure=model.get_start_state(),
-        flat_sequence=[tensor for _, _, tensor in numbered_state])
-    return features, True
-
-  def _train(features):
-    """Add training ops to the graph."""
-    with variable_scope.variable_scope("model"):
-      model_outputs = state_manager.define_loss(model, features,
-                                                estimator_lib.ModeKeys.TRAIN)
-    train_op = optimizers.optimize_loss(
-        model_outputs.loss,
-        global_step=variables.get_global_step(),
-        optimizer=optimizer,
-        # Learning rate is set in the Optimizer object
-        learning_rate=None)
-    return estimator_lib.EstimatorSpec(
-        loss=model_outputs.loss,
-        mode=estimator_lib.ModeKeys.TRAIN,
-        train_op=train_op)
-
-  def _evaluate(features):
-    """Add ops for evaluation (aka filtering) to the graph."""
-    with variable_scope.variable_scope("model"):
-      model_outputs = state_manager.define_loss(model, features,
-                                                estimator_lib.ModeKeys.EVAL)
-    metrics = {}
-    # Just output in-sample predictions for the last chunk seen
-    for prediction_key, prediction_value in model_outputs.predictions.items():
-      metrics[prediction_key] = _identity_metric_single(prediction_key,
-                                                        prediction_value)
-    metrics[feature_keys.FilteringResults.TIMES] = _identity_metric_single(
-        feature_keys.FilteringResults.TIMES, model_outputs.prediction_times)
-    metrics[feature_keys.FilteringResults.STATE_TUPLE] = (
-        _identity_metric_nested(feature_keys.FilteringResults.STATE_TUPLE,
-                                model_outputs.end_state))
-    return estimator_lib.EstimatorSpec(
-        loss=model_outputs.loss,
-        mode=estimator_lib.ModeKeys.EVAL,
-        eval_metric_ops=metrics,
-        predictions={})
-
-  def _predict(features):
-    """Add ops for prediction to the graph."""
-    with variable_scope.variable_scope("model"):
-      prediction = model.predict(features=features)
-    prediction[feature_keys.PredictionResults.TIMES] = features[
-        feature_keys.PredictionFeatures.TIMES]
-    return estimator_lib.EstimatorSpec(
-        predictions=prediction, mode=estimator_lib.ModeKeys.PREDICT)
-
-  def _serving(features):
-    with variable_scope.variable_scope("model"):
-      prediction_outputs = model.predict(features=features)
-    with variable_scope.variable_scope("model", reuse=True):
-      filtering_outputs = state_manager.define_loss(model, features,
-                                                    estimator_lib.ModeKeys.EVAL)
-    return estimator_lib.EstimatorSpec(
-        mode=estimator_lib.ModeKeys.PREDICT,
-        export_outputs={
-            feature_keys.SavedModelLabels.PREDICT:
-                export_lib.PredictOutput(prediction_outputs),
-            feature_keys.SavedModelLabels.FILTER:
-                export_lib.PredictOutput(
-                    state_to_dictionary(filtering_outputs.end_state))
-        },
-        # Likely unused, but it is necessary to return `predictions` to satisfy
-        # the Estimator's error checking.
-        predictions={})
-
-  def _model_fn(features, labels, mode):
-    """Given a time series in `features`, define a loss for `mode`.
-
-    Args:
-      features: A dictionary, the output of a chunker (typically with keys
-          feature_keys.TrainEvalFeatures.TIMES and
-          feature_keys.TrainEvalFeatures.VALUES).
-      labels: Not used; included for compatibility with tf.learn.
-      mode: The tf.estimator.ModeKeys mode to use (TRAIN, EVAL, INFER).
-    Returns:
-      A tuple of predictions, a loss Tensor, and a train op.
-    Raises:
-      ValueError: If the model makes predictions which do not have static shape
-          information.
-    """
-    if labels:
-      raise ValueError("The model received a `labels` dictionary, which is not"
-                       " supported. Pass '{}' and '{}' as features.".format(
-                           feature_keys.TrainEvalFeatures.TIMES,
-                           feature_keys.TrainEvalFeatures.VALUES))
-    del labels
-    features = {name: _convert_feature_to_tensor(name=name, value=value)
-                for name, value in features.items()}
-    if input_statistics_generator is not None:
-      input_statistics = input_statistics_generator.initialize_graph(
-          features, update_statistics=(mode == estimator_lib.ModeKeys.TRAIN))
-    else:
-      input_statistics = None
-    model.initialize_graph(input_statistics=input_statistics)
-    # _gather_state requires the model to have its graph initialized (so it has
-    # access to the structure of the model's state)
-    features, passed_flat_state = _gather_state(features)
-    if (mode == estimator_lib.ModeKeys.TRAIN
-        or mode == estimator_lib.ModeKeys.EVAL):
-      _check_train_eval_features(features, model)
-    elif mode == estimator_lib.ModeKeys.PREDICT:
-      _check_predict_features(features)
-    else:
-      raise ValueError("Unknown mode '{}' passed to model_fn.".format(mode))
-    state_manager.initialize_graph(
-        model=model, input_statistics=input_statistics)
-    if mode == estimator_lib.ModeKeys.TRAIN:
-      return _train(features)
-    elif mode == estimator_lib.ModeKeys.EVAL:
-      return _evaluate(features)
-    elif mode == estimator_lib.ModeKeys.PREDICT and not passed_flat_state:
-      return _predict(features)
-    elif mode == estimator_lib.ModeKeys.PREDICT and passed_flat_state:
-      # The mode is PREDICT, but we're actually in export_savedmodel for
-      # serving. We want to return two graphs: one for filtering (state + data
-      # -> state) and one for predicting (state -> prediction).
-      return _serving(features)
-  return _model_fn
 
 
 # TODO(agarwal): Remove and replace with functionality from tf.slim
diff --git a/tensorflow/contrib/timeseries/python/timeseries/model_utils_test.py b/tensorflow/contrib/timeseries/python/timeseries/model_utils_test.py
index 2998689554..cfd31cc70d 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/model_utils_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/model_utils_test.py
@@ -18,22 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.timeseries.python.timeseries import feature_keys
-from tensorflow.contrib.timeseries.python.timeseries import model
 from tensorflow.contrib.timeseries.python.timeseries import model_utils
-from tensorflow.contrib.timeseries.python.timeseries import state_management
 
-from tensorflow.python.estimator import estimator_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
-from tensorflow.python.training import coordinator as coordinator_lib
-from tensorflow.python.training import queue_runner_impl
-from tensorflow.python.training import training as train
 
 
 class ModelUtilsTest(test.TestCase):
@@ -46,230 +34,6 @@ class ModelUtilsTest(test.TestCase):
       self.assertEqual(5, getter(parameter))
       self.assertEqual(4, getter(overridden_parameter))
 
-  def test_labels_provided_error(self):
-    model_fn = _stub_model_fn()
-    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL,
-                 estimator_lib.ModeKeys.PREDICT]:
-      with self.assertRaisesRegexp(ValueError, "labels"):
-        model_fn(features={}, labels={"a": "b"}, mode=mode)
-
-  def test_unknown_mode(self):
-    model_fn = _stub_model_fn()
-    with self.assertRaisesRegexp(ValueError, "Unknown mode 'Not a mode'"):
-      model_fn(features={}, labels={}, mode="Not a mode")
-
-
-class _TickerModel(object):
-  num_features = 1
-  dtype = dtypes.float32
-
-  def initialize_graph(self, input_statistics):
-    pass
-
-  def define_loss(self, features, mode):
-    del mode  # unused
-    return model.ModelOutputs(
-        loss=features["ticker"],
-        end_state=(features["ticker"], features["ticker"]),
-        prediction_times=array_ops.zeros(()),
-        predictions={"ticker": features["ticker"]})
-
-
-class EvaluationMetricsTests(test.TestCase):
-
-  def test_metrics_consistent(self):
-    # Tests that the identity metrics used to report in-sample predictions match
-    # the behavior of standard metrics.
-    g = ops.Graph()
-    with g.as_default():
-      features = {
-          feature_keys.TrainEvalFeatures.TIMES:
-              array_ops.zeros((1, 1)),
-          feature_keys.TrainEvalFeatures.VALUES:
-              array_ops.zeros((1, 1, 1)),
-          "ticker":
-              array_ops.reshape(
-                  math_ops.cast(
-                      variables.Variable(
-                          name="ticker",
-                          initial_value=0,
-                          dtype=dtypes.int64,
-                          collections=[ops.GraphKeys.LOCAL_VARIABLES])
-                      .count_up_to(10),
-                      dtype=dtypes.float32), (1, 1, 1))
-      }
-      model_fn = model_utils.make_model_fn(
-          model=_TickerModel(),
-          state_manager=state_management.PassthroughStateManager(),
-          optimizer=train.GradientDescentOptimizer(0.001))
-      outputs = model_fn(
-          features=features, labels=None, mode=estimator_lib.ModeKeys.EVAL)
-      metric_update_ops = [
-          metric[1] for metric in outputs.eval_metric_ops.values()]
-      loss_mean, loss_update = metrics.mean(outputs.loss)
-      metric_update_ops.append(loss_update)
-      with self.test_session() as sess:
-        coordinator = coordinator_lib.Coordinator()
-        queue_runner_impl.start_queue_runners(sess, coord=coordinator)
-        variables.local_variables_initializer().run()
-        sess.run(metric_update_ops)
-        loss_evaled, metric_evaled, nested_metric_evaled = sess.run(
-            (loss_mean, outputs.eval_metric_ops["ticker"][0],
-             outputs.eval_metric_ops[feature_keys.FilteringResults.STATE_TUPLE][
-                 0][0]))
-        # The custom model_utils metrics for in-sample predictions should be in
-        # sync with the Estimator's mean metric for model loss.
-        self.assertAllClose(0., loss_evaled)
-        self.assertAllClose((((0.,),),), metric_evaled)
-        self.assertAllClose((((0.,),),), nested_metric_evaled)
-        coordinator.request_stop()
-        coordinator.join()
-
-
-class _StubModel(object):
-  num_features = 3
-  dtype = dtypes.float64
-
-  def initialize_graph(self, input_statistics):
-    del input_statistics  # unused
-
-
-def _stub_model_fn():
-  return model_utils.make_model_fn(
-      model=_StubModel(),
-      state_manager=state_management.PassthroughStateManager(),
-      optimizer=train.AdamOptimizer(0.001))
-
-
-class TrainEvalFeatureCheckingTests(test.TestCase):
-
-  def test_no_time_feature(self):
-    model_fn = _stub_model_fn()
-    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL]:
-      with self.assertRaisesRegexp(ValueError, "Expected a '{}' feature".format(
-          feature_keys.TrainEvalFeatures.TIMES)):
-        model_fn(
-            features={feature_keys.TrainEvalFeatures.VALUES: [[[1.]]]},
-            labels=None,
-            mode=mode)
-
-  def test_no_value_feature(self):
-    model_fn = _stub_model_fn()
-    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL]:
-      with self.assertRaisesRegexp(ValueError, "Expected a '{}' feature".format(
-          feature_keys.TrainEvalFeatures.VALUES)):
-        model_fn(
-            features={feature_keys.TrainEvalFeatures.TIMES: [[1]]},
-            labels=None,
-            mode=mode)
-
-  def test_bad_time_rank(self):
-    model_fn = _stub_model_fn()
-    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL]:
-      with self.assertRaisesRegexp(ValueError,
-                                   "Expected shape.*for feature '{}'".format(
-                                       feature_keys.TrainEvalFeatures.TIMES)):
-        model_fn(
-            features={
-                feature_keys.TrainEvalFeatures.TIMES: [[[1]]],
-                feature_keys.TrainEvalFeatures.VALUES: [[[1.]]]
-            },
-            labels=None,
-            mode=mode)
-
-  def test_bad_value_rank(self):
-    model_fn = _stub_model_fn()
-    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL]:
-      with self.assertRaisesRegexp(ValueError,
-                                   "Expected shape.*for feature '{}'".format(
-                                       feature_keys.TrainEvalFeatures.VALUES)):
-        model_fn(
-            features={
-                feature_keys.TrainEvalFeatures.TIMES: [[1]],
-                feature_keys.TrainEvalFeatures.VALUES: [[1.]]
-            },
-            labels=None,
-            mode=mode)
-
-  def test_bad_value_num_features(self):
-    model_fn = _stub_model_fn()
-    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL]:
-      with self.assertRaisesRegexp(
-          ValueError, "Expected shape.*, 3.*for feature '{}'".format(
-              feature_keys.TrainEvalFeatures.VALUES)):
-        model_fn(
-            features={
-                feature_keys.TrainEvalFeatures.TIMES: [[1]],
-                feature_keys.TrainEvalFeatures.VALUES: [[[1.]]]
-            },
-            labels=None,
-            mode=mode)
-
-  def test_bad_exogenous_shape(self):
-    model_fn = _stub_model_fn()
-    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL]:
-      with self.assertRaisesRegexp(
-          ValueError,
-          "Features must have shape.*for feature 'exogenous'"):
-        model_fn(
-            features={
-                feature_keys.TrainEvalFeatures.TIMES: [[1]],
-                feature_keys.TrainEvalFeatures.VALUES: [[[1., 2., 3.]]],
-                "exogenous": [[1], [2]]
-            },
-            labels=None,
-            mode=mode)
-
-
-class PredictFeatureCheckingTests(test.TestCase):
-
-  def test_no_time_feature(self):
-    model_fn = _stub_model_fn()
-    with self.assertRaisesRegexp(ValueError, "Expected a '{}' feature".format(
-        feature_keys.PredictionFeatures.TIMES)):
-      model_fn(
-          features={
-              feature_keys.PredictionFeatures.STATE_TUPLE: ([[[1.]]], 1.)
-          },
-          labels=None,
-          mode=estimator_lib.ModeKeys.PREDICT)
-
-  def test_no_start_state_feature(self):
-    model_fn = _stub_model_fn()
-    with self.assertRaisesRegexp(ValueError, "Expected a '{}' feature".format(
-        feature_keys.PredictionFeatures.STATE_TUPLE)):
-      model_fn(
-          features={feature_keys.PredictionFeatures.TIMES: [[1]]},
-          labels=None,
-          mode=estimator_lib.ModeKeys.PREDICT)
-
-  def test_bad_time_rank(self):
-    model_fn = _stub_model_fn()
-    with self.assertRaisesRegexp(ValueError,
-                                 "Expected shape.*for feature '{}'".format(
-                                     feature_keys.PredictionFeatures.TIMES)):
-      model_fn(
-          features={
-              feature_keys.PredictionFeatures.TIMES: 1,
-              feature_keys.PredictionFeatures.STATE_TUPLE: (1, (2, 3.))
-          },
-          labels=None,
-          mode=estimator_lib.ModeKeys.PREDICT)
-
-  def test_bad_exogenous_shape(self):
-    model_fn = _stub_model_fn()
-    with self.assertRaisesRegexp(
-        ValueError,
-        "Features must have shape.*for feature 'exogenous'"):
-      model_fn(
-          features={
-              feature_keys.PredictionFeatures.TIMES: [[1]],
-              feature_keys.PredictionFeatures.STATE_TUPLE: (1, (2, 3.)),
-              "exogenous": 1.
-          },
-          labels=None,
-          mode=estimator_lib.ModeKeys.PREDICT)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/saved_model_utils.py b/tensorflow/contrib/timeseries/python/timeseries/saved_model_utils.py
index 16e29f5e68..97f6d36a87 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/saved_model_utils.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/saved_model_utils.py
@@ -23,6 +23,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.timeseries.python.timeseries import feature_keys as _feature_keys
+from tensorflow.contrib.timeseries.python.timeseries import head as _head
 from tensorflow.contrib.timeseries.python.timeseries import input_pipeline as _input_pipeline
 from tensorflow.contrib.timeseries.python.timeseries import model_utils as _model_utils
 
@@ -34,7 +35,7 @@ def _colate_features_to_feeds_and_fetches(continue_from, signature, features,
   """Uses a saved model signature to construct feed and fetch dictionaries."""
   if _feature_keys.FilteringResults.STATE_TUPLE in continue_from:
     # We're continuing from an evaluation, so we need to unpack/flatten state.
-    state_values = _model_utils.state_to_dictionary(
+    state_values = _head.state_to_dictionary(
         continue_from[_feature_keys.FilteringResults.STATE_TUPLE])
   else:
     state_values = continue_from
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index eb66d8e329..f3e43dd552 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1773,6 +1773,7 @@ tf_cuda_library(
     ) + if_mkl(
         [
             "//third_party/mkl:intel_binary_blob",
+            "@mkl_dnn//:mkl_dnn",
         ],
     ),
     alwayslink = 1,
@@ -1933,7 +1934,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/visitable_allocator.h",
     "graph/gradients.h",
     "graph/quantize_training.h",
-]
+] + if_mkl(["graph/mkl_graph_util.h"])
 
 tf_cuda_library(
     name = "core_cpu_impl",
@@ -2034,7 +2035,10 @@ tf_cuda_library(
         "//third_party/eigen3",
         "//tensorflow/core/kernels:required",
     ] + if_mkl(
-        ["//third_party/mkl:intel_binary_blob"],
+        [
+            "//third_party/mkl:intel_binary_blob",
+            "@mkl_dnn//:mkl_dnn",
+        ],
     ) + tf_additional_core_deps() + if_static([":core_cpu_impl"]),
     alwayslink = 1,
 )
@@ -2670,7 +2674,7 @@ tf_cc_test_mkl(
         "graph/mkl_layout_pass_test.cc",
         "graph/mkl_tfconversion_pass_test.cc",
     ],
-    linkstatic = tf_kernel_tests_linkstatic(),
+    linkstatic = 1,
     deps = [
         ":core",
         ":core_cpu",
@@ -2688,18 +2692,6 @@ tf_cc_test_mkl(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:scope",
         "//tensorflow/cc:sendrecv_ops",
-        "//tensorflow/core/kernels:mkl_aggregate_ops",
-        "//tensorflow/core/kernels:mkl_concat_op",
-        "//tensorflow/core/kernels:mkl_conv_op",
-        "//tensorflow/core/kernels:mkl_cwise_ops_common",
-        "//tensorflow/core/kernels:mkl_fused_batch_norm_op",
-        "//tensorflow/core/kernels:mkl_identity_op",
-        "//tensorflow/core/kernels:mkl_input_conversion_op",
-        "//tensorflow/core/kernels:mkl_lrn_op",
-        "//tensorflow/core/kernels:mkl_pooling_ops",
-        "//tensorflow/core/kernels:mkl_relu_op",
-        "//tensorflow/core/kernels:mkl_reshape_op",
-        "//tensorflow/core/kernels:mkl_tfconv_op",
         "//tensorflow/core/kernels:ops_util",
         "//third_party/eigen3",
     ],
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
new file mode 100644
index 0000000000..cb32d64334
--- /dev/null
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -0,0 +1,128 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_MKL_GRAPH_UTIL_H_
+#define TENSORFLOW_CORE_GRAPH_MKL_GRAPH_UTIL_H_
+#ifdef INTEL_MKL
+
+#include <string>
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+// Since our ops are going to produce and also consume N addition tensors
+// (Mkl) for N Tensorflow tensors, we can have following different
+// orderings among these 2N tensors.
+//
+// E.g., for Tensorflow tensors A, B, and C, our ops will produce and
+// consume A_m, B_m, and C_m additionally.
+//
+// INTERLEAVED: in this case 2N tensors are interleaved. So for above
+//              example, the ordering looks like: A, A_m, B, B_m, C, C_m.
+//
+// CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed
+//             by N Mkl tensors. So for above example, the ordering looks
+//             like: A, B, C, A_m, B_m, C_m
+//
+// Following APIs map index of original Tensorflow tensors to their
+// appropriate position based on selected ordering. For contiguous ordering,
+// we need to know the total number of tensors (parameter total).
+//
+typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering;
+// NOTE: Currently, we use contiguous ordering. If you change this, then you
+// would need to change Mkl op definitions in nn_ops.cc.
+static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS;
+
+// Get index of MetaData tensor from index 'n' of Data tensor.
+inline int DataIndexToMetaDataIndex(int n, int total_tensors) {
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    // For interleaved ordering, Mkl tensor follows immediately after
+    // Tensorflow tensor.
+    return n + 1;
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away.
+    return n + total_tensors / 2;
+  }
+}
+
+int inline GetTensorDataIndex(int n, int total_tensors) {
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    return 2 * n;  // index corresponding to nth input/output tensor
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    return n;
+  }
+}
+
+int inline GetTensorMetaDataIndex(int n, int total_tensors) {
+  // Get index for TensorData first and then use mapping function
+  // to get TensorMetaData index from TensorData index.
+  int tidx = GetTensorDataIndex(n, total_tensors);
+  return DataIndexToMetaDataIndex(tidx, total_tensors);
+}
+
+namespace mkl_op_registry {
+static const char* kMklOpLabel = "MklOp";
+static const char* kMklOpLabelPattern = "label='MklOp'";
+
+// Get the name of Mkl op from original TensorFlow op
+// We prefix 'Mkl' to the original op to get Mkl op.
+inline string GetMklOpName(const string& name) {
+  // Prefix that we add to Tensorflow op name to construct Mkl op name.
+  const char* const kMklOpPrefix = "_Mkl";
+  return string(kMklOpPrefix) + name;
+}
+
+// Check whether opname with type T is registered as MKL-compliant.
+//
+// @input: name of the op
+// @input: T datatype to be used for checking op
+// @return: true if opname is registered as Mkl op; false otherwise
+static inline bool IsMklOp(const std::string& op_name, DataType T) {
+  string kernel = KernelsRegisteredForOp(op_name);
+  bool result =
+      kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
+  if (result) {
+    VLOG(1) << "mkl_op_registry::" << op_name << " is " << kMklOpLabel;
+  }
+  return result;
+}
+
+// Check whether opname with type T is registered as MKL-compliant and
+// is element-wise.
+//
+// @input: name of the op
+// @input: T datatype to be used for checking op
+// @return: true if opname is registered as element-wise Mkl op;
+// false otherwise
+static inline bool IsMklElementWiseOp(const std::string& op_name, DataType T) {
+  if (!IsMklOp(op_name, T)) {
+    return false;
+  }
+
+  bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
+                 0 == op_name.compare(GetMklOpName("Sub")) ||
+                 0 == op_name.compare(GetMklOpName("Mul")) ||
+                 0 == op_name.compare(GetMklOpName("Maximum")) ||
+                 0 == op_name.compare(GetMklOpName("SquaredDifference")));
+
+  VLOG(1) << "mkl_op_registry::" << op_name
+          << " is elementwise MKL op: " << result;
+  return result;
+}
+}  // namespace mkl_op_registry
+}  // namespace tensorflow
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_GRAPH_MKL_GRAPH_UTIL_H_
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 90377e54c7..f87a94a76a 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -37,8 +37,8 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/tensor_format.h"
 
+#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/graph/mkl_layout_pass.h"
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 6a41e3965a..a2b2f6530d 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 #include "tensorflow/core/graph/mkl_layout_pass.h"
-#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 
 #include <algorithm>
 #include <string>
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc
index 3f8b0e86d0..fe4588389e 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@@ -33,8 +33,8 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
 
+#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/graph/mkl_tfconversion_pass.h"
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
index b01818f746..bbdbe78bbd 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 #include "tensorflow/core/graph/mkl_tfconversion_pass.h"
-#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 
 #include <algorithm>
 #include <string>
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 36fbf6b023..bdc6faefbc 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -820,6 +820,7 @@ tf_kernel_library(
     hdrs = ["transpose_op.h"],
     deps = ARRAY_DEPS + if_mkl([
         "//third_party/mkl:intel_binary_blob",
+        "@mkl_dnn//:mkl_dnn",
     ]),
 )
 
@@ -2596,6 +2597,7 @@ tf_kernel_library(
         "//conditions:default": [],
     }) + if_mkl([
         "//third_party/mkl:intel_binary_blob",
+        "@mkl_dnn//:mkl_dnn",
     ]) + if_cuda([
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
     ]),
@@ -5501,8 +5503,10 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
+    ] + if_mkl([
         "//third_party/mkl:intel_binary_blob",
-    ],
+        "@mkl_dnn//:mkl_dnn",
+    ]),
 )
 
 tf_mkl_kernel_library(
@@ -5516,8 +5520,10 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
+    ] + if_mkl([
         "//third_party/mkl:intel_binary_blob",
-    ],
+        "@mkl_dnn//:mkl_dnn",
+    ]),
 )
 
 tf_mkl_kernel_library(
@@ -5566,16 +5572,19 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
+    ] + if_mkl([
         "//third_party/mkl:intel_binary_blob",
-    ],
+        "@mkl_dnn//:mkl_dnn",
+    ]),
 )
 
 tf_mkl_kernel_library(
     name = "mkl_fused_batch_norm_op",
     srcs = ["mkl_fused_batch_norm_op.cc"],
-    deps = NN_DEPS + [
+    deps = NN_DEPS + if_mkl([
         "//third_party/mkl:intel_binary_blob",
-    ],
+        "@mkl_dnn//:mkl_dnn",
+    ]),
 )
 
 tf_mkl_kernel_library(
@@ -5589,9 +5598,10 @@ tf_mkl_kernel_library(
 tf_mkl_kernel_library(
     name = "mkl_concat_op",
     prefix = "mkl_concat_op",
-    deps = ARRAY_DEPS + [
+    deps = ARRAY_DEPS + if_mkl([
         "//third_party/mkl:intel_binary_blob",
-    ],
+        "@mkl_dnn//:mkl_dnn",
+    ]),
 )
 
 tf_mkl_kernel_library(
@@ -5605,17 +5615,19 @@ tf_mkl_kernel_library(
 tf_mkl_kernel_library(
     name = "mkl_identity_op",
     prefix = "mkl_identity_op",
-    deps = ARRAY_DEPS + [
+    deps = ARRAY_DEPS + if_mkl([
         "//third_party/mkl:intel_binary_blob",
-    ],
+        "@mkl_dnn//:mkl_dnn",
+    ]),
 )
 
 tf_mkl_kernel_library(
     name = "mkl_lrn_op",
     prefix = "mkl_lrn_op",
-    deps = NN_DEPS + [
+    deps = NN_DEPS + if_mkl([
         "//third_party/mkl:intel_binary_blob",
-    ],
+        "@mkl_dnn//:mkl_dnn",
+    ]),
 )
 
 tf_mkl_kernel_library(
diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc
index 1bdfafb89b..368993c827 100644
--- a/tensorflow/core/kernels/bias_op.cc
+++ b/tensorflow/core/kernels/bias_op.cc
@@ -39,6 +39,48 @@ typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
 
+namespace {
+
+void GetBiasValueDims(const Tensor& value_tensor, TensorFormat data_format,
+                      int32* batch, int32* height, int32* width,
+                      int32* channel) {
+  *batch = 1;
+  *width = 1;
+  *height = 1;
+  *channel = 1;
+  if (data_format == FORMAT_NHWC) {
+    int32 channel_dim = value_tensor.dims() - 1;
+    *channel = static_cast<int32>(value_tensor.dim_size(channel_dim));
+    for (int32 i = 0; i < channel_dim; i++) {
+      *batch *= static_cast<int32>(value_tensor.dim_size(i));
+    }
+  } else if (data_format == FORMAT_NCHW) {
+    int32 channel_dim = value_tensor.dims() - 3;
+    int32 height_dim = value_tensor.dims() - 2;
+    int32 width_dim = value_tensor.dims() - 1;
+    *channel = static_cast<int32>(value_tensor.dim_size(channel_dim));
+    *height = static_cast<int32>(value_tensor.dim_size(height_dim));
+    *width = static_cast<int32>(value_tensor.dim_size(width_dim));
+    for (int32 i = 0; i < channel_dim; i++) {
+      *batch *= static_cast<int32>(value_tensor.dim_size(i));
+    }
+  }
+}
+
+template <class T>
+struct AccumulatorType {
+  typedef T type;
+};
+
+// float is faster on the CPU than half, and also more precise,
+// so use float for the temporary accumulators.
+template <>
+struct AccumulatorType<Eigen::half> {
+  typedef float type;
+};
+
+}  // namespace
+
 template <typename Device, typename T>
 class BiasOp : public BinaryOp<T> {
  public:
@@ -50,9 +92,6 @@ class BiasOp : public BinaryOp<T> {
     } else {
       data_format_ = FORMAT_NHWC;
     }
-    OP_REQUIRES(context, data_format_ == FORMAT_NHWC,
-                errors::InvalidArgument(context->device()->name() +
-                                        " BiasOp only supports NHWC."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -65,9 +104,21 @@ class BiasOp : public BinaryOp<T> {
     OP_REQUIRES(context, TensorShapeUtils::IsVector(bias.shape()),
                 errors::InvalidArgument("Biases must be 1D: ",
                                         bias.shape().DebugString()));
-    const auto last_dim = input.shape().dims() - 1;
+
+    // Added by intel_tf to support NCHW on CPU regardless of MKL used or not.
+    size_t channel_dim;
+    if (data_format_ == FORMAT_NCHW) {
+      OP_REQUIRES(context, input.dims() == 4,
+                  errors::InvalidArgument(
+                      "NCHW format supports only 4D input tensor."));
+      channel_dim = 1;
+    } else {
+      channel_dim = input.shape().dims() - 1;  // End of code by intel_tf.
+    }
+
     OP_REQUIRES(
-        context, bias.shape().dim_size(0) == input.shape().dim_size(last_dim),
+        context,
+        bias.shape().dim_size(0) == input.shape().dim_size(channel_dim),
         errors::InvalidArgument(
             "Must provide as many biases as the last dimension "
             "of the input tensor: ",
@@ -78,6 +129,19 @@ class BiasOp : public BinaryOp<T> {
                                 {0}, 0, input.shape(), &output));
     if (input.NumElements() == 0) return;
 
+    // Added by intel_tf to support NCHW on CPU regardless of MKL used or not.
+    if (data_format_ == FORMAT_NCHW) {
+      int32 batch, height, width, channel;
+      GetBiasValueDims(input, data_format_, &batch, &height, &width, &channel);
+      Eigen::DSizes<int32, 4> four_dims(1, channel, 1, 1);
+      Eigen::DSizes<int32, 4> broad_cast_dims(batch, 1, height, width);
+      const Device& d = context->eigen_device<Device>();
+      output->tensor<T, 4>().device(d) =
+          input.tensor<T, 4>() +
+          bias.tensor<T, 1>().reshape(four_dims).broadcast(broad_cast_dims);
+      return;
+    }  // End of code by intel_tf.
+
     switch (input.shape().dims()) {
       case 2:
         Compute<2>(context, input, bias, output);
@@ -137,48 +201,6 @@ REGISTER_KERNEL(double);
 #undef REGISTER_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
-namespace {
-
-void GetBiasValueDims(const Tensor& value_tensor, TensorFormat data_format,
-                      int32* batch, int32* height, int32* width,
-                      int32* channel) {
-  *batch = 1;
-  *width = 1;
-  *height = 1;
-  *channel = 1;
-  if (data_format == FORMAT_NHWC) {
-    int32 channel_dim = value_tensor.dims() - 1;
-    *channel = static_cast<int32>(value_tensor.dim_size(channel_dim));
-    for (int32 i = 0; i < channel_dim; i++) {
-      *batch *= static_cast<int32>(value_tensor.dim_size(i));
-    }
-  } else if (data_format == FORMAT_NCHW) {
-    int32 channel_dim = value_tensor.dims() - 3;
-    int32 height_dim = value_tensor.dims() - 2;
-    int32 width_dim = value_tensor.dims() - 1;
-    *channel = static_cast<int32>(value_tensor.dim_size(channel_dim));
-    *height = static_cast<int32>(value_tensor.dim_size(height_dim));
-    *width = static_cast<int32>(value_tensor.dim_size(width_dim));
-    for (int32 i = 0; i < channel_dim; i++) {
-      *batch *= static_cast<int32>(value_tensor.dim_size(i));
-    }
-  }
-}
-
-template <class T>
-struct AccumulatorType {
-  typedef T type;
-};
-
-// float is faster on the CPU than half, and also more precise,
-// so use float for the temporary accumulators.
-template <>
-struct AccumulatorType<Eigen::half> {
-  typedef float type;
-};
-
-}  // namespace
-
 template <typename Device, typename T>
 class BiasGradOp : public OpKernel {
  public:
@@ -190,9 +212,6 @@ class BiasGradOp : public OpKernel {
     } else {
       data_format_ = FORMAT_NHWC;
     }
-    OP_REQUIRES(context, data_format_ == FORMAT_NHWC,
-                errors::InvalidArgument(context->device()->name() +
-                                        " BiasGradOp only supports NHWC."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -222,18 +241,40 @@ class BiasGradOp : public OpKernel {
       // Eigen often crashes by design on empty tensors, but setZero is safe
       output->template flat<T>().setZero();
     } else {
-      Eigen::DSizes<int, 2> two_dims(batch * height * width, channel);
+      // Added by intel_tf to support NCHW on CPU regardless of MKL used or not.
+      if (data_format_ == FORMAT_NCHW) {
+        OP_REQUIRES(context, output_backprop.dims() == 4,
+                    errors::InvalidArgument(
+                        "NCHW format supports only 4D input/output tensor."));
+        Eigen::DSizes<int, 4> four_dims(batch, channel, height, width);
+#ifdef EIGEN_HAS_INDEX_LIST
+        using idx0 = Eigen::type2index<0>;
+        using idx2 = Eigen::type2index<2>;
+        using idx3 = Eigen::type2index<3>;
+        Eigen::IndexList<idx0, idx2, idx3> reduction_axes;
+#else
+        Eigen::array<int, 3> reduction_axes = {0, 2, 3};
+#endif
+        output->template flat<T>().device(context->eigen_device<Device>()) =
+            output_backprop.flat<T>()
+                .template cast<typename AccumulatorType<T>::type>()
+                .reshape(four_dims)
+                .sum(reduction_axes)
+                .template cast<T>();  // End of code by intel_tf.
+      } else {
+        Eigen::DSizes<int, 2> two_dims(batch * height * width, channel);
 #ifdef EIGEN_HAS_INDEX_LIST
-      Eigen::IndexList<Eigen::type2index<0> > reduction_axis;
+        Eigen::IndexList<Eigen::type2index<0> > reduction_axis;
 #else
-      Eigen::array<int, 1> reduction_axis = {0};
+        Eigen::array<int, 1> reduction_axis = {0};
 #endif
-      output->template flat<T>().device(context->eigen_device<Device>()) =
-          output_backprop.flat<T>()
-              .template cast<typename AccumulatorType<T>::type>()
-              .reshape(two_dims)
-              .sum(reduction_axis)
-              .template cast<T>();
+        output->template flat<T>().device(context->eigen_device<Device>()) =
+            output_backprop.flat<T>()
+                .template cast<typename AccumulatorType<T>::type>()
+                .reshape(two_dims)
+                .sum(reduction_axis)
+                .template cast<T>();
+      }
     }
   }
 
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 641077ca65..5e09963d2d 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -816,40 +816,35 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
   AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune && !AutoTuneConvBwdFilter::GetInstance()->Find(
                                 conv_parameters, &algorithm_config)) {
-    std::vector<AlgorithmDesc::Index> algorithms;
+    std::vector<AlgorithmDesc> algorithms;
     CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
         conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
     ProfileResult best_result;
     ProfileResult best_result_no_scratch;
-    // TODO(benbarsdell): Ideally this should not attempt using tensor op math
-    // if it's not enabled.
-    for (bool use_tensor_ops : {false, true}) {
-      for (auto algo_index : algorithms) {
-        // TODO(zhengxq): profile each algorithm multiple times to better
-        // accuracy.
-        AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
-        CudnnScratchAllocator scratch_allocator(
-            ConvolveBackwardFilterScratchSize, ctx);
-        ProfileResult profile_result;
-        bool cudnn_launch_status =
-            stream
-                ->ThenConvolveBackwardFilterWithAlgorithm(
-                    input_desc, input_ptr, output_desc, out_backprop_ptr,
-                    conv_desc, filter_desc, &filter_backprop_ptr,
-                    &scratch_allocator, AlgorithmConfig(profile_algorithm),
-                    &profile_result)
-                .ok();
-        if (cudnn_launch_status) {
-          if (profile_result.is_valid()) {
-            if (profile_result.elapsed_time_in_ms() <
-                best_result.elapsed_time_in_ms()) {
-              best_result = profile_result;
-            }
-            if (scratch_allocator.TotalByteSize() == 0 &&
-                profile_result.elapsed_time_in_ms() <
-                    best_result_no_scratch.elapsed_time_in_ms()) {
-              best_result_no_scratch = profile_result;
-            }
+    for (auto profile_algorithm : algorithms) {
+      // TODO(zhengxq): profile each algorithm multiple times to better
+      // accuracy.
+      CudnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
+                                              ctx);
+      ProfileResult profile_result;
+      bool cudnn_launch_status =
+          stream
+              ->ThenConvolveBackwardFilterWithAlgorithm(
+                  input_desc, input_ptr, output_desc, out_backprop_ptr,
+                  conv_desc, filter_desc, &filter_backprop_ptr,
+                  &scratch_allocator, AlgorithmConfig(profile_algorithm),
+                  &profile_result)
+              .ok();
+      if (cudnn_launch_status) {
+        if (profile_result.is_valid()) {
+          if (profile_result.elapsed_time_in_ms() <
+              best_result.elapsed_time_in_ms()) {
+            best_result = profile_result;
+          }
+          if (scratch_allocator.TotalByteSize() == 0 &&
+              profile_result.elapsed_time_in_ms() <
+                  best_result_no_scratch.elapsed_time_in_ms()) {
+            best_result_no_scratch = profile_result;
           }
         }
       }
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 0732bf4046..0b2d01afa9 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -870,39 +870,34 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
   AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune && !AutoTuneConvBwdData::GetInstance()->Find(
                                 conv_parameters, &algorithm_config)) {
-    std::vector<AlgorithmDesc::Index> algorithms;
+    std::vector<AlgorithmDesc> algorithms;
     CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
         conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
     ProfileResult best_result;
     ProfileResult best_result_no_scratch;
-    // TODO(benbarsdell): Ideally this should not attempt using tensor op math
-    // if it's not enabled.
-    for (bool use_tensor_ops : {false, true}) {
-      for (auto algo_index : algorithms) {
-        // TODO(zhengxq): profile each algorithm multiple times to better
-        // accuracy.
-        AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
-        CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
-                                                ctx);
-        ProfileResult profile_result;
-        bool cudnn_launch_status =
-            stream
-                ->ThenConvolveBackwardDataWithAlgorithm(
-                    filter_desc, filter_ptr, output_desc, out_backprop_ptr,
-                    conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
-                    AlgorithmConfig(profile_algorithm), &profile_result)
-                .ok();
-        if (cudnn_launch_status) {
-          if (profile_result.is_valid()) {
-            if (profile_result.elapsed_time_in_ms() <
-                best_result.elapsed_time_in_ms()) {
-              best_result = profile_result;
-            }
-            if (scratch_allocator.TotalByteSize() == 0 &&
-                profile_result.elapsed_time_in_ms() <
-                    best_result_no_scratch.elapsed_time_in_ms()) {
-              best_result_no_scratch = profile_result;
-            }
+    for (auto profile_algorithm : algorithms) {
+      // TODO(zhengxq): profile each algorithm multiple times to better
+      // accuracy.
+      CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
+                                              ctx);
+      ProfileResult profile_result;
+      bool cudnn_launch_status =
+          stream
+              ->ThenConvolveBackwardDataWithAlgorithm(
+                  filter_desc, filter_ptr, output_desc, out_backprop_ptr,
+                  conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
+                  AlgorithmConfig(profile_algorithm), &profile_result)
+              .ok();
+      if (cudnn_launch_status) {
+        if (profile_result.is_valid()) {
+          if (profile_result.elapsed_time_in_ms() <
+              best_result.elapsed_time_in_ms()) {
+            best_result = profile_result;
+          }
+          if (scratch_allocator.TotalByteSize() == 0 &&
+              profile_result.elapsed_time_in_ms() <
+                  best_result_no_scratch.elapsed_time_in_ms()) {
+            best_result_no_scratch = profile_result;
           }
         }
       }
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 8ad56053a8..21f5cb1716 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -654,40 +654,34 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     AlgorithmConfig algorithm_config;
     if (cudnn_use_autotune_ && !AutoTuneConv3dBwdData::GetInstance()->Find(
                                    conv_parameters, &algorithm_config)) {
-      std::vector<AlgorithmDesc::Index> algorithms;
+      std::vector<AlgorithmDesc> algorithms;
       CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
           conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
       ProfileResult best_result;
       ProfileResult best_result_no_scratch;
-      // TODO(benbarsdell): Ideally this should not attempt using tensor op math
-      // if it's not enabled.
-      for (bool use_tensor_ops : {false, true}) {
-        for (auto algo_index : algorithms) {
-          // TODO(zhengxq): profile each algorithm multiple times to better
-          // accuracy.
-          AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
-          CudnnScratchAllocator scratch_allocator(
-              ConvolveBackwardDataScratchSize, context);
-          ProfileResult profile_result;
-          bool cudnn_launch_status =
-              stream
-                  ->ThenConvolveBackwardDataWithAlgorithm(
-                      filter_desc, filter_ptr, output_desc, out_backprop_ptr,
-                      conv_desc, input_desc, &in_backprop_ptr,
-                      &scratch_allocator, AlgorithmConfig(profile_algorithm),
-                      &profile_result)
-                  .ok();
-          if (cudnn_launch_status) {
-            if (profile_result.is_valid()) {
-              if (profile_result.elapsed_time_in_ms() <
-                  best_result.elapsed_time_in_ms()) {
-                best_result = profile_result;
-              }
-              if (scratch_allocator.TotalByteSize() == 0 &&
-                  profile_result.elapsed_time_in_ms() <
-                      best_result_no_scratch.elapsed_time_in_ms()) {
-                best_result_no_scratch = profile_result;
-              }
+      for (auto profile_algorithm : algorithms) {
+        // TODO(zhengxq): profile each algorithm multiple times to better
+        // accuracy.
+        CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
+                                                context);
+        ProfileResult profile_result;
+        bool cudnn_launch_status =
+            stream
+                ->ThenConvolveBackwardDataWithAlgorithm(
+                    filter_desc, filter_ptr, output_desc, out_backprop_ptr,
+                    conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
+                    AlgorithmConfig(profile_algorithm), &profile_result)
+                .ok();
+        if (cudnn_launch_status) {
+          if (profile_result.is_valid()) {
+            if (profile_result.elapsed_time_in_ms() <
+                best_result.elapsed_time_in_ms()) {
+              best_result = profile_result;
+            }
+            if (scratch_allocator.TotalByteSize() == 0 &&
+                profile_result.elapsed_time_in_ms() <
+                    best_result_no_scratch.elapsed_time_in_ms()) {
+              best_result_no_scratch = profile_result;
             }
           }
         }
@@ -1026,40 +1020,35 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     AlgorithmConfig algorithm_config;
     if (cudnn_use_autotune_ && !AutoTuneConv3dBwdFilter::GetInstance()->Find(
                                    conv_parameters, &algorithm_config)) {
-      std::vector<AlgorithmDesc::Index> algorithms;
+      std::vector<AlgorithmDesc> algorithms;
       CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
           conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
       ProfileResult best_result;
       ProfileResult best_result_no_scratch;
-      // TODO(benbarsdell): Ideally this should not attempt using tensor op math
-      //                      if it's not enabled.
-      for (bool use_tensor_ops : {false, true}) {
-        for (auto algo_index : algorithms) {
-          // TODO(zhengxq): profile each algorithm multiple times to better
-          // accuracy.
-          AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
-          CudnnScratchAllocator scratch_allocator(
-              ConvolveBackwardFilterScratchSize, context);
-          ProfileResult profile_result;
-          bool cudnn_launch_status =
-              stream
-                  ->ThenConvolveBackwardFilterWithAlgorithm(
-                      input_desc, input_ptr, output_desc, out_backprop_ptr,
-                      conv_desc, filter_desc, &filter_backprop_ptr,
-                      &scratch_allocator, AlgorithmConfig(profile_algorithm),
-                      &profile_result)
-                  .ok();
-          if (cudnn_launch_status) {
-            if (profile_result.is_valid()) {
-              if (profile_result.elapsed_time_in_ms() <
-                  best_result.elapsed_time_in_ms()) {
-                best_result = profile_result;
-              }
-              if (scratch_allocator.TotalByteSize() == 0 &&
-                  profile_result.elapsed_time_in_ms() <
-                      best_result_no_scratch.elapsed_time_in_ms()) {
-                best_result_no_scratch = profile_result;
-              }
+      for (auto profile_algorithm : algorithms) {
+        // TODO(zhengxq): profile each algorithm multiple times to better
+        // accuracy.
+        CudnnScratchAllocator scratch_allocator(
+            ConvolveBackwardFilterScratchSize, context);
+        ProfileResult profile_result;
+        bool cudnn_launch_status =
+            stream
+                ->ThenConvolveBackwardFilterWithAlgorithm(
+                    input_desc, input_ptr, output_desc, out_backprop_ptr,
+                    conv_desc, filter_desc, &filter_backprop_ptr,
+                    &scratch_allocator, AlgorithmConfig(profile_algorithm),
+                    &profile_result)
+                .ok();
+        if (cudnn_launch_status) {
+          if (profile_result.is_valid()) {
+            if (profile_result.elapsed_time_in_ms() <
+                best_result.elapsed_time_in_ms()) {
+              best_result = profile_result;
+            }
+            if (scratch_allocator.TotalByteSize() == 0 &&
+                profile_result.elapsed_time_in_ms() <
+                    best_result_no_scratch.elapsed_time_in_ms()) {
+              best_result_no_scratch = profile_result;
             }
           }
         }
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index dc03eeb658..bb67113fb0 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -662,38 +662,33 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune &&
       !AutoTuneConv::GetInstance()->Find(conv_parameters, &algorithm_config)) {
-    std::vector<AlgorithmDesc::Index> algorithms;
+    std::vector<AlgorithmDesc> algorithms;
     CHECK(stream->parent()->GetConvolveAlgorithms(
         conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
     ProfileResult best_result;
     ProfileResult best_result_no_scratch;
-    // TODO(benbarsdell): Ideally this should not attempt using tensor op math
-    // if it's not enabled.
-    for (bool use_tensor_ops : {false, true}) {
-      for (auto algo_index : algorithms) {
-        // TODO(zhengxq): profile each algorithm multiple times to better
-        // accuracy.
-        AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
-        CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
-        ProfileResult profile_result;
-        bool cudnn_launch_status =
-            stream
-                ->ThenConvolveWithAlgorithm(
-                    input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
-                    output_desc, &output_ptr, &scratch_allocator,
-                    AlgorithmConfig(profile_algorithm), &profile_result)
-                .ok();
-        if (cudnn_launch_status) {
-          if (profile_result.is_valid()) {
-            if (profile_result.elapsed_time_in_ms() <
-                best_result.elapsed_time_in_ms()) {
-              best_result = profile_result;
-            }
-            if (scratch_allocator.TotalByteSize() == 0 &&
-                profile_result.elapsed_time_in_ms() <
-                    best_result_no_scratch.elapsed_time_in_ms()) {
-              best_result_no_scratch = profile_result;
-            }
+    for (auto profile_algorithm : algorithms) {
+      // TODO(zhengxq): profile each algorithm multiple times to better
+      // accuracy.
+      CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+      ProfileResult profile_result;
+      bool cudnn_launch_status =
+          stream
+              ->ThenConvolveWithAlgorithm(
+                  input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
+                  output_desc, &output_ptr, &scratch_allocator,
+                  AlgorithmConfig(profile_algorithm), &profile_result)
+              .ok();
+      if (cudnn_launch_status) {
+        if (profile_result.is_valid()) {
+          if (profile_result.elapsed_time_in_ms() <
+              best_result.elapsed_time_in_ms()) {
+            best_result = profile_result;
+          }
+          if (scratch_allocator.TotalByteSize() == 0 &&
+              profile_result.elapsed_time_in_ms() <
+                  best_result_no_scratch.elapsed_time_in_ms()) {
+            best_result_no_scratch = profile_result;
           }
         }
       }
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 72758f707a..8a89d564de 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -390,38 +390,33 @@ struct LaunchConvOp<GPUDevice, T> {
 
     if (cudnn_use_autotune && !AutoTuneConv3d::GetInstance()->Find(
                                   conv_parameters, &algorithm_config)) {
-      std::vector<AlgorithmDesc::Index> algorithms;
+      std::vector<AlgorithmDesc> algorithms;
       CHECK(stream->parent()->GetConvolveAlgorithms(
           conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
       ProfileResult best_result;
       ProfileResult best_result_no_scratch;
-      // TODO(benbarsdell): Ideally this should not attempt using tensor op math
-      // if it's not enabled.
-      for (bool use_tensor_ops : {false, true}) {
-        for (auto algo_index : algorithms) {
-          AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
-          // TODO(zhengxq): profile each algorithm multiple times to better
-          // accuracy.
-          CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
-          ProfileResult profile_result;
-          bool cudnn_launch_status =
-              stream
-                  ->ThenConvolveWithAlgorithm(
-                      input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
-                      output_desc, &output_ptr, &scratch_allocator,
-                      AlgorithmConfig(profile_algorithm), &profile_result)
-                  .ok();
-          if (cudnn_launch_status) {
-            if (profile_result.is_valid()) {
-              if (profile_result.elapsed_time_in_ms() <
-                  best_result.elapsed_time_in_ms()) {
-                best_result = profile_result;
-              }
-              if (scratch_allocator.TotalByteSize() == 0 &&
-                  profile_result.elapsed_time_in_ms() <
-                      best_result_no_scratch.elapsed_time_in_ms()) {
-                best_result_no_scratch = profile_result;
-              }
+      for (auto profile_algorithm : algorithms) {
+        // TODO(zhengxq): profile each algorithm multiple times to better
+        // accuracy.
+        CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+        ProfileResult profile_result;
+        bool cudnn_launch_status =
+            stream
+                ->ThenConvolveWithAlgorithm(
+                    input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
+                    output_desc, &output_ptr, &scratch_allocator,
+                    AlgorithmConfig(profile_algorithm), &profile_result)
+                .ok();
+        if (cudnn_launch_status) {
+          if (profile_result.is_valid()) {
+            if (profile_result.elapsed_time_in_ms() <
+                best_result.elapsed_time_in_ms()) {
+              best_result = profile_result;
+            }
+            if (scratch_allocator.TotalByteSize() == 0 &&
+                profile_result.elapsed_time_in_ms() <
+                    best_result_no_scratch.elapsed_time_in_ms()) {
+              best_result_no_scratch = profile_result;
             }
           }
         }
diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc
index 42ea23553b..5e48ae9766 100644
--- a/tensorflow/core/kernels/decode_csv_op.cc
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@@ -36,8 +36,8 @@ class DecodeCSVOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_quote_delim", &use_quote_delim_));
     OP_REQUIRES(ctx, delim.size() == 1,
                 errors::InvalidArgument("field_delim should be only 1 char"));
-
     delim_ = delim[0];
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("na_value", &na_value_));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -79,9 +79,9 @@ class DecodeCSVOp : public OpKernel {
         const DataType& dtype = out_type_[f];
         switch (dtype) {
           case DT_INT32: {
-            // If this field is empty, check if default is given:
+            // If this field is empty or NA value, check if default is given:
             // If yes, use default value; Otherwise report error.
-            if (fields[f].empty()) {
+            if (fields[f].empty() || fields[f] == na_value_) {
               OP_REQUIRES(ctx, record_defaults[f].NumElements() == 1,
                           errors::InvalidArgument(
                               "Field ", f,
@@ -99,9 +99,9 @@ class DecodeCSVOp : public OpKernel {
             break;
           }
           case DT_INT64: {
-            // If this field is empty, check if default is given:
+            // If this field is empty or NA value, check if default is given:
             // If yes, use default value; Otherwise report error.
-            if (fields[f].empty()) {
+            if (fields[f].empty() || fields[f] == na_value_) {
               OP_REQUIRES(ctx, record_defaults[f].NumElements() == 1,
                           errors::InvalidArgument(
                               "Field ", f,
@@ -119,9 +119,9 @@ class DecodeCSVOp : public OpKernel {
             break;
           }
           case DT_FLOAT: {
-            // If this field is empty, check if default is given:
+            // If this field is empty or NA value, check if default is given:
             // If yes, use default value; Otherwise report error.
-            if (fields[f].empty()) {
+            if (fields[f].empty() || fields[f] == na_value_) {
               OP_REQUIRES(ctx, record_defaults[f].NumElements() == 1,
                           errors::InvalidArgument(
                               "Field ", f,
@@ -138,9 +138,9 @@ class DecodeCSVOp : public OpKernel {
             break;
           }
           case DT_STRING: {
-            // If this field is empty, check if default is given:
+            // If this field is empty or NA value, check if default is given:
             // If yes, use default value; Otherwise report error.
-            if (fields[f].empty()) {
+            if (fields[f].empty() || fields[f] == na_value_) {
               OP_REQUIRES(ctx, record_defaults[f].NumElements() == 1,
                           errors::InvalidArgument(
                               "Field ", f,
@@ -165,6 +165,7 @@ class DecodeCSVOp : public OpKernel {
   std::vector<DataType> out_type_;
   char delim_;
   bool use_quote_delim_;
+  string na_value_;
 
   void ExtractFields(OpKernelContext* ctx, StringPiece input,
                      std::vector<string>* result) {
diff --git a/tensorflow/core/kernels/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/dense_to_sparse_batch_dataset_op.cc
index 25a6813d59..0174c8dfc8 100644
--- a/tensorflow/core/kernels/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/dense_to_sparse_batch_dataset_op.cc
@@ -49,10 +49,10 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("row_shape", &row_shape_t));
     OP_REQUIRES(ctx, TensorShapeUtils::IsVector(row_shape_t->shape()),
                 errors::InvalidArgument("row_shape must be a vector"));
-    TensorShape row_shape;
-    for (size_t i = 0; i < row_shape_t->dim_size(0); ++i) {
-      row_shape.AddDim(row_shape_t->vec<int64>()(i));
-    }
+    PartialTensorShape row_shape;
+    OP_REQUIRES_OK(ctx, PartialTensorShape::MakePartialShape(
+                            row_shape_t->vec<int64>().data(),
+                            row_shape_t->NumElements(), &row_shape));
 
     *output = nullptr;
 
@@ -78,7 +78,7 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
   template <class T>
   class Dataset : public DatasetBase {
    public:
-    Dataset(int64 batch_size, const TensorShape& row_shape,
+    Dataset(int64 batch_size, const PartialTensorShape& row_shape,
             const DatasetBase* input)
         : batch_size_(batch_size), row_shape_(row_shape), input_(input) {
       input_->Ref();
@@ -129,9 +129,22 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
         int64 total_elements = 0;
         batch_elements.reserve(
             DatasetIterator<Dataset<T>>::dataset()->batch_size_);
-        const TensorShape& row_shape =
+        const PartialTensorShape& row_shape =
             DatasetIterator<Dataset<T>>::dataset()->row_shape_;
         const int row_ndims = row_shape.dims();
+
+        // Determine the size of the output tensors:
+        // * dense_shape will be [`row_shape + 1`].
+        Tensor dense_shape(cpu_allocator(), DT_INT64, {row_ndims + 1});
+        auto dense_shape_vec = dense_shape.vec<int64>();
+        for (size_t i = 0; i < row_ndims; ++i) {
+          if (row_shape.dim_size(i) == -1) {
+            dense_shape_vec(i + 1) = 0;
+          } else {
+            dense_shape_vec(i + 1) = row_shape.dim_size(i);
+          }
+        }
+
         {
           mutex_lock l(mu_);
           *end_of_sequence = false;
@@ -156,9 +169,14 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
                     ") that is incompatible with the row shape (",
                     row_shape.DebugString(), ").");
               }
-              for (int i = 0; i < row_ndims; ++i) {
-                if (batch_element_tuple[0].shape().dim_size(i) >
-                    row_shape.dim_size(i)) {
+              for (int j = 0; j < row_ndims; ++j) {
+                // Take the maximum in the dimension if -1 is given.
+                if (row_shape.dim_size(j) == -1) {
+                  dense_shape_vec(j + 1) =
+                      std::max(batch_element_tuple[0].dim_size(j),
+                               dense_shape_vec(j + 1));
+                } else if (batch_element_tuple[0].dim_size(j) >
+                           row_shape.dim_size(j)) {
                   return errors::DataLoss(
                       "Input element had shape (",
                       batch_element_tuple[0].shape().DebugString(),
@@ -175,20 +193,16 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
           return Status::OK();
         }
 
-        // Determine the size of the output tensors:
         // * indices will be [`total_elements`, `row_shape + 1`].
         // * values will be [`total_elements`].
-        // * dense_shape will be [`row_shape + 1`].
         Tensor indices(cpu_allocator(), DT_INT64,
                        {total_elements, row_ndims + 1});
         Tensor values(
             cpu_allocator(),
             DatasetIterator<Dataset<T>>::dataset()->output_dtypes()[1],
             {total_elements});
-        Tensor dense_shape(cpu_allocator(), DT_INT64, {row_ndims + 1});
         auto indices_matrix = indices.matrix<int64>();
         auto values_flat = values.flat<T>();
-        auto dense_shape_vec = dense_shape.vec<int64>();
 
         int64 current_position_in_values = 0;
         for (int64 i = 0; i < batch_elements.size(); ++i) {
@@ -220,9 +234,6 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
         }
 
         dense_shape_vec(0) = batch_elements.size();
-        for (size_t i = 0; i < row_ndims; ++i) {
-          dense_shape_vec(i + 1) = row_shape.dim_size(i);
-        }
 
         out_tensors->push_back(std::move(indices));
         out_tensors->push_back(std::move(values));
@@ -239,7 +250,7 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
     };
 
     const int64 batch_size_;
-    const TensorShape row_shape_;
+    const PartialTensorShape row_shape_;
     const DatasetBase* const input_;
     std::vector<PartialTensorShape> output_shapes_;
   };
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index f81a448e51..9080bf7be8 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/mkl_conv_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -41,10 +42,24 @@ limitations under the License.
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+
+using mkldnn::prop_kind;
+using mkldnn::stream;
+
+using mkldnn::convolution_backward_weights;
+using mkldnn::convolution_direct;
+using mkldnn::convolution_forward;
+
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+#ifndef INTEL_MKL_DNN
+
 template <typename Device, class T>
 class MklConv2DCustomBackpropFilterOp : public OpKernel {
  public:
@@ -411,6 +426,172 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
   TensorFormat data_format_;
 };
 
+#else
+
+template <typename Device, class T>
+class MklConv2DCustomBackpropFilterOp : public OpKernel {
+ public:
+  explicit MklConv2DCustomBackpropFilterOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    int stride_n = GetTensorDim(strides_, data_format_, 'N');
+    int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    OP_REQUIRES(
+        context, (stride_n == 1 && stride_c == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+
+      MklDnnData<T> input(&cpu_engine);
+      MklDnnData<T> outbackprop(&cpu_engine);
+      MklDnnData<T> output(&cpu_engine);
+
+      // Input tensors
+      const Tensor& input_tensor = MklGetInput(context, 0);
+      const Tensor& filter_tensor = MklGetInput(context, 1);
+      const Tensor& obp_tensor = MklGetInput(context, 2);  // Outbackprop
+
+      // Generate input shapes.
+      TensorShape filter_shape;
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsVector(filter_tensor.shape()),
+          errors::InvalidArgument(
+              "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
+              filter_tensor.dims()));
+      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
+                                  filter_tensor.vec<int32>(), &filter_shape));
+      TensorShape input_shape = input_tensor.shape();
+      TensorShape obp_shape = obp_tensor.shape();
+
+      // By default, all dims are in MKL order. Only dims in TF order
+      // are those with prefix tf_order.
+      memory::dims obp_dims, fwd_input_dims, fwd_filter_dims;
+      memory::dims padding_l, padding_r, strides, fwd_output_dims;
+      memory::dims fwd_output_dims_tf_order;
+
+      // Get forward convolution parameters.
+      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
+      conv_utl.GetConvFwdSizesInMklOrder(
+          input_shape, filter_shape, &fwd_input_dims, &fwd_filter_dims,
+          &strides, &fwd_output_dims_tf_order, &fwd_output_dims, &padding_l,
+          &padding_r);
+      if (!context->status().ok()) return;
+
+      // Create Convolution forward descriptor since Convolution backward
+      // API needs it. For that, we first need to create input, filter
+      // and output memory descriptors.
+      auto mkl_data_format = TFDataFormatToMklDnnDataFormat(data_format_);
+      auto fwd_src_md =
+          memory::desc(fwd_input_dims, MklDnnType<T>(), mkl_data_format);
+      auto fwd_filter_md =
+          memory::desc(fwd_filter_dims, MklDnnType<T>(), memory::format::hwio);
+      auto fwd_out_md =
+          memory::desc(fwd_output_dims, MklDnnType<T>(), mkl_data_format);
+      auto fwd_desc = convolution_forward::desc(
+          prop_kind::forward, convolution_direct, fwd_src_md, fwd_filter_md,
+          fwd_out_md, strides, padding_l, padding_r,
+          TFPaddingToMklDnnPadding(padding_));
+      auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
+
+      // Allocate output tensor and shape
+      // TODO(nhasabni): Update this when support for MKL layout is added.
+      // Shape of output of Conv2DBackpropInput is same as 'input' of Conv2D.
+      TensorShape tf_output_shape(filter_shape);
+      MklShape mkl_output_mkl_shape;
+      mkl_output_mkl_shape.SetMklTensor(false);
+      Tensor* output_tensor = nullptr;
+      AllocateOutputSetMklShape(context, 0, &output_tensor, tf_output_shape,
+                                mkl_output_mkl_shape);
+
+      // Create memory for user data.
+      // Describe how the inputs and outputs of Convolution look like. Also
+      // specify buffers containing actual input and output data.
+      // Although input shape required is in MKL-DNN order, the layout is
+      // Tensorflow's layout (NHWC or NCHW depending on data format).
+      input.SetUsrMem(fwd_input_dims, mkl_data_format, &input_tensor);
+      // Outbackprop shape is NHWC or NCHW depending on data format. Since
+      // GetInputSizeInMklOrder function returns size in that order we just use
+      // use that function directly.
+      conv_utl.GetInputSizeInMklOrder(obp_shape, &obp_dims);
+      if (!context->status().ok()) return;
+      outbackprop.SetUsrMem(obp_dims, mkl_data_format, &obp_tensor);
+      // Although output shape required is in MKL-DNN order,
+      // layout is Tensorflow's filter layout (HWIO)
+      // Shape of output of Conv2DBackpropInput is same as shape of filter.
+      memory::dims bwd_output_dims = fwd_filter_dims;
+      output.SetUsrMem(bwd_output_dims, memory::format::hwio, output_tensor);
+
+      // Create memory descriptors for convolution data w/ no specified format.
+      input.SetOpMemDesc(fwd_input_dims, memory::format::any);
+      outbackprop.SetOpMemDesc(obp_dims, memory::format::any);
+      output.SetOpMemDesc(bwd_output_dims, memory::format::any);
+
+      // Create convolution backward weights primitive.
+      auto bwd_desc = convolution_backward_weights::desc(
+          convolution_direct, input.GetOpMemDesc(), output.GetOpMemDesc(),
+          outbackprop.GetOpMemDesc(), strides, padding_l, padding_r,
+          TFPaddingToMklDnnPadding(padding_));
+
+      auto bwd_pd = convolution_backward_weights::primitive_desc(
+          bwd_desc, cpu_engine, fwd_pd);
+
+      PrepareAndExecutePrimitive(bwd_pd, &input, &outbackprop, &output);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+  TensorFormat data_format_;
+
+  // Prepare and execute net - checks for input and output reorders.
+  void PrepareAndExecutePrimitive(
+      const convolution_backward_weights::primitive_desc& conv_pd,
+      MklDnnData<T>* input, MklDnnData<T>* obp, MklDnnData<T>* output) {
+    // Create reorders between user layout and MKL layout if it is needed and
+    // add it to the net before convolution.
+    std::vector<primitive> net;
+    input->CheckReorderToOpMem(conv_pd.src_primitive_desc(), &net);
+    obp->CheckReorderToOpMem(conv_pd.diff_dst_primitive_desc(), &net);
+
+    // Memory for output of convolution. Since we may need reorder on the
+    // output side, we will prepare reorder primitive in case output
+    // reorder to user memory is required.
+    bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
+        conv_pd.diff_weights_primitive_desc());
+
+    net.push_back(convolution_backward_weights(
+        conv_pd, input->GetOpMem(), obp->GetOpMem(), output->GetOpMem()));
+
+    // Insert reorder primitive in the net for output reorder if reorder is
+    // required.
+    if (output_reorder_required) {
+      output->InsertReorderToUserMem(&net);
+    }
+
+    // Handle output reorder
+    stream(stream::kind::eager).submit(net).wait();
+  }
+};
+#endif
+
 #define REGISTER_MKL_FILTER_KERNELS(T)                              \
   REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")          \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index 00884d0981..4b6bf92e42 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 #include <algorithm>
 #include <vector>
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -30,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/mkl_conv_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -40,13 +43,24 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
+
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+
+using mkldnn::prop_kind;
+using mkldnn::stream;
+
+using mkldnn::convolution_backward_data;
+using mkldnn::convolution_direct;
+using mkldnn::convolution_forward;
+#endif
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+#ifndef INTEL_MKL_DNN
+
 template <typename Device, class T>
 class MklConv2DCustomBackpropInputOp : public OpKernel {
  public:
@@ -345,6 +359,178 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
   TensorFormat data_format;
 };
 
+#else
+
+template <typename Device, class T>
+class MklConv2DCustomBackpropInputOp : public OpKernel {
+ public:
+  ~MklConv2DCustomBackpropInputOp() {}
+  explicit MklConv2DCustomBackpropInputOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format_str;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    int stride_n = GetTensorDim(strides_, data_format_, 'N');
+    int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    OP_REQUIRES(
+        context, (stride_n == 1 && stride_c == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+
+      MklDnnData<T> filter(&cpu_engine);
+      MklDnnData<T> outbackprop(&cpu_engine);
+      MklDnnData<T> output(&cpu_engine);
+
+      // Input tensors
+      const Tensor& input_tensor = MklGetInput(context, 0);
+      const Tensor& filter_tensor = MklGetInput(context, 1);
+      const Tensor& obp_tensor = MklGetInput(context, 2);  // Outbackprop
+
+      // Generate input shape.
+      TensorShape input_shape;
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsVector(input_tensor.shape()),
+          errors::InvalidArgument(
+              "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
+              input_tensor.dims()));
+      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
+                                  input_tensor.vec<int32>(), &input_shape));
+      TensorShape filter_shape = filter_tensor.shape();
+      TensorShape obp_shape = obp_tensor.shape();
+
+      // By default, all dims are in MKL order. Only dims in TF order
+      // are those with prefix tf_order.
+      memory::dims obp_dims, fwd_input_dims, fwd_filter_dims;
+      memory::dims padding_l, padding_r, strides, fwd_output_dims;
+      memory::dims fwd_output_dims_tf_order;
+
+      // Get forward convolution parameters.
+      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
+      conv_utl.GetConvFwdSizesInMklOrder(
+          input_shape, filter_shape, &fwd_input_dims, &fwd_filter_dims,
+          &strides, &fwd_output_dims_tf_order, &fwd_output_dims, &padding_l,
+          &padding_r);
+      if (!context->status().ok()) return;
+
+      // Create Convolution forward descriptor since Convolution backward
+      // API needs it. For that, we first need to create input, filter
+      // and output memory descriptors.
+      auto mkl_data_format = TFDataFormatToMklDnnDataFormat(data_format_);
+      auto fwd_src_md =
+          memory::desc(fwd_input_dims, MklDnnType<T>(), mkl_data_format);
+      auto fwd_filter_md =
+          memory::desc(fwd_filter_dims, MklDnnType<T>(), memory::format::hwio);
+      auto fwd_out_md =
+          memory::desc(fwd_output_dims, MklDnnType<T>(), mkl_data_format);
+      auto fwd_desc = convolution_forward::desc(
+          prop_kind::forward, convolution_direct, fwd_src_md, fwd_filter_md,
+          fwd_out_md, strides, padding_l, padding_r,
+          TFPaddingToMklDnnPadding(padding_));
+      auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
+
+      // Allocate output tensor and shape
+      // TODO(nhasabni): Update this when support for MKL layout is added.
+      // Shape of output of Conv2DBackpropInput is same as 'input' of Conv2D.
+      TensorShape tf_output_shape(input_shape);
+      MklShape mkl_output_mkl_shape;
+      mkl_output_mkl_shape.SetMklTensor(false);
+      Tensor* output_tensor = nullptr;
+      AllocateOutputSetMklShape(context, 0, &output_tensor, tf_output_shape,
+                                mkl_output_mkl_shape);
+
+      // Create memory for user data.
+      // Describe how the inputs and outputs of Convolution look like. Also
+      // specify buffers containing actual input and output data.
+      // Although input shape required is in MKL-DNN order, the layout is
+      // Tensorflow's layout (NHWC or NCHW depending on data format).
+      // Although filter shape (filter_dims) required is in MKL-DNN order,
+      // the layout is Tensorflow's layout (HWIO).
+      // Shape of Conv2DBackpropInput's filter is same as that of Conv2D filter.
+      filter.SetUsrMem(fwd_filter_dims, memory::format::hwio, &filter_tensor);
+      // Outbackprop shape is NHWC or NCHW depending on data format. Since
+      // GetInputSizeInMklOrder function returns size in that order we just use
+      // use that function directly.
+      conv_utl.GetInputSizeInMklOrder(obp_shape, &obp_dims);
+      if (!context->status().ok()) return;
+      outbackprop.SetUsrMem(obp_dims, mkl_data_format, &obp_tensor);
+      // Although output shape required is in MKL-DNN order,
+      // layout is Tensorflow's layout (NHWC or NCHW depending on data format).
+      // Shape of output of Conv2DBackpropInput is same as shape of 'input'
+      // of Conv2D.
+      memory::dims bwd_output_dims = fwd_input_dims;
+      output.SetUsrMem(bwd_output_dims, mkl_data_format, output_tensor);
+
+      // Create memory descriptors for convolution data w/ no specified format.
+      filter.SetOpMemDesc(fwd_filter_dims, memory::format::any);
+      outbackprop.SetOpMemDesc(obp_dims, memory::format::any);
+      output.SetOpMemDesc(bwd_output_dims, memory::format::any);
+
+      // Create convolution backward data primitive.
+      auto bwd_desc = convolution_backward_data::desc(
+          convolution_direct, output.GetOpMemDesc(), filter.GetOpMemDesc(),
+          outbackprop.GetOpMemDesc(), strides, padding_l, padding_r,
+          TFPaddingToMklDnnPadding(padding_));
+
+      auto bwd_pd = convolution_backward_data::primitive_desc(
+          bwd_desc, cpu_engine, fwd_pd);
+
+      PrepareAndExecutePrimitive(bwd_pd, &filter, &outbackprop, &output);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+  TensorFormat data_format_;
+
+  // Prepare and execute net - checks for input and output reorders.
+  void PrepareAndExecutePrimitive(
+      const convolution_backward_data::primitive_desc& conv_pd,
+      MklDnnData<T>* filter, MklDnnData<T>* obp, MklDnnData<T>* output) {
+    // Create reorders between user layout and MKL layout if it is needed and
+    // add it to the net before convolution.
+    std::vector<primitive> net;
+    filter->CheckReorderToOpMem(conv_pd.weights_primitive_desc(), &net);
+    obp->CheckReorderToOpMem(conv_pd.diff_dst_primitive_desc(), &net);
+
+    // Memory for output of convolution. Since we may need reorder on the
+    // output side, we will prepare reorder primitive in case output
+    // reorder to user memory is required.
+    bool output_reorder_required =
+        output->PrepareReorderToUserMemIfReq(conv_pd.diff_src_primitive_desc());
+
+    net.push_back(convolution_backward_data(
+        conv_pd, obp->GetOpMem(), filter->GetOpMem(), output->GetOpMem()));
+
+    // Insert reorder primitive in the net for output reorder if reorder is
+    // required.
+    if (output_reorder_required) {
+      output->InsertReorderToUserMem(&net);
+    }
+
+    // Handle output reorder
+    stream(stream::kind::eager).submit(net).wait();
+  }
+};
+
+#endif  // INTEL_MKL_DNN
+
 #define REGISTER_MKL_CPU_KERNELS(T)                                 \
   REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")           \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 7f1555d325..57661e8b10 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -18,7 +18,9 @@ limitations under the License.
 
 #include <string.h>
 #include <map>
+#include <string>
 #include <vector>
+
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -26,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/mkl_conv_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -40,10 +43,23 @@ limitations under the License.
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+
+using mkldnn::prop_kind;
+using mkldnn::stream;
+
+using mkldnn::convolution_direct;
+using mkldnn::convolution_forward;
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+// For now, MKL-ML is default. So making MKL-DNN not a default choice.
+#ifndef INTEL_MKL_DNN
+
 template <typename Device, typename T, bool biasEnabled>
 class MklConv2DOp : public OpKernel {
  public:
@@ -461,6 +477,203 @@ class MklConv2DOp : public OpKernel {
   TensorFormat data_format_;
 };
 
+#else
+
+template <typename Device, typename T, bool biasEnabled>
+class MklConv2DOp : public OpKernel {
+ public:
+  ~MklConv2DOp() {}
+
+  explicit MklConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+
+    const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
+    const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
+    OP_REQUIRES(
+        context, stride_n == 1 && stride_c == 1,
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+
+      // Input tensors
+      size_t src_idx = 0, filter_idx = 1;
+      const Tensor& src_tensor = MklGetInput(context, src_idx);
+      const Tensor& filter_tensor = MklGetInput(context, filter_idx);
+
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> filter(&cpu_engine);
+      MklDnnData<T> output(&cpu_engine);
+
+      memory::dims src_dims, filter_dims, padding_l, padding_r, strides;
+      memory::dims output_dims_tf_order, output_dims_mkl_order;
+
+      // Get shapes of input tensors in MKL-DNN order
+      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
+      conv_utl.GetConvFwdSizesInMklOrder(
+          src_tensor.shape(), filter_tensor.shape(), &src_dims, &filter_dims,
+          &strides, &output_dims_tf_order, &output_dims_mkl_order, &padding_l,
+          &padding_r);
+      if (!context->status().ok()) return;
+
+      // Check for corner case - if there is nothing to compute, return.
+      TensorShape tf_output_shape(
+          {output_dims_tf_order[0], output_dims_tf_order[1],
+           output_dims_tf_order[2], output_dims_tf_order[3]});
+      Tensor* output_tensor = nullptr;
+      MklShape mkl_output_mkl_shape;
+      mkl_output_mkl_shape.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, 0, &output_tensor, tf_output_shape,
+                                mkl_output_mkl_shape);
+
+      // Forward filter in TF format from input at index 1 to output at index 1.
+      ForwardTfTensorInToOut(context, 1, 1);
+
+      if (tf_output_shape.num_elements() == 0) {
+        // TODO(jbobba): Verify correctness here
+        //               Need semantics for Null MKL tensor
+        return;
+      }
+
+      // Corner case to handle 0 batch size.
+      if (output_dims_tf_order[0] == 0) {
+        // Nothing to do, allocate output tensor and return
+        // TODO(nhasabni): remove this code later once serialization
+        // in MKL-DNN is supported.
+        AllocateOutputSetMklShape(context, 0, &output_tensor,
+                                  src_tensor.shape(), mkl_output_mkl_shape);
+        return;
+      } else {
+        // Otherwise regular output tensor allocation
+        // Allocate output tensor.
+      }
+      CHECK_NOTNULL(output_tensor);
+
+      // Create memory for user data.
+      // Describe how the inputs and outputs of Convolution look like. Also
+      // specify buffers containing actual input and output data.
+      // Although input shape (src_dims) required is in MKL-DNN order,
+      // the layout is Tensorflow's layout (NHWC or NCHW depending on data
+      // format).
+      src.SetUsrMem(src_dims, TFDataFormatToMklDnnDataFormat(data_format_),
+                    const_cast<void*>(
+                        static_cast<const void*>(src_tensor.flat<T>().data())));
+      // Although filter shape (filter_dims) required is in MKL-DNN order,
+      // the layout is Tensorflow's layout (HWIO).
+      filter.SetUsrMem(filter_dims, memory::format::hwio,
+                       const_cast<void*>(static_cast<const void*>(
+                           filter_tensor.flat<T>().data())));
+      // Although output shape (output_dims) required is in MKL-DNN order,
+      // layout is Tensorflow's layout (NHWC or NCHW depending on data format).
+      output.SetUsrMem(output_dims_mkl_order,
+                       TFDataFormatToMklDnnDataFormat(data_format_),
+                       output_tensor->flat<T>().data());
+
+      // Create memory descriptors for convolution data w/ no specified format.
+      src.SetOpMemDesc(src_dims, memory::format::any);
+      filter.SetOpMemDesc(filter_dims, memory::format::any);
+      output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
+
+      // If bias is enabled, then do the same steps as above for bias.
+      if (biasEnabled) {
+        MklDnnData<T> bias(&cpu_engine);
+        memory::dims bias_size;
+        conv_utl.GetBiasSizeInMklOrder(2 /* bias idx */, &bias_size);
+        const Tensor& bias_tensor = MklGetInput(context, 2);
+        bias.SetUsrMem(bias_size, memory::format::x,
+                       const_cast<void*>(static_cast<const void*>(
+                           bias_tensor.flat<T>().data())));
+        bias.SetOpMemDesc(bias_size, memory::format::any);
+
+        // Create convolution primitive with Bias.
+        auto conv_desc = convolution_forward::desc(
+            prop_kind::forward, convolution_direct, src.GetOpMemDesc(),
+            filter.GetOpMemDesc(), bias.GetOpMemDesc(), output.GetOpMemDesc(),
+            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
+
+        auto conv_prim_desc =
+            convolution_forward::primitive_desc(conv_desc, cpu_engine);
+        PrepareAndExecuteNet(conv_prim_desc, &src, &filter, &bias, &output);
+      } else {
+        // Create convolution primitive without Bias.
+        auto conv_desc = convolution_forward::desc(
+            prop_kind::forward, convolution_direct, src.GetOpMemDesc(),
+            filter.GetOpMemDesc(), output.GetOpMemDesc(), strides, padding_l,
+            padding_r, TFPaddingToMklDnnPadding(padding_));
+
+        auto conv_prim_desc =
+            convolution_forward::primitive_desc(conv_desc, cpu_engine);
+        PrepareAndExecuteNet(conv_prim_desc, &src, &filter, nullptr, &output);
+      }
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + std::string(e.message) + ", in file " +
+                         std::string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+  TensorFormat data_format_;
+
+  // Prepare and execute net - checks for input and output reorders.
+  void PrepareAndExecuteNet(
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      MklDnnData<T>* src, MklDnnData<T>* filter, MklDnnData<T>* bias,
+      MklDnnData<T>* output) {
+    // Create reorders between user layout and MKL layout if it is needed and
+    // add it to the net before convolution.
+    std::vector<primitive> net;
+    src->CheckReorderToOpMem(conv_prim_desc.src_primitive_desc(), &net);
+    filter->CheckReorderToOpMem(conv_prim_desc.weights_primitive_desc(), &net);
+
+    // Memory for output of convolution. Since we may need reorder on the
+    // output side, we will prepare reorder primitive in case output
+    // reorder to user memory is required.
+    bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
+        conv_prim_desc.dst_primitive_desc());
+
+    // Create convolution primitive and add it to net.
+    if (bias) {
+      CHECK_EQ(biasEnabled, true);
+      net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
+                                        filter->GetOpMem(), bias->GetOpMem(),
+                                        output->GetOpMem()));
+    } else {
+      CHECK_EQ(biasEnabled, false);
+      net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
+                                        filter->GetOpMem(),
+                                        output->GetOpMem()));
+    }
+
+    // Insert reorder primitive in the net for output reorder if reorder is
+    // required.
+    if (output_reorder_required) {
+      output->InsertReorderToUserMem(&net);
+    }
+
+    // Handle output reorder
+    stream(stream::kind::eager).submit(net).wait();
+  }
+};
+
+#endif
+
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklConv2D")                        \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
new file mode 100644
index 0000000000..e29af19ca9
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -0,0 +1,308 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
+
+#include <limits>
+#include <vector>
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#include "tensorflow/core/util/mkl_util.h"
+
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+#endif
+
+namespace tensorflow {
+
+#ifdef INTEL_MKL_DNN
+
+class MklDnnConvUtil {
+ protected:
+  OpKernelContext *context_;  // We don't own this.
+  std::vector<int32> strides_;
+  Padding padding_;
+  TensorFormat data_format_;
+
+ public:
+  MklDnnConvUtil(OpKernelContext *context, const std::vector<int32> &strides,
+                 Padding pad, TensorFormat fm)
+      : context_(context), strides_(strides), padding_(pad), data_format_(fm) {}
+
+  virtual ~MklDnnConvUtil() { context_ = nullptr; }
+
+  // Calculate Convolution strides
+  virtual inline void GetStridesInMklOrder(memory::dims *strides) {
+    // For now we take the stride from the second and third dimensions only
+    // (we do not support striding on the batch or depth dimension).
+    CHECK_NOTNULL(strides);
+    int stride_rows = GetTensorDim(strides_, data_format_, 'H');
+    int stride_cols = GetTensorDim(strides_, data_format_, 'W');
+    *strides = {stride_rows, stride_cols};
+  }
+
+  // Calculate Convolution input size in MKL-DNN order. MKL-DNN
+  // requires input in NCHW format. Function does not return anything.
+  // But errors arising from sanity checks are returned in context's
+  // status.
+  virtual inline void GetInputSizeInMklOrder(const TensorShape &input_shape,
+                                             memory::dims *input_dims) {
+#define CHECK_BOUNDS(val, err_msg)                                     \
+  do {                                                                 \
+    OP_REQUIRES(context_,                                              \
+                FastBoundsCheck(val, std::numeric_limits<int>::max()), \
+                errors::InvalidArgument(err_msg));                     \
+  } while (0)
+
+    CHECK_NOTNULL(input_dims);
+
+    // Input channel
+    int64 input_depth_raw = GetTensorDim(input_shape, data_format_, 'C');
+    int input_depth = static_cast<int>(input_depth_raw);
+
+    // Input rows/height
+    int64 input_rows_raw = GetTensorDim(input_shape, data_format_, 'H');
+    CHECK_BOUNDS(input_rows_raw, "Input rows too large");
+    int input_rows = static_cast<int>(input_rows_raw);
+
+    // Input columns/width
+    int64 input_cols_raw = GetTensorDim(input_shape, data_format_, 'W');
+    CHECK_BOUNDS(input_cols_raw, "Input cols too large");
+    int input_cols = static_cast<int>(input_cols_raw);
+
+    // Input batch
+    int64 input_batch_raw = GetTensorDim(input_shape, data_format_, 'N');
+    CHECK_BOUNDS(input_batch_raw, "Input batch too large");
+    int input_batch = static_cast<int>(input_batch_raw);
+
+#undef CHECK_BOUNDS
+
+    // MKL-DNN always requires input in NCHW format.
+    *input_dims = {input_batch, input_depth, input_rows, input_cols};
+  }
+
+  // Calculate Convolution filter size in MKL-DNN order. MKL-DNN
+  // requires filter in OIHW format. Function does not return anything.
+  // But errors arising from sanity checks are returned in context's
+  // status.
+  //
+  // Calculate Convolution filter size in MKL-DNN order. MKL-DNN
+  // requires filter in OIHW format. Function does not return anything.
+  // But errors arising from sanity checks are returned in context's
+  // status. This function differs from GetConvFilterSizeInMklOrder in
+  // parameter for input - it accepts src_shape since Convolution Backward
+  // Input gets shape of input tensor rather than actual tensor (Convolution
+  // forward gets actual tensor as input).
+  //
+  // TODO(nhasabni): Add similar function for input and filter in MklShape.
+  virtual inline void GetFilterSizeInMklOrder(const TensorShape &input_shape,
+                                              const TensorShape &filter_shape,
+                                              memory::dims *filter_dims) {
+    CHECK_NOTNULL(filter_dims);
+
+    OP_REQUIRES(context_, filter_shape.dims() == 4,
+                errors::InvalidArgument("filter must be 4-dimensional: ",
+                                        filter_shape.DebugString()));
+
+    for (int i = 0; i < 3; i++) {
+      OP_REQUIRES(context_,
+                  FastBoundsCheck(filter_shape.dim_size(i),
+                                  std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("filter too large"));
+    }
+
+    int input_depth = GetTensorDim(input_shape, data_format_, 'C');
+
+    OP_REQUIRES(context_, input_depth == filter_shape.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", input_depth,
+                    " vs ", filter_shape.dim_size(2)));
+
+    // TF filter is always in (rows, cols, in_depth, out_depth) order.
+    int filter_rows = static_cast<int>(filter_shape.dim_size(0));
+    int filter_cols = static_cast<int>(filter_shape.dim_size(1));
+    int in_depth = static_cast<int>(filter_shape.dim_size(2));
+    int out_depth = static_cast<int>(filter_shape.dim_size(3));
+
+    // MKL-DNN always needs filter in OIHW format.
+    // OIHW = (out_depth, in_depth, rows, cols)
+    *filter_dims = {out_depth, in_depth, filter_rows, filter_cols};
+  }
+
+  // Calculate Convolution filter size in MKL-DNN order. MKL-DNN
+  // requires filter in OIHW format. Function does not return anything.
+  // But errors arising from sanity checks are returned in context's
+  // status.
+  virtual inline void GetFilterSizeInMklOrder(size_t src_index,
+                                              size_t filter_index,
+                                              memory::dims *filter_dims) {
+    CHECK_NOTNULL(filter_dims);
+    const Tensor &input = MklGetInput(context_, src_index);
+    const Tensor &filter = MklGetInput(context_, filter_index);
+    GetFilterSizeInMklOrder(input.shape(), filter.shape(), filter_dims);
+  }
+
+  // Calculate Bias size for 2D Convolution. Function does not return
+  // anything, but sets error in context status.
+  virtual inline void GetBiasSizeInMklOrder(size_t bias_index,
+                                            memory::dims *bias_dims) {
+    const Tensor &bias = MklGetInput(context_, bias_index);
+    OP_REQUIRES(context_, bias.dims() == 1,
+                errors::InvalidArgument("bias must be 1-dimensional: ",
+                                        bias.shape().DebugString()));
+
+    *bias_dims = {static_cast<int>(bias.dim_size(0))};
+  }
+
+  // Function to calculate output and padding size for 2D convolution.
+  //
+  // Calculate output shape of Convolution in MKL-DNN and TensorFlow order.
+  // MKL-DNN uses NCHW for output order. But TensorFlow output will be in
+  // NHWC or NCHW format depending on data format. Function also calculates
+  // left, right, top and bottom pads. Function does not return any status -
+  // status is returned via context status.
+  //
+  // TODO(nhasabni): Add similar function for input and filter in MklShape.
+  virtual inline void GetOutputAndPadSizeInMklOrder(
+      const TensorShape &input_shape, const TensorShape &filter_shape,
+      const memory::dims &strides, memory::dims *output_dims_tf_order,
+      memory::dims *output_dims_mkl_order, memory::dims *pad_l,
+      memory::dims *pad_r) {
+    CHECK_NOTNULL(output_dims_tf_order);
+    CHECK_NOTNULL(output_dims_mkl_order);
+    CHECK_NOTNULL(pad_l);
+    CHECK_NOTNULL(pad_r);
+
+    int input_rows = GetTensorDim(input_shape, data_format_, 'H');
+    int input_cols = GetTensorDim(input_shape, data_format_, 'W');
+
+    // The first dimension for filter is rows/height.
+    int filter_rows = filter_shape.dim_size(0);
+    // The second dimension for filter is cols/width.
+    int filter_cols = filter_shape.dim_size(1);
+
+    // Stride is vector of 2 elements: {s_r, s_c}
+    int stride_rows = strides[0];
+    int stride_cols = strides[1];
+
+    // Output batch is same as input batch.
+    int out_batch = GetTensorDim(input_shape, data_format_, 'N');
+    // Output depth is same as last dimension for filter.
+    int out_depth = filter_shape.dim_size(3);
+
+    int64 out_rows = 0, out_cols = 0;
+    int64 pad_top = 0, pad_bottom = 0, pad_left, pad_right;
+
+    OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                 input_rows, filter_rows, stride_rows, padding_,
+                                 &out_rows, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                 input_cols, filter_cols, stride_cols, padding_,
+                                 &out_cols, &pad_left, &pad_right));
+
+    // Tensorflow output is in data_format order. (NHWC or NCHW)
+    TensorShape out_shape =
+        ShapeFromFormat(data_format_, out_batch, out_rows, out_cols, out_depth);
+    *output_dims_tf_order = TFShapeToMklDnnDims(out_shape);
+
+    // MKL-DNN always needs output in NCHW format.
+    *output_dims_mkl_order = {out_batch, out_depth, static_cast<int>(out_rows),
+                              static_cast<int>(out_cols)};
+
+    // Now handle padding. MKL-DNN uses asymetric padding.
+    *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
+    *pad_r = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
+  }
+
+  // Calculate output and pad size of forward Convolution operator.
+  // See comment on GetConvOutputAndPadSizeInMklOrder for parameters.
+  //
+  // Function does not return anything, but sets error in context status.
+  inline void GetOutputAndPadSizeInMklOrder(
+      size_t src_index, size_t filter_index, const memory::dims &strides,
+      memory::dims *output_dims_tf_order, memory::dims *output_dims_mkl_order,
+      memory::dims *pad_l, memory::dims *pad_r) {
+    CHECK_NOTNULL(output_dims_tf_order);
+    CHECK_NOTNULL(output_dims_mkl_order);
+    CHECK_NOTNULL(pad_l);
+    CHECK_NOTNULL(pad_r);
+
+    const Tensor &input = MklGetInput(context_, src_index);
+    const Tensor &filter = MklGetInput(context_, filter_index);
+
+    OP_REQUIRES(context_, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().DebugString()));
+
+    GetOutputAndPadSizeInMklOrder(input.shape(), filter.shape(), strides,
+                                  output_dims_tf_order, output_dims_mkl_order,
+                                  pad_l, pad_r);
+  }
+
+  // Wrapper function to calculate input, filter, and output sizes of
+  // 2D Convolution in MKL order (NCHW for input and output; OIHW for filter.)
+  // Function also calculates output shape in Tensorflow order. Additionally, it
+  // also calculates strides and paddings for 2D Convolution.
+  //
+  // Function does not return anything, but sets error in context status.
+  inline void GetConvFwdSizesInMklOrder(
+      const TensorShape &input_shape, const TensorShape &filter_shape,
+      memory::dims *input_dims, memory::dims *filter_dims,
+      memory::dims *strides, memory::dims *output_dims_tf_order,
+      memory::dims *output_dims_mkl_order, memory::dims *pad_l,
+      memory::dims *pad_r) {
+    CHECK_NOTNULL(input_dims);
+    CHECK_NOTNULL(filter_dims);
+    CHECK_NOTNULL(strides);
+    CHECK_NOTNULL(output_dims_tf_order);
+    CHECK_NOTNULL(output_dims_mkl_order);
+    CHECK_NOTNULL(pad_l);
+    CHECK_NOTNULL(pad_r);
+
+    GetInputSizeInMklOrder(input_shape, input_dims);
+    if (!context_->status().ok()) return;
+    GetFilterSizeInMklOrder(input_shape, filter_shape, filter_dims);
+    if (!context_->status().ok()) return;
+    GetStridesInMklOrder(strides);
+    GetOutputAndPadSizeInMklOrder(input_shape, filter_shape, *strides,
+                                  output_dims_tf_order, output_dims_mkl_order,
+                                  pad_l, pad_r);
+    if (!context_->status().ok()) return;
+  }
+};
+
+#endif  // INTEL_MKL_DNN
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
diff --git a/tensorflow/core/kernels/mkl_cwise_ops_common.cc b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
index 7fc633c254..c065724e0d 100644
--- a/tensorflow/core/kernels/mkl_cwise_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
@@ -48,7 +48,7 @@ class MklBinaryOp : public BinaryOp<Device, Functor> {
     auto out = context->mutable_output(0);
     VLOG(1) << "Shapes (output): " << out->shape().DebugString();
 
-    // Pass input shape through to ouput shape
+    // Pass input shape through to output shape
     ForwardMklMetaDataInToOut(context, 0, 0);
 
     out = context->mutable_output(0);
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index 3c85737702..302a6967e3 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -340,7 +340,7 @@ char* FloatToBuffer(float value, char* buffer) {
   float parsed_value;
   if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) {
     snprintf_result =
-        snprintf(buffer, kFastToBufferSize, "%.*g", FLT_DIG + 2, value);
+        snprintf(buffer, kFastToBufferSize, "%.*g", FLT_DIG + 3, value);
 
     // Should never overflow; see above.
     DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index df189af1b8..c0e84c8bb0 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -383,7 +383,8 @@ input_dataset: A handle to an input dataset. Must have a single component.
 batch_size: A scalar representing the number of elements to accumulate in a
   batch.
 row_shape: A vector representing the dense shape of each row in the produced
-  SparseTensor.
+  SparseTensor. The shape may be partially specified, using `-1` to indicate
+  that a particular dimension should use the maximum size of all batch elements.
 )doc");
 
 REGISTER_OP("RangeDataset")
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 3dc16ac457..b34dc1a008 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -29,22 +29,6 @@ using shape_inference::ShapeHandle;
 
 namespace {
 
-// A shape function that uses the tensor value at <input_idx> as a shape for
-// output 0. If the tensor value is not available, it uses a shape with <ndims>
-// unknown dims.
-Status InputTensorShapeOrUnknown(InferenceContext* c, int input_idx,
-                                 int ndims) {
-  ShapeHandle out;
-  const Tensor* input = c->input_tensor(input_idx);
-  if (input == nullptr) {
-    out = c->UnknownShapeOfRank(ndims);
-  } else {
-    TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(input_idx, &out));
-  }
-  c->set_output(0, out);
-  return Status::OK();
-}
-
 Status FractionalPoolShapeFn(InferenceContext* c) {
   ShapeHandle input;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
@@ -119,11 +103,11 @@ REGISTER_OP("AvgPoolGrad")
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("T: {half, float, double}")
     .SetShapeFn([](InferenceContext* c) {
-      // NOTE(mrry): We could in principle work out the shape from the
-      // gradients and the attrs, but if we do not know orig_input_shape
-      // statically, then we are unlikely to know the shape of the
-      // gradients either.
-      return InputTensorShapeOrUnknown(c, 0 /* input_idx */, 4 /* ndims */);
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+      c->set_output(0, s);
+      return Status::OK();
     })
     .Doc(R"doc(
 Computes gradients of the average pooling function.
@@ -583,11 +567,11 @@ REGISTER_OP("Conv2DBackpropInput")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .SetShapeFn([](InferenceContext* c) {
-      // NOTE(mrry): We could in principle work out the shape from the
-      // gradients and the attrs, but if we do not know orig_input_shape
-      // statically, then we are unlikely to know the shape of the
-      // gradients either.
-      return InputTensorShapeOrUnknown(c, 0 /* input_idx */, 4 /* ndims */);
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+      c->set_output(0, s);
+      return Status::OK();
     })
     .Doc(R"doc(
 Computes the gradients of convolution with respect to the input.
@@ -625,11 +609,11 @@ REGISTER_OP("Conv2DBackpropFilter")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .SetShapeFn([](InferenceContext* c) {
-      // NOTE(mrry): We could in principle work out the shape from the
-      // gradients and the attrs, but if we do not know orig_input_shape
-      // statically, then we are unlikely to know the shape of the
-      // gradients either.
-      return InputTensorShapeOrUnknown(c, 1 /* input_idx */, 4 /* ndims */);
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+      c->set_output(0, s);
+      return Status::OK();
     })
     .Doc(R"doc(
 Computes the gradients of convolution with respect to the filter.
@@ -882,11 +866,11 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropInput")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .SetShapeFn([](InferenceContext* c) {
-      // NOTE(mrry): We could in principle work out the shape from the
-      // gradients and the attrs, but if we do not know orig_input_shape
-      // statically, then we are unlikely to know the shape of the
-      // gradients either.
-      return InputTensorShapeOrUnknown(c, 0 /* input_idx */, 4 /* ndims */);
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+      c->set_output(0, s);
+      return Status::OK();
     })
     .Doc(R"doc(
 Computes the gradients of depthwise convolution with respect to the input.
@@ -924,11 +908,11 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropFilter")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .SetShapeFn([](InferenceContext* c) {
-      // NOTE(mrry): We could in principle work out the shape from the
-      // gradients and the attrs, but if we do not know orig_input_shape
-      // statically, then we are unlikely to know the shape of the
-      // gradients either.
-      return InputTensorShapeOrUnknown(c, 1 /* input_idx */, 4 /* ndims */);
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+      c->set_output(0, s);
+      return Status::OK();
     })
     .Doc(R"doc(
 Computes the gradients of depthwise convolution with respect to the filter.
@@ -2870,7 +2854,11 @@ REGISTER_OP("_MklConv2DBackpropFilter")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .SetShapeFn([](InferenceContext* c) {
-      return InputTensorShapeOrUnknown(c, 1 /* input_idx */, 4 /* ndims */);
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+      c->set_output(0, s);
+      return Status::OK();
     })
     .Doc(R"doc(
 MKL version of Conv2DBackpropFilter. Uses MKL DNN APIs to compute the
@@ -2911,7 +2899,11 @@ REGISTER_OP("_MklConv2DBackpropInput")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .SetShapeFn([](InferenceContext* c) {
-      return InputTensorShapeOrUnknown(c, 0 /* input_idx */, 4 /* ndims */);
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+      c->set_output(0, s);
+      return Status::OK();
     })
     .Doc(R"doc(
 MKL version of Convolution2D backward input. Uses MKL DNN APIs to compute the
@@ -3034,7 +3026,11 @@ REGISTER_OP("_MklAvgPoolGrad")
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("T: {float, half, double}")
     .SetShapeFn([](InferenceContext* c) {
-      return InputTensorShapeOrUnknown(c, 0 /* input_idx */, 4 /* ndims */);
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+      c->set_output(0, s);
+      return Status::OK();
     })
     .Doc(R"doc(
 MKL version of AvgPoolGrad operator. Uses MKL DNN APIs to compute gradients
diff --git a/tensorflow/core/ops/nn_ops_test.cc b/tensorflow/core/ops/nn_ops_test.cc
index 51e4f8bffe..4628b725f8 100644
--- a/tensorflow/core/ops/nn_ops_test.cc
+++ b/tensorflow/core/ops/nn_ops_test.cc
@@ -81,55 +81,6 @@ TEST(NNOpsTest, TopKV2_ShapeFn) {
       op, "[1,2,3,4];[]");
 }
 
-TEST(NNOpsTest, InputTensorShapeOrUnknown2D_ShapeFn) {
-  typedef std::pair<const char*, int> NameAndInputIndex;
-  for (const auto& p :
-       {NameAndInputIndex("AvgPoolGrad", 0),
-        NameAndInputIndex("Conv2DBackpropInput", 0),
-        NameAndInputIndex("Conv2DBackpropFilter", 1),
-        NameAndInputIndex("DepthwiseConv2dNativeBackpropInput", 0),
-        NameAndInputIndex("DepthwiseConv2dNativeBackpropFilter", 1)}) {
-    ShapeInferenceTestOp op(p.first);
-    op.input_tensors.resize(2);
-
-    // Conv and Depthwise conv have three inputs.
-    string extra_shapes = (op.name == "AvgPoolGrad" ? "" : ";?");
-
-    // When the input tensor is not known, the output is 4 unknown dims.
-    INFER_OK(op, "?;?" + extra_shapes, "[?,?,?,?]");
-    INFER_OK(op, "[4];?" + extra_shapes, "[?,?,?,?]");
-
-    // When input tensor is known, its values determine output shape.
-    std::vector<int32> shape{1, 2, 3, 4};
-    Tensor shape_t = test::AsTensor<int32>(shape);
-    op.input_tensors[p.second] = &shape_t;
-    INFER_OK(op, "[4];?" + extra_shapes, "[1,2,3,4]");
-  }
-}
-
-TEST(NNOpsTest, InputTensorShapeOrUnknown3D_ShapeFn) {
-  typedef std::pair<const char*, int> NameAndInputIndex;
-  for (const auto& p : {NameAndInputIndex("AvgPool3DGrad", 0),
-                        NameAndInputIndex("Conv3DBackpropInputV2", 0),
-                        NameAndInputIndex("Conv3DBackpropFilterV2", 1)}) {
-    ShapeInferenceTestOp op(p.first);
-    op.input_tensors.resize(2);
-
-    // Conv3D has an extra shape.
-    string extra_shapes = (op.name == "AvgPool3DGrad" ? "" : ";?");
-
-    // When the input tensor is not known, the output is 4 unknown dims.
-    INFER_OK(op, "?;?" + extra_shapes, "[?,?,?,?,?]");
-    INFER_OK(op, "[5];?" + extra_shapes, "[?,?,?,?,?]");
-
-    // When input tensor is known, its values determine output shape.
-    std::vector<int32> shape{1, 2, 3, 4, 5};
-    Tensor shape_t = test::AsTensor<int32>(shape);
-    op.input_tensors[p.second] = &shape_t;
-    INFER_OK(op, "[5];?" + extra_shapes, "[1,2,3,4,5]");
-  }
-}
-
 TEST(NNOpsTest, BatchNormWithGlobalNormalization_ShapeFn) {
   ShapeInferenceTestOp op("BatchNormWithGlobalNormalization");
 
diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc
index f23ff083af..b44ea2e080 100644
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@@ -332,6 +332,7 @@ REGISTER_OP("DecodeCSV")
     .Attr("OUT_TYPE: list({float,int32,int64,string})")
     .Attr("field_delim: string = ','")
     .Attr("use_quote_delim: bool = true")
+    .Attr("na_value: string = ''")
     .SetShapeFn([](InferenceContext* c) {
       // Validate the record_defaults inputs.
       for (int i = 1; i < c->num_inputs(); ++i) {
@@ -362,6 +363,7 @@ field_delim: char delimiter to separate fields in a record.
 use_quote_delim: If false, treats double quotation marks as regular
   characters inside of the string fields (ignoring RFC 4180, Section 2,
   Bullet 5).
+na_value: Additional string to recognize as NA/NaN.
 output: Each tensor will have the same shape as records.
 )doc");
 
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index f4bec9524a..1bfa4f83a3 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -26,13 +26,19 @@ limitations under the License.
 #include "mkl_trans.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/util/tensor_format.h"
 
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+#endif
 
 // The file contains a number of utility classes and functions used by MKL
 // enabled kernels
@@ -219,19 +225,18 @@ class MklShape {
 // Location from start of buffer where isMklTensor_ is serialized
 #define DIMS_OFFSET \
   (IS_MKL_TENSOR_OFFSET + sizeof(size_t))  // Location of dimension_
-#define SIZES_OFFSET(dims) \
-  (DIMS_OFFSET +           \
-   sizeof(size_t))  // Location of sizes. Note dim is not used here, left here
-                    // to make macros consistent.
+// Location of sizes. Note dim is not used here, left here
+// to make macros consistent.
+#define SIZES_OFFSET(dims) (DIMS_OFFSET + sizeof(size_t))
 #define STRIDES_OFFSET(dims) \
   (SIZES_OFFSET(dims) + dims * sizeof(size_t))  // Location of strides
 #define MKL_LAYOUT_OFFSET(dims) \
   (STRIDES_OFFSET(dims) + dims * sizeof(size_t))  // Location of mklLayout_
 #define TF_LAYOUT_OFFSET(dims) \
   (MKL_LAYOUT_OFFSET(dims) + SIZE_OF_MKL_DNN_BUF)  // Location of tfLayout_
+// Location of tf_to_mkl_dim_map_
 #define TF_TO_MKL_DIM_MAP_OFFSET(dims) \
-  (TF_LAYOUT_OFFSET(dims) +            \
-   SIZE_OF_MKL_DNN_BUF)  // Location of tf_to_mkl_dim_map_
+  (TF_LAYOUT_OFFSET(dims) + SIZE_OF_MKL_DNN_BUF)
 
   // TODO(agramesh1) make sure to create a const to share with rewrite pass
   // for min size of MKL metadata tensor.
@@ -342,58 +347,6 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
   return output_tensor;
 }
 
-// Since our ops are going to produce and also consume N addition tensors
-// (Mkl) for N Tensorflow tensors, we can have following different
-// orderings among these 2N tensors.
-//
-// E.g., for Tensorflow tensors A, B, and C, our ops will produce and
-// consume A_m, B_m, and C_m additionally.
-//
-// INTERLEAVED: in this case 2N tensors are interleaved. So for above
-//              example, the ordering looks like: A, A_m, B, B_m, C, C_m.
-//
-// CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed
-//             by N Mkl tensors. So for above example, the ordering looks
-//             like: A, B, C, A_m, B_m, C_m
-//
-// Following APIs map index of original Tensorflow tensors to their appropriate
-// position based on selected ordering. For contiguous ordering, we need to know
-// the total number of tensors (parameter total).
-//
-typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering;
-// NOTE: Currently, we use contiguous ordering. If you change this, then you
-// would need to change Mkl op definitions in nn_ops.cc.
-static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS;
-
-// Get index of MetaData tensor from index 'n' of Data tensor.
-inline int DataIndexToMetaDataIndex(int n, int total_tensors) {
-  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
-    // For interleaved ordering, Mkl tensor follows immediately after
-    // Tensorflow tensor.
-    return n + 1;
-  } else {
-    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-    // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away.
-    return n + total_tensors / 2;
-  }
-}
-
-int inline GetTensorDataIndex(int n, int total_tensors) {
-  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
-    return 2 * n;  // index corresponding to nth input/output tensor
-  } else {
-    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-    return n;
-  }
-}
-
-int inline GetTensorMetaDataIndex(int n, int total_tensors) {
-  // Get index for TensorData first and then use mapping function
-  // to get TensorMetaData index from TensorData index.
-  int tidx = GetTensorDataIndex(n, total_tensors);
-  return DataIndexToMetaDataIndex(tidx, total_tensors);
-}
-
 // Get the MKL shape from the second string tensor
 inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
   mklshape->DeSerializeMklShape(
@@ -480,6 +433,13 @@ inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
   *buf_out = static_cast<void*>(tensor_out->flat<float>().data());
 }
 
+template <typename T>
+inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
+                           TensorShape tf_shape) {
+  OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::v(),
+                                                 tf_shape, tensor_out));
+}
+
 inline void GetStridesFromSizes(TensorFormat data_format, size_t* strides,
                                 const size_t* sizes) {
   // MKL requires strides in NCHW
@@ -743,56 +703,299 @@ inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) {
   }
 }
 
-namespace mkl_op_registry {
-static const char* kMklOpLabel = "MklOp";
-static const char* kMklOpLabelPattern = "label='MklOp'";
+// -------------------------------------------------------------------
+
+#ifdef INTEL_MKL_DNN
+
+using mkldnn::engine;
+using mkldnn::memory;
+using mkldnn::padding_kind;
+using mkldnn::primitive;
+using mkldnn::reorder;
+
+/// Return MKL-DNN data type (memory::data_type) for input type T
+///
+/// @input None
+/// @return memory::data_type corresponding to type T
+template <typename T>
+static memory::data_type MklDnnType();
+
+/// Instantiation for float type. Add similar instantiations for other
+/// type if needed.
+template <>
+memory::data_type MklDnnType<float>() {
+  return memory::data_type::f32;
+}
+
+/// Map TensorFlow's data format into MKL-DNN data format
+///
+/// @input: TensorFlow data format
+/// @return: memory::format corresponding to TensorFlow data format;
+///          Fails with an error if invalid data format.
+inline memory::format TFDataFormatToMklDnnDataFormat(TensorFormat format) {
+  if (format == FORMAT_NHWC)
+    return memory::format::nhwc;
+  else if (format == FORMAT_NCHW)
+    return memory::format::nchw;
+  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
+  // Return to get rid of compiler warning
+  return memory::format::format_undef;
+}
 
-// Get the name of Mkl op from original TensorFlow op
-// We prefix 'Mkl' to the original op to get Mkl op.
-inline string GetMklOpName(const string& name) {
-  // Prefix that we add to Tensorflow op name to construct Mkl op name.
-  const char* const kMklOpPrefix = "_Mkl";
-  return string(kMklOpPrefix) + name;
+/// Map TensorShape object into memory::dims required by MKL-DNN
+///
+/// This function will simply map input TensorShape into MKL-DNN dims
+/// naively. So it will preserve the order of dimensions. E.g., if
+/// input tensor is in NHWC format, then dims will be in NHWC format
+/// also.
+///
+/// @input TensorShape object in shape
+/// @return memory::dims corresponding to TensorShape
+inline memory::dims TFShapeToMklDnnDims(const TensorShape& shape) {
+  memory::dims dims(shape.dims());
+  for (unsigned int d = 0; d < shape.dims(); ++d) {
+    dims[d] = shape.dim_size(d);
+  }
+  return dims;
 }
 
-// Check whether opname with type T is registered as MKL-compliant.
-//
-// @input: name of the op
-// @input: T datatype to be used for checking op
-// @return: true if opname is registered as Mkl op; false otherwise
-static inline bool IsMklOp(const std::string& op_name, DataType T) {
-  string kernel = KernelsRegisteredForOp(op_name);
-  bool result =
-      kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
-  if (result) {
-    VLOG(1) << "mkl_op_registry::" << op_name << " is " << kMklOpLabel;
-  }
-  return result;
+/// Map TensorShape object into memory::dims in NCHW format required by MKL-DNN
+///
+/// This function is a specific one than above function. It will map input
+/// TensorShape into MKL-DNN dims in NCHW format. So it may not preserve the
+/// order of dimensions. E.g., if input tensor is in NHWC format, then dims
+/// will be in NCHW format, and not in NHWC format.
+///
+/// @input TensorShape object in shape
+/// @return memory::dims in MKL-DNN required NCHW format
+inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape,
+                                              TensorFormat format) {
+  // Check validity of format.
+  CHECK_NE(TFDataFormatToMklDnnDataFormat(format),
+           memory::format::format_undef);
+
+  int n = shape.dim_size(GetTensorDimIndex(format, 'N'));
+  int c = shape.dim_size(GetTensorDimIndex(format, 'C'));
+  int h = shape.dim_size(GetTensorDimIndex(format, 'H'));
+  int w = shape.dim_size(GetTensorDimIndex(format, 'W'));
+
+  // MKL-DNN requires dimensions in NCHW format.
+  return memory::dims({n, c, h, w});
 }
 
-// Check whether opname with type T is registered as MKL-compliant and
-// is element-wise.
-//
-// @input: name of the op
-// @input: T datatype to be used for checking op
-// @return: true if opname is registered as element-wise Mkl op; false otherwise
-static inline bool IsMklElementWiseOp(const std::string& op_name, DataType T) {
-  if (!IsMklOp(op_name, T)) {
+inline padding_kind TFPaddingToMklDnnPadding(Padding pad) {
+  // MKL-DNN only supports zero padding.
+  return padding_kind::zero;
+}
+
+/*
+ * Class to represent all the resources corresponding to a tensor in TensorFlow
+ * that are required to execute an operation (such as Convolution).
+ */
+template <typename T>
+class MklDnnData {
+ private:
+  /// MKL-DNN memory primitive for input user memory
+  memory* user_memory_;
+
+  /// MKL-DNN memory primitive in case input or output reorder is needed.
+  memory* reorder_memory_;
+
+  /// Operations memory descriptor
+  memory::desc* op_md_;
+
+  /// CPU engine on which operation will be executed
+  const engine* cpu_engine_;
+
+ public:
+  explicit MklDnnData(const engine* e)
+      : user_memory_(nullptr),
+        reorder_memory_(nullptr),
+        op_md_(nullptr),
+        cpu_engine_(e) {}
+
+  ~MklDnnData() {
+    cpu_engine_ = nullptr;  // We don't own this.
+    delete (user_memory_);
+    delete (reorder_memory_);
+    delete (op_md_);
+  }
+
+  void* GetTensorBuffer(const Tensor* tensor) {
+    CHECK_NOTNULL(tensor);
+    return const_cast<void*>(
+        static_cast<const void*>(tensor->flat<T>().data()));
+  }
+
+  /// Set user memory primitive using specified dimensions, memory format and
+  /// data_buffer. Function automatically uses element data type by using
+  /// input type T used for creating call object.
+  ///
+  /// In a nutshell, function allows user to describe the input tensor to
+  /// an operation. E.g., filter of Conv2D is of shape {1, 2, 3, 4}, and
+  /// memory format HWIO, and the buffer that contains actual values is
+  /// pointed by data_buffer.
+  void SetUsrMem(memory::dims dim, memory::format fm, void* data_buffer) {
+    CHECK_NOTNULL(data_buffer);
+    CHECK_NOTNULL(cpu_engine_);
+    // TODO(nhasabni): can we remove dynamic memory allocation?
+    user_memory_ =
+        new memory(memory::primitive_desc(
+                       memory::desc(dim, MklDnnType<T>(), fm), *cpu_engine_),
+                   data_buffer);
+  }
+
+  void SetUsrMem(memory::dims dim, memory::format fm, const Tensor* tensor) {
+    CHECK_NOTNULL(tensor);
+    SetUsrMem(dim, fm, GetTensorBuffer(tensor));
+  }
+
+  /// A version of function to set user memory primitive that accepts memory
+  /// descriptor directly, instead of accepting dimensions and format. This
+  /// function is more generic that the one above, but the function above is
+  /// sufficient in most cases.
+  void SetUsrMem(memory::desc md, void* data_buffer) {
+    CHECK_NOTNULL(data_buffer);
+    CHECK_NOTNULL(cpu_engine_);
+    // TODO(nhasabni): can we remove dynamic memory allocation?
+    user_memory_ =
+        new memory(memory::primitive_desc(md, *cpu_engine_), data_buffer);
+  }
+
+  /// A version of SetUsrMem with memory descriptor and tensor
+  void SetUsrMem(memory::desc md, const Tensor* tensor) {
+    CHECK_NOTNULL(tensor);
+    SetUsrMem(md, GetTensorBuffer(tensor));
+  }
+
+  /// A version of function to set user memory primitive that accepts primitive
+  /// descriptor directly, instead of accepting dimensions and format. This
+  /// function is more generic that the one above, but the function above is
+  /// sufficient in most cases.
+  void SetUsrMem(memory::primitive_desc pd, void* data_buffer) {
+    CHECK_NOTNULL(data_buffer);
+    CHECK_NOTNULL(cpu_engine_);
+    // TODO(nhasabni): can we remove dynamic memory allocation?
+    user_memory_ = new memory(pd, data_buffer);
+  }
+
+  /// A version of SetUsrMem with primitive descriptor and tensor
+  void SetUsrMem(memory::primitive_desc pd, const Tensor* tensor) {
+    CHECK_NOTNULL(tensor);
+    SetUsrMem(pd, GetTensorBuffer(tensor));
+  }
+
+  /// Get function for user memory primitive.
+  const memory* GetUsrMem() const { return user_memory_; }
+
+  /// Get function for primitive descriptor of user memory primitive.
+  const memory::primitive_desc GetUsrMemPrimDesc() const {
+    CHECK_NOTNULL(user_memory_);
+    return user_memory_->get_primitive_desc();
+  }
+
+  /// Get function for descriptor of user memory.
+  memory::desc GetUsrMemDesc() {
+    // This is ugly. Why MKL-DNN does not provide desc() method of const type??
+    const memory::primitive_desc pd = GetUsrMemPrimDesc();
+    return const_cast<memory::primitive_desc*>(&pd)->desc();
+  }
+
+  /// Get function for data buffer of user memory primitive.
+  void* GetUsrMemDataHandle() const {
+    CHECK_NOTNULL(user_memory_);
+    return user_memory_->get_data_handle();
+  }
+
+  /// Get the memory primitive for input and output of an op. If inputs
+  /// to an op require reorders, then this function returns memory primitive
+  /// for reorder. Otherwise, it will return memory primitive for user memory.
+  ///
+  /// E.g., Conv2D(I, F) is a primitive with I and F being inputs. Then to
+  /// execute Conv2D, we need memory primitive for I and F. Buf if reorder is
+  /// required for I and F (say I_r is reorder primitive for I; F_r is reorder
+  /// primitive for F), then we need I_r and F_r to perform Conv2D.
+  const memory& GetOpMem() const {
+    return reorder_memory_ ? *reorder_memory_ : *user_memory_;
+  }
+
+  /// Set memory descriptor of an operation in terms of dimensions and memory
+  /// format. E.g., For Conv2D, the dimensions would be same as user dimensions
+  /// but memory::format would be mkldnn::any because we want MKL-DNN to choose
+  /// best layout/format for given input dimensions.
+  void SetOpMemDesc(const memory::dims& dim, memory::format fm) {
+    // TODO(nhasabni): can we remove dynamic memory allocation?
+    op_md_ = new memory::desc(dim, MklDnnType<T>(), fm);
+  }
+
+  /// Get function for memory descriptor for an operation
+  const memory::desc& GetOpMemDesc() const { return *op_md_; }
+
+  /// Function to handle input reordering
+  ///
+  /// Check if we need to reorder this input of an operation.
+  /// Return true and allocate reorder memory primitive if reorder is needed.
+  /// Otherwise, return false and do not allocate reorder memory primitive.
+  ///
+  /// To check if reorder is needed, this function compares memory primitive
+  /// descriptor of an operation (op_pd) for the given input with the
+  /// user-specified memory primitive descriptor.
+  ///
+  /// @input: op_pd - memory primitive descriptor of the given input of an
+  ///               operation
+  /// @input: net - net to which to add reorder primitive in case it is needed.
+  /// @return: true in case reorder of input is needed; false, otherwise.
+  bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
+                           std::vector<primitive>* net) {
+    CHECK_NOTNULL(net);
+    CHECK_NOTNULL(user_memory_);
+    if (op_pd != user_memory_->get_primitive_desc()) {
+      // TODO(nhasabni): can we remove dynamic memory allocation?
+      reorder_memory_ = new memory(op_pd);
+      net->push_back(reorder(*user_memory_, *reorder_memory_));
+      return true;
+    }
     return false;
   }
 
-  bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
-                 0 == op_name.compare(GetMklOpName("Sub")) ||
-                 0 == op_name.compare(GetMklOpName("Mul")) ||
-                 0 == op_name.compare(GetMklOpName("Maximum")) ||
-                 0 == op_name.compare(GetMklOpName("SquaredDifference")));
+  /// Function to handle output reorder
+  ///
+  /// This function performs very similar functionality as input reordering
+  /// function above. The only difference is that this function does not add
+  /// reorder primitive to the net. The reason for this is: the reorder
+  /// primitive for output needs to be added to the list only after operation
+  /// has executed. But we need to prepare a temporary buffer in case output
+  /// reorder is needed. And this temporary buffer will hold the output of
+  /// an operation before it is fed to reorder primitive.
+  ///
+  /// @input memory primitive descriptor for the given output of an operation
+  /// @return: true in case reorder of output is needed; false, otherwise.
+  bool PrepareReorderToUserMemIfReq(const memory::primitive_desc& op_pd) {
+    CHECK_NOTNULL(user_memory_);
+    if (op_pd != user_memory_->get_primitive_desc()) {
+      // TODO(nhasabni): can we remove dynamic memory allocation?
+      reorder_memory_ = new memory(op_pd);
+      return true;
+    }
+    return false;
+  }
 
-  VLOG(1) << "mkl_op_registry::" << op_name
-          << " is elementwise MKL op: " << result;
-  return result;
-}
+  /// Function to actually insert reorder primitive in the net
+  ///
+  /// This function completes remaining part of output reordering. It inserts
+  /// a reordering primitive from the temporary buffer that holds the output
+  /// to the user-specified output buffer.
+  ///
+  /// @input: net - net to which to add reorder primitive
+  void InsertReorderToUserMem(std::vector<primitive>* net) {
+    CHECK_NOTNULL(net);
+    CHECK_NOTNULL(user_memory_);
+    CHECK_NOTNULL(reorder_memory_);
+    net->push_back(reorder(*reorder_memory_, *user_memory_));
+  }
+};
 
-}  // namespace mkl_op_registry
+#endif  // INTEL_MKL_DNN
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index d8925d3909..e6a4088656 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -429,3 +429,41 @@ Stack Overflow and specify the `tensorflow` tag.
   <pre>ImportError: cannot import name pywrap_tensorflow</pre></td>
 </tr>
 </table>
+
+## Tested source configurations
+**Linux**
+<table>
+<tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.3.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>6</td><td>8</td></tr>
+<tr><td>tensorflow-1.2.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.2.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>5.1</td><td>8</td></tr>
+<tr><td>tensorflow-1.1.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.2</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.1.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.2</td><td>5.1</td><td>8</td></tr>
+<tr><td>tensorflow-1.0.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.2</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.0.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.2</td><td>5.1</td><td>8</td></tr>
+</table>
+
+**Mac**
+<table>
+<tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>ttensorflow-1.2.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>ttensorflow-1.1.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.2</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>ttensorflow_gpu-1.1.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.2</td><td>5.1</td><td>8</td></tr>
+<tr><td>ttensorflow-1.0.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.2</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>ttensorflow_gpu-1.0.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.2</td><td>5.1</td><td>8</td></tr>
+</table>
+
+**Windows**
+<table>
+<tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.3.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.3.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>6</td><td>8</td></tr>
+<tr><td>tensorflow-1.2.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.2.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>5.1</td><td>8</td></tr>
+<tr><td>tensorflow-1.1.0</td><td>CPU</td><td>3.5</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.1.0</td><td>GPU</td><td>3.5</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>5.1</td><td>8</td></tr>
+<tr><td>tensorflow-1.0.0</td><td>CPU</td><td>3.5</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.0.0</td><td>GPU</td><td>3.5</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>5.1</td><td>8</td></tr>
+</table>
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java
index eb4dc69d63..184df1bdb4 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java
@@ -37,6 +37,7 @@ import android.content.pm.PackageManager;
 import android.media.AudioFormat;
 import android.media.AudioRecord;
 import android.media.MediaRecorder;
+import android.os.Build;
 import android.os.Bundle;
 import android.util.Log;
 import android.view.View;
@@ -151,12 +152,15 @@ public class SpeechActivity extends Activity {
 
     // Start the recording and recognition threads.
     requestMicrophonePermission();
+    startRecording();
     startRecognition();
   }
 
   private void requestMicrophonePermission() {
-    requestPermissions(
-        new String[] {android.Manifest.permission.RECORD_AUDIO}, REQUEST_RECORD_AUDIO);
+    if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
+      requestPermissions(
+          new String[]{android.Manifest.permission.RECORD_AUDIO}, REQUEST_RECORD_AUDIO);
+    }
   }
 
   @Override
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index 6d98c7b85d..1fa2b14869 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -89,7 +89,7 @@ def build_dataset(words, n_words):
 # Filling 4 global variables:
 # data - list of codes (integers from 0 to vocabulary_size-1).
 #   This is the original text but words are replaced by their codes
-# count - map of words(strings) to count of occurences
+# count - map of words(strings) to count of occurrences
 # dictionary - map of words(strings) to their codes(integers)
 # reverse_dictionary - maps codes(integers) to words(strings)
 data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
diff --git a/tensorflow/go/example_inception_inference_test.go b/tensorflow/go/example_inception_inference_test.go
index 2162fbe484..f84a588899 100644
--- a/tensorflow/go/example_inception_inference_test.go
+++ b/tensorflow/go/example_inception_inference_test.go
@@ -28,8 +28,8 @@ import (
 	"os"
 	"path/filepath"
 
-	"github.com/tensorflow/tensorflow/tensorflow/go/op"
 	tf "github.com/tensorflow/tensorflow/tensorflow/go"
+	"github.com/tensorflow/tensorflow/tensorflow/go/op"
 )
 
 func Example() {
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index a534a0d659..e8fa21a62b 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -92,7 +92,7 @@ func NewTensor(value interface{}) (*Tensor, error) {
 	raw := tensorData(t.c)
 	buf := bytes.NewBuffer(raw[:0:len(raw)])
 	if dataType != String {
-		if err := encodeTensor(buf, val); err != nil {
+		if err := encodeTensor(buf, val, shape); err != nil {
 			return nil, err
 		}
 		if uintptr(buf.Len()) != nbytes {
@@ -100,7 +100,7 @@ func NewTensor(value interface{}) (*Tensor, error) {
 		}
 	} else {
 		e := stringEncoder{offsets: buf, data: raw[nflattened*8 : len(raw)], status: newStatus()}
-		if err := e.encode(reflect.ValueOf(value)); err != nil {
+		if err := e.encode(reflect.ValueOf(value), shape); err != nil {
 			return nil, err
 		}
 		if int64(buf.Len()) != nflattened*8 {
@@ -236,17 +236,11 @@ func shapeAndDataTypeOf(val reflect.Value) (shape []int64, dt DataType, err erro
 	typ := val.Type()
 	for typ.Kind() == reflect.Array || typ.Kind() == reflect.Slice {
 		shape = append(shape, int64(val.Len()))
-		// If slice elements are slices, verify that all of them have the same size.
-		// Go's type system makes that guarantee for arrays.
 		if val.Len() > 0 {
-			if val.Type().Elem().Kind() == reflect.Slice {
-				expected := val.Index(0).Len()
-				for i := 1; i < val.Len(); i++ {
-					if val.Index(i).Len() != expected {
-						return shape, dt, fmt.Errorf("mismatched slice lengths: %d and %d", val.Index(i).Len(), expected)
-					}
-				}
-			}
+			// In order to check tensor structure properly in general case we need to iterate over all slices of the tensor to check sizes match
+			// Since we already going to iterate over all elements in encodeTensor() let's
+			// 1) do the actual check in encodeTensor() to save some cpu cycles here
+			// 2) assume the shape is represented by lengths of elements with zero index in each dimension
 			val = val.Index(0)
 		}
 		typ = typ.Elem()
@@ -302,7 +296,7 @@ func byteSizeOfEncodedStrings(val interface{}) uintptr {
 
 // encodeTensor writes v to the specified buffer using the format specified in
 // c_api.h. Use stringEncoder for String tensors.
-func encodeTensor(w *bytes.Buffer, v reflect.Value) error {
+func encodeTensor(w *bytes.Buffer, v reflect.Value, shape []int64) error {
 	switch v.Kind() {
 	case reflect.Bool:
 		b := byte(0)
@@ -318,19 +312,18 @@ func encodeTensor(w *bytes.Buffer, v reflect.Value) error {
 		}
 
 	case reflect.Array, reflect.Slice:
-		// If slice elements are slices, verify that all of them have the same size.
+		// If current dimension is a slice, verify that it has the expected size
 		// Go's type system makes that guarantee for arrays.
-		if v.Len() > 0 && v.Type().Elem().Kind() == reflect.Slice {
-			expected := v.Index(0).Len()
-			for i := 1; i < v.Len(); i++ {
-				if v.Index(i).Len() != expected {
-					return fmt.Errorf("mismatched slice lengths: %d and %d", v.Index(i).Len(), expected)
-				}
+		if v.Kind() == reflect.Slice {
+			expected := int(shape[0])
+			if v.Len() != expected {
+				return fmt.Errorf("mismatched slice lengths: %d and %d", v.Len(), expected)
 			}
 		}
 
+		subShape := shape[1:]
 		for i := 0; i < v.Len(); i++ {
-			err := encodeTensor(w, v.Index(i))
+			err := encodeTensor(w, v.Index(i), subShape)
 			if err != nil {
 				return err
 			}
@@ -379,7 +372,7 @@ type stringEncoder struct {
 	status  *status
 }
 
-func (e *stringEncoder) encode(v reflect.Value) error {
+func (e *stringEncoder) encode(v reflect.Value, shape []int64) error {
 	if v.Kind() == reflect.String {
 		if err := binary.Write(e.offsets, nativeEndian, e.offset); err != nil {
 			return err
@@ -395,8 +388,17 @@ func (e *stringEncoder) encode(v reflect.Value) error {
 		C.free(unsafe.Pointer(src))
 		return e.status.Err()
 	}
+
+	if v.Kind() == reflect.Slice {
+		expected := int(shape[0])
+		if v.Len() != expected {
+			return fmt.Errorf("mismatched slice lengths: %d and %d", v.Len(), expected)
+		}
+	}
+
+	subShape := shape[1:]
 	for i := 0; i < v.Len(); i++ {
-		if err := e.encode(v.Index(i)); err != nil {
+		if err := e.encode(v.Index(i), subShape); err != nil {
 			return err
 		}
 	}
diff --git a/tensorflow/go/tensor_test.go b/tensorflow/go/tensor_test.go
index 2fc7553f87..35bd2fd9a5 100644
--- a/tensorflow/go/tensor_test.go
+++ b/tensorflow/go/tensor_test.go
@@ -42,6 +42,10 @@ func TestNewTensor(t *testing.T) {
 		{[]int64{2}, []bool{true, false}},
 		{[]int64{1}, []float64{1}},
 		{[]int64{1}, [1]float64{1}},
+		{[]int64{1, 1}, [1][1]float64{{1}}},
+		{[]int64{1, 1, 1}, [1][1][]float64{{{1}}}},
+		{[]int64{1, 1, 2}, [1][][2]float64{{{1, 2}}}},
+		{[]int64{1, 1, 1, 1}, [1][][1][]float64{{{{1}}}}},
 		{[]int64{2}, []string{"string", "slice"}},
 		{[]int64{2}, [2]string{"string", "array"}},
 		{[]int64{3, 2}, [][]float64{{1, 2}, {3, 4}, {5, 6}}},
@@ -74,6 +78,12 @@ func TestNewTensor(t *testing.T) {
 		[]uint64{5},
 		// Mismatched dimensions
 		[][]float32{{1, 2, 3}, {4}},
+		// Mismatched dimensions. Should return "mismatched slice lengths" error instead of "BUG"
+		[][][]float32{{{1, 2}, {3, 4}}, {{1}, {3}}},
+		// Mismatched dimensions. Should return error instead of valid tensor
+		[][][]float32{{{1, 2}, {3, 4}}, {{1}, {3}}, {{1, 2, 3}, {2, 3, 4}}},
+		// Mismatched dimensions for strings
+		[][]string{{"abc"}, {"abcd", "abcd"}},
 	}
 
 	for _, test := range tests {
diff --git a/tensorflow/java/src/gen/perl/tftypes-runall.pl b/tensorflow/java/src/gen/perl/tftypes-runall.pl
index 258c1ff836..a451ce92aa 100644
--- a/tensorflow/java/src/gen/perl/tftypes-runall.pl
+++ b/tensorflow/java/src/gen/perl/tftypes-runall.pl
@@ -37,4 +37,4 @@ sub locchk {
 &locchk("$rsrc/tftypes.csv");
 
 system("perl $dir/tftypes.pl -t $rsrc/tftypes.csv $pkg/types");
-# system("perl $dir/tftypes.pl -c $rsrc/tftypes.csv $rsrc/Tensors.java.tmpl > $pkg/op/Tensors.java");
+system("perl $dir/tftypes.pl -c $rsrc/tftypes.csv $rsrc/Tensors.java.tmpl > $pkg/Tensors.java");
diff --git a/tensorflow/java/src/gen/perl/tftypes.pl b/tensorflow/java/src/gen/perl/tftypes.pl
index 86867335cb..115723ac8a 100644
--- a/tensorflow/java/src/gen/perl/tftypes.pl
+++ b/tensorflow/java/src/gen/perl/tftypes.pl
@@ -75,15 +75,23 @@ open (TYPEDESC, $typedesc);
 
 my @info = ([]);
 
+sub trim {
+    (my $ret) = @_;
+    $ret =~ s/^\s*//g;
+    $ret =~ s/\s*$//g;
+    return $ret;
+}
+
 while (<TYPEDESC>) {
     chomp;
     my $line = $_;
     if ($line =~ m/^TF type/) { next }
     $line =~ s/\r$//;
-    (my $name, my $jtype, my $creat, my $default, my $desc) =
-        split /,/, $line, 5;
-    $desc =~ s/^ *//g;
-    $desc =~ s/ *$//g;
+    my @items = split /,/, $line, 6;
+    for (my $i = 0; $i <= $#items; $i++) {
+        $items[$i] = trim $items[$i];
+    }
+    my $jtype = $items[2];
     $jtypecount{$jtype}++;
     if ($jtypecount{$jtype} > 1) {
 # currently allowing Java types to stand for more than one TF type, but
@@ -92,63 +100,85 @@ while (<TYPEDESC>) {
 #       exit 1
     }
 
-    push @info, [$name, $jtype, $creat, $default, $desc];
+    push @info, \@items;
+}
+
+sub article {
+    (my $s) = @_;
+    if (substr($s, 0, 1) =~ m/^[aeoiu8]$/i) {
+        return "an $s"
+    } else {
+        return "a $s"
+    }
 }
 
 for (my $i = 1; $i <= $#info; $i++) {
-    (my $name, my $jtype, my $creat, my $default, my $desc) =
+    (my $name, my $builtin, my $jtype, my $creat, my $default, my $desc) =
         @{$info[$i]};
-    my $tfname = "TF".$name;
+    my $tfname = $name;
     my $ucname = uc $name;
 
+    print STDERR "$name $desc\n";
+
     if ($option eq '-t') {
         if ($jtype eq '') { next }
+        if ($builtin eq 'y') { next }
         # Generate class declarations
         # print STDERR "Creating $dirname/$tfname.java\n";
         open (CLASSFILE, ">$dirname/$tfname.java") || die "Can't open $tfname.java";
-        print CLASSFILE $copyright;
-        print CLASSFILE "// GENERATED FILE. To update, edit tftypes.pl instead.\n\n";
-
-        my $fulldesc = $desc;
-        if (substr($desc, 0, 1) =~ m/^[aeoiu8]$/i) {
-            $fulldesc = "an $desc"
-        } else {
-            $fulldesc = "a $desc"
-        }
-        print CLASSFILE  "package org.tensorflow.types;\n\n"
-                        ."import org.tensorflow.DataType;\n\n";
+        print CLASSFILE $copyright, "\n";
+        # print CLASSFILE "// GENERATED FILE. To update, edit tftypes.pl instead.\n\n";
+
+        my $fulldesc = article($desc);
+        print CLASSFILE  "package org.tensorflow.types;\n\n";
         print CLASSFILE  "/** Represents $fulldesc. */\n"
-                        ."public class $tfname implements TFType {\n"
-                        ."  private $tfname() {}\n"
-                        ."  static {\n"
-                        ."    Types.typeCodes.put($tfname.class, DataType.$ucname);\n"
-                        ."  }\n";
-        if ($default ne '') {
-            print CLASSFILE
-                         "  static {\n"
-                        ."    Types.scalars.put($tfname.class, $default);\n"
-                        ."  }\n";
-        }
-        print CLASSFILE  "}\n";
+                        ."public class $tfname {\n"
+                        ."  private $tfname() {\n"
+                        ."  }\n"
+                        ."}\n";
         close(CLASSFILE);
     } elsif ($option eq '-c') {
       # Generate creator declarations for Tensors.java
       if ($jtype ne '' && $creat eq 'y') {
-        for (my $brackets = ''; length $brackets <= 12; $brackets .= '[]') {
+        for (my $brackets = '', my $rank = 0; length $brackets <= 12; $brackets .= '[]', $rank++) {
+            my $datainfo = "   *  \@param data An array containing the values to put into the new tensor.\n"
+                          ."   *  The dimensions of the new tensor will match those of the array.\n";
+            if ($rank == 0) {
+                $datainfo = "   *  \@param data The value to put into the new scalar tensor.\n"
+            }
+
+            my $trank = $rank;
+            if ($tfname eq 'String') {
+                $trank = $rank-1;
+                next if $trank < 0;
+
+                $datainfo = "   *  \@param data An array containing the data to put into the new tensor.\n"
+                           ."   *  String elements are sequences of bytes from the last array dimension.\n";
+            }
+
+    
+            my $intro = ($trank > 0)
+                ?  "Creates a rank-$trank tensor of {\@code $jtype} elements."
+                :  "Creates a scalar tensor containing a single {\@code $jtype} element.";
             $typeinfo .=
-                "  public static Tensor<$tfname> create($jtype$brackets data) {\n"
-               ."    return Tensor.create(data, $tfname.class);\n"
-               ."  }\n";
+             "  /**\n"
+            ."   * $intro\n"
+            ."   * \n"
+            .$datainfo
+            ."   */\n"
+            ."  public static Tensor<$tfname> create($jtype$brackets data) {\n"
+            ."    return Tensor.create(data, $tfname.class);\n"
+            ."  }\n\n";
         }
       }
-      if ($text =~ m/\b$tfname\b/ || $creat eq 'y') {
+      if ($text =~ m/\b$tfname\b/ && $builtin eq 'n' && $creat eq 'y') {
             $imports .= "import org.tensorflow.types.$tfname;\n";
       }
     }
 }
 
 if ($option ne '-t') {
-  print "// GENERATED FILE. Edits to this file will be lost -- edit $tmpl instead.\n";
+# print "// GENERATED FILE. Edits to this file will be lost -- edit $tmpl instead.\n";
 
   $text =~ s/\@TYPEINFO\@/$typeinfo/;
   $text =~ s/\@IMPORTS\@/$imports/;
diff --git a/tensorflow/java/src/gen/resources/Tensors.java.tmpl b/tensorflow/java/src/gen/resources/Tensors.java.tmpl
new file mode 100644
index 0000000000..98e1588559
--- /dev/null
+++ b/tensorflow/java/src/gen/resources/Tensors.java.tmpl
@@ -0,0 +1,31 @@
+package org.tensorflow;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import org.tensorflow.Tensor;
+@IMPORTS@
+
+/**
+ * Type-safe factory methods for creating {@link Tensor} objects.
+ */
+public final class Tensors {
+  private Tensors() {}
+
+  /** Creates a scalar String tensor using the default, UTF-8 encoding.
+   * 
+   *  @param data  The string to put into the new scalar tensor.
+   */
+  public static Tensor<String> create(String data) {
+    return Tensor.create(data.getBytes(UTF_8), String.class);
+  }
+
+  /** Creates a scalar String tensor using a specified encoding.
+   * 
+   *  @param charset The encoding from String to bytes.
+   *  @param data    The string to put into the new scalar tensor.
+   */
+  public static Tensor<String> create(String data, java.nio.charset.Charset charset) {
+    return Tensor.create(data.getBytes(charset), String.class);
+  }
+
+@TYPEINFO@}
+
diff --git a/tensorflow/java/src/gen/resources/tftypes.csv b/tensorflow/java/src/gen/resources/tftypes.csv
index 88acaafd3c..6f26230f27 100644
--- a/tensorflow/java/src/gen/resources/tftypes.csv
+++ b/tensorflow/java/src/gen/resources/tftypes.csv
@@ -1,21 +1,21 @@
-TF type,Java type,Creator?,Zero value,Description
-Float,float,y,0f,32-bit single precision floating point number
-Double,double,y,0.0,64-bit double precision floating point number
-Int32,int,y,0,32-bit signed integer
-UInt8,byte,n,(byte)0,8-bit unsigned integer
-Int16,,n,(short)0,16-bit signed integer
-Int8,,n,(byte)0,8-bit signed integer
-String,byte,n,,arbitrary sequence of bytes
-Complex64,,n,,single-precision complex number
-Int64,long,y,0L,64-bit signed integer
-Bool,boolean,y,false,boolean
-QInt8,,n,,quantized int8
-QUInt8,,n,,quantized uint8
-QInt32,,n,,quantized int32
-BFloat16,,n,,float32 truncated to 16 bits. Only for cast ops.
-QInt16,,n,,quantized int16
-QUInt16,,n,,quantized uint16
-UInt16,,n,,16-bit unsigned integer
-Complex128,,n,,double-precision complex number
-Half,,n,,
-Resource,,n,,
+TF type,Builtin,Java type,Creator?,Zero value,Description
+Float,y,float,y,0f,32-bit single precision floating point number
+Double,y,double,y,0.0,64-bit double precision floating point number
+Integer,y,int,y,0,32-bit signed integer
+UInt8,n,byte,n,(byte)0,8-bit unsigned integer
+Short,y,,n,(short)0,16-bit signed integer
+Byte,y,,n,(byte)0,8-bit signed integer
+String,y,byte,y,,arbitrary sequence of bytes
+Complex64,n,,n,,single-precision complex number
+Long,y,long,y,0L,64-bit signed integer
+Boolean,y,boolean,y,false,boolean
+QInt8,n,,n,,quantized int8
+QUInt8,n,,n,,quantized uint8
+QInt32,n,,n,,quantized int32
+BFloat16,n,,n,,float32 truncated to 16 bits. Only for cast ops.
+QInt16,n,,n,,quantized int16
+QUInt16,n,,n,,quantized uint16
+UInt16,n,,n,,16-bit unsigned integer
+Complex128,n,,n,,double-precision complex number
+Half,n,,n,,
+Resource,n,,n,,
diff --git a/tensorflow/java/src/main/java/org/tensorflow/DataType.java b/tensorflow/java/src/main/java/org/tensorflow/DataType.java
index e67e266ff7..e835101d08 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/DataType.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/DataType.java
@@ -15,7 +15,13 @@ limitations under the License.
 
 package org.tensorflow;
 
-/** Type of elements in a {@link Tensor}. */
+import java.util.HashMap;
+import java.util.Map;
+import org.tensorflow.types.UInt8;
+
+/**
+ * Represents the type of elements in a {@link Tensor} as an enum.
+ */
 public enum DataType {
   /** 32-bit single precision floating point. */
   FLOAT(1),
@@ -55,14 +61,41 @@ public enum DataType {
   }
   
   // Cached to avoid copying it
-  final private static DataType[] values = values();
+  private static final DataType[] values = values();
 
   static DataType fromC(int c) {
     for (DataType t : values) {
-      if (t.value == c)
+      if (t.value == c) {
         return t;
+      }
     }
     throw new IllegalArgumentException(
         "DataType " + c + " is not recognized in Java (version " + TensorFlow.version() + ")");
   }
+
+  /**
+   * Returns the DataType of a Tensor whose elements have the type specified by class {@code c}.
+   *
+   * @param c The class describing the TensorFlow type of interest.
+   */
+  public static DataType fromClass(Class<?> c) {
+    DataType dtype = typeCodes.get(c);
+    if (dtype == null) {
+      throw new IllegalArgumentException(
+          c.getName() + " objects cannot be used as elements in a TensorFlow Tensor");
+    }
+    return dtype;
+  }
+
+  private static final Map<Class<?>, DataType> typeCodes = new HashMap<>();
+
+  static {
+    typeCodes.put(Float.class, DataType.FLOAT);
+    typeCodes.put(Double.class, DataType.DOUBLE);
+    typeCodes.put(Integer.class, DataType.INT32);
+    typeCodes.put(UInt8.class, DataType.UINT8);
+    typeCodes.put(Long.class, DataType.INT64);
+    typeCodes.put(Boolean.class, DataType.BOOL);
+    typeCodes.put(String.class, DataType.STRING);
+  }
 }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Graph.java b/tensorflow/java/src/main/java/org/tensorflow/Graph.java
index 58ad3ab193..d4fd3db5f7 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Graph.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Graph.java
@@ -81,8 +81,8 @@ public final class Graph implements AutoCloseable {
   /**
    * Iterator over all the {@link Operation}s in the graph.
    *
-   * The order of iteration is unspecified. Consumers of the iterator will received no notification
-   * should the underlying graph change during iteration.
+   * <p>The order of iteration is unspecified. Consumers of the iterator will receive no
+   * notification should the underlying graph change during iteration.
    */
   public Iterator<Operation> operations() {
     return new OperationIterator(this);
@@ -245,7 +245,8 @@ public final class Graph implements AutoCloseable {
 
   private static native long operation(long handle, String name);
 
-  // This method returns the Operation native handle at index 0 and the new value for pos at index 1 (see TF_GraphNextOperation)
+  // This method returns the Operation native handle at index 0 and the new value for pos at index 1
+  // (see TF_GraphNextOperation)
   private static native long[] nextOperation(long handle, int position);
 
   private static native void importGraphDef(long handle, byte[] graphDef, String prefix)
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Input.java b/tensorflow/java/src/main/java/org/tensorflow/Input.java
index 8e6685ee0f..13bc463e7d 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Input.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Input.java
@@ -34,7 +34,7 @@ package org.tensorflow;
  * ops.array().concat(0, split);
  * }</pre>
  */
-public interface Input {
+public interface Input<T> {
 
   /**
    * Returns the symbolic handle of a tensor.
@@ -44,5 +44,5 @@ public interface Input {
    *
    * @see OperationBuilder#addInput(Output)
    */
-  Output asOutput();
+  Output<T> asOutput();
 }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
index d2d019babb..2b431eebf5 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
@@ -122,8 +122,7 @@ final class NativeLibrary {
   }
 
   private static String extractResource(
-      InputStream resource, String resourceName, String extractToDirectory)
-      throws IOException {
+      InputStream resource, String resourceName, String extractToDirectory) throws IOException {
     final File dst = new File(extractToDirectory, System.mapLibraryName(resourceName));
     dst.deleteOnExit();
     final String dstPath = dst.toString();
@@ -184,8 +183,7 @@ final class NativeLibrary {
   // compatibility.
   private static File createTemporaryDirectory() {
     File baseDirectory = new File(System.getProperty("java.io.tmpdir"));
-    String directoryName
-        = "tensorflow_native_libraries-" + System.currentTimeMillis() + "-";
+    String directoryName = "tensorflow_native_libraries-" + System.currentTimeMillis() + "-";
     for (int attempt = 0; attempt < 1000; attempt++) {
       File temporaryDirectory = new File(baseDirectory, directoryName + attempt);
       if (temporaryDirectory.mkdir()) {
@@ -194,7 +192,8 @@ final class NativeLibrary {
     }
     throw new IllegalStateException(
         "Could not create a temporary directory (tried to make "
-        + directoryName + "*) to extract TensorFlow native libraries.");
+            + directoryName
+            + "*) to extract TensorFlow native libraries.");
   }
 
   private NativeLibrary() {}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Operand.java b/tensorflow/java/src/main/java/org/tensorflow/Operand.java
index 695c4c1060..61082e83d5 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Operand.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Operand.java
@@ -22,19 +22,19 @@ package org.tensorflow;
  *
  * <pre>{@code
  * // The "decodeJpeg" operation can be used as an operand to the "cast" operation
- * Operand decodeJpeg = ops.image().decodeJpeg(...);
+ * Operand<UInt8> decodeJpeg = ops.image().decodeJpeg(...);
  * ops.math().cast(decodeJpeg, DataType.FLOAT);
  *
  * // The output "y" of the "unique" operation can be used as an operand to the "cast" operation
- * Output y = ops.array().unique(...).y();
- * ops.math().cast(y, DataType.FLOAT);
+ * Output<Integer> y = ops.array().unique(...).y();
+ * ops.math().cast(y, Float.class);
  *
  * // The "split" operation can be used as operand list to the "concat" operation
- * Iterable<? extends Operand> split = ops.array().split(...);
+ * Iterable<? extends Operand<Float>> split = ops.array().split(...);
  * ops.array().concat(0, split);
  * }</pre>
  */
-public interface Operand {
+public interface Operand<T> {
 
   /**
    * Returns the symbolic handle of a tensor.
@@ -44,5 +44,5 @@ public interface Operand {
    *
    * @see OperationBuilder#addInput(Output)
    */
-  Output asOutput();
+  Output<T> asOutput();
 }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Operation.java b/tensorflow/java/src/main/java/org/tensorflow/Operation.java
index ec26309fba..6b82e5780b 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Operation.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Operation.java
@@ -98,16 +98,26 @@ public final class Operation {
    * @param length number of tensors in the list
    * @return array of {@code Output}
    */
-  public Output[] outputList(int idx, int length) {
-    Output[] outputs = new Output[length];
+  public Output<?>[] outputList(int idx, int length) {
+    Output<?>[] outputs = new Output<?>[length];
     for (int i = 0; i < length; ++i) {
       outputs[i] = output(idx + i);
     }
     return outputs;
   }
 
-  /** Returns a symbolic handle to one of the tensors produced by this operation. */
-  public Output output(int idx) {
+  /**
+   * Returns a symbolic handle to one of the tensors produced by this operation.
+   *
+   * <p>Warning: Does not check that the type of the tensor matches T. It is recommended to call
+   * this method with an explicit type parameter rather than letting it be inferred, e.g. {@code
+   * operation.<Integer>output(0)}
+   *
+   * @param <T> The expected element type of the tensors produced by this output.
+   * @param idx The index of the output among the outputs produced by this operation.
+   */
+  @SuppressWarnings({"rawtypes", "unchecked"})
+  public <T> Output<T> output(int idx) {
     return new Output(this, idx);
   }
 
diff --git a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
index 15077ce439..9a1b7592b3 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
@@ -63,7 +63,6 @@ public final class OperationBuilder {
     }
   }
 
-
   /**
    * Returns the builder to create an operation.
    *
@@ -73,7 +72,7 @@ public final class OperationBuilder {
    * @param input {@link Output} supposed to be the input of the OperationBuilder.
    * @return the OperationBuilder instance for chaining.
    */
-  public OperationBuilder addInput(Output input) {
+  public OperationBuilder addInput(Output<?> input) {
     Graph.Reference r = graph.ref();
     try {
       addInput(unsafeNativeHandle, input.op().getUnsafeNativeHandle(), input.index());
@@ -106,7 +105,7 @@ public final class OperationBuilder {
     return this;
   }
 
-  public OperationBuilder addInputList(Output[] inputs) {
+  public OperationBuilder addInputList(Output<?>[] inputs) {
     Graph.Reference r = graph.ref();
     try {
       long[] opHandles = new long[inputs.length];
@@ -231,7 +230,7 @@ public final class OperationBuilder {
     return this;
   }
 
-  public OperationBuilder setAttr(String name, Tensor value) {
+  public OperationBuilder setAttr(String name, Tensor<?> value) {
     Graph.Reference r = graph.ref();
     try {
       setAttrTensor(unsafeNativeHandle, name, value.getNativeHandle());
@@ -241,10 +240,10 @@ public final class OperationBuilder {
     return this;
   }
 
-  public OperationBuilder setAttr(String name, Tensor[] value) {
+  public OperationBuilder setAttr(String name, Tensor<?>[] value) {
     long[] handles = new long[value.length];
     int idx = 0;
-    for (Tensor t : value) {
+    for (Tensor<?> t : value) {
       handles[idx++] = t.getNativeHandle();
     }
     Graph.Reference r = graph.ref();
@@ -266,7 +265,7 @@ public final class OperationBuilder {
     return this;
   }
 
-  public OperationBuilder setAttr(String name,  String[] value) {
+  public OperationBuilder setAttr(String name, String[] value) {
     Charset utf8 = Charset.forName("UTF-8");
     Object[] objects = new Object[value.length];
     for (int i = 0; i < value.length; ++i) {
@@ -326,5 +325,4 @@ public final class OperationBuilder {
   private static native void setAttrShape(long handle, String name, long[] shape, int numDims);
 
   private static native void setAttrStringList(long handle, String name, Object[] value);
-
 }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Output.java b/tensorflow/java/src/main/java/org/tensorflow/Output.java
index 8dff50fafb..0e17a722ff 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Output.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Output.java
@@ -20,13 +20,13 @@ import java.util.Objects;
 /**
  * A symbolic handle to a tensor produced by an {@link Operation}.
  *
- * <p>An Output is a symbolic handle to a tensor. The value of the Tensor is computed by executing
- * the {@link Operation} in a {@link Session}.
+ * <p>An Output<T> is a symbolic handle to a Tensor<T>. The value of the tensor is computed by
+ * executing the {@link Operation} in a {@link Session}.
  *
  * <p>By implementing the {@link Operand} interface, instances of this class also act as operands to
  * {@link org.tensorflow.op.Op Op} instances.
  */
-public final class Output implements Operand {
+public final class Output<T> implements Operand<T> {
 
   /** Handle to the idx-th output of the Operation {@code op}. */
   public Output(Operation op, int idx) {
@@ -55,7 +55,7 @@ public final class Output implements Operand {
   }
 
   @Override
-  public Output asOutput() {
+  public Output<T> asOutput() {
     return this;
   }
 
@@ -69,8 +69,8 @@ public final class Output implements Operand {
     if (o == this) {
       return true;
     }
-    if (o instanceof Output) {
-      Output that = (Output) o;
+    if (o instanceof Output<?>) {
+      Output<?> that = (Output<?>) o;
       return index == that.index && operation.equals(that.operation);
     }
     return false;
diff --git a/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java b/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java
index b4591dd869..c8b9126f03 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java
@@ -27,8 +27,9 @@ package org.tensorflow;
 public class SavedModelBundle implements AutoCloseable {
 
   /**
-   * Load a saved model from an export directory. The model that is being loaded should be created using
-   * the <a href="https://www.tensorflow.org/api_docs/python/tf/saved_model">Saved Model API</a>.
+   * Load a saved model from an export directory. The model that is being loaded should be created
+   * using the <a href="https://www.tensorflow.org/api_docs/python/tf/saved_model">Saved Model
+   * API</a>.
    *
    * @param exportDir the directory path containing a saved model.
    * @param tags the tags identifying the specific metagraphdef to load.
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Session.java b/tensorflow/java/src/main/java/org/tensorflow/Session.java
index 83a300a560..73324f23e6 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Session.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Session.java
@@ -127,7 +127,7 @@ public final class Session implements AutoCloseable {
      *     {@code SignatureDef} protocol buffer messages that are included in {@link
      *     SavedModelBundle#metaGraphDef()}.
      */
-    public Runner feed(String operation, Tensor t) {
+    public Runner feed(String operation, Tensor<?> t) {
       return feed(parseOutput(operation), t);
     }
 
@@ -138,7 +138,7 @@ public final class Session implements AutoCloseable {
      * <p>Operations in a {@link Graph} can have multiple outputs, {@code index} identifies which
      * one {@code t} is being provided for.
      */
-    public Runner feed(String operation, int index, Tensor t) {
+    public Runner feed(String operation, int index, Tensor<?> t) {
       Operation op = operationByName(operation);
       if (op != null) {
         inputs.add(op.output(index));
@@ -151,7 +151,7 @@ public final class Session implements AutoCloseable {
      * Use {@code t} instead of the Tensor referred to by executing the operation referred to by
      * {@code output}.
      */
-    public Runner feed(Output o, Tensor t) {
+    public Runner feed(Output<?> o, Tensor<?> t) {
       inputs.add(o);
       inputTensors.add(t);
       return this;
@@ -186,7 +186,7 @@ public final class Session implements AutoCloseable {
     }
 
     /** Makes {@link #run()} return the Tensor referred to by {@code output}. */
-    public Runner fetch(Output output) {
+    public Runner fetch(Output<?> output) {
       outputs.add(output);
       return this;
     }
@@ -240,8 +240,11 @@ public final class Session implements AutoCloseable {
      * easier for the caller to cleanup (perhaps returning something like AutoCloseableList in
      * SessionTest.java), and (b) Evaluate whether the return value should be a list, or maybe a
      * {@code Map<Output, Tensor>}?
+     *
+     * <p>TODO(andrewmyers): It would also be good if whatever is returned here made it easier to
+     * extract output tensors in a type-safe way.
      */
-    public List<Tensor> run() {
+    public List<Tensor<?>> run() {
       return runHelper(false).outputs;
     }
 
@@ -269,17 +272,17 @@ public final class Session implements AutoCloseable {
       // It's okay to use Operation.getUnsafeNativeHandle() here since the safety depends on the
       // validity of the Graph and graphRef ensures that.
       int idx = 0;
-      for (Tensor t : inputTensors) {
+      for (Tensor<?> t : inputTensors) {
         inputTensorHandles[idx++] = t.getNativeHandle();
       }
       idx = 0;
-      for (Output o : inputs) {
+      for (Output<?> o : inputs) {
         inputOpHandles[idx] = o.op().getUnsafeNativeHandle();
         inputOpIndices[idx] = o.index();
         idx++;
       }
       idx = 0;
-      for (Output o : outputs) {
+      for (Output<?> o : outputs) {
         outputOpHandles[idx] = o.op().getUnsafeNativeHandle();
         outputOpIndices[idx] = o.index();
         idx++;
@@ -306,12 +309,12 @@ public final class Session implements AutoCloseable {
       } finally {
         runRef.close();
       }
-      List<Tensor> outputs = new ArrayList<Tensor>();
+      List<Tensor<?>> outputs = new ArrayList<Tensor<?>>();
       for (long h : outputTensorHandles) {
         try {
           outputs.add(Tensor.fromHandle(h));
         } catch (Exception e) {
-          for (Tensor t : outputs) {
+          for (Tensor<?> t : outputs) {
             t.close();
           }
           outputs.clear();
@@ -355,7 +358,8 @@ public final class Session implements AutoCloseable {
       return op;
     }
 
-    private Output parseOutput(String opName) {
+    @SuppressWarnings("rawtypes")
+    private Output<?> parseOutput(String opName) {
       int colon = opName.lastIndexOf(':');
       if (colon == -1 || colon == opName.length() - 1) {
         return new Output(operationByName(opName), 0);
@@ -369,9 +373,9 @@ public final class Session implements AutoCloseable {
       }
     }
 
-    private ArrayList<Output> inputs = new ArrayList<Output>();
-    private ArrayList<Tensor> inputTensors = new ArrayList<Tensor>();
-    private ArrayList<Output> outputs = new ArrayList<Output>();
+    private ArrayList<Output<?>> inputs = new ArrayList<Output<?>>();
+    private ArrayList<Tensor<?>> inputTensors = new ArrayList<Tensor<?>>();
+    private ArrayList<Output<?>> outputs = new ArrayList<Output<?>>();
     private ArrayList<Operation> targets = new ArrayList<Operation>();
     private byte[] runOptions = null;
   }
@@ -388,7 +392,7 @@ public final class Session implements AutoCloseable {
    */
   public static final class Run {
     /** Tensors from requested fetches. */
-    public List<Tensor> outputs;
+    public List<Tensor<?>> outputs;
 
     /**
      * (Experimental): Metadata about the run.
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Tensor.java b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
index c5ad1ee51c..d4b753628b 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
@@ -28,89 +28,117 @@ import java.util.Arrays;
 import java.util.HashMap;
 
 /**
- * A typed multi-dimensional array.
+ * A statically typed multi-dimensional array whose elements are of a type described by T.
  *
  * <p>Instances of a Tensor are <b>not</b> thread-safe.
  *
  * <p><b>WARNING:</b> Resources consumed by the Tensor object <b>must</b> be explicitly freed by
  * invoking the {@link #close()} method when the object is no longer needed. For example, using a
- * try-with-resources block like:
+ * try-with-resources block:
  *
  * <pre>{@code
- * try(Tensor t = Tensor.create(...)) {
+ * try (Tensor t = Tensor.create(...)) {
  *   doSomethingWith(t);
  * }
  * }</pre>
  */
-public final class Tensor implements AutoCloseable {
+public final class Tensor<T> implements AutoCloseable {
 
   /**
-   * Create a Tensor from a Java object.
+   * Creates a Tensor from a Java object.
    *
-   * <p>A Tensor is a multi-dimensional array of elements of a limited set of types ({@link
-   * DataType}). Thus, not all Java objects can be converted to a Tensor. In particular, {@code obj}
-   * must be either a primitive (float, double, int, long, boolean) or a multi-dimensional array of
-   * one of those primitives. For example:
+   * <p>A {@code Tensor} is a multi-dimensional array of elements of a limited set of types ({@link
+   * types}), so not all Java objects can be converted to a {@code Tensor}. In particular, the
+   * argument {@code obj} must be either a primitive (float, double, int, long, boolean, byte) or a
+   * multi-dimensional array of one of those primitives. The argument {@code type} specifies how to
+   * interpret the first argument as a TensorFlow type. For example:
    *
    * <pre>{@code
    * // Valid: A 64-bit integer scalar.
-   * Tensor s = Tensor.create(42L);
+   * Tensor<Long> s = Tensor.create(42L, Long.class);
    *
    * // Valid: A 3x2 matrix of floats.
    * float[][] matrix = new float[3][2];
-   * Tensor m = Tensor.create(matrix);
+   * Tensor<Float> m = Tensor.create(matrix, Float.class);
    *
    * // Invalid: Will throw an IllegalArgumentException as an arbitrary Object
    * // does not fit into the TensorFlow type system.
-   * Tensor o = Tensor.create(new Object());
+   * Tensor<?> o = Tensor.create(new Object())
    *
    * // Invalid: Will throw an IllegalArgumentException since there are
    * // a differing number of elements in each row of this 2-D array.
    * int[][] twoD = new int[2][];
    * twoD[0] = new int[1];
    * twoD[1] = new int[2];
-   * Tensor x = Tensor.create(twoD);
+   * Tensor<Integer> x = Tensor.create(twoD, Integer.class);
    * }</pre>
    *
-   * {@link DataType#STRING} typed Tensors are multi-dimensionary arrays of arbitrary byte sequences
-   * and thus have {@code byte[]} and not {@code String}-valued elements. For example:
+   * {@link String}-typed Tensors are multi-dimensional arrays of arbitrary byte sequences, so can
+   * be initialized from arrays of {@code byte[]} elements. For example:
    *
    * <pre>{@code
-   * // Valid: A DataType.STRING tensor.
-   * Tensor s = Tensor.create(new byte[]{1, 2, 3});
+   * // Valid: A String tensor.
+   * Tensor<String> s = Tensor.create(new byte[]{1, 2, 3}, String.class);
    *
    * // Java Strings will need to be encoded into a byte-sequence.
    * String mystring = "foo";
-   * Tensor s = Tensor.create(mystring.getBytes("UTF-8"));
+   * Tensor<String> s = Tensor.create(mystring.getBytes("UTF-8"), String.class);
    *
-   * // Valid: Matrix of DataType.STRING tensors.
+   * // Valid: Matrix of String tensors.
    * // Each element might have a different length.
    * byte[][][] matrix = new byte[2][2][];
    * matrix[0][0] = "this".getBytes("UTF-8");
    * matrix[0][1] = "is".getBytes("UTF-8");
    * matrix[1][0] = "a".getBytes("UTF-8");
    * matrix[1][1] = "matrix".getBytes("UTF-8");
-   * Tensor m = Tensor.create(matrix);
+   * Tensor<String> m = Tensor.create(matrix, String.class);
    * }</pre>
    *
+   * @param obj The object to convert to a Tensor<T>. Note that whether it is compatible with the
+   *     type T is not checked by the type system. For type-safe creation of tensors, use {@link
+   *     Tensors}.
+   * @param type The class object representing the type T.
    * @throws IllegalArgumentException if {@code obj} is not compatible with the TensorFlow type
-   *     system, or if obj does not disambiguate between multiple DataTypes. In that case, consider
-   *     using {@link #create(DataType, long[], ByteBuffer)} instead.
+   *     system.
    */
-  public static Tensor create(Object obj) {
+  @SuppressWarnings("unchecked")
+  public static <T> Tensor<T> create(Object obj, Class<T> type) {
+    DataType dtype = DataType.fromClass(type);
+    if (!objectCompatWithType(obj, dtype)) {
+      throw new IllegalArgumentException(
+          "DataType of object does not match T (expected "
+              + dtype
+              + ", got "
+              + dataTypeOf(obj)
+              + ")");
+    }
+    return (Tensor<T>) create(obj, dtype);
+  }
+
+  /**
+   * Creates a tensor from an object whose class is inspected to figure out what the underlying data
+   * type should be.
+   *
+   * @throws IllegalArgumentException if {@code obj} is not compatible with the TensorFlow type
+   *     system.
+   */
+  public static Tensor<?> create(Object obj) {
     return create(obj, dataTypeOf(obj));
   }
 
   /**
-   * Create a Tensor of data type {@code dtype} from a Java object.
+   * Create a Tensor of data type {@code dtype} from a Java object. Requires the parameter {@code T}
+   * to match {@code type}, but this condition is not checked.
    *
-   * @param dtype the intended tensor data type. It must match the the run-time type of the object.
+   * @param obj the object supplying the tensor data.
+   * @param dtype the data type of the tensor to create. It must be compatible with the run-time
+   *     type of the object.
+   * @return the new tensor
    */
-  static Tensor create(Object obj, DataType dtype) {
-    Tensor t = new Tensor();
-    t.dtype = dtype;
+  private static Tensor<?> create(Object obj, DataType dtype) {
+    @SuppressWarnings("rawtypes")
+    Tensor<?> t = new Tensor(dtype);
     t.shapeCopy = new long[numDimensions(obj, dtype)];
-    assert objectCompatWithType(obj, dtype);
     fillShape(obj, 0, t.shapeCopy);
     if (t.dtype != DataType.STRING) {
       int byteSize = elemByteSize(t.dtype) * numElements(t.shapeCopy);
@@ -125,7 +153,7 @@ public final class Tensor implements AutoCloseable {
   }
 
   /**
-   * Create an {@link DataType#INT32} Tensor with data from the given buffer.
+   * Create a {@link Integer} Tensor with data from the given buffer.
    *
    * <p>Creates a Tensor with the given shape by copying elements from the buffer (starting from its
    * current position) into the tensor. For example, if {@code shape = {2,3} } (which represents a
@@ -136,14 +164,14 @@ public final class Tensor implements AutoCloseable {
    * @param data a buffer containing the tensor data.
    * @throws IllegalArgumentException If the tensor shape is not compatible with the buffer
    */
-  public static Tensor create(long[] shape, IntBuffer data) {
-    Tensor t = allocateForBuffer(DataType.INT32, shape, data.remaining());
+  public static Tensor<Integer> create(long[] shape, IntBuffer data) {
+    Tensor<Integer> t = allocateForBuffer(DataType.INT32, shape, data.remaining());
     t.buffer().asIntBuffer().put(data);
     return t;
   }
 
   /**
-   * Create a {@link DataType#FLOAT} Tensor with data from the given buffer.
+   * Create a {@link Float} Tensor with data from the given buffer.
    *
    * <p>Creates a Tensor with the given shape by copying elements from the buffer (starting from its
    * current position) into the tensor. For example, if {@code shape = {2,3} } (which represents a
@@ -154,14 +182,14 @@ public final class Tensor implements AutoCloseable {
    * @param data a buffer containing the tensor data.
    * @throws IllegalArgumentException If the tensor shape is not compatible with the buffer
    */
-  public static Tensor create(long[] shape, FloatBuffer data) {
-    Tensor t = allocateForBuffer(DataType.FLOAT, shape, data.remaining());
+  public static Tensor<Float> create(long[] shape, FloatBuffer data) {
+    Tensor<Float> t = allocateForBuffer(DataType.FLOAT, shape, data.remaining());
     t.buffer().asFloatBuffer().put(data);
     return t;
   }
 
   /**
-   * Create a {@link DataType#DOUBLE} Tensor with data from the given buffer.
+   * Create a {@link Double} Tensor with data from the given buffer.
    *
    * <p>Creates a Tensor with the given shape by copying elements from the buffer (starting from its
    * current position) into the tensor. For example, if {@code shape = {2,3} } (which represents a
@@ -172,14 +200,14 @@ public final class Tensor implements AutoCloseable {
    * @param data a buffer containing the tensor data.
    * @throws IllegalArgumentException If the tensor shape is not compatible with the buffer
    */
-  public static Tensor create(long[] shape, DoubleBuffer data) {
-    Tensor t = allocateForBuffer(DataType.DOUBLE, shape, data.remaining());
+  public static Tensor<Double> create(long[] shape, DoubleBuffer data) {
+    Tensor<Double> t = allocateForBuffer(DataType.DOUBLE, shape, data.remaining());
     t.buffer().asDoubleBuffer().put(data);
     return t;
   }
 
   /**
-   * Create an {@link DataType#INT64} Tensor with data from the given buffer.
+   * Create an {@link Long} Tensor with data from the given buffer.
    *
    * <p>Creates a Tensor with the given shape by copying elements from the buffer (starting from its
    * current position) into the tensor. For example, if {@code shape = {2,3} } (which represents a
@@ -190,47 +218,87 @@ public final class Tensor implements AutoCloseable {
    * @param data a buffer containing the tensor data.
    * @throws IllegalArgumentException If the tensor shape is not compatible with the buffer
    */
-  public static Tensor create(long[] shape, LongBuffer data) {
-    Tensor t = allocateForBuffer(DataType.INT64, shape, data.remaining());
+  public static Tensor<Long> create(long[] shape, LongBuffer data) {
+    Tensor<Long> t = allocateForBuffer(DataType.INT64, shape, data.remaining());
     t.buffer().asLongBuffer().put(data);
     return t;
   }
 
   /**
-   * Create a Tensor with data from the given buffer.
+   * Create a Tensor of any type with data from the given buffer.
+   *
+   * <p>Creates a Tensor with the provided shape of any type where the tensor's data has been
+   * encoded into {@code data} as per the specification of the TensorFlow <a
+   * href="https://www.tensorflow.org/code/tensorflow/c/c_api.h">C API</a>.
+   *
+   * @param <T> the tensor element type
+   * @param type the tensor element type, represented as a class object.
+   * @param shape the tensor shape.
+   * @param data a buffer containing the tensor data.
+   * @throws IllegalArgumentException If the tensor datatype or shape is not compatible with the
+   *     buffer
+   */
+  public static <T> Tensor<T> create(Class<T> type, long[] shape, ByteBuffer data) {
+    @SuppressWarnings("unchecked")
+    Tensor<T> ret = (Tensor<T>) create(DataType.fromClass(type), shape, data);
+    return ret;
+  }
+
+  /**
+   * Creates a Tensor of any type with data from the given buffer.
    *
    * <p>Creates a Tensor with the provided shape of any type where the tensor's data has been
    * encoded into {@code data} as per the specification of the TensorFlow <a
    * href="https://www.tensorflow.org/code/tensorflow/c/c_api.h">C API</a>.
    *
-   * @param dataType the tensor datatype.
+   * @param <T> The tensor element type
+   * @param type the tensor element type, specified as a DataType. This must agree with T.
    * @param shape the tensor shape.
    * @param data a buffer containing the tensor data.
    * @throws IllegalArgumentException If the tensor datatype or shape is not compatible with the
    *     buffer
    */
-  public static Tensor create(DataType dataType, long[] shape, ByteBuffer data) {
+  private static Tensor<?> create(DataType dtype, long[] shape, ByteBuffer data) {
     int nremaining = 0;
-    if (dataType != DataType.STRING) {
-      int elemBytes = elemByteSize(dataType);
+    if (dtype != DataType.STRING) {
+      int elemBytes = elemByteSize(dtype);
       if (data.remaining() % elemBytes != 0) {
         throw new IllegalArgumentException(
             String.format(
                 "ByteBuffer with %d bytes is not compatible with a %s Tensor (%d bytes/element)",
-                data.remaining(), dataType.toString(), elemBytes));
+                data.remaining(), dtype.toString(), elemBytes));
       }
       nremaining = data.remaining() / elemBytes;
     } else {
       nremaining = data.remaining();
     }
-    Tensor t = allocateForBuffer(dataType, shape, nremaining);
+    Tensor<?> t = allocateForBuffer(dtype, shape, nremaining);
     t.buffer().put(data);
     return t;
   }
 
+  /**
+   * Returns this Tensor object with the type {@code Tensor<U>}. This method is useful when given a
+   * value of type {@code Tensor<?>}.
+   *
+   * @param type any (non-null) array of the correct type.
+   * @throws IllegalArgumentException if the actual data type of this object does not match the type
+   *     {@code U}.
+   */
+  @SuppressWarnings("unchecked")
+  public <U> Tensor<U> expect(Class<U> type) {
+    DataType dt = DataType.fromClass(type);
+    if (!dt.equals(dtype)) {
+      throw new IllegalArgumentException(
+          "Cannot cast from tensor of " + dtype + " to tensor of " + dt);
+    }
+    return ((Tensor<U>) this);
+  }
+
   // Helper function to allocate a Tensor for the create() methods that create a Tensor from
   // a java.nio.Buffer.
-  private static Tensor allocateForBuffer(DataType dataType, long[] shape, int nBuffered) {
+  // Requires: dataType matches T
+  private static <T> Tensor<T> allocateForBuffer(DataType dataType, long[] shape, int nBuffered) {
     final int nflattened = numElements(shape);
     int nbytes = 0;
     if (dataType != DataType.STRING) {
@@ -242,8 +310,7 @@ public final class Tensor implements AutoCloseable {
       // DT_STRING tensor encoded in a ByteBuffer.
       nbytes = nBuffered;
     }
-    Tensor t = new Tensor();
-    t.dtype = dataType;
+    Tensor<T> t = new Tensor<T>(dataType);
     t.shapeCopy = Arrays.copyOf(shape, shape.length);
     t.nativeHandle = allocate(t.dtype.c(), t.shapeCopy, nbytes);
     return t;
@@ -300,7 +367,7 @@ public final class Tensor implements AutoCloseable {
   }
 
   /**
-   * Returns the value in a scalar {@link DataType#FLOAT} tensor.
+   * Returns the value in a scalar {@link Float} tensor.
    *
    * @throws IllegalArgumentException if the Tensor does not represent a float scalar.
    */
@@ -309,7 +376,7 @@ public final class Tensor implements AutoCloseable {
   }
 
   /**
-   * Returns the value in a scalar {@link DataType#DOUBLE} tensor.
+   * Returns the value in a scalar {@link Double} tensor.
    *
    * @throws IllegalArgumentException if the Tensor does not represent a double scalar.
    */
@@ -318,7 +385,7 @@ public final class Tensor implements AutoCloseable {
   }
 
   /**
-   * Returns the value in a scalar {@link DataType#INT32} tensor.
+   * Returns the value in a scalar {@link Integer} tensor.
    *
    * @throws IllegalArgumentException if the Tensor does not represent a int scalar.
    */
@@ -327,7 +394,7 @@ public final class Tensor implements AutoCloseable {
   }
 
   /**
-   * Returns the value in a scalar {@link DataType#INT64} tensor.
+   * Returns the value in a scalar {@link Long} tensor.
    *
    * @throws IllegalArgumentException if the Tensor does not represent a long scalar.
    */
@@ -336,7 +403,7 @@ public final class Tensor implements AutoCloseable {
   }
 
   /**
-   * Returns the value in a scalar {@link DataType#BOOL} tensor.
+   * Returns the value in a scalar {@link Boolean} tensor.
    *
    * @throws IllegalArgumentException if the Tensor does not represent a boolean scalar.
    */
@@ -345,7 +412,7 @@ public final class Tensor implements AutoCloseable {
   }
 
   /**
-   * Returns the value in a scalar {@link DataType#STRING} tensor.
+   * Returns the value in a scalar {@link String} tensor.
    *
    * @throws IllegalArgumentException if the Tensor does not represent a boolean scalar.
    */
@@ -377,21 +444,21 @@ public final class Tensor implements AutoCloseable {
    * @throws IllegalArgumentException if the tensor is a scalar or if {@code dst} is not compatible
    *     with the tensor (for example, mismatched data types or shapes).
    */
-  public <T> T copyTo(T dst) {
+  public <U> U copyTo(U dst) {
     throwExceptionIfTypeIsIncompatible(dst);
     readNDArray(nativeHandle, dst);
     return dst;
   }
 
   /**
-   * Write the data of a {@link DataType#INT32} tensor into the given buffer.
+   * Write the data of a {@link Integer} tensor into the given buffer.
    *
    * <p>Copies {@code numElements()} elements to the buffer.
    *
    * @param dst the destination buffer
    * @throws BufferOverflowException If there is insufficient space in the given buffer for the data
    *     in this tensor
-   * @throws IllegalArgumentException If the tensor datatype is not {@link DataType#INT32}
+   * @throws IllegalArgumentException If the tensor data type is not {@link Integer}
    */
   public void writeTo(IntBuffer dst) {
     if (dtype != DataType.INT32) {
@@ -402,14 +469,14 @@ public final class Tensor implements AutoCloseable {
   }
 
   /**
-   * Write the data of a {@link DataType#FLOAT} tensor into the given buffer.
+   * Write the data of a {@link Float} tensor into the given buffer.
    *
    * <p>Copies {@code numElements()} elements to the buffer.
    *
    * @param dst the destination buffer
    * @throws BufferOverflowException If there is insufficient space in the given buffer for the data
    *     in this tensor
-   * @throws IllegalArgumentException If the tensor datatype is not {@link DataType#FLOAT}
+   * @throws IllegalArgumentException If the tensor datatype is not {@link Float}
    */
   public void writeTo(FloatBuffer dst) {
     if (dtype != DataType.FLOAT) {
@@ -420,14 +487,14 @@ public final class Tensor implements AutoCloseable {
   }
 
   /**
-   * Write the data of a {@link DataType#DOUBLE} tensor into the given buffer.
+   * Write the data of a {@link Double} tensor into the given buffer.
    *
    * <p>Copies {@code numElements()} elements to the buffer.
    *
    * @param dst the destination buffer
    * @throws BufferOverflowException If there is insufficient space in the given buffer for the data
    *     in this tensor
-   * @throws IllegalArgumentException If the tensor datatype is not {@link DataType#DOUBLE}
+   * @throws IllegalArgumentException If the tensor datatype is not {@link Double}
    */
   public void writeTo(DoubleBuffer dst) {
     if (dtype != DataType.DOUBLE) {
@@ -438,14 +505,14 @@ public final class Tensor implements AutoCloseable {
   }
 
   /**
-   * Write the data of a {@link DataType#INT64} tensor into the given buffer.
+   * Write the data of a {@link Long} tensor into the given buffer.
    *
    * <p>Copies {@code numElements()} elements to the buffer.
    *
    * @param dst the destination buffer
    * @throws BufferOverflowException If there is insufficient space in the given buffer for the data
    *     in this tensor
-   * @throws IllegalArgumentException If the tensor datatype is not {@link DataType#INT64}
+   * @throws IllegalArgumentException If the tensor datatype is not {@link Long}
    */
   public void writeTo(LongBuffer dst) {
     if (dtype != DataType.INT64) {
@@ -480,9 +547,9 @@ public final class Tensor implements AutoCloseable {
    *
    * <p>Takes ownership of the handle.
    */
-  static Tensor fromHandle(long handle) {
-    Tensor t = new Tensor();
-    t.dtype = DataType.fromC(dtype(handle));
+  static Tensor<?> fromHandle(long handle) {
+    @SuppressWarnings("rawtypes")
+    Tensor<?> t = new Tensor(DataType.fromC(dtype(handle)));
     t.shapeCopy = shape(handle);
     t.nativeHandle = handle;
     return t;
@@ -496,7 +563,9 @@ public final class Tensor implements AutoCloseable {
   private DataType dtype;
   private long[] shapeCopy = null;
 
-  private Tensor() {}
+  private Tensor(DataType t) {
+    dtype = t;
+  }
 
   private ByteBuffer buffer() {
     return buffer(nativeHandle).order(ByteOrder.nativeOrder());
@@ -564,11 +633,26 @@ public final class Tensor implements AutoCloseable {
     classDataTypes.put(Boolean.class, DataType.BOOL);
   }
 
-  private static DataType dataTypeOf(Object o) {
+  /** The class for the data type to which Java object o corresponds. */
+  private static Class<?> baseObjType(Object o) {
     Class<?> c = o.getClass();
     while (c.isArray()) {
       c = c.getComponentType();
     }
+    return c;
+  }
+
+  /**
+   * The default TensorFlow data type to which Java object o corresponds. Some Java objects
+   * represent more than one TensorFlow data type; for example, 'byte' can represent both {@code
+   * uint8} and {@code string}, with the latter being the default interpretation.
+   */
+  private static DataType dataTypeOf(Object o) {
+    Class<?> c = baseObjType(o);
+    return dataTypeFromClass(c);
+  }
+
+  private static DataType dataTypeFromClass(Class<?> c) {
     DataType ret = classDataTypes.get(c);
     if (ret != null) {
       return ret;
@@ -577,7 +661,12 @@ public final class Tensor implements AutoCloseable {
   }
 
   /**
-   * Returns the number of dimensions of a tensor of type dtype when represented by the object o.
+   * Return the number of dimensions of the tensor that object {@code o} represents as a tensor
+   * whose datatype is {@code dtype}. Normally this is the same as the number of dimensions of o
+   * itself, but is one smaller for tensors of strings.
+   *
+   * @param o The object to inspect. It must be a valid representation of the given data type.
+   * @param dtype The expected data type of the tensor.
    */
   private static int numDimensions(Object o, DataType dtype) {
     int ret = numArrayDimensions(o);
@@ -624,7 +713,13 @@ public final class Tensor implements AutoCloseable {
 
   /** Returns whether the object {@code obj} can represent a tensor with data type {@code dtype}. */
   private static boolean objectCompatWithType(Object obj, DataType dtype) {
-    DataType dto = dataTypeOf(obj);
+    Class<?> c = baseObjType(obj);
+    DataType dto = dataTypeFromClass(c);
+    int nd = numDimensions(obj, dto);
+    if (!c.isPrimitive() && c != String.class && nd != 0) {
+      throw new IllegalArgumentException(
+          "cannot create non-scalar Tensors from arrays of boxed values");
+    }
     if (dto.equals(dtype)) {
       return true;
     }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Tensors.java b/tensorflow/java/src/main/java/org/tensorflow/Tensors.java
new file mode 100644
index 0000000000..c828d23efc
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/Tensors.java
@@ -0,0 +1,447 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/** Type-safe factory methods for creating {@link org.tensorflow.Tensor} objects. */
+public final class Tensors {
+  private Tensors() {}
+
+  /**
+   * Creates a scalar String tensor using the default, UTF-8 encoding.
+   *
+   * @param data The string to put into the new scalar tensor.
+   */
+  public static Tensor<String> create(String data) {
+    return Tensor.create(data.getBytes(UTF_8), String.class);
+  }
+
+  /**
+   * Creates a scalar String tensor using a specified encoding.
+   *
+   * @param charset The encoding from String to bytes.
+   * @param data The string to put into the new scalar tensor.
+   */
+  public static Tensor<String> create(String data, java.nio.charset.Charset charset) {
+    return Tensor.create(data.getBytes(charset), String.class);
+  }
+
+  /**
+   * Creates a scalar tensor containing a single {@code float} element.
+   *
+   * @param data The value to put into the new scalar tensor.
+   */
+  public static Tensor<Float> create(float data) {
+    return Tensor.create(data, Float.class);
+  }
+
+  /**
+   * Creates a rank-1 tensor of {@code float} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Float> create(float[] data) {
+    return Tensor.create(data, Float.class);
+  }
+
+  /**
+   * Creates a rank-2 tensor of {@code float} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Float> create(float[][] data) {
+    return Tensor.create(data, Float.class);
+  }
+
+  /**
+   * Creates a rank-3 tensor of {@code float} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Float> create(float[][][] data) {
+    return Tensor.create(data, Float.class);
+  }
+
+  /**
+   * Creates a rank-4 tensor of {@code float} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Float> create(float[][][][] data) {
+    return Tensor.create(data, Float.class);
+  }
+
+  /**
+   * Creates a rank-5 tensor of {@code float} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Float> create(float[][][][][] data) {
+    return Tensor.create(data, Float.class);
+  }
+
+  /**
+   * Creates a rank-6 tensor of {@code float} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Float> create(float[][][][][][] data) {
+    return Tensor.create(data, Float.class);
+  }
+
+  /**
+   * Creates a scalar tensor containing a single {@code double} element.
+   *
+   * @param data The value to put into the new scalar tensor.
+   */
+  public static Tensor<Double> create(double data) {
+    return Tensor.create(data, Double.class);
+  }
+
+  /**
+   * Creates a rank-1 tensor of {@code double} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Double> create(double[] data) {
+    return Tensor.create(data, Double.class);
+  }
+
+  /**
+   * Creates a rank-2 tensor of {@code double} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Double> create(double[][] data) {
+    return Tensor.create(data, Double.class);
+  }
+
+  /**
+   * Creates a rank-3 tensor of {@code double} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Double> create(double[][][] data) {
+    return Tensor.create(data, Double.class);
+  }
+
+  /**
+   * Creates a rank-4 tensor of {@code double} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Double> create(double[][][][] data) {
+    return Tensor.create(data, Double.class);
+  }
+
+  /**
+   * Creates a rank-5 tensor of {@code double} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Double> create(double[][][][][] data) {
+    return Tensor.create(data, Double.class);
+  }
+
+  /**
+   * Creates a rank-6 tensor of {@code double} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Double> create(double[][][][][][] data) {
+    return Tensor.create(data, Double.class);
+  }
+
+  /**
+   * Creates a scalar tensor containing a single {@code int} element.
+   *
+   * @param data The value to put into the new scalar tensor.
+   */
+  public static Tensor<Integer> create(int data) {
+    return Tensor.create(data, Integer.class);
+  }
+
+  /**
+   * Creates a rank-1 tensor of {@code int} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Integer> create(int[] data) {
+    return Tensor.create(data, Integer.class);
+  }
+
+  /**
+   * Creates a rank-2 tensor of {@code int} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Integer> create(int[][] data) {
+    return Tensor.create(data, Integer.class);
+  }
+
+  /**
+   * Creates a rank-3 tensor of {@code int} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Integer> create(int[][][] data) {
+    return Tensor.create(data, Integer.class);
+  }
+
+  /**
+   * Creates a rank-4 tensor of {@code int} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Integer> create(int[][][][] data) {
+    return Tensor.create(data, Integer.class);
+  }
+
+  /**
+   * Creates a rank-5 tensor of {@code int} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Integer> create(int[][][][][] data) {
+    return Tensor.create(data, Integer.class);
+  }
+
+  /**
+   * Creates a rank-6 tensor of {@code int} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Integer> create(int[][][][][][] data) {
+    return Tensor.create(data, Integer.class);
+  }
+
+  /**
+   * Creates a scalar tensor containing a single {@code byte} element.
+   *
+   * @param data An array containing the data to put into the new tensor. String elements are
+   *     sequences of bytes from the last array dimension.
+   */
+  public static Tensor<String> create(byte[] data) {
+    return Tensor.create(data, String.class);
+  }
+
+  /**
+   * Creates a rank-1 tensor of {@code byte} elements.
+   *
+   * @param data An array containing the data to put into the new tensor. String elements are
+   *     sequences of bytes from the last array dimension.
+   */
+  public static Tensor<String> create(byte[][] data) {
+    return Tensor.create(data, String.class);
+  }
+
+  /**
+   * Creates a rank-2 tensor of {@code byte} elements.
+   *
+   * @param data An array containing the data to put into the new tensor. String elements are
+   *     sequences of bytes from the last array dimension.
+   */
+  public static Tensor<String> create(byte[][][] data) {
+    return Tensor.create(data, String.class);
+  }
+
+  /**
+   * Creates a rank-3 tensor of {@code byte} elements.
+   *
+   * @param data An array containing the data to put into the new tensor. String elements are
+   *     sequences of bytes from the last array dimension.
+   */
+  public static Tensor<String> create(byte[][][][] data) {
+    return Tensor.create(data, String.class);
+  }
+
+  /**
+   * Creates a rank-4 tensor of {@code byte} elements.
+   *
+   * @param data An array containing the data to put into the new tensor. String elements are
+   *     sequences of bytes from the last array dimension.
+   */
+  public static Tensor<String> create(byte[][][][][] data) {
+    return Tensor.create(data, String.class);
+  }
+
+  /**
+   * Creates a rank-5 tensor of {@code byte} elements.
+   *
+   * @param data An array containing the data to put into the new tensor. String elements are
+   *     sequences of bytes from the last array dimension.
+   */
+  public static Tensor<String> create(byte[][][][][][] data) {
+    return Tensor.create(data, String.class);
+  }
+
+  /**
+   * Creates a scalar tensor containing a single {@code long} element.
+   *
+   * @param data The value to put into the new scalar tensor.
+   */
+  public static Tensor<Long> create(long data) {
+    return Tensor.create(data, Long.class);
+  }
+
+  /**
+   * Creates a rank-1 tensor of {@code long} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Long> create(long[] data) {
+    return Tensor.create(data, Long.class);
+  }
+
+  /**
+   * Creates a rank-2 tensor of {@code long} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Long> create(long[][] data) {
+    return Tensor.create(data, Long.class);
+  }
+
+  /**
+   * Creates a rank-3 tensor of {@code long} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Long> create(long[][][] data) {
+    return Tensor.create(data, Long.class);
+  }
+
+  /**
+   * Creates a rank-4 tensor of {@code long} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Long> create(long[][][][] data) {
+    return Tensor.create(data, Long.class);
+  }
+
+  /**
+   * Creates a rank-5 tensor of {@code long} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Long> create(long[][][][][] data) {
+    return Tensor.create(data, Long.class);
+  }
+
+  /**
+   * Creates a rank-6 tensor of {@code long} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Long> create(long[][][][][][] data) {
+    return Tensor.create(data, Long.class);
+  }
+
+  /**
+   * Creates a scalar tensor containing a single {@code boolean} element.
+   *
+   * @param data The value to put into the new scalar tensor.
+   */
+  public static Tensor<Boolean> create(boolean data) {
+    return Tensor.create(data, Boolean.class);
+  }
+
+  /**
+   * Creates a rank-1 tensor of {@code boolean} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Boolean> create(boolean[] data) {
+    return Tensor.create(data, Boolean.class);
+  }
+
+  /**
+   * Creates a rank-2 tensor of {@code boolean} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Boolean> create(boolean[][] data) {
+    return Tensor.create(data, Boolean.class);
+  }
+
+  /**
+   * Creates a rank-3 tensor of {@code boolean} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Boolean> create(boolean[][][] data) {
+    return Tensor.create(data, Boolean.class);
+  }
+
+  /**
+   * Creates a rank-4 tensor of {@code boolean} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Boolean> create(boolean[][][][] data) {
+    return Tensor.create(data, Boolean.class);
+  }
+
+  /**
+   * Creates a rank-5 tensor of {@code boolean} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Boolean> create(boolean[][][][][] data) {
+    return Tensor.create(data, Boolean.class);
+  }
+
+  /**
+   * Creates a rank-6 tensor of {@code boolean} elements.
+   *
+   * @param data An array containing the values to put into the new tensor. The dimensions of the
+   *     new tensor will match those of the array.
+   */
+  public static Tensor<Boolean> create(boolean[][][][][][] data) {
+    return Tensor.create(data, Boolean.class);
+  }
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java b/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
index 19929188a5..489e95c310 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
@@ -29,6 +29,7 @@ import org.tensorflow.Output;
 import org.tensorflow.Session;
 import org.tensorflow.Tensor;
 import org.tensorflow.TensorFlow;
+import org.tensorflow.types.UInt8;
 
 /** Sample use of the TensorFlow Java API to label images using a pre-trained model. */
 public class LabelImage {
@@ -61,17 +62,17 @@ public class LabelImage {
         readAllLinesOrExit(Paths.get(modelDir, "imagenet_comp_graph_label_strings.txt"));
     byte[] imageBytes = readAllBytesOrExit(Paths.get(imageFile));
 
-    try (Tensor image = constructAndExecuteGraphToNormalizeImage(imageBytes)) {
+    try (Tensor<Float> image = constructAndExecuteGraphToNormalizeImage(imageBytes)) {
       float[] labelProbabilities = executeInceptionGraph(graphDef, image);
       int bestLabelIdx = maxIndex(labelProbabilities);
       System.out.println(
-          String.format(
-              "BEST MATCH: %s (%.2f%% likely)",
-              labels.get(bestLabelIdx), labelProbabilities[bestLabelIdx] * 100f));
+          String.format("BEST MATCH: %s (%.2f%% likely)",
+              labels.get(bestLabelIdx),
+              labelProbabilities[bestLabelIdx] * 100f));
     }
   }
 
-  private static Tensor constructAndExecuteGraphToNormalizeImage(byte[] imageBytes) {
+  private static Tensor<Float> constructAndExecuteGraphToNormalizeImage(byte[] imageBytes) {
     try (Graph g = new Graph()) {
       GraphBuilder b = new GraphBuilder(g);
       // Some constants specific to the pre-trained model at:
@@ -88,28 +89,29 @@ public class LabelImage {
       // Since the graph is being constructed once per execution here, we can use a constant for the
       // input image. If the graph were to be re-used for multiple input images, a placeholder would
       // have been more appropriate.
-      final Output input = b.constant("input", imageBytes);
-      final Output output =
+      final Output<String> input = b.constant("input", imageBytes);
+      final Output<Float> output =
           b.div(
               b.sub(
                   b.resizeBilinear(
                       b.expandDims(
-                          b.cast(b.decodeJpeg(input, 3), DataType.FLOAT),
+                          b.cast(b.decodeJpeg(input, 3), Float.class),
                           b.constant("make_batch", 0)),
                       b.constant("size", new int[] {H, W})),
                   b.constant("mean", mean)),
               b.constant("scale", scale));
       try (Session s = new Session(g)) {
-        return s.runner().fetch(output.op().name()).run().get(0);
+        return s.runner().fetch(output.op().name()).run().get(0).expect(Float.class);
       }
     }
   }
 
-  private static float[] executeInceptionGraph(byte[] graphDef, Tensor image) {
+  private static float[] executeInceptionGraph(byte[] graphDef, Tensor<Float> image) {
     try (Graph g = new Graph()) {
       g.importGraphDef(graphDef);
       try (Session s = new Session(g);
-          Tensor result = s.runner().feed("input", image).fetch("output").run().get(0)) {
+          Tensor<Float> result =
+              s.runner().feed("input", image).fetch("output").run().get(0).expect(Float.class)) {
         final long[] rshape = result.shape();
         if (result.numDimensions() != 2 || rshape[0] != 1) {
           throw new RuntimeException(
@@ -161,48 +163,71 @@ public class LabelImage {
       this.g = g;
     }
 
-    Output div(Output x, Output y) {
+    Output<Float> div(Output<Float> x, Output<Float> y) {
       return binaryOp("Div", x, y);
     }
 
-    Output sub(Output x, Output y) {
+    <T> Output<T> sub(Output<T> x, Output<T> y) {
       return binaryOp("Sub", x, y);
     }
 
-    Output resizeBilinear(Output images, Output size) {
-      return binaryOp("ResizeBilinear", images, size);
+    <T> Output<Float> resizeBilinear(Output<T> images, Output<Integer> size) {
+      return binaryOp3("ResizeBilinear", images, size);
     }
 
-    Output expandDims(Output input, Output dim) {
-      return binaryOp("ExpandDims", input, dim);
+    <T> Output<T> expandDims(Output<T> input, Output<Integer> dim) {
+      return binaryOp3("ExpandDims", input, dim);
     }
 
-    Output cast(Output value, DataType dtype) {
-      return g.opBuilder("Cast", "Cast").addInput(value).setAttr("DstT", dtype).build().output(0);
+    <T, U> Output<U> cast(Output<T> value, Class<U> type) {
+      DataType dtype = DataType.fromClass(type);
+      return g.opBuilder("Cast", "Cast")
+          .addInput(value)
+          .setAttr("DstT", dtype)
+          .build()
+          .<U>output(0);
     }
 
-    Output decodeJpeg(Output contents, long channels) {
+    Output<UInt8> decodeJpeg(Output<String> contents, long channels) {
       return g.opBuilder("DecodeJpeg", "DecodeJpeg")
           .addInput(contents)
           .setAttr("channels", channels)
           .build()
-          .output(0);
+          .<UInt8>output(0);
     }
 
-    Output constant(String name, Object value) {
-      try (Tensor t = Tensor.create(value)) {
+    <T> Output<T> constant(String name, Object value, Class<T> type) {
+      try (Tensor<T> t = Tensor.<T>create(value, type)) {
         return g.opBuilder("Const", name)
-            .setAttr("dtype", t.dataType())
+            .setAttr("dtype", DataType.fromClass(type))
             .setAttr("value", t)
             .build()
-            .output(0);
+            .<T>output(0);
       }
     }
+    Output<String> constant(String name, byte[] value) {
+      return this.constant(name, value, String.class);
+    }
 
-    private Output binaryOp(String type, Output in1, Output in2) {
-      return g.opBuilder(type, type).addInput(in1).addInput(in2).build().output(0);
+    Output<Integer> constant(String name, int value) {
+      return this.constant(name, value, Integer.class);
     }
 
+    Output<Integer> constant(String name, int[] value) {
+      return this.constant(name, value, Integer.class);
+    }
+
+    Output<Float> constant(String name, float value) {
+      return this.constant(name, value, Float.class);
+    }
+
+    private <T> Output<T> binaryOp(String type, Output<T> in1, Output<T> in2) {
+      return g.opBuilder(type, type).addInput(in1).addInput(in2).build().<T>output(0);
+    }
+
+    private <T, U, V> Output<T> binaryOp3(String type, Output<U> in1, Output<V> in2) {
+      return g.opBuilder(type, type).addInput(in1).addInput(in2).build().<T>output(0);
+    }
     private Graph g;
   }
 }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/Operands.java b/tensorflow/java/src/main/java/org/tensorflow/op/Operands.java
index 5971103d6d..ac48da8032 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/op/Operands.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/op/Operands.java
@@ -33,12 +33,12 @@ public final class Operands {
    * @param inputs an iteration of input operands
    * @return an array of outputs
    */
-  public static Output[] asOutputs(Iterable<? extends Operand> inputs) {
-    List<Output> outputList = new ArrayList<>();
-    for (Operand input : inputs) {
+  public static Output<?>[] asOutputs(Iterable<? extends Operand<?>> inputs) {
+    List<Output<?>> outputList = new ArrayList<>();
+    for (Operand<?> input : inputs) {
       outputList.add(input.asOutput());
     }
-    return outputList.toArray(new Output[outputList.size()]);
+    return outputList.toArray(new Output<?>[outputList.size()]);
   }
 
   // Disabled constructor
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/core/Constant.java b/tensorflow/java/src/main/java/org/tensorflow/op/core/Constant.java
index cd7931d3bb..725c81765a 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/op/core/Constant.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/op/core/Constant.java
@@ -31,7 +31,7 @@ import org.tensorflow.op.annotation.Operator;
 
 /** An operator producing a constant value. */
 @Operator
-public final class Constant extends PrimitiveOp implements Operand {
+public final class Constant<T> extends PrimitiveOp implements Operand<T> {
   /**
    * Create a constant from a Java object.
    *
@@ -47,8 +47,8 @@ public final class Constant extends PrimitiveOp implements Operand {
    * @param object a Java object representing the constant.
    * @see org.tensorflow.Tensor#create(Object) Tensor.create
    */
-  public static Constant create(Scope scope, Object object) {
-    try (Tensor value = Tensor.create(object)) {
+  public static <T> Constant<T> create(Scope scope, Object object, Class<T> type) {
+    try (Tensor<T> value = Tensor.create(object, type)) {
       return createWithTensor(scope, value);
     }
   }
@@ -66,8 +66,8 @@ public final class Constant extends PrimitiveOp implements Operand {
    * @param data a buffer containing the tensor data.
    * @throws IllegalArgumentException If the tensor shape is not compatible with the buffer
    */
-  public static Constant create(Scope scope, long[] shape, IntBuffer data) {
-    try (Tensor value = Tensor.create(shape, data)) {
+  public static Constant<Integer> create(Scope scope, long[] shape, IntBuffer data) {
+    try (Tensor<Integer> value = Tensor.create(shape, data)) {
       return createWithTensor(scope, value);
     }
   }
@@ -85,8 +85,8 @@ public final class Constant extends PrimitiveOp implements Operand {
    * @param data a buffer containing the tensor data.
    * @throws IllegalArgumentException If the tensor shape is not compatible with the buffer
    */
-  public static Constant create(Scope scope, long[] shape, FloatBuffer data) {
-    try (Tensor value = Tensor.create(shape, data)) {
+  public static Constant<Float> create(Scope scope, long[] shape, FloatBuffer data) {
+    try (Tensor<Float> value = Tensor.create(shape, data)) {
       return createWithTensor(scope, value);
     }
   }
@@ -104,8 +104,8 @@ public final class Constant extends PrimitiveOp implements Operand {
    * @param data a buffer containing the tensor data.
    * @throws IllegalArgumentException If the tensor shape is not compatible with the buffer
    */
-  public static Constant create(Scope scope, long[] shape, DoubleBuffer data) {
-    try (Tensor value = Tensor.create(shape, data)) {
+  public static Constant<Double> create(Scope scope, long[] shape, DoubleBuffer data) {
+    try (Tensor<Double> value = Tensor.create(shape, data)) {
       return createWithTensor(scope, value);
     }
   }
@@ -123,8 +123,8 @@ public final class Constant extends PrimitiveOp implements Operand {
    * @param data a buffer containing the tensor data.
    * @throws IllegalArgumentException If the tensor shape is not compatible with the buffer
    */
-  public static Constant create(Scope scope, long[] shape, LongBuffer data) {
-    try (Tensor value = Tensor.create(shape, data)) {
+  public static Constant<Long> create(Scope scope, long[] shape, LongBuffer data) {
+    try (Tensor<Long> value = Tensor.create(shape, data)) {
       return createWithTensor(scope, value);
     }
   }
@@ -143,14 +143,14 @@ public final class Constant extends PrimitiveOp implements Operand {
    * @throws IllegalArgumentException If the tensor datatype or shape is not compatible with the
    *     buffer
    */
-  public static Constant create(Scope scope, DataType dataType, long[] shape, ByteBuffer data) {
-    try (Tensor value = Tensor.create(dataType, shape, data)) {
+  public static <T> Constant<T> create(Scope scope, Class<T> type, long[] shape, ByteBuffer data) {
+    try (Tensor<T> value = Tensor.create(type, shape, data)) {
       return createWithTensor(scope, value);
     }
   }
 
-  private static Constant createWithTensor(Scope scope, Tensor value) {
-    return new Constant(
+  private static <T> Constant<T> createWithTensor(Scope scope, Tensor<T> value) {
+    return new Constant<T>(
         scope
             .graph()
             .opBuilder("Const", scope.makeOpName("Const"))
@@ -160,7 +160,7 @@ public final class Constant extends PrimitiveOp implements Operand {
   }
 
   @Override
-  public Output asOutput() {
+  public Output<T> asOutput() {
     return output;
   }
 
@@ -169,5 +169,5 @@ public final class Constant extends PrimitiveOp implements Operand {
     output = operation.output(0);
   }
 
-  private final Output output;
+  private final Output<T> output;
 }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/UInt8.java b/tensorflow/java/src/main/java/org/tensorflow/types/UInt8.java
new file mode 100644
index 0000000000..0c751aed9f
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/types/UInt8.java
@@ -0,0 +1,21 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.types;
+
+/** Represents an 8-bit unsigned integer. */
+public class UInt8 {
+  private UInt8() {}
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/package-info.java b/tensorflow/java/src/main/java/org/tensorflow/types/package-info.java
index f1410a760e..96018c5366 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/types/package-info.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/types/package-info.java
@@ -15,13 +15,15 @@ limitations under the License.
 
 /**
  * Defines classes that represent TensorFlow data types. For each possible data type
- * that can be used in a tensor, there is a corresponding class in this package that
+ * that can be used in a tensor, there is a corresponding class that
  * is used to represent it. For example, the TensorFlow int32 type is represented by
- * the type TFInt32 and by the class object TFInt32.class. The former is used to
- * support compile-time checking of tensor data types and the latter is used for
- * run-time checking of data types. All such classes implement the TFType interface.
- * TensorFlow data types are also separately represented by the DataType enum, with
- * one enum value per data type. The enum representation should rarely be needed, but
- * the Types class can be used to obtain it from the class object representation.
+ * the type {@link Integer} and by the class object {@code Integer.class}. The former is used to
+ * support compile-time checking of tensor element types and the latter is used for
+ * run-time checking of element types. Classes appearing in this package, such as
+ * UInt8, represent TensorFlow data types for which there is no existing Java equivalent.
+ *
+ * <p>TensorFlow element types are also separately represented by the {@link DataType} enum, with
+ * one enum value per element type. The enum representation is not usually needed, but
+ * can be obtained using {@link DataType.fromClass}.
  */
 package org.tensorflow.types;
diff --git a/tensorflow/java/src/test/java/org/tensorflow/GraphTest.java b/tensorflow/java/src/test/java/org/tensorflow/GraphTest.java
index 4adc861bf1..c540299bdc 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/GraphTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/GraphTest.java
@@ -22,7 +22,6 @@ import static org.junit.Assert.assertTrue;
 
 import java.util.HashSet;
 import java.util.Iterator;
-
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
diff --git a/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java b/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
index b3bc3aaef9..6dc233987b 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
@@ -34,8 +34,8 @@ public class OperationBuilderTest {
   public void failWhenMixingOperationsOnDifferentGraphs() {
     try (Graph g1 = new Graph();
         Graph g2 = new Graph()) {
-      Output c1 = TestUtil.constant(g1, "C1", 3);
-      Output c2 = TestUtil.constant(g2, "C2", 3);
+      Output<Integer> c1 = TestUtil.constant(g1, "C1", 3);
+      Output<Integer> c2 = TestUtil.constant(g2, "C2", 3);
       TestUtil.addN(g1, c1, c1);
       try {
         TestUtil.addN(g2, c1, c2);
@@ -48,7 +48,7 @@ public class OperationBuilderTest {
   @Test
   public void failOnUseAfterBuild() {
     try (Graph g = new Graph();
-        Tensor t = Tensor.create(1)) {
+        Tensor<Integer> t = Tensors.create(1)) {
       OperationBuilder b =
           g.opBuilder("Const", "Const").setAttr("dtype", t.dataType()).setAttr("value", t);
       b.build();
@@ -64,7 +64,7 @@ public class OperationBuilderTest {
   public void failOnUseAfterGraphClose() {
     OperationBuilder b = null;
     try (Graph g = new Graph();
-        Tensor t = Tensor.create(1)) {
+        Tensor<Integer> t = Tensors.create(1)) {
       b = g.opBuilder("Const", "Const").setAttr("dtype", t.dataType()).setAttr("value", t);
     }
     try {
@@ -85,7 +85,7 @@ public class OperationBuilderTest {
     // types that aren't inferred from the input arguments.
     try (Graph g = new Graph()) {
       // dtype, tensor attributes.
-      try (Tensor t = Tensor.create(1)) {
+      try (Tensor<Integer> t = Tensors.create(1)) {
         g.opBuilder("Const", "DataTypeAndTensor")
             .setAttr("dtype", DataType.INT32)
             .setAttr("value", t)
@@ -101,7 +101,7 @@ public class OperationBuilderTest {
       assertTrue(hasNode(g, "StringAndBool"));
       // int (TF "int" attributes are 64-bit signed, so a Java long).
       g.opBuilder("RandomUniform", "Int")
-          .addInput(TestUtil.constant(g, "RandomUniformShape", new int[]{1}))
+          .addInput(TestUtil.constant(g, "RandomUniformShape", new int[] {1}))
           .setAttr("seed", 10)
           .setAttr("dtype", DataType.FLOAT)
           .build();
@@ -127,7 +127,7 @@ public class OperationBuilderTest {
   @Test
   public void setAttrShape() {
     try (Graph g = new Graph()) {
-      Output n =
+      Output<?> n =
           g.opBuilder("Placeholder", "unknown")
               .setAttr("dtype", DataType.FLOAT)
               .setAttr("shape", Shape.unknown())
@@ -136,8 +136,7 @@ public class OperationBuilderTest {
       assertEquals(-1, n.shape().numDimensions());
       assertEquals(DataType.FLOAT, n.dataType());
 
-      n =
-          g.opBuilder("Placeholder", "batch_of_vectors")
+      n = g.opBuilder("Placeholder", "batch_of_vectors")
               .setAttr("dtype", DataType.FLOAT)
               .setAttr("shape", Shape.make(-1, 784))
               .build()
@@ -153,13 +152,13 @@ public class OperationBuilderTest {
   public void addControlInput() {
     try (Graph g = new Graph();
         Session s = new Session(g);
-        Tensor yes = Tensor.create(true);
-        Tensor no = Tensor.create(false)) {
-      Output placeholder = TestUtil.placeholder(g, "boolean", DataType.BOOL);
+        Tensor<Boolean> yes = Tensors.create(true);
+        Tensor<Boolean> no = Tensors.create(false)) {
+      Output<Boolean> placeholder = TestUtil.placeholder(g, "boolean", Boolean.class);
       Operation check =
           g.opBuilder("Assert", "assert")
               .addInput(placeholder)
-              .addInputList(new Output[] {placeholder})
+              .addInputList(new Output<?>[] {placeholder})
               .build();
       Operation noop = g.opBuilder("NoOp", "noop").addControlInput(check).build();
 
diff --git a/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java b/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java
index aade375db8..6fe3b3c327 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java
@@ -24,7 +24,6 @@ import static org.junit.Assert.fail;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;
-
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
@@ -104,9 +103,9 @@ public class OperationTest {
   @Test
   public void outputEquality() {
     try (Graph g = new Graph()) {
-      Output output = TestUtil.constant(g, "c", 1);
-      Output output1 = output.op().output(0);
-      Output output2 = g.operation("c").output(0);
+      Output<Integer> output = TestUtil.constant(g, "c", 1);
+      Output<Integer> output1 = output.op().<Integer>output(0);
+      Output<Integer> output2 = g.operation("c").<Integer>output(0);
       assertEquals(output, output1);
       assertEquals(output.hashCode(), output1.hashCode());
       assertEquals(output, output2);
@@ -117,10 +116,10 @@ public class OperationTest {
   @Test
   public void outputCollection() {
     try (Graph g = new Graph()) {
-      Output output = TestUtil.constant(g, "c", 1);
-      Output output1 = output.op().output(0);
-      Output output2 = g.operation("c").output(0);
-      Set<Output> ops = new HashSet<>();
+      Output<Integer> output = TestUtil.constant(g, "c", 1);
+      Output<Integer> output1 = output.op().<Integer>output(0);
+      Output<Integer> output2 = g.operation("c").<Integer>output(0);
+      Set<Output<Integer>> ops = new HashSet<>();
       ops.addAll(Arrays.asList(output, output1, output2));
       assertEquals(1, ops.size());
       assertTrue(ops.contains(output));
@@ -132,7 +131,7 @@ public class OperationTest {
   @Test
   public void outputToString() {
     try (Graph g = new Graph()) {
-      Output output = TestUtil.constant(g, "c", new int[] {1});
+      Output<Integer> output = TestUtil.constant(g, "c", new int[] {1});
       assertNotNull(output.toString());
     }
   }
@@ -158,7 +157,7 @@ public class OperationTest {
   public void outputList() {
     try (Graph g = new Graph()) {
       Operation split = TestUtil.split(g, "split", new int[] {0, 1, 2}, 3);
-      Output[] outputs = split.outputList(1, 2);
+      Output<?>[] outputs = split.outputList(1, 2);
       assertNotNull(outputs);
       assertEquals(2, outputs.length);
       for (int i = 0; i < outputs.length; ++i) {
diff --git a/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java b/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
index 50bdf351e3..a86b4dd117 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
@@ -35,9 +35,9 @@ public class SessionTest {
     try (Graph g = new Graph();
         Session s = new Session(g)) {
       TestUtil.transpose_A_times_X(g, new int[][] {{2}, {3}});
-      try (Tensor x = Tensor.create(new int[][] {{5}, {7}});
-          AutoCloseableList<Tensor> outputs =
-              new AutoCloseableList<Tensor>(s.runner().feed("X", x).fetch("Y").run())) {
+      try (Tensor<Integer> x = Tensors.create(new int[][] {{5}, {7}});
+          AutoCloseableList<Tensor<?>> outputs =
+              new AutoCloseableList<Tensor<?>>(s.runner().feed("X", x).fetch("Y").run())) {
         assertEquals(1, outputs.size());
         final int[][] expected = {{31}};
         assertArrayEquals(expected, outputs.get(0).copyTo(new int[1][1]));
@@ -50,11 +50,11 @@ public class SessionTest {
     try (Graph g = new Graph();
         Session s = new Session(g)) {
       TestUtil.transpose_A_times_X(g, new int[][] {{2}, {3}});
-      Output feed = g.operation("X").output(0);
-      Output fetch = g.operation("Y").output(0);
-      try (Tensor x = Tensor.create(new int[][] {{5}, {7}});
-          AutoCloseableList<Tensor> outputs =
-              new AutoCloseableList<Tensor>(s.runner().feed(feed, x).fetch(fetch).run())) {
+      Output<Integer> feed = g.operation("X").output(0);
+      Output<Integer> fetch = g.operation("Y").output(0);
+      try (Tensor<Integer> x = Tensors.create(new int[][] {{5}, {7}});
+          AutoCloseableList<Tensor<?>> outputs =
+              new AutoCloseableList<Tensor<?>>(s.runner().feed(feed, x).fetch(fetch).run())) {
         assertEquals(1, outputs.size());
         final int[][] expected = {{31}};
         assertArrayEquals(expected, outputs.get(0).copyTo(new int[1][1]));
@@ -78,14 +78,21 @@ public class SessionTest {
           .build()
           .output(0);
       // Fetch using colon separated names.
-      try (Tensor fetched = s.runner().fetch("Split:1").run().get(0)) {
+      try (Tensor<Integer> fetched =
+          s.runner().fetch("Split:1").run().get(0).expect(Integer.class)) {
         final int[] expected = {3, 4};
         assertArrayEquals(expected, fetched.copyTo(new int[2]));
       }
       // Feed using colon separated names.
-      try (Tensor fed = Tensor.create(new int[] {4, 3, 2, 1});
-          Tensor fetched =
-              s.runner().feed("Split:0", fed).feed("Split:1", fed).fetch("Add").run().get(0)) {
+      try (Tensor<Integer> fed = Tensors.create(new int[] {4, 3, 2, 1});
+          Tensor<Integer> fetched =
+              s.runner()
+                  .feed("Split:0", fed)
+                  .feed("Split:1", fed)
+                  .fetch("Add")
+                  .run()
+                  .get(0)
+                  .expect(Integer.class)) {
         final int[] expected = {8, 6, 4, 2};
         assertArrayEquals(expected, fetched.copyTo(new int[4]));
       }
@@ -97,7 +104,7 @@ public class SessionTest {
     try (Graph g = new Graph();
         Session s = new Session(g)) {
       TestUtil.transpose_A_times_X(g, new int[][] {{2}, {3}});
-      try (Tensor x = Tensor.create(new int[][] {{5}, {7}})) {
+      try (Tensor<Integer> x = Tensors.create(new int[][] {{5}, {7}})) {
         Session.Run result =
             s.runner()
                 .feed("X", x)
@@ -105,7 +112,7 @@ public class SessionTest {
                 .setOptions(fullTraceRunOptions())
                 .runAndFetchMetadata();
         // Sanity check on outputs.
-        AutoCloseableList<Tensor> outputs = new AutoCloseableList<Tensor>(result.outputs);
+        AutoCloseableList<Tensor<?>> outputs = new AutoCloseableList<Tensor<?>>(result.outputs);
         assertEquals(1, outputs.size());
         final int[][] expected = {{31}};
         assertArrayEquals(expected, outputs.get(0).copyTo(new int[1][1]));
@@ -117,6 +124,7 @@ public class SessionTest {
             assertTrue(md.toString(), md.hasStepStats());
         */
         assertTrue(result.metadata.length > 0);
+        outputs.close();
       }
     }
   }
@@ -127,11 +135,12 @@ public class SessionTest {
         Session s = new Session(g)) {
       TestUtil.constant(g, "c1", 2718);
       TestUtil.constant(g, "c2", 31415);
-      AutoCloseableList<Tensor> outputs =
-          new AutoCloseableList<Tensor>(s.runner().fetch("c2").fetch("c1").run());
+      AutoCloseableList<Tensor<?>> outputs =
+          new AutoCloseableList<Tensor<?>>(s.runner().fetch("c2").fetch("c1").run());
       assertEquals(2, outputs.size());
       assertEquals(31415, outputs.get(0).intValue());
       assertEquals(2718, outputs.get(1).intValue());
+      outputs.close();
     }
   }
 
diff --git a/tensorflow/java/src/test/java/org/tensorflow/ShapeTest.java b/tensorflow/java/src/test/java/org/tensorflow/ShapeTest.java
index fe46c0184c..3b027700c5 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/ShapeTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/ShapeTest.java
@@ -61,7 +61,7 @@ public class ShapeTest {
   @Test
   public void nodesInAGraph() {
     try (Graph g = new Graph()) {
-      Output n = TestUtil.placeholder(g, "feed", DataType.FLOAT);
+      Output<Float> n = TestUtil.placeholder(g, "feed", Float.class);
       assertEquals(-1, n.shape().numDimensions());
 
       n = TestUtil.constant(g, "scalar", 3);
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
index 036db04503..6538359d11 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
@@ -30,6 +30,7 @@ import java.nio.LongBuffer;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
+import org.tensorflow.types.UInt8;
 
 /** Unit tests for {@link org.tensorflow.Tensor}. */
 @RunWith(JUnit4.class)
@@ -47,7 +48,7 @@ public class TensorTest {
     byte[] strings = "test".getBytes(UTF_8);
     long[] strings_shape = {};
     byte[] strings_; // raw TF_STRING
-    try (Tensor t = Tensor.create(strings)) {
+    try (Tensor<String> t = Tensors.create(strings)) {
       ByteBuffer to = ByteBuffer.allocate(t.numBytes());
       t.writeTo(to);
       strings_ = to.array();
@@ -55,7 +56,7 @@ public class TensorTest {
 
     // validate creating a tensor using a byte buffer
     {
-      try (Tensor t = Tensor.create(DataType.BOOL, bools_shape, ByteBuffer.wrap(bools_))) {
+      try (Tensor<Boolean> t = Tensor.create(Boolean.class, bools_shape, ByteBuffer.wrap(bools_))) {
         boolean[] actual = t.copyTo(new boolean[bools_.length]);
         for (int i = 0; i < bools.length; ++i) {
           assertEquals("" + i, bools[i], actual[i]);
@@ -63,7 +64,8 @@ public class TensorTest {
       }
 
       // note: the buffer is expected to contain raw TF_STRING (as per C API)
-      try (Tensor t = Tensor.create(DataType.STRING, strings_shape, ByteBuffer.wrap(strings_))) {
+      try (Tensor<String> t =
+          Tensor.create(String.class, strings_shape, ByteBuffer.wrap(strings_))) {
         assertArrayEquals(strings, t.bytesValue());
       }
     }
@@ -72,15 +74,15 @@ public class TensorTest {
     {
       ByteBuffer buf = ByteBuffer.allocateDirect(8 * doubles.length).order(ByteOrder.nativeOrder());
       buf.asDoubleBuffer().put(doubles);
-      try (Tensor t = Tensor.create(DataType.DOUBLE, doubles_shape, buf)) {
+      try (Tensor<Double> t = Tensor.create(Double.class, doubles_shape, buf)) {
         double[] actual = new double[doubles.length];
         assertArrayEquals(doubles, t.copyTo(actual), EPSILON);
       }
     }
 
     // validate shape checking
-    try (Tensor t =
-        Tensor.create(DataType.BOOL, new long[bools_.length * 2], ByteBuffer.wrap(bools_))) {
+    try (Tensor<Boolean> t =
+        Tensor.create(Boolean.class, new long[bools_.length * 2], ByteBuffer.wrap(bools_))) {
       fail("should have failed on incompatible buffer");
     } catch (IllegalArgumentException e) {
       // expected
@@ -99,7 +101,7 @@ public class TensorTest {
             .asDoubleBuffer()
             .put(doubles);
     buf.flip();
-    try (Tensor t = Tensor.create(new long[] {doubles.length}, buf)) {
+    try (Tensor<Double> t = Tensor.create(new long[] {doubles.length}, buf)) {
       double[] actual = new double[doubles.length];
       assertArrayEquals(doubles, t.copyTo(actual), EPSILON);
     }
@@ -115,19 +117,19 @@ public class TensorTest {
 
     // validate creating a tensor using a typed buffer
     {
-      try (Tensor t = Tensor.create(shape, DoubleBuffer.wrap(doubles))) {
+      try (Tensor<Double> t = Tensor.create(shape, DoubleBuffer.wrap(doubles))) {
         double[] actual = new double[doubles.length];
         assertArrayEquals(doubles, t.copyTo(actual), EPSILON);
       }
-      try (Tensor t = Tensor.create(shape, FloatBuffer.wrap(floats))) {
+      try (Tensor<Float> t = Tensor.create(shape, FloatBuffer.wrap(floats))) {
         float[] actual = new float[floats.length];
         assertArrayEquals(floats, t.copyTo(actual), EPSILON_F);
       }
-      try (Tensor t = Tensor.create(shape, IntBuffer.wrap(ints))) {
+      try (Tensor<Integer> t = Tensor.create(shape, IntBuffer.wrap(ints))) {
         int[] actual = new int[ints.length];
         assertArrayEquals(ints, t.copyTo(actual));
       }
-      try (Tensor t = Tensor.create(shape, LongBuffer.wrap(longs))) {
+      try (Tensor<Long> t = Tensor.create(shape, LongBuffer.wrap(longs))) {
         long[] actual = new long[longs.length];
         assertArrayEquals(longs, t.copyTo(actual));
       }
@@ -135,22 +137,23 @@ public class TensorTest {
 
     // validate shape-checking
     {
-      try (Tensor t = Tensor.create(new long[doubles.length + 1], DoubleBuffer.wrap(doubles))) {
+      try (Tensor<Double> t =
+          Tensor.create(new long[doubles.length + 1], DoubleBuffer.wrap(doubles))) {
         fail("should have failed on incompatible buffer");
       } catch (IllegalArgumentException e) {
         // expected
       }
-      try (Tensor t = Tensor.create(new long[floats.length + 1], FloatBuffer.wrap(floats))) {
+      try (Tensor<Float> t = Tensor.create(new long[floats.length + 1], FloatBuffer.wrap(floats))) {
         fail("should have failed on incompatible buffer");
       } catch (IllegalArgumentException e) {
         // expected
       }
-      try (Tensor t = Tensor.create(new long[ints.length + 1], IntBuffer.wrap(ints))) {
+      try (Tensor<Integer> t = Tensor.create(new long[ints.length + 1], IntBuffer.wrap(ints))) {
         fail("should have failed on incompatible buffer");
       } catch (IllegalArgumentException e) {
         // expected
       }
-      try (Tensor t = Tensor.create(new long[longs.length + 1], LongBuffer.wrap(longs))) {
+      try (Tensor<Long> t = Tensor.create(new long[longs.length + 1], LongBuffer.wrap(longs))) {
         fail("should have failed on incompatible buffer");
       } catch (IllegalArgumentException e) {
         // expected
@@ -166,11 +169,11 @@ public class TensorTest {
     long[] longs = {1L, 2L, 3L};
     boolean[] bools = {true, false, true};
 
-    try (Tensor tints = Tensor.create(ints);
-        Tensor tfloats = Tensor.create(floats);
-        Tensor tdoubles = Tensor.create(doubles);
-        Tensor tlongs = Tensor.create(longs);
-        Tensor tbools = Tensor.create(bools)) {
+    try (Tensor<Integer> tints = Tensors.create(ints);
+        Tensor<Float> tfloats = Tensors.create(floats);
+        Tensor<Double> tdoubles = Tensors.create(doubles);
+        Tensor<Long> tlongs = Tensors.create(longs);
+        Tensor<Boolean> tbools = Tensors.create(bools)) {
 
       // validate that any datatype is readable with ByteBuffer (content, position)
       {
@@ -293,35 +296,35 @@ public class TensorTest {
 
   @Test
   public void scalars() {
-    try (Tensor t = Tensor.create(2.718f)) {
+    try (Tensor<Float> t = Tensors.create(2.718f)) {
       assertEquals(DataType.FLOAT, t.dataType());
       assertEquals(0, t.numDimensions());
       assertEquals(0, t.shape().length);
       assertEquals(2.718f, t.floatValue(), EPSILON_F);
     }
 
-    try (Tensor t = Tensor.create(3.1415)) {
+    try (Tensor<Double> t = Tensors.create(3.1415)) {
       assertEquals(DataType.DOUBLE, t.dataType());
       assertEquals(0, t.numDimensions());
       assertEquals(0, t.shape().length);
       assertEquals(3.1415, t.doubleValue(), EPSILON);
     }
 
-    try (Tensor t = Tensor.create(-33)) {
+    try (Tensor<Integer> t = Tensors.create(-33)) {
       assertEquals(DataType.INT32, t.dataType());
       assertEquals(0, t.numDimensions());
       assertEquals(0, t.shape().length);
       assertEquals(-33, t.intValue());
     }
 
-    try (Tensor t = Tensor.create(8589934592L)) {
+    try (Tensor<Long> t = Tensors.create(8589934592L)) {
       assertEquals(DataType.INT64, t.dataType());
       assertEquals(0, t.numDimensions());
       assertEquals(0, t.shape().length);
       assertEquals(8589934592L, t.longValue());
     }
 
-    try (Tensor t = Tensor.create(true)) {
+    try (Tensor<Boolean> t = Tensors.create(true)) {
       assertEquals(DataType.BOOL, t.dataType());
       assertEquals(0, t.numDimensions());
       assertEquals(0, t.shape().length);
@@ -329,7 +332,7 @@ public class TensorTest {
     }
 
     final byte[] bytes = {1, 2, 3, 4};
-    try (Tensor t = Tensor.create(bytes)) {
+    try (Tensor<String> t = Tensors.create(bytes)) {
       assertEquals(DataType.STRING, t.dataType());
       assertEquals(0, t.numDimensions());
       assertEquals(0, t.shape().length);
@@ -340,7 +343,7 @@ public class TensorTest {
   @Test
   public void nDimensional() {
     double[] vector = {1.414, 2.718, 3.1415};
-    try (Tensor t = Tensor.create(vector)) {
+    try (Tensor<Double> t = Tensors.create(vector)) {
       assertEquals(DataType.DOUBLE, t.dataType());
       assertEquals(1, t.numDimensions());
       assertArrayEquals(new long[] {3}, t.shape());
@@ -350,7 +353,7 @@ public class TensorTest {
     }
 
     int[][] matrix = {{1, 2, 3}, {4, 5, 6}};
-    try (Tensor t = Tensor.create(matrix)) {
+    try (Tensor<Integer> t = Tensors.create(matrix)) {
       assertEquals(DataType.INT32, t.dataType());
       assertEquals(2, t.numDimensions());
       assertArrayEquals(new long[] {2, 3}, t.shape());
@@ -362,7 +365,7 @@ public class TensorTest {
     long[][][] threeD = {
       {{1}, {3}, {5}, {7}, {9}}, {{2}, {4}, {6}, {8}, {0}},
     };
-    try (Tensor t = Tensor.create(threeD)) {
+    try (Tensor<Long> t = Tensors.create(threeD)) {
       assertEquals(DataType.INT64, t.dataType());
       assertEquals(3, t.numDimensions());
       assertArrayEquals(new long[] {2, 5, 1}, t.shape());
@@ -376,7 +379,7 @@ public class TensorTest {
       {{{false, false, true, true}, {false, true, false, false}}},
       {{{false, true, false, true}, {false, true, true, false}}},
     };
-    try (Tensor t = Tensor.create(fourD)) {
+    try (Tensor<Boolean> t = Tensors.create(fourD)) {
       assertEquals(DataType.BOOL, t.dataType());
       assertEquals(4, t.numDimensions());
       assertArrayEquals(new long[] {3, 1, 2, 4}, t.shape());
@@ -394,7 +397,7 @@ public class TensorTest {
         matrix[i][j] = String.format("(%d, %d) = %d", i, j, i << j).getBytes(UTF_8);
       }
     }
-    try (Tensor t = Tensor.create(matrix)) {
+    try (Tensor<String> t = Tensors.create(matrix)) {
       assertEquals(DataType.STRING, t.dataType());
       assertEquals(2, t.numDimensions());
       assertArrayEquals(new long[] {4, 3}, t.shape());
@@ -412,14 +415,24 @@ public class TensorTest {
 
   @Test
   public void testUInt8Tensor() {
-    byte[] vector = new byte[] { 1, 2, 3, 4 };
-    try (Tensor t = Tensor.create(vector, DataType.UINT8)) {
+    byte[] vector = new byte[] {1, 2, 3, 4};
+    try (Tensor<UInt8> t = Tensor.create(vector, UInt8.class)) {
       assertEquals(DataType.UINT8, t.dataType());
       assertEquals(1, t.numDimensions());
       assertArrayEquals(new long[] {4}, t.shape());
 
       byte[] got = t.copyTo(new byte[4]);
-      assertArrayEquals(got, vector);
+      assertArrayEquals(vector, got);
+    }
+  }
+
+  @Test
+  public void testCreateFromArrayOfBoxed() {
+    Integer[] vector = new Integer[] {1, 2, 3, 4};
+    try (Tensor<Integer> t = Tensor.create(vector, Integer.class)) {
+      fail("Tensor.create() should fail because it was given an array of boxed values");
+    } catch (IllegalArgumentException e) {
+        // The expected exception
     }
   }
 
@@ -431,7 +444,7 @@ public class TensorTest {
         invalid[x][y] = new int[x + y + 1];
       }
     }
-    try (Tensor t = Tensor.create(invalid)) {
+    try (Tensor<?> t = Tensor.create(invalid)) {
       fail("Tensor.create() should fail because of differing sizes in the 3rd dimension");
     } catch (IllegalArgumentException e) {
       // The expected exception.
@@ -440,7 +453,7 @@ public class TensorTest {
 
   @Test
   public void failCopyToOnIncompatibleDestination() {
-    try (final Tensor matrix = Tensor.create(new int[][] {{1, 2}, {3, 4}})) {
+    try (final Tensor<Integer> matrix = Tensors.create(new int[][] {{1, 2}, {3, 4}})) {
       try {
         matrix.copyTo(new int[2]);
         fail("should have failed on dimension mismatch");
@@ -466,7 +479,7 @@ public class TensorTest {
 
   @Test
   public void failCopyToOnScalar() {
-    try (final Tensor scalar = Tensor.create(3)) {
+    try (final Tensor<Integer> scalar = Tensors.create(3)) {
       try {
         scalar.copyTo(3);
         fail("copyTo should fail on scalar tensors, suggesting use of primitive accessors instead");
@@ -478,8 +491,8 @@ public class TensorTest {
 
   @Test
   public void failOnArbitraryObject() {
-    try (Tensor t = Tensor.create(new Object())) {
-      fail("should fail on creating a Tensor with a Java object that has not equivalent DataType");
+    try (Tensor<?> t = Tensor.create(new Object())) {
+      fail("should fail on creating a Tensor with a Java object that has no equivalent DataType");
     } catch (IllegalArgumentException e) {
       // The expected exception.
     }
@@ -487,7 +500,7 @@ public class TensorTest {
 
   @Test
   public void failOnZeroDimension() {
-    try (Tensor t = Tensor.create(new int[3][0][1])) {
+    try (Tensor<Integer> t = Tensors.create(new int[3][0][1])) {
       fail("should fail on creating a Tensor where one of the dimensions is 0");
     } catch (IllegalArgumentException e) {
       // The expected exception.
@@ -497,7 +510,7 @@ public class TensorTest {
   @Test
   public void useAfterClose() {
     int n = 4;
-    Tensor t = Tensor.create(n);
+    Tensor<?> t = Tensor.create(n);
     t.close();
     try {
       t.intValue();
@@ -515,8 +528,8 @@ public class TensorTest {
     // An exception is made for this test, where the pitfalls of this is avoided by not calling
     // close() on both Tensors.
     final float[][] matrix = {{1, 2, 3}, {4, 5, 6}};
-    try (Tensor src = Tensor.create(matrix)) {
-      Tensor cpy = Tensor.fromHandle(src.getNativeHandle());
+    try (Tensor<Float> src = Tensors.create(matrix)) {
+      Tensor<Float> cpy = Tensor.fromHandle(src.getNativeHandle()).expect(Float.class);
       assertEquals(src.dataType(), cpy.dataType());
       assertEquals(src.numDimensions(), cpy.numDimensions());
       assertArrayEquals(src.shape(), cpy.shape());
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java b/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java
index e3415a696d..c973b5a3d8 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java
@@ -19,33 +19,36 @@ import java.lang.reflect.Array;
 
 /** Static utility functions. */
 public class TestUtil {
-  public static Output constant(Graph g, String name, Object value) {
-    try (Tensor t = Tensor.create(value)) {
+  public static <T> Output<T> constant(Graph g, String name, Object value) {
+    try (Tensor<?> t = Tensor.create(value)) {
       return g.opBuilder("Const", name)
           .setAttr("dtype", t.dataType())
           .setAttr("value", t)
           .build()
-          .output(0);
+          .<T>output(0);
     }
   }
 
-  public static Output placeholder(Graph g, String name, DataType dtype) {
-    return g.opBuilder("Placeholder", name).setAttr("dtype", dtype).build().output(0);
+  public static <T> Output<T> placeholder(Graph g, String name, Class<T> type) {
+    return g.opBuilder("Placeholder", name)
+        .setAttr("dtype", DataType.fromClass(type))
+        .build()
+        .<T>output(0);
   }
 
-  public static Output addN(Graph g, Output... inputs) {
+  public static Output<?> addN(Graph g, Output<?>... inputs) {
     return g.opBuilder("AddN", "AddN").addInputList(inputs).build().output(0);
   }
 
-  public static Output matmul(
-      Graph g, String name, Output a, Output b, boolean transposeA, boolean transposeB) {
+  public static <T> Output<T> matmul(
+      Graph g, String name, Output<T> a, Output<T> b, boolean transposeA, boolean transposeB) {
     return g.opBuilder("MatMul", name)
         .addInput(a)
         .addInput(b)
         .setAttr("transpose_a", transposeA)
         .setAttr("transpose_b", transposeB)
         .build()
-        .output(0);
+        .<T>output(0);
   }
 
   public static Operation split(Graph g, String name, int[] values, int numSplit) {
@@ -57,7 +60,8 @@ public class TestUtil {
   }
 
   public static void transpose_A_times_X(Graph g, int[][] a) {
-    matmul(g, "Y", constant(g, "A", a), placeholder(g, "X", DataType.INT32), true, false);
+    Output<Integer> aa = constant(g, "A", a);
+    matmul(g, "Y", aa, placeholder(g, "X", Integer.class), true, false);
   }
 
   /**
diff --git a/tensorflow/java/src/test/java/org/tensorflow/op/OperandsTest.java b/tensorflow/java/src/test/java/org/tensorflow/op/OperandsTest.java
index 4fdd150acc..79bfcc8354 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/op/OperandsTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/op/OperandsTest.java
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -36,8 +36,9 @@ public class OperandsTest {
   public void createOutputArrayFromOperandList() {
     try (Graph g = new Graph()) {
       Operation split = TestUtil.split(g, "split", new int[] {0, 1, 2}, 3);
-      List<Output> list = Arrays.asList(split.output(0), split.output(2));
-      Output[] array = Operands.asOutputs(list);
+      List<Output<Integer>> list =
+          Arrays.asList(split.<Integer>output(0), split.<Integer>output(2));
+      Output<?>[] array = Operands.asOutputs(list);
       assertEquals(list.size(), array.length);
       assertSame(array[0], list.get(0));
       assertSame(array[1], list.get(1));
diff --git a/tensorflow/java/src/test/java/org/tensorflow/op/PrimitiveOpTest.java b/tensorflow/java/src/test/java/org/tensorflow/op/PrimitiveOpTest.java
index b24bf5a476..e02c38ed22 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/op/PrimitiveOpTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/op/PrimitiveOpTest.java
@@ -36,7 +36,7 @@ public class PrimitiveOpTest {
   @Test
   public void equalsHashcode() {
     try (Graph g = new Graph()) {
-      Output array = TestUtil.constant(g, "array", new int[2]);
+      Output<Integer> array = TestUtil.constant(g, "array", new int[2]);
 
       PrimitiveOp test1 =
           new PrimitiveOp(g.opBuilder("Shape", "shape1").addInput(array).build()) {};
diff --git a/tensorflow/java/src/test/java/org/tensorflow/op/ScopeTest.java b/tensorflow/java/src/test/java/org/tensorflow/op/ScopeTest.java
index 9256cb281d..125de73554 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/op/ScopeTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/op/ScopeTest.java
@@ -19,6 +19,8 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.fail;
 
+import java.util.HashMap;
+import java.util.Map;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
@@ -26,6 +28,8 @@ import org.tensorflow.Graph;
 import org.tensorflow.Output;
 import org.tensorflow.Session;
 import org.tensorflow.Tensor;
+import org.tensorflow.Tensors;
+import org.tensorflow.types.UInt8;
 
 /** Unit tests for {@link org.tensorflow.Scope}. */
 @RunWith(JUnit4.class)
@@ -122,13 +126,13 @@ public class ScopeTest {
   public void basic() {
     try (Graph g = new Graph()) {
       Scope s = new Scope(g);
-      Const c1 = Const.create(s, 42);
+      Const<Integer> c1 = Const.create(s, 42);
       assertEquals("Const", c1.output().op().name());
-      Const c2 = Const.create(s, 7);
+      Const<Integer> c2 = Const.create(s, 7);
       assertEquals("Const_1", c2.output().op().name());
-      Const c3 = Const.create(s.withName("four"), 4);
+      Const<Integer> c3 = Const.create(s.withName("four"), 4);
       assertEquals("four", c3.output().op().name());
-      Const c4 = Const.create(s.withName("four"), 4);
+      Const<Integer> c4 = Const.create(s.withName("four"), 4);
       assertEquals("four_1", c4.output().op().name());
     }
   }
@@ -148,122 +152,164 @@ public class ScopeTest {
     try (Graph g = new Graph();
         Session sess = new Session(g)) {
       Scope s = new Scope(g);
-      Output data = Const.create(s.withName("data"), new int[] {600, 470, 170, 430, 300}).output();
+      Output<Integer> data =
+          Const.create(s.withName("data"), new int[] {600, 470, 170, 430, 300}).output();
 
       // Create a composite op with a customized name
-      Variance var1 = Variance.create(s.withName("example"), data);
+      Variance<Integer> var1 = Variance.create(s.withName("example"), data, Integer.class);
       assertEquals("example/variance", var1.output().op().name());
 
       // Confirm internally added ops have the right names.
       assertNotNull(g.operation("example/squared_deviation"));
       assertNotNull(g.operation("example/Mean"));
-      assertNotNull(g.operation("example/zero"));
+      // assertNotNull(g.operation("example/zero"));
 
       // Same composite op with a default name
-      Variance var2 = Variance.create(s, data);
+      Variance<Integer> var2 = Variance.create(s, data, Integer.class);
       assertEquals("variance/variance", var2.output().op().name());
 
       // Confirm internally added ops have the right names.
       assertNotNull(g.operation("variance/squared_deviation"));
       assertNotNull(g.operation("variance/Mean"));
-      assertNotNull(g.operation("variance/zero"));
+      // assertNotNull(g.operation("variance/zero"));
 
       // Verify correct results as well.
-      Tensor result = sess.runner().fetch(var1.output()).run().get(0);
+      Tensor<Integer> result =
+          sess.runner().fetch(var1.output()).run().get(0).expect(Integer.class);
       assertEquals(21704, result.intValue());
-      result = sess.runner().fetch(var2.output()).run().get(0);
+      result = sess.runner().fetch(var2.output()).run().get(0).expect(Integer.class);
       assertEquals(21704, result.intValue());
     }
   }
 
   // "handwritten" sample operator classes
-  private static final class Const {
-    private final Output output;
+  private static final class Const<T> {
+    private final Output<T> output;
 
-    static Const create(Scope s, Object v) {
-      try (Tensor value = Tensor.create(v)) {
-        return new Const(
+    static Const<Integer> create(Scope s, int v) {
+      return create(s, Tensors.create(v));
+    }
+
+    static Const<Integer> create(Scope s, int[] v) {
+      return create(s, Tensors.create(v));
+    }
+
+    static <T> Const<T> create(Scope s, Tensor<T> value) {
+      return new Const<T>(
+          s.graph()
+              .opBuilder("Const", s.makeOpName("Const"))
+              .setAttr("dtype", value.dataType())
+              .setAttr("value", value)
+              .build()
+              .<T>output(0));
+    }
+
+    static <T> Const<T> create(Scope s, Object v, Class<T> type) {
+      try (Tensor<T> value = Tensor.create(v, type)) {
+        return new Const<T>(
             s.graph()
                 .opBuilder("Const", s.makeOpName("Const"))
                 .setAttr("dtype", value.dataType())
                 .setAttr("value", value)
                 .build()
-                .output(0));
+                .<T>output(0));
       }
     }
 
-    Const(Output o) {
+    Const(Output<T> o) {
       output = o;
     }
 
-    Output output() {
+    Output<T> output() {
       return output;
     }
   }
 
-  private static final class Mean {
-    private final Output output;
+  private static final class Mean<T> {
+    private final Output<T> output;
 
-    static Mean create(Scope s, Output input, Output reductionIndices) {
-      return new Mean(
+    static <T> Mean<T> create(Scope s, Output<T> input, Output<T> reductionIndices) {
+      return new Mean<T>(
           s.graph()
               .opBuilder("Mean", s.makeOpName("Mean"))
               .addInput(input)
               .addInput(reductionIndices)
               .build()
-              .output(0));
+              .<T>output(0));
     }
 
-    Mean(Output o) {
+    Mean(Output<T> o) {
       output = o;
     }
 
-    Output output() {
+    Output<T> output() {
       return output;
     }
   }
 
-  private static final class SquaredDifference {
-    private final Output output;
+  private static final class SquaredDifference<T> {
+    private final Output<T> output;
 
-    static SquaredDifference create(Scope s, Output x, Output y) {
-      return new SquaredDifference(
+    static <T> SquaredDifference<T> create(Scope s, Output<T> x, Output<T> y) {
+      return new SquaredDifference<T>(
           s.graph()
               .opBuilder("SquaredDifference", s.makeOpName("SquaredDifference"))
               .addInput(x)
               .addInput(y)
               .build()
-              .output(0));
+              .<T>output(0));
     }
 
-    SquaredDifference(Output o) {
+    SquaredDifference(Output<T> o) {
       output = o;
     }
 
-    Output output() {
+    Output<T> output() {
       return output;
     }
   }
 
-  private static final class Variance {
-    private final Output output;
+  /**
+   * Returns the zero value of type described by {@code c}, or null if the type (e.g., string) is
+   * not numeric and therefore has no zero value.
+   *
+   * @param c The class describing the TensorFlow type of interest.
+   */
+  public static Object zeroValue(Class<?> c) {
+    return zeros.get(c);
+  }
+
+  private static final Map<Class<?>, Object> zeros = new HashMap<>();
+
+  static {
+    zeros.put(Float.class, 0.0f);
+    zeros.put(Double.class, 0.0);
+    zeros.put(Integer.class, 0);
+    zeros.put(UInt8.class, (byte) 0);
+    zeros.put(Long.class, 0L);
+    zeros.put(Boolean.class, false);
+    zeros.put(String.class, null); // no zero value
+  }
+
+  private static final class Variance<T> {
+    private final Output<T> output;
 
-    static Variance create(Scope base, Output x) {
+    static <T> Variance<T> create(Scope base, Output<T> x, Class<T> type) {
       Scope s = base.withSubScope("variance");
-      Output zero = Const.create(s.withName("zero"), new int[] {0}).output();
-      Output sqdiff =
+      Output<T> zero = Const.create(base, zeroValue(type), type).output();
+      Output<T> sqdiff =
           SquaredDifference.create(
                   s.withName("squared_deviation"), x, Mean.create(s, x, zero).output())
               .output();
 
-      return new Variance(Mean.create(s.withName("variance"), sqdiff, zero).output());
+      return new Variance<T>(Mean.create(s.withName("variance"), sqdiff, zero).output());
     }
 
-    Variance(Output o) {
+    Variance(Output<T> o) {
       output = o;
     }
 
-    Output output() {
+    Output<T> output() {
       return output;
     }
   }
diff --git a/tensorflow/java/src/test/java/org/tensorflow/op/core/ConstantTest.java b/tensorflow/java/src/test/java/org/tensorflow/op/core/ConstantTest.java
index ec23792485..ca54214e06 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/op/core/ConstantTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/op/core/ConstantTest.java
@@ -29,7 +29,6 @@ import java.nio.LongBuffer;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
-import org.tensorflow.DataType;
 import org.tensorflow.Graph;
 import org.tensorflow.Session;
 import org.tensorflow.Tensor;
@@ -47,8 +46,9 @@ public class ConstantTest {
     try (Graph g = new Graph();
         Session sess = new Session(g)) {
       Scope scope = new Scope(g);
-      Constant op = Constant.create(scope, shape, IntBuffer.wrap(ints));
-      Tensor result = sess.runner().fetch(op.asOutput()).run().get(0);
+      Constant<Integer> op = Constant.create(scope, shape, IntBuffer.wrap(ints));
+      Tensor<Integer> result = sess.runner().fetch(op.asOutput())
+          .run().get(0).expect(Integer.class);
       int[] actual = new int[ints.length];
       assertArrayEquals(ints, result.copyTo(actual));
     }
@@ -62,8 +62,8 @@ public class ConstantTest {
     try (Graph g = new Graph();
         Session sess = new Session(g)) {
       Scope scope = new Scope(g);
-      Constant op = Constant.create(scope, shape, FloatBuffer.wrap(floats));
-      Tensor result = sess.runner().fetch(op.asOutput()).run().get(0);
+      Constant<Float> op = Constant.create(scope, shape, FloatBuffer.wrap(floats));
+      Tensor<Float> result = sess.runner().fetch(op.asOutput()).run().get(0).expect(Float.class);
       float[] actual = new float[floats.length];
       assertArrayEquals(floats, result.copyTo(actual), EPSILON);
     }
@@ -77,8 +77,8 @@ public class ConstantTest {
     try (Graph g = new Graph();
         Session sess = new Session(g)) {
       Scope scope = new Scope(g);
-      Constant op = Constant.create(scope, shape, DoubleBuffer.wrap(doubles));
-      Tensor result = sess.runner().fetch(op.asOutput()).run().get(0);
+      Constant<Double> op = Constant.create(scope, shape, DoubleBuffer.wrap(doubles));
+      Tensor<Double> result = sess.runner().fetch(op.asOutput()).run().get(0).expect(Double.class);
       double[] actual = new double[doubles.length];
       assertArrayEquals(doubles, result.copyTo(actual), EPSILON);
     }
@@ -92,8 +92,8 @@ public class ConstantTest {
     try (Graph g = new Graph();
         Session sess = new Session(g)) {
       Scope scope = new Scope(g);
-      Constant op = Constant.create(scope, shape, LongBuffer.wrap(longs));
-      Tensor result = sess.runner().fetch(op.asOutput()).run().get(0);
+      Constant<Long> op = Constant.create(scope, shape, LongBuffer.wrap(longs));
+      Tensor<Long> result = sess.runner().fetch(op.asOutput()).run().get(0).expect(Long.class);
       long[] actual = new long[longs.length];
       assertArrayEquals(longs, result.copyTo(actual));
     }
@@ -123,8 +123,8 @@ public class ConstantTest {
     try (Graph g = new Graph();
         Session sess = new Session(g)) {
       Scope scope = new Scope(g);
-      Constant op = Constant.create(scope, DataType.STRING, shape, ByteBuffer.wrap(content));
-      Tensor result = sess.runner().fetch(op.asOutput()).run().get(0);
+      Constant<String> op = Constant.create(scope, String.class, shape, ByteBuffer.wrap(content));
+      Tensor<String> result = sess.runner().fetch(op.asOutput()).run().get(0).expect(String.class);
       assertArrayEquals(data, result.bytesValue());
     }
   }
diff --git a/tensorflow/python/debug/lib/debug_graphs.py b/tensorflow/python/debug/lib/debug_graphs.py
index 486e659158..87033d53a4 100644
--- a/tensorflow/python/debug/lib/debug_graphs.py
+++ b/tensorflow/python/debug/lib/debug_graphs.py
@@ -231,8 +231,8 @@ def _infer_device_name(graph_def):
       break
   if device_name is None:
     logging.warn(
-        "Failed to infer device name from partiton GraphDef: none of the nodes "
-        "of the GraphDef has a non-empty device name.")
+        "Failed to infer device name from partition GraphDef: none of the "
+        "nodes of the GraphDef has a non-empty device name.")
   return device_name
 
 
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index d7fe4bbfa1..c0a287e922 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -49,7 +49,7 @@ except ImportError:
 def _fill_array(arr, seq, fillvalue=0):
   """ 
   Recursively fills padded arr with elements from seq. 
-  If lenght of seq is less then arr padded length, fillvalue used.
+  If length of seq is less than arr padded length, fillvalue used.
 
   Args:
     arr: Padded tensor of shape [batch_size, ..., max_padded_dim_len].
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology_test.py b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
index 97bef2965c..32e692ba7c 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
@@ -200,7 +200,7 @@ class TopologyConstructionTest(test.TestCase):
     with self.assertRaises(ValueError):
       _ = keras.layers.Input(shape=(32,), batch_shape=(10, 32))
     with self.assertRaises(ValueError):
-      _ = keras.layers.Input(shape=(32,), unknwon_kwarg=None)
+      _ = keras.layers.Input(shape=(32,), unknown_kwarg=None)
 
     self.assertListEqual(a.get_shape().as_list(), [None, 32])
     a_layer, a_node_index, a_tensor_index = a._keras_history
diff --git a/tensorflow/python/kernel_tests/conv2d_transpose_test.py b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
index 18184a0ee0..7d0bc54b69 100644
--- a/tensorflow/python/kernel_tests/conv2d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
@@ -24,8 +24,12 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.client import device_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
@@ -289,6 +293,16 @@ class Conv2DTransposeTest(test.TestCase):
 
         self.assertAllClose(cache_values, value)
 
+  def testConv2DTransposeShapeInference(self):
+    # Test case for 8972
+    initializer = random_ops.truncated_normal(
+        [3, 3, 5, 1], mean=0.0, stddev=0.01, dtype=dtypes.float32)
+    x = variables.Variable(random_ops.random_normal([3, 10, 5, 1]))
+    f = variable_scope.get_variable("f", initializer=initializer)
+    f_shape = array_ops.stack([array_ops.shape(x)[0], 10, 5, 5])
+    output = nn_ops.conv2d_transpose(
+        x, f, f_shape, strides=[1, 1, 1, 1], padding="SAME")
+    self.assertEqual(output.get_shape().as_list(), [None, 10, 5, 5])
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/decode_csv_op_test.py b/tensorflow/python/kernel_tests/decode_csv_op_test.py
index 3853379328..7d9e57c8e5 100644
--- a/tensorflow/python/kernel_tests/decode_csv_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_csv_op_test.py
@@ -116,6 +116,17 @@ class DecodeCSVOpTest(test.TestCase):
 
     self._test(args, expected_out)
 
+  def testNA(self):
+    args = {
+        "records": ["2.0,NA,aa", "NA,5,bb", "3,6,NA"],
+        "record_defaults": [[0.0], [0], [""]],
+        "na_value": "NA"
+    }
+
+    expected_out = [[2.0, 0.0, 3], [0, 5, 6], [b"aa", b"bb", b""]]
+
+    self._test(args, expected_out)
+
   def testWithDefaults(self):
     args = {
         "records": [",1,", "0.2,3,bcd", "3.0,,"],
diff --git a/tensorflow/python/kernel_tests/summary_tensor_op_test.py b/tensorflow/python/kernel_tests/summary_tensor_op_test.py
index 3584637865..d534aadb79 100644
--- a/tensorflow/python/kernel_tests/summary_tensor_op_test.py
+++ b/tensorflow/python/kernel_tests/summary_tensor_op_test.py
@@ -154,7 +154,7 @@ class SummaryOpsTest(test.TestCase):
       self.assertEqual(descr.display_name, "my name")
       self.assertEqual(descr.summary_description, "my description")
 
-      # If both SummmaryMetadata and explicit args are provided, the args win
+      # If both SummaryMetadata and explicit args are provided, the args win
       overwrite = summary_ops.tensor_summary(
           "simple",
           const,
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
index 6e7122db5e..d27e867583 100644
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -207,6 +207,7 @@ TextLineReaderV2
 TFRecordReaderV2
 WholeFileReaderV2
 LMDBReader
+DecodeCSV
 
 # linalg_ops
 BatchCholesky
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index c5fd15bae4..ea7132791c 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -1166,3 +1166,42 @@ def _parse_single_sequence_example_raw(serialized,
             feature_list_sparse_tensors + feature_list_dense_values))
 
     return (context_output, feature_list_output)
+
+
+# Swap `name` and `na_value` for backward compatibility.
+def decode_csv(records, record_defaults, field_delim=",",
+               use_quote_delim=True, name=None, na_value=""):
+  # pylint: disable=protected-access
+  """Convert CSV records to tensors. Each column maps to one tensor.
+
+  RFC 4180 format is expected for the CSV records.
+  (https://tools.ietf.org/html/rfc4180)
+  Note that we allow leading and trailing spaces with int or float field.
+
+  Args:
+    records: A `Tensor` of type `string`.
+      Each string is a record/row in the csv and all records should have
+      the same format.
+    record_defaults: A list of `Tensor` objects with specific types.
+      Acceptable types are `float32`, `int32`, `int64`, `string`.
+      One tensor per column of the input record, with either a
+      scalar default value for that column or empty if the column is required.
+    field_delim: An optional `string`. Defaults to `","`.
+      char delimiter to separate fields in a record.
+    use_quote_delim: An optional `bool`. Defaults to `True`.
+      If false, treats double quotation marks as regular
+      characters inside of the string fields (ignoring RFC 4180, Section 2,
+      Bullet 5).
+    name: A name for the operation (optional).
+    na_value: Additional string to recognize as NA/NaN.
+
+  Returns:
+    A list of `Tensor` objects. Has the same type as `record_defaults`.
+    Each tensor will have the same shape as records.
+  """
+  # TODO(martinwicke), remove the wrapper when new Python API generator is done.
+  return gen_parsing_ops._decode_csv(
+      records=records, record_defaults=record_defaults,
+      field_delim=field_delim, use_quote_delim=use_quote_delim,
+      na_value=na_value, name=name)
+  # pylint: enable=protected-access
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index bf8380ebbd..0a1a748c40 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -562,7 +562,7 @@ static bool TensorOpMathEnabled() {
     bool ret;
     TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DISABLE_TENSOR_OP_MATH",
                                                /*default=*/false, &ret));
-    return ret;
+    return !ret;
   }();
   return is_enabled;
 }
@@ -2474,58 +2474,73 @@ struct WinogradNonfused {
 };
 
 bool CudnnSupport::GetConvolveAlgorithms(
-    bool with_winograd_nonfused,
-    std::vector<dnn::AlgorithmDesc::Index>* out_algorithms) {
-  out_algorithms->assign({
-      // clang-format off
-      CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
-      CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
-      CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
-      CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
-      CUDNN_CONVOLUTION_FWD_ALGO_FFT,
+    bool with_winograd_nonfused, int cc_major, int cc_minor,
+    std::vector<dnn::AlgorithmDesc>* out_algorithms) {
+  std::vector<dnn::AlgorithmDesc::Index> algo_types = {
+    // clang-format off
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
+    CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+    CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT,
 #if CUDNN_VERSION >= 5000
-      CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
 #endif
-      // clang-format on
-  });
+    // clang-format on
+  };
   if (CudnnEnvVar<FftTilingForward>::IsEnabled()) {
-    out_algorithms->push_back(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING);
+    algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING);
   }
 #if CUDNN_VERSION >= 5100
   if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
-    out_algorithms->push_back(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED);
+    algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED);
   }
 #endif
+
+  out_algorithms->clear();
+  for (auto i : algo_types) {
+    out_algorithms->push_back({i, /*use_tensor_ops=*/false});
+    if (cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled()) {
+      out_algorithms->push_back({i, /*use_tensor_ops=*/true});
+    }
+  }
   return true;
 }
 
 bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
-    bool with_winograd_nonfused,
-    std::vector<dnn::AlgorithmDesc::Index>* out_algorithms) {
-  out_algorithms->assign({
-      // clang-format off
-      CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
-      CUDNN_CONVOLUTION_BWD_DATA_ALGO_1,
-      CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT,
-      CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,
+    bool with_winograd_nonfused, int cc_major, int cc_minor,
+    std::vector<dnn::AlgorithmDesc>* out_algorithms) {
+  std::vector<dnn::AlgorithmDesc::Index> algo_types = {
+    // clang-format off
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_1,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,
 #if CUDNN_VERSION >= 5000
-      CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD,
 #endif
-      // clang-format on
-  });
+    // clang-format on
+  };
 #if CUDNN_VERSION >= 5100
   if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
-    out_algorithms->push_back(
-        CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED);
+    algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED);
   }
 #endif
+
+  out_algorithms->clear();
+  for (auto i : algo_types) {
+    out_algorithms->push_back({i, /*use_tensor_ops=*/false});
+    if (cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled()) {
+      out_algorithms->push_back({i, /*use_tensor_ops=*/true});
+    }
+  }
   return true;
 }
 
 bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
-    bool with_winograd_nonfused,
-    std::vector<dnn::AlgorithmDesc::Index>* out_algorithms) {
-  out_algorithms->assign({
+    bool with_winograd_nonfused, int cc_major, int cc_minor,
+    std::vector<dnn::AlgorithmDesc>* out_algorithms) {
+  std::vector<dnn::AlgorithmDesc::Index> algo_types = {
       // clang-format off
       CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0,
       CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1,
@@ -2534,13 +2549,20 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
       // Based on cudnn.h, the following is not implemented.
       // CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD,
       // clang-format on
-  });
+  };
 #if CUDNN_VERSION >= 5110
   if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
-    out_algorithms->push_back(
-        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED);
+    algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED);
   }
 #endif
+
+  out_algorithms->clear();
+  for (auto i : algo_types) {
+    out_algorithms->push_back({i, /*use_tensor_ops=*/false});
+    if (cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled()) {
+      out_algorithms->push_back({i, /*use_tensor_ops=*/true});
+    }
+  }
   return true;
 }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index beb2f7d050..8d7069a902 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -145,16 +145,16 @@ class CudnnSupport : public dnn::DnnSupport {
                      ScratchAllocator* workspace_allocator) override;
 
   bool GetConvolveAlgorithms(
-      bool with_winograd_nonfused,
-      std::vector<dnn::AlgorithmDesc::Index>* out_algorithms) override;
+      bool with_winograd_nonfused, int cc_major, int cc_minor,
+      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
   bool GetConvolveBackwardDataAlgorithms(
-      bool with_winograd_nonfused,
-      std::vector<dnn::AlgorithmDesc::Index>* out_algorithms) override;
+      bool with_winograd_nonfused, int cc_major, int cc_minor,
+      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
   bool GetConvolveBackwardFilterAlgorithms(
-      bool with_winograd_nonfused,
-      std::vector<dnn::AlgorithmDesc::Index>* out_algorithms) override;
+      bool with_winograd_nonfused, int cc_major, int cc_minor,
+      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
   bool DoBatchNormalizationForward(
       Stream* stream, const DeviceMemory<float>& x,
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index 2c40e18f5c..07fe8a85f4 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -23,20 +23,20 @@ namespace gputools {
 namespace dnn {
 
 bool DnnSupport::GetConvolveAlgorithms(
-    bool with_winograd_nonfused,
-    std::vector<AlgorithmDesc::Index>* out_algorithms) {
+    bool with_winograd_nonfused, int cc_major, int cc_minor,
+    std::vector<AlgorithmDesc>* out_algorithms) {
   return false;
 }
 
 bool DnnSupport::GetConvolveBackwardDataAlgorithms(
-    bool with_winograd_nonfused,
-    std::vector<AlgorithmDesc::Index>* out_algorithms) {
+    bool with_winograd_nonfused, int cc_major, int cc_minor,
+    std::vector<AlgorithmDesc>* out_algorithms) {
   return false;
 }
 
 bool DnnSupport::GetConvolveBackwardFilterAlgorithms(
-    bool with_winograd_nonfused,
-    std::vector<AlgorithmDesc::Index>* out_algorithms) {
+    bool with_winograd_nonfused, int cc_major, int cc_minor,
+    std::vector<AlgorithmDesc>* out_algorithms) {
   return false;
 }
 
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 5fe523602a..624357b82f 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -1183,8 +1183,8 @@ class DnnSupport {
 
   // Return a list of algorithms supported by the forward convolution pass.
   virtual bool GetConvolveAlgorithms(
-      bool with_winograd_nonfused,
-      std::vector<AlgorithmDesc::Index>* out_algorithms);
+      bool with_winograd_nonfused, int cc_major, int cc_minor,
+      std::vector<AlgorithmDesc>* out_algorithms);
 
   // Version of DoConvolve that uses pre-quantized 8 bit coefficients.
   // coefficient_scales specifies the scaling of each column of coefficients:
@@ -1263,8 +1263,8 @@ class DnnSupport {
   // Return a list of algorithms supported by the backward convolution pass for
   // data.
   virtual bool GetConvolveBackwardDataAlgorithms(
-      bool with_winograd_nonfused,
-      std::vector<AlgorithmDesc::Index>* out_algorithms);
+      bool with_winograd_nonfused, int cc_major, int cc_minor,
+      std::vector<AlgorithmDesc>* out_algorithms);
 
   virtual bool DoConvolveBackwardData(
       Stream* stream, const FilterDescriptor& filter_descriptor,
@@ -1312,8 +1312,8 @@ class DnnSupport {
   // Return a list of algorithms supported by the backward convolution pass for
   // filters.
   virtual bool GetConvolveBackwardFilterAlgorithms(
-      bool with_winograd_nonfused,
-      std::vector<AlgorithmDesc::Index>* out_algorithms);
+      bool with_winograd_nonfused, int cc_major, int cc_minor,
+      std::vector<AlgorithmDesc>* out_algorithms);
 
   virtual bool DoConvolveBackwardFilter(
       Stream* stream, const BatchDescriptor& input_descriptor,
diff --git a/tensorflow/stream_executor/platform.h b/tensorflow/stream_executor/platform.h
index ed12982e30..f0a0e60e02 100644
--- a/tensorflow/stream_executor/platform.h
+++ b/tensorflow/stream_executor/platform.h
@@ -96,7 +96,7 @@ class Platform {
   // each platform is required to expose an ID to ensure unique registration and
   // as a target against which plugins can register.
   //
-  // The macro below is provided to help generate a [process-unique] identifer.
+  // The macro below is provided to help generate a [process-unique] identifier.
   using Id = void*;
 
 // Helper macro to define a plugin ID. To be used only inside plugin
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index a72ee804c1..21172d5a16 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -70,7 +70,7 @@ class BatchDescriptor;
 class FilterDescriptor;
 class ConvolutionDescriptor;
 class ProfileResult;
-struct AlgorithmDesc;
+class AlgorithmDesc;
 }  // namespace dnn
 
 class StreamExecutor;
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 199a908914..9bbfe7f04a 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -286,35 +286,41 @@ bool StreamExecutor::SupportsDnn() const {
 
 bool StreamExecutor::GetConvolveAlgorithms(
     bool with_winograd_nonfused,
-    std::vector<dnn::AlgorithmDesc::Index> *out_algorithms) {
+    std::vector<dnn::AlgorithmDesc> *out_algorithms) {
   dnn::DnnSupport *dnn_support = AsDnn();
   if (!dnn_support) {
     return false;
   }
-  return dnn_support->GetConvolveAlgorithms(with_winograd_nonfused,
-                                            out_algorithms);
+  int cc_major, cc_minor;
+  GetDeviceDescription().cuda_compute_capability(&cc_major, &cc_minor);
+  return dnn_support->GetConvolveAlgorithms(with_winograd_nonfused, cc_major,
+                                            cc_minor, out_algorithms);
 }
 
 bool StreamExecutor::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused,
-    std::vector<dnn::AlgorithmDesc::Index> *out_algorithms) {
+    std::vector<dnn::AlgorithmDesc> *out_algorithms) {
   dnn::DnnSupport *dnn_support = AsDnn();
   if (!dnn_support) {
     return false;
   }
-  return dnn_support->GetConvolveBackwardDataAlgorithms(with_winograd_nonfused,
-                                                        out_algorithms);
+  int cc_major, cc_minor;
+  GetDeviceDescription().cuda_compute_capability(&cc_major, &cc_minor);
+  return dnn_support->GetConvolveBackwardDataAlgorithms(
+      with_winograd_nonfused, cc_major, cc_minor, out_algorithms);
 }
 
 bool StreamExecutor::GetConvolveBackwardFilterAlgorithms(
     bool with_winograd_nonfused,
-    std::vector<dnn::AlgorithmDesc::Index> *out_algorithms) {
+    std::vector<dnn::AlgorithmDesc> *out_algorithms) {
   dnn::DnnSupport *dnn_support = AsDnn();
   if (!dnn_support) {
     return false;
   }
+  int cc_major, cc_minor;
+  GetDeviceDescription().cuda_compute_capability(&cc_major, &cc_minor);
   return dnn_support->GetConvolveBackwardFilterAlgorithms(
-      with_winograd_nonfused, out_algorithms);
+      with_winograd_nonfused, cc_major, cc_minor, out_algorithms);
 }
 
 bool StreamExecutor::GetBlasGemmAlgorithms(
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 98136a92a0..f354317a6e 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -343,20 +343,19 @@ class StreamExecutor {
   bool SupportsDnn() const;
 
   // Get the list of supported algorithms for the forward convolution opeartion.
-  bool GetConvolveAlgorithms(
-      bool with_winograd_nonfused,
-      std::vector<dnn::AlgorithmDesc::Index> *out_algorithms);
+  bool GetConvolveAlgorithms(bool with_winograd_nonfused,
+                             std::vector<dnn::AlgorithmDesc> *out_algorithms);
 
   // Get the list of supported algorithms for the backward convolution on data.
   bool GetConvolveBackwardDataAlgorithms(
       bool with_winograd_nonfused,
-      std::vector<dnn::AlgorithmDesc::Index> *out_algorithms);
+      std::vector<dnn::AlgorithmDesc> *out_algorithms);
 
   // Get the list of supported algorithms for the backward convolution on the
   // filter.
   bool GetConvolveBackwardFilterAlgorithms(
       bool with_winograd_nonfused,
-      std::vector<dnn::AlgorithmDesc::Index> *out_algorithms);
+      std::vector<dnn::AlgorithmDesc> *out_algorithms);
 
   // Get the list of supported algorithms for BLAS gemm.
   bool GetBlasGemmAlgorithms(std::vector<blas::AlgorithmType> *out_algorithms);
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index a308688790..0f074151db 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -526,6 +526,7 @@ def tf_cc_test(name,
                extra_copts=[],
                suffix="",
                linkopts=[],
+               nocopts=None,
                **kwargs):
   native.cc_test(
       name="%s%s" % (name, suffix),
@@ -547,6 +548,7 @@ def tf_cc_test(name,
           clean_dep("//tensorflow:darwin"): 1,
           "//conditions:default": 0,
       }),
+      nocopts=nocopts,
       **kwargs)
 
 
@@ -649,7 +651,8 @@ def tf_cc_tests(srcs,
                 tags=[],
                 size="medium",
                 args=None,
-                linkopts=[]):
+                linkopts=[],
+                nocopts=None):
   for src in srcs:
     tf_cc_test(
         name=src_to_test_name(src),
@@ -659,7 +662,8 @@ def tf_cc_tests(srcs,
         tags=tags,
         size=size,
         args=args,
-        linkopts=linkopts)
+        linkopts=linkopts,
+        nocopts=nocopts)
 
 
 def tf_cc_test_mkl(srcs,
@@ -669,7 +673,7 @@ def tf_cc_test_mkl(srcs,
                    tags=[],
                    size="medium",
                    args=None):
-  if_mkl(tf_cc_tests(srcs, deps, linkstatic, tags=tags, size=size, args=args))
+  if_mkl(tf_cc_tests(srcs, deps, name, linkstatic=linkstatic, tags=tags, size=size, args=args, nocopts="-fno-exceptions"))
 
 
 def tf_cc_tests_gpu(srcs,
@@ -867,18 +871,33 @@ def tf_mkl_kernel_library(name,
                           deps=None,
                           alwayslink=1,
                           copts=tf_copts(),
+                          nocopts="-fno-exceptions",
                           **kwargs):
+  """A rule to build MKL-based TensorFlow kernel libraries."""
+  gpu_srcs = gpu_srcs  # unused argument
+  kwargs = kwargs  # unused argument
+
+  if not bool(srcs):
+    srcs = []
+  if not bool(hdrs):
+    hdrs = []
+
+  if prefix:
+    srcs = srcs + native.glob(
+        [prefix + "*.cc"])
+    hdrs = hdrs + native.glob(
+        [prefix + "*.h"])
+
   if_mkl(
-      tf_kernel_library(
-          name,
-          prefix=prefix,
+      native.cc_library(
+          name=name,
           srcs=srcs,
-          gpu_srcs=gpu_srcs,
           hdrs=hdrs,
           deps=deps,
           alwayslink=alwayslink,
           copts=copts,
-          **kwargs))
+          nocopts=nocopts
+      ))
 
 
 # Bazel rules for building swig files.
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index 32a86e420a..6e03f9e8fb 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -874,7 +874,7 @@ tf_module {
   }
   member_method {
     name: "decode_csv"
-    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\'], "
+    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\'], "
   }
   member_method {
     name: "decode_json_example"
diff --git a/tensorflow/tools/ci_build/install/install_golang.sh b/tensorflow/tools/ci_build/install/install_golang.sh
index 88bc2960e3..596265b069 100755
--- a/tensorflow/tools/ci_build/install/install_golang.sh
+++ b/tensorflow/tools/ci_build/install/install_golang.sh
@@ -16,7 +16,7 @@
 
 set -ex
 
-GOLANG_URL="https://storage.googleapis.com/golang/go1.8.3.linux-amd64.tar.gz"
+GOLANG_URL="https://storage.googleapis.com/golang/go1.9.linux-amd64.tar.gz"
 
 sudo mkdir -p /usr/local
 wget -q -O - "${GOLANG_URL}" | sudo tar -C /usr/local -xz
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index f5364d803a..04773376e9 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -78,10 +78,12 @@ WORKDIR /tensorflow
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
-ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH /usr/local/cuda/lib64/stubs:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2,6.0,6.1
 
+RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
+
 RUN tensorflow/tools/ci_build/builds/configured GPU \
     bazel build -c opt --config=cuda --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
         tensorflow/tools/pip_package:build_pip_package && \
diff --git a/tensorflow/tools/docker/jupyter_notebook_config.py b/tensorflow/tools/docker/jupyter_notebook_config.py
index 747beb8251..0acbf6fcee 100644
--- a/tensorflow/tools/docker/jupyter_notebook_config.py
+++ b/tensorflow/tools/docker/jupyter_notebook_config.py
@@ -18,7 +18,6 @@ from IPython.lib import passwd
 c.NotebookApp.ip = '*'
 c.NotebookApp.port = int(os.getenv('PORT', 8888))
 c.NotebookApp.open_browser = False
-c.MultiKernelManager.default_kernel_name = 'python2'
 
 # sets a password if PASSWORD is set in the environment
 if 'PASSWORD' in os.environ:
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index ca3b778c29..1015103077 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -923,7 +923,7 @@ class _ClassPageInfo(object):
     """Sets the `aliases` list.
 
     Args:
-      aliases: A list of strings. Containing all the obejct's full names.
+      aliases: A list of strings. Containing all the object's full names.
     """
     assert self.aliases is None
     self._aliases = aliases
@@ -1438,7 +1438,7 @@ class _PythonBuiltin(object):
 class _PythonFile(object):
   """This class indicates that the object is defined in a regular python file.
 
-  This can be used for the `defined_in` slot of the `PageInfo` obejcts.
+  This can be used for the `defined_in` slot of the `PageInfo` objects.
   """
 
   def __init__(self, path, parser_config):
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc
index 81f85e0009..6f0b4f47de 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc
@@ -93,13 +93,15 @@ TEST(CreateProtoDebugStringLibTest, ValidSimpleTypes) {
   proto.set_optional_int64(std::numeric_limits<protobuf_int64>::max());
   proto.set_optional_uint32(std::numeric_limits<uint32>::max());
   proto.set_optional_uint64(std::numeric_limits<uint64>::max());
-  proto.set_optional_float(std::numeric_limits<float>::max());
+  // TODO(b/67475677): Re-enable after resolving float precision issue
+  // proto.set_optional_float(std::numeric_limits<float>::max());
   proto.set_optional_double(std::numeric_limits<double>::max());
   EXPECT_TEXT_TRANSFORMS_MATCH();
 
   // Least positive numeric values.
   proto.Clear();
-  proto.set_optional_float(std::numeric_limits<float>::min());
+  // TODO(b/67475677): Re-enable after resolving float precision issue
+  // proto.set_optional_float(std::numeric_limits<float>::min());
   proto.set_optional_double(std::numeric_limits<double>::min());
   EXPECT_TEXT_TRANSFORMS_MATCH();
 
@@ -107,7 +109,8 @@ TEST(CreateProtoDebugStringLibTest, ValidSimpleTypes) {
   proto.Clear();
   proto.set_optional_int32(std::numeric_limits<int32>::lowest());
   proto.set_optional_int64(std::numeric_limits<protobuf_int64>::lowest());
-  proto.set_optional_float(std::numeric_limits<float>::lowest());
+  // TODO(b/67475677): Re-enable after resolving float precision issue
+  // proto.set_optional_float(std::numeric_limits<float>::lowest());
   proto.set_optional_double(std::numeric_limits<double>::lowest());
   EXPECT_TEXT_TRANSFORMS_MATCH();
 
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index b226184261..de0084613b 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -171,6 +171,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "and will be removed in the future.")
 
   native.new_http_archive(
+      name = "mkl_dnn",
+      urls = [
+          "https://github.com/01org/mkl-dnn/archive/b01e3a55a07be62172e713bcd2644c5176360212.tar.gz",
+          "http://mirror.bazel.build/github.com/01org/mkl-dnn/archive/b01e3a55a07be62172e713bcd2644c5176360212.tar.gz",
+      ],
+      sha256 = "0d529ad4c49dc799e6df07c2b88b115d0668735da15fb3b3862d28d33fa68165",
+      strip_prefix = "mkl-dnn-b01e3a55a07be62172e713bcd2644c5176360212",
+      build_file = str(Label("//third_party/mkl_dnn:mkldnn.BUILD")),
+  )
+
+  native.new_http_archive(
       name = "eigen_archive",
       urls = [
           "https://bitbucket.org/eigen/eigen/get/429aa5254200.tar.gz",
@@ -373,10 +384,10 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   patched_http_archive(
       name = "protobuf_archive",
       urls = [
-          "http://mirror.bazel.build/github.com/google/protobuf/archive/0b059a3d8a8f8aa40dde7bea55edca4ec5dfea66.tar.gz",
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
       ],
-      sha256 = "6d43b9d223ce09e5d4ce8b0060cb8a7513577a35a64c7e3dad10f0703bf3ad93",
-      strip_prefix = "protobuf-0b059a3d8a8f8aa40dde7bea55edca4ec5dfea66",
+      sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a",
+      strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
       # TODO: remove patching when tensorflow stops linking same protos into
       #       multiple shared libraries loaded in runtime by python.
       #       This patch fixes a runtime crash when tensorflow is compiled
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index baa6e01bca..31a4bfabf6 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -117,7 +117,7 @@ def get_cxx_inc_directories(repository_ctx, cc):
   includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
   includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
 
-  includes_cpp_set = set(includes_cpp)
+  includes_cpp_set = depset(includes_cpp)
   return includes_cpp + [inc for inc in includes_c
                          if inc not in includes_cpp_set]
 
diff --git a/third_party/mkl_dnn/BUILD b/third_party/mkl_dnn/BUILD
new file mode 100644
index 0000000000..5b01f6e3e4
--- /dev/null
+++ b/third_party/mkl_dnn/BUILD
@@ -0,0 +1 @@
+licenses(["notice"])
diff --git a/third_party/mkl_dnn/mkldnn.BUILD b/third_party/mkl_dnn/mkldnn.BUILD
new file mode 100644
index 0000000000..58bb7a6a5d
--- /dev/null
+++ b/third_party/mkl_dnn/mkldnn.BUILD
@@ -0,0 +1,25 @@
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "mkl_dnn",
+    srcs = glob([
+        "src/common/*.cpp",
+        "src/cpu/*.cpp",
+    ]),
+    hdrs = glob(["include/*"]),
+    copts = ["-fexceptions"] + select({
+        "@org_tensorflow//tensorflow:linux_x86_64": [
+            "-fopenmp",
+        ],
+        "//conditions:default": [],
+    }),
+    includes = [
+        "include",
+        "src",
+        "src/common",
+        "src/cpu",
+        "src/cpu/xbyak",
+    ],
+    nocopts = "-fno-exceptions",
+    visibility = ["//visibility:public"],
+)