106 files changed, 1573 insertions, 503 deletions
diff --git a/README.md b/README.md
index e9b9228492..49b1983731 100644
--- a/README.md
+++ b/README.md
@@ -61,6 +61,6 @@ Hello, TensorFlow!
 * [TensorFlow website](http://tensorflow.org)
 * [TensorFlow whitepaper](http://download.tensorflow.org/paper/whitepaper2015.pdf)
 * [TensorFlow Model Zoo](https://github.com/tensorflow/models)
-* [TensorFlow MOOC on Udacity] (https://www.udacity.com/course/deep-learning--ud730)
+* [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730)
 
 The TensorFlow community has created amazing things with TensorFlow, please see the [resources section of tensorflow.org](https://www.tensorflow.org/versions/master/resources#community) for an incomplete list.
diff --git a/RELEASE.md b/RELEASE.md
index 0b37c7b092..7d0a68654c 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -84,7 +84,7 @@ Snyder, @jpangburn, Jules Gagnon-Marchand, Karen Brems, @kborer, Kirill Bobyrev,
 Laurent Mazare, Longqi Yang, Malith Yapa, Maniteja Nandana, Martin Englund,
 Matthias Winkelmann, @mecab, Mu-Ik Jeon, Nand Dalal, Niels Ole Salscheider,
 Nikhil Mishra, Park Jiin, Pieter De Rijk, @raix852, Ritwik Gupta, Sahil Sharma,
-@Sangheum, @SergejsRk, Shinichiro Hamaji, Simon Denel, @Steve, @suiyuan2009,
+Sangheum Hwang, @SergejsRk, Shinichiro Hamaji, Simon Denel, @Steve, @suiyuan2009,
 Tiago Jorge, Tijmen Tieleman, @tvn, @tyfkda, Wang Yang, Wei-Ting Kuo, Wenjian
 Huang, Yan Chen, @YenChenLin, Yuan (Terry) Tang, Yuncheng Li, Yunfeng Wang, Zack
 Polizzi, @zhongzyd, Ziming Dong, @perhapszzy
diff --git a/configure b/configure
index 8bc271aaf3..8f0a77dcae 100755
--- a/configure
+++ b/configure
@@ -8,8 +8,22 @@ pushd `dirname $0` #> /dev/null
 SOURCE_BASE_DIR=`pwd -P`
 popd > /dev/null
 
+PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
+function is_windows() {
+  # On windows, the shell script is actually running in msys
+  if [[ "${PLATFORM}" =~ msys_nt* ]]; then
+    true
+  else
+    false
+  fi
+}
+
 function bazel_clean_and_fetch() {
-  bazel clean --expunge
+  # bazel clean --expunge currently doesn't work on Windows
+  # TODO(pcloudy): Re-enable it after bazel clean --expunge is fixed.
+  if ! is_windows; then
+    bazel clean --expunge
+  fi
   bazel fetch //tensorflow/...
 }
 
@@ -35,6 +49,12 @@ while true; do
   # Retry
 done
 
+if is_windows; then
+  TF_NEED_GCP=0
+  TF_NEED_HDFS=0
+  TF_NEED_CUDA=0
+fi
+
 while [ "$TF_NEED_GCP" == "" ]; do
   read -p "Do you wish to build TensorFlow with "\
 "Google Cloud Platform support? [y/N] " INPUT
@@ -89,12 +109,16 @@ fi
 
 ## Find swig path
 if [ -z "$SWIG_PATH" ]; then
-  SWIG_PATH=`type -p swig 2> /dev/null`
+  SWIG_PATH=`type -p swig 2> /dev/null || true`
 fi
 if [[ ! -e "$SWIG_PATH" ]]; then
   echo "Can't find swig.  Ensure swig is in \$PATH or set \$SWIG_PATH."
   exit 1
 fi
+# Convert swig path to Windows style before writing into swig_path
+if is_windows; then
+  SWIG_PATH="$(cygpath -m "$SWIG_PATH")"
+fi
 echo "$SWIG_PATH" > tensorflow/tools/swig/swig_path
 
 # Invoke python_config and set up symlinks to python includes
@@ -104,7 +128,7 @@ echo "$SWIG_PATH" > tensorflow/tools/swig/swig_path
 # git hash propagation
 GEN_GIT_SOURCE=tensorflow/tools/git/gen_git_source.py
 chmod a+x ${GEN_GIT_SOURCE}
-${PYTHON_BIN_PATH} ${GEN_GIT_SOURCE} --configure ${SOURCE_BASE_DIR}
+"${PYTHON_BIN_PATH}" ${GEN_GIT_SOURCE} --configure "${SOURCE_BASE_DIR}"
 
 ## Set up Cuda-related environment settings
 
@@ -255,8 +279,8 @@ while true; do
     CUDA_DNN_LIB_PATH="lib64/libcudnn.so${TF_CUDNN_EXT}"
     CUDA_DNN_LIB_ALT_PATH="libcudnn.so${TF_CUDNN_EXT}"
   elif [ "$OSNAME" == "Darwin" ]; then
-    CUDA_DNN_LIB_PATH="lib/libcudnn${TF_CUDNN_EXT}.dylib"
-    CUDA_DNN_LIB_ALT_PATH="libcudnn${TF_CUDNN_EXT}.dylib"
+    CUDA_DNN_LIB_PATH="lib/libcudnn${TF_CUDNN_EXT}"
+    CUDA_DNN_LIB_ALT_PATH="libcudnn${TF_CUDNN_EXT}"
   fi
 
   if [ -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_ALT_PATH}" -o -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_PATH}" ]; then
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 2c53e7aefc..6a173d040b 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -45,6 +45,12 @@ config_setting(
 )
 
 config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows_msvc"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
     name = "ios",
     values = {
         "crosstool_top": "//tools/osx/crosstool:crosstool",
@@ -109,6 +115,7 @@ filegroup(
         "//tensorflow/contrib/ndlstm:all_files",
         "//tensorflow/contrib/opt:all_files",
         "//tensorflow/contrib/rnn:all_files",
+        "//tensorflow/contrib/seq2seq:all_files",
         "//tensorflow/contrib/session_bundle:all_files",
         "//tensorflow/contrib/session_bundle/example:all_files",
         "//tensorflow/contrib/slim:all_files",
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index 4954b4c43b..3a3a265439 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -965,8 +965,6 @@ typedef struct TF_Library TF_Library;
 // Pass "library_filename" to a platform-specific mechanism for dynamically
 // loading a library. The rules for determining the exact location of the
 // library are platform-specific and are not documented here.
-// Expects the symbols "RegisterOps", "RegisterKernels", and "GetOpList", to be
-// defined in the library.
 //
 // On success, place OK in status and return the newly created library handle.
 // The caller owns the library handle.
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 6497b9c91d..b9b5db2587 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -32,6 +32,7 @@ py_library(
         "//tensorflow/contrib/opt:opt_py",
         "//tensorflow/contrib/quantization:quantization_py",
         "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/contrib/seq2seq:seq2seq_py",
         "//tensorflow/contrib/slim",
         "//tensorflow/contrib/slim:nets",
         "//tensorflow/contrib/tensor_forest:tensor_forest_py",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 4eb6ec09a2..75a40d7975 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -37,6 +37,7 @@ from tensorflow.contrib import metrics
 from tensorflow.contrib import opt
 from tensorflow.contrib import quantization
 from tensorflow.contrib import rnn
+from tensorflow.contrib import seq2seq
 from tensorflow.contrib import slim
 from tensorflow.contrib import tensor_forest
 from tensorflow.contrib import tensorboard
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 1a1c30f526..7679ada189 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -48,7 +48,9 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
   add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC -D__VERSION__=\"MSVC\")
-  set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} /MP)
+  # Suppress warnings to reduce build log size.
+  add_definitions(/wd4267 /wd4244 /wd4800 /wd4503 /wd4554 /wd4996 /wd4348 /wd4018)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
 endif()
 
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
@@ -56,6 +58,7 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
 endif()
 
 # External dependencies
+include(zlib)
 include(gif)
 include(png)
 include(jpeg)
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index ecb381f115..1e5178d15c 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -19,7 +19,7 @@ endif()
   
 ExternalProject_Add(grpc
     PREFIX grpc
-    DEPENDS protobuf
+    DEPENDS protobuf zlib
     GIT_REPOSITORY ${GRPC_URL}
     GIT_TAG ${GRPC_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index 2be5aa70af..bbf626f87b 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -19,6 +19,7 @@ set(png_HEADERS
 
 ExternalProject_Add(png
     PREFIX png
+    DEPENDS zlib
     URL ${png_URL}
     URL_HASH ${png_HASH}
     INSTALL_DIR ${png_INSTALL}
@@ -28,6 +29,7 @@ ExternalProject_Add(png
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -DCMAKE_INSTALL_PREFIX:STRING=${png_INSTALL}
 	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+	-DZLIB_ROOT:STRING=${ZLIB_INSTALL}
 )
 
 ## put png includes in the directory where they are expected
diff --git a/tensorflow/contrib/cmake/external/protobuf.cmake b/tensorflow/contrib/cmake/external/protobuf.cmake
index 28f946ec3e..27f9c2e313 100644
--- a/tensorflow/contrib/cmake/external/protobuf.cmake
+++ b/tensorflow/contrib/cmake/external/protobuf.cmake
@@ -15,6 +15,7 @@ endif()
 
 ExternalProject_Add(protobuf
     PREFIX protobuf
+    DEPENDS zlib
     GIT_REPOSITORY ${PROTOBUF_URL}
     GIT_TAG ${PROTOBUF_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
@@ -29,4 +30,5 @@ ExternalProject_Add(protobuf
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+	-DZLIB_ROOT:STRING=${ZLIB_INSTALL}
 )
diff --git a/tensorflow/contrib/cmake/external/zlib.cmake b/tensorflow/contrib/cmake/external/zlib.cmake
new file mode 100644
index 0000000000..ded2e41770
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/zlib.cmake
@@ -0,0 +1,46 @@
+include (ExternalProject)
+
+set(zlib_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/zlib_archive)
+set(ZLIB_URL https://github.com/madler/zlib)
+set(ZLIB_BUILD ${CMAKE_CURRENT_BINARY_DIR}/zlib/src/zlib)
+set(ZLIB_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/zlib/install)
+set(ZLIB_TAG 50893291621658f355bc5b4d450a8d06a563053d)
+
+if(WIN32)
+  set(zlib_STATIC_LIBRARIES
+      ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlib.lib)
+else()
+  set(zlib_STATIC_LIBRARIES
+      ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/libz.a)
+endif()
+
+set(ZLIB_HEADERS
+    "${ZLIB_INSTALL}/include/zconf.h"
+    "${ZLIB_INSTALL}/include/zlib.h"
+)
+
+ExternalProject_Add(zlib
+    PREFIX zlib
+    GIT_REPOSITORY ${ZLIB_URL}
+    GIT_TAG ${ZLIB_TAG}
+    INSTALL_DIR ${ZLIB_INSTALL}
+    BUILD_IN_SOURCE 1
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    CMAKE_CACHE_ARGS
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_INSTALL_PREFIX:STRING=${ZLIB_INSTALL}
+	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+)
+
+# put zlib includes in the directory where they are expected
+add_custom_target(zlib_create_destination_dir
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${zlib_INCLUDE_DIR}
+    DEPENDS zlib)
+
+add_custom_target(zlib_copy_headers_to_destination
+    DEPENDS zlib_create_destination_dir)
+
+foreach(header_file ${ZLIB_HEADERS})
+    add_custom_command(TARGET zlib_copy_headers_to_destination PRE_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${header_file} ${zlib_INCLUDE_DIR})
+endforeach()
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 321528c584..d1029d3e52 100644
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -41,9 +41,6 @@ if(NOT NUMPY_INCLUDE_DIR)
   endif(${NUMPY_NOT_FOUND})
 endif(NOT NUMPY_INCLUDE_DIR)
 
-# 3. Resolve the installed version of zlib (for libz.so).
-find_package(ZLIB REQUIRED)
-
 
 ########################################################
 # Build the Python directory structure.
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/beta_test.py b/tensorflow/contrib/distributions/python/kernel_tests/beta_test.py
index e1a8a6d602..c0eee548ff 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/beta_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/beta_test.py
@@ -17,7 +17,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from scipy import stats
+from scipy import stats, special
 import tensorflow as tf
 
 
@@ -308,6 +308,40 @@ class BetaTest(tf.test.TestCase):
       self.assertAllClose(tf.nn.softplus(a).eval(), dist.a.eval())
       self.assertAllClose(tf.nn.softplus(b).eval(), dist.b.eval())
 
+  def testBetaBetaKL(self):
+    with self.test_session() as sess:
+      for shape in [(10,), (4,5)]:
+        a1 = 6.0*np.random.random(size=shape) + 1e-4
+        b1 = 6.0*np.random.random(size=shape) + 1e-4 
+        a2 = 6.0*np.random.random(size=shape) + 1e-4
+        b2 = 6.0*np.random.random(size=shape) + 1e-4 
+        # Take inverse softplus of values to test BetaWithSoftplusAB
+        a1_sp = np.log(np.exp(a1) - 1.0)
+        b1_sp = np.log(np.exp(b1) - 1.0)
+        a2_sp = np.log(np.exp(a2) - 1.0)
+        b2_sp = np.log(np.exp(b2) - 1.0)
+
+        d1 = tf.contrib.distributions.Beta(a=a1, b=b1)
+        d2 = tf.contrib.distributions.Beta(a=a2, b=b2)
+        d1_sp = tf.contrib.distributions.BetaWithSoftplusAB(a=a1_sp, b=b1_sp)
+        d2_sp = tf.contrib.distributions.BetaWithSoftplusAB(a=a2_sp, b=b2_sp)
+
+        kl_expected = (special.betaln(a2, b2) - special.betaln(a1, b1)
+                     + (a1 - a2)*special.digamma(a1)
+                     + (b1 - b2)*special.digamma(b1)
+                     + (a2 - a1 + b2 - b1)*special.digamma(a1 + b1))
+
+        for dist1 in [d1, d1_sp]:
+          for dist2 in [d2, d2_sp]:
+            kl = tf.contrib.distributions.kl(dist1, dist2)
+            kl_val = sess.run(kl)
+            self.assertEqual(kl.get_shape(), shape)
+            self.assertAllClose(kl_val, kl_expected)
+        
+        # Make sure KL(d1||d1) is 0
+        kl_same = sess.run(tf.contrib.distributions.kl(d1, d1))
+        self.assertAllClose(kl_same, np.zeros_like(kl_expected))
+
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py b/tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py
index 066d8edd6e..ee9c1b5401 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py
@@ -222,5 +222,34 @@ class CategoricalTest(tf.test.TestCase):
       dist = tf.contrib.distributions.Categorical(tf.log(histograms) - 50.)
       self.assertAllEqual(dist.mode().eval(), [[1, 0]])
 
+  def testCategoricalCategoricalKL(self):
+    def np_softmax(logits):
+      exp_logits = np.exp(logits)
+      return exp_logits / exp_logits.sum(axis=-1, keepdims=True)
+
+    with self.test_session() as sess:
+      for categories in [2, 4]:
+        for batch_size in [1, 10]:
+          a_logits = np.random.randn(batch_size, categories)
+          b_logits = np.random.randn(batch_size, categories)
+
+          a = tf.contrib.distributions.Categorical(logits=a_logits)
+          b = tf.contrib.distributions.Categorical(logits=b_logits)
+
+          kl = tf.contrib.distributions.kl(a, b)
+          kl_val = sess.run(kl)
+          # Make sure KL(a||a) is 0
+          kl_same = sess.run(tf.contrib.distributions.kl(a, a))
+
+          prob_a = np_softmax(a_logits)
+          prob_b = np_softmax(b_logits)
+          kl_expected = np.sum(
+              prob_a * (np.log(prob_a) - np.log(prob_b)), axis=-1)
+
+          self.assertEqual(kl.get_shape(), (batch_size,))
+          self.assertAllClose(kl_val, kl_expected)
+          self.assertAllClose(kl_same, np.zeros_like(kl_expected))
+
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/beta.py b/tensorflow/contrib/distributions/python/ops/beta.py
index 34eec7f69c..2ccd3519c4 100644
--- a/tensorflow/contrib/distributions/python/ops/beta.py
+++ b/tensorflow/contrib/distributions/python/ops/beta.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import distribution
 from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -299,3 +300,39 @@ class BetaWithSoftplusAB(Beta):
           allow_nan_stats=allow_nan_stats,
           name=ns)
     self._parameters = parameters
+
+
+def _kl_beta_beta(d1, d2, name=None):
+  """Calculate the batched KL divergence KL(d1 || d2) with d1 and d2 Beta.
+
+  Args:
+    d1: instance of a Beta distribution object.
+    d2: instance of a Beta distribution object.
+    name: (optional) Name to use for created operations.
+      default is "kl_beta_beta".
+
+  Returns:
+    Batchwise KL(d1 || d2)
+  """
+  inputs = [d1.a, d1.b, d1.a_b_sum, d2.a_b_sum]
+  with ops.name_scope(name, "kl_beta_beta", inputs):
+    # ln(B(a', b') / B(a, b))
+    log_betas = (math_ops.lgamma(d2.a) + math_ops.lgamma(d2.b)
+                - math_ops.lgamma(d2.a_b_sum) + math_ops.lgamma(d1.a_b_sum)
+                - math_ops.lgamma(d1.a) - math_ops.lgamma(d1.b))
+    # (a - a')*psi(a) + (b - b')*psi(b) + (a' - a + b' - b)*psi(a + b)
+    digammas = ((d1.a - d2.a)*math_ops.digamma(d1.a)
+              + (d1.b - d2.b)*math_ops.digamma(d1.b)
+              + (d2.a_b_sum - d1.a_b_sum)*math_ops.digamma(d1.a_b_sum))
+    return log_betas + digammas
+
+
+# Register KL divergences.
+kl_classes = [
+    Beta,
+    BetaWithSoftplusAB,
+]
+
+for beta_aa in kl_classes:
+  for beta_bb in kl_classes:
+    kullback_leibler.RegisterKL(beta_aa, beta_bb)(_kl_beta_beta)
diff --git a/tensorflow/contrib/distributions/python/ops/categorical.py b/tensorflow/contrib/distributions/python/ops/categorical.py
index f8cd137869..908690c1ce 100644
--- a/tensorflow/contrib/distributions/python/ops/categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/categorical.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import distribution
 from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -223,3 +224,24 @@ class Categorical(distribution.Distribution):
     ret = math_ops.cast(ret, self.dtype)
     ret.set_shape(self.get_batch_shape())
     return ret
+
+
+@kullback_leibler.RegisterKL(Categorical, Categorical)
+def _kl_categorical_categorical(a, b, name=None):
+  """Calculate the batched KL divergence KL(a || b) with a and b Categorical.
+
+  Args:
+    a: instance of a Categorical distribution object.
+    b: instance of a Categorical distribution object.
+    name: (optional) Name to use for created operations.
+      default is "kl_categorical_categorical".
+
+  Returns:
+    Batchwise KL(a || b)
+  """
+  with ops.name_scope(
+    name, "kl_categorical_categorical", [a.logits, b.logits]):
+    # sum(p*ln(p/q))
+    return math_ops.reduce_sum(
+        nn_ops.softmax(a.logits)*(nn_ops.log_softmax(a.logits)
+            - nn_ops.log_softmax(b.logits)), reduction_indices=[-1])
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 095de031b9..5c4dfb27c6 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -162,7 +162,7 @@ def _fused_batch_norm(
       updates = tf.group(*update_ops)
       total_loss = control_flow_ops.with_dependencies([updates], total_loss)
 
-  One can set update_collections=None to force the updates in place, but that
+  One can set updates_collections=None to force the updates in place, but that
   can have speed penalty, specially in distributed settings.
 
   Args:
@@ -204,24 +204,36 @@ def _fused_batch_norm(
   Raises:
     ValueError: if `data_format` is neither `NHWC` nor `NCHW`.
     ValueError: if the rank of `inputs` is undefined.
-    ValueError: if rank or last dimension of `inputs` is undefined.
+    ValueError: if the rank of `inputs` is neither 2 or 4.
+    ValueError: if rank or `C` dimension of `inputs` is undefined.
   """
   if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
     raise ValueError('data_format has to be either NCHW or NHWC.')
   with variable_scope.variable_scope(
       scope, 'BatchNorm', [inputs], reuse=reuse) as sc:
     inputs = ops.convert_to_tensor(inputs)
+    original_shape = inputs.get_shape()
+    original_rank = original_shape.ndims
+    if original_rank is None:
+      raise ValueError('Inputs %s has undefined rank' % inputs.name)
+    elif original_rank not in [2, 4]:
+      raise ValueError('Inputs %s has unsupported rank. \
+          Expected 2 or 4 but got %d' % (inputs.name, original_rank))
+    if original_rank == 2:
+      channels = inputs.get_shape()[-1].value
+      if channels is None:
+        raise ValueError('`C` dimension must be known but is None')
+      new_shape = [-1, channels, 1, 1] if data_format == DATA_FORMAT_NCHW else \
+          [-1, 1, 1, channels]
+      inputs = array_ops.reshape(inputs, new_shape)
     inputs_shape = inputs.get_shape()
-    inputs_rank = inputs_shape.ndims
-    if inputs_rank is None:
-      raise ValueError('Inputs %s has undefined rank.' % inputs.name)
     dtype = inputs.dtype.base_dtype
     if data_format == DATA_FORMAT_NHWC:
       params_shape = inputs_shape[-1:]
     else:
       params_shape = inputs_shape[1:2]
     if not params_shape.is_fully_defined():
-      raise ValueError('Inputs %s has undefined last dimension %s.' %
+      raise ValueError('Inputs %s has undefined `C` dimension %s.' %
                        (inputs.name, params_shape))
 
     # Allocate parameters for the beta and gamma of the normalization.
@@ -277,31 +289,31 @@ def _fused_batch_norm(
         trainable=False,
         collections=moving_variance_collections)
 
+    def _fused_batch_norm_training():
+      return nn.fused_batch_norm(
+          inputs, gamma, beta, epsilon=epsilon, data_format=data_format)
+    def _fused_batch_norm_inference():
+      return nn.fused_batch_norm(
+          inputs,
+          gamma,
+          beta,
+          mean=moving_mean,
+          variance=moving_variance,
+          epsilon=epsilon,
+          is_training=False,
+          data_format=data_format)
+    outputs, mean, variance = utils.smart_cond(is_training,
+                                               _fused_batch_norm_training,
+                                               _fused_batch_norm_inference)
+
     # If `is_training` doesn't have a constant value, because it is a `Tensor`,
     # a `Variable` or `Placeholder` then is_training_value will be None and
-    # `needs_moments` will be true.
+    # `need_updates` will be true.
     is_training_value = utils.constant_value(is_training)
-    need_moments = is_training_value is None or is_training_value
-    if need_moments:
-      # Calculate the moments based on the individual batch.
-      def _fused_batch_norm_training():
-        return nn.fused_batch_norm(
-            inputs, gamma, beta, epsilon=epsilon, data_format=data_format)
-      def _fused_batch_norm_inference():
-        return nn.fused_batch_norm(
-            inputs,
-            gamma,
-            beta,
-            mean=moving_mean,
-            variance=moving_variance,
-            epsilon=epsilon,
-            is_training=False,
-            data_format=data_format)
-      outputs, mean, variance = utils.smart_cond(is_training,
-                                                 _fused_batch_norm_training,
-                                                 _fused_batch_norm_inference)
-      moving_vars_fn = lambda: (moving_mean, moving_variance)
+    need_updates = is_training_value is None or is_training_value
+    if need_updates:
       if updates_collections is None:
+        _no_updates = lambda: outputs
         def _force_updates():
           """Internal function forces updates moving_vars if is_training."""
           update_moving_mean = moving_averages.assign_moving_average(
@@ -310,12 +322,10 @@ def _fused_batch_norm(
               moving_variance, variance, decay)
           with ops.control_dependencies(
               [update_moving_mean, update_moving_variance]):
-            return array_ops.identity(mean), array_ops.identity(variance)
-        mean, variance = utils.smart_cond(is_training, _force_updates,
-                                          moving_vars_fn)
-        with ops.control_dependencies([mean, variance]):
-          outputs = array_ops.identity(outputs)
+            return array_ops.identity(outputs)
+        outputs = utils.smart_cond(is_training, _force_updates, _no_updates)
       else:
+        moving_vars_fn = lambda: (moving_mean, moving_variance)
         def _delay_updates():
           """Internal function that delay updates moving_vars if is_training."""
           update_moving_mean = moving_averages.assign_moving_average(
@@ -328,22 +338,10 @@ def _fused_batch_norm(
                                                         moving_vars_fn)
         ops.add_to_collections(updates_collections, update_mean)
         ops.add_to_collections(updates_collections, update_variance)
-        # Use computed moments during training and moving_vars otherwise.
-        vars_fn = lambda: (mean, variance)
-        mean, variance = utils.smart_cond(is_training, vars_fn, moving_vars_fn)
-    else:
-      mean, variance = moving_mean, moving_variance
-      outputs, _, _ = nn.fused_batch_norm(
-          inputs,
-          gamma,
-          beta,
-          mean=moving_mean,
-          variance=moving_variance,
-          epsilon=epsilon,
-          is_training=False,
-          data_format=data_format)
 
     outputs.set_shape(inputs_shape)
+    if original_shape.ndims == 2:
+      outputs = array_ops.reshape(outputs, original_shape)
     if activation_fn is not None:
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections,
@@ -610,6 +608,7 @@ def bias_add(inputs,
              variables_collections=None,
              outputs_collections=None,
              trainable=True,
+             data_format=DATA_FORMAT_NHWC,
              scope=None):
   """Adds a bias to the inputs.
 
@@ -629,16 +628,34 @@ def bias_add(inputs,
     outputs_collections: collections to add the outputs.
     trainable: If `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    data_format: A string. 'NHWC' and 'NCHW' are supported.
     scope: Optional scope for variable_scope.
 
   Returns:
     a tensor representing the result of adding biases to the inputs.
+
+  Raises:
+    ValueError: if `data_format` is neither `NHWC` nor `NCHW`.
+    ValueError: if `data_format` is `NCHW` and rank of `inputs` is not 4.
+    ValueError: if the rank of `inputs` is undefined.
+    ValueError: if rank or `C` dimension of `inputs` is undefined.
   """
+  if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
+    raise ValueError('data_format has to be either NCHW or NHWC.')
   with variable_scope.variable_scope(scope, 'BiasAdd', [inputs],
                                      reuse=reuse) as sc:
     inputs = ops.convert_to_tensor(inputs)
     dtype = inputs.dtype.base_dtype
-    num_features = utils.last_dimension(inputs.get_shape(), min_rank=2)
+    inputs_shape = inputs.get_shape()
+    inputs_rank = inputs_shape.ndims
+    if inputs_rank is None:
+      raise ValueError('Dims of shape must be known but is None')
+    elif inputs_rank != 4 and data_format == DATA_FORMAT_NCHW:
+      raise ValueError('Data format NCHW only supports 4D Tensor')
+    axis = 1 if data_format==DATA_FORMAT_NCHW else -1
+    num_features = inputs_shape[axis].value
+    if num_features is None:
+      raise ValueError('`C` dimension must be known but is None')
     biases_collections = utils.get_variable_collections(variables_collections,
                                                         'biases')
     biases = variables.model_variable('biases',
@@ -648,7 +665,7 @@ def bias_add(inputs,
                                       regularizer=regularizer,
                                       collections=biases_collections,
                                       trainable=trainable)
-    outputs = nn.bias_add(inputs, biases)
+    outputs = nn.bias_add(inputs, biases, data_format=data_format)
     if activation_fn is not None:
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections,
diff --git a/tensorflow/contrib/layers/python/layers/utils.py b/tensorflow/contrib/layers/python/layers/utils.py
index 2d111ab150..9738fe192c 100644
--- a/tensorflow/contrib/layers/python/layers/utils.py
+++ b/tensorflow/contrib/layers/python/layers/utils.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from collections import namedtuple
+from collections import OrderedDict
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -99,15 +100,15 @@ def get_tensor_alias(tensor):
 
 
 def convert_collection_to_dict(collection):
-  """Returns a dict of Tensors using get_tensor_alias as key.
+  """Returns an OrderedDict of Tensors using get_tensor_alias as key.
 
   Args:
     collection: A collection.
 
   Returns:
-    A dictionary of {get_tensor_alias(tensor): tensor}
+    An OrderedDict of {get_tensor_alias(tensor): tensor}
   """
-  return {get_tensor_alias(t): t for t in ops.get_collection(collection)}
+  return OrderedDict((get_tensor_alias(t), t) for t in ops.get_collection(collection))
 
 
 def constant_value(value_or_tensor_or_var, dtype=None):
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/batch.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/batch.py
index 7793e296e2..200ec57b67 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/batch.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/batch.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/in_memory_source.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/in_memory_source.py
index 575577d140..674107e496 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/in_memory_source.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/in_memory_source.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/metrics/python/ops/confusion_matrix_ops.py b/tensorflow/contrib/metrics/python/ops/confusion_matrix_ops.py
index 6dcec77c81..1820f6bf17 100644
--- a/tensorflow/contrib/metrics/python/ops/confusion_matrix_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/confusion_matrix_ops.py
@@ -33,26 +33,33 @@ def confusion_matrix(predictions, labels, num_classes=None, dtype=dtypes.int32,
   Calculate the Confusion Matrix for a pair of prediction and
   label 1-D int arrays.
 
-  Considering a prediction array such as: `[1, 2, 3]`
-  And a label array such as: `[2, 2, 3]`
+  The matrix rows represent the prediction labels and the columns
+  represents the real labels. The confusion matrix is always a 2-D array
+  of shape `[n, n]`, where `n` is the number of valid labels for a given
+  classification task. Both prediction and labels must be 1-D arrays of
+  the same shape in order for this function to work.
+
+  If `num_classes` is None, then `num_classes` will be set to the one plus
+  the maximum value in either predictions or labels.
+  Class labels are expected to start at 0. E.g., if `num_classes` was
+  three, then the possible labels would be `[0, 1, 2]`.
+
+  If `weights` is not `None`, then each prediction contributes its
+  corresponding weight to the total value of the confusion matrix cell.
 
-  The confusion matrix returned would be the following one:
+  For example:
 
   ```python
-      [[0, 0, 0, 0]
-       [0, 0, 1, 0]
-       [0, 0, 1, 0]
-       [0, 0, 0, 1]]
+    tf.contrib.metrics.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
+        [[0 0 0 0 0]
+         [0 0 1 0 0]
+         [0 0 1 0 0]
+         [0 0 0 0 0]
+         [0 0 0 0 1]]
   ```
 
-  If `weights` is not None, then the confusion matrix elements are the
-  corresponding `weights` elements.
-
-  Where the matrix rows represent the prediction labels and the columns
-  represents the real labels. The confusion matrix is always a 2-D array
-  of shape [n, n], where n is the number of valid labels for a given
-  classification task. Both prediction and labels must be 1-D arrays of
-  the same shape in order for this function to work.
+  Note that the possible labels are assumed to be `[0, 1, 2, 3, 4]`,
+  resulting in a 5x5 confusion matrix.
 
   Args:
     predictions: A 1-D array representing the predictions for a given
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 98a00adf2d..ffdfbccaae 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -3080,6 +3080,7 @@ def aggregate_metric_map(names_to_tuples):
   This function is useful for pairing metric names with their associated value
   and update ops when the list of metrics is long. For example:
 
+  ```python
     metrics_to_values, metrics_to_updates = slim.metrics.aggregate_metric_map({
         'Mean Absolute Error': new_slim.metrics.streaming_mean_absolute_error(
             predictions, labels, weights),
@@ -3090,6 +3091,7 @@ def aggregate_metric_map(names_to_tuples):
         'RMSE Log': new_slim.metrics.streaming_root_mean_squared_error(
             predictions, labels, weights),
     })
+  ```
 
   Args:
     names_to_tuples: a map of metric names to tuples, each of which contain the
diff --git a/tensorflow/contrib/seq2seq/BUILD b/tensorflow/contrib/seq2seq/BUILD
new file mode 100644
index 0000000000..d5f0abe6a7
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/BUILD
@@ -0,0 +1,52 @@
+# Description:
+#   contains parts of TensorFlow that are experimental or unstable and which are not supported.
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+py_library(
+    name = "seq2seq_py",
+    srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
+    srcs_version = "PY2AND3",
+)
+
+cuda_py_test(
+    name = "layers_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/layers_test.py"],
+    additional_deps = [
+        ":seq2seq_py",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "loss_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/loss_test.py"],
+    additional_deps = [
+        ":seq2seq_py",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/seq2seq/README.md b/tensorflow/contrib/seq2seq/README.md
new file mode 100644
index 0000000000..50ac32ec15
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/README.md
@@ -0,0 +1,9 @@
+# TensorFlow contrib seq2seq layers and losses
+
+## Layers
+
+Information to be added.
+
+## Losses
+
+Information to be added.
diff --git a/tensorflow/contrib/seq2seq/__init__.py b/tensorflow/contrib/seq2seq/__init__.py
new file mode 100644
index 0000000000..8861cb94d0
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Ops for building neural network seq2seq layers and losses."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+# pylint: disable=unused-import,wildcard-import,line-too-long
+from tensorflow.contrib.seq2seq.python.ops import layers
+from tensorflow.contrib.seq2seq.python.ops import loss
diff --git a/tensorflow/contrib/seq2seq/python/__init__.py b/tensorflow/contrib/seq2seq/python/__init__.py
new file mode 100644
index 0000000000..c5ca3a623f
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ops module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/layers_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/layers_test.py
new file mode 100644
index 0000000000..b4eaec658a
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/layers_test.py
@@ -0,0 +1,36 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for contrib.seq2seq.python.seq2seq.layers_ops."""
+# pylint: disable=unused-import,g-bad-import-order
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# pylint: enable=unused-import
+
+import tensorflow as tf
+
+
+class LayersTest(tf.test.TestCase):
+
+  def testRNNDecoder(self):
+    pass
+
+  def testRNNDecoderAttention(self):
+    pass
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
new file mode 100644
index 0000000000..f99de76f17
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
@@ -0,0 +1,33 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for contrib.seq2seq.python.seq2seq.loss_ops."""
+# pylint: disable=unused-import,g-bad-import-order
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# pylint: enable=unused-import
+
+import tensorflow as tf
+
+
+class LossTest(tf.test.TestCase):
+
+  def testLoss(self):
+    pass
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/seq2seq/python/ops/layers.py b/tensorflow/contrib/seq2seq/python/ops/layers.py
new file mode 100644
index 0000000000..4ee2df6073
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/ops/layers.py
@@ -0,0 +1,35 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Seq2seq layer operations for use in neural networks.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import array_ops
+
+
+__all__ = ["rnn_decoder",
+           "rnn_decoder_attention"]
+
+
+def rnn_decoder(*args, **kwargs):
+  pass
+
+
+def rnn_decoder_attention(*args, **kwargs):
+  pass
diff --git a/tensorflow/contrib/seq2seq/python/ops/loss.py b/tensorflow/contrib/seq2seq/python/ops/loss.py
new file mode 100644
index 0000000000..b8a33b3f6f
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/ops/loss.py
@@ -0,0 +1,30 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Seq2seq loss operations for use in neural networks.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import array_ops
+
+
+__all__ = ["seq2seq_loss"]
+
+
+def seq2seq_loss(*args, **kwargs):
+  pass
diff --git a/tensorflow/contrib/session_bundle/README.md b/tensorflow/contrib/session_bundle/README.md
index 64328fe596..6df63cba80 100644
--- a/tensorflow/contrib/session_bundle/README.md
+++ b/tensorflow/contrib/session_bundle/README.md
@@ -34,12 +34,10 @@ definition that's needed for serving.
 *   `assets` -- Asset file directory
     *   Holds auxiliary files for the graph (e.g., vocabularies)
 *   `export.meta` -- MetaGraph Definition
-    *   Binary [`tensorflow::MetaGraphDef`]
-        (https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/protobuf/meta_graph.proto)
+    *   Binary [`tensorflow::MetaGraphDef`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/protobuf/meta_graph.proto)
 *   `export-?????-of-?????`
     *   A checkpoint of the Graph Variables
-    *   Outputs from Python [`Saver`]
-        (https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/training/saver.py)
+    *   Outputs from Python [`Saver`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/training/saver.py)
         with `sharded=True`.
 
 ## Exporting (Python code)
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 7141ddf804..438935e934 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -64,6 +64,7 @@ load(
     "//tensorflow:tensorflow.bzl",
     "if_android",
     "if_ios",
+    "if_not_windows",
     "tf_copts",
     "tf_cc_test",
     "tf_cc_tests",
@@ -140,6 +141,7 @@ cc_library(
         "platform/protobuf.h",
         "platform/types.h",
     ] + glob(tf_additional_proto_hdrs()),
+    copts = tf_copts(),
     deps = [
         ":protos_all_cc",
         "//tensorflow/core/platform/default/build_config:proto_parsing",
@@ -294,8 +296,6 @@ tf_cuda_library(
         "util/example_proto_fast_parsing.h",
         "util/example_proto_helper.h",
         "util/guarded_philox_random.h",
-        "util/memmapped_file_system.h",
-        "util/memmapped_file_system_writer.h",
         "util/mirror_pad_mode.h",
         "util/padding.h",
         "util/port.h",
@@ -312,7 +312,13 @@ tf_cuda_library(
         "util/use_cudnn.h",
         "util/util.h",
         "util/work_sharder.h",
-    ],
+    ] + select({
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            "util/memmapped_file_system.h",
+            "util/memmapped_file_system_writer.h",
+        ],
+    }),
     visibility = ["//visibility:public"],
     deps = [":framework_internal"],
 )
@@ -513,7 +519,6 @@ cc_library(
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:ctc_ops",
         "//tensorflow/core/kernels:data_flow",
-        "//tensorflow/core/kernels:fact_op",
         "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:image",
         "//tensorflow/core/kernels:io",
@@ -524,7 +529,6 @@ cc_library(
         "//tensorflow/core/kernels:nn",
         "//tensorflow/core/kernels:parameterized_truncated_normal_op",
         "//tensorflow/core/kernels:parsing",
-        "//tensorflow/core/kernels:quantized_ops",
         "//tensorflow/core/kernels:random_ops",
         "//tensorflow/core/kernels:required",
         "//tensorflow/core/kernels:sdca_ops",
@@ -533,7 +537,12 @@ cc_library(
         "//tensorflow/core/kernels:string",
         "//tensorflow/core/kernels:training_ops",
         "//tensorflow/models/embedding:word2vec_kernels",
-    ],
+    ] + if_not_windows([
+        "//tensorflow/core/kernels:fact_op",
+        "//tensorflow/core/kernels:array_not_windows",
+        "//tensorflow/core/kernels:math_not_windows",
+        "//tensorflow/core/kernels:quantized_ops",
+    ]),
 )
 
 tf_cuda_library(
@@ -874,12 +883,12 @@ cc_library(
 # Libraries with GPU facilities that are useful for writing kernels.
 cc_library(
     name = "gpu_lib",
-    srcs = [
+    srcs = if_not_windows([
         "common_runtime/gpu/gpu_event_mgr.cc",
-    ],
-    hdrs = [
+    ]),
+    hdrs = if_not_windows([
         "common_runtime/gpu/gpu_event_mgr.h",
-    ],
+    ]),
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
@@ -889,8 +898,7 @@ cc_library(
         ":lib_internal",
         ":proto_text",
         ":protos_all_cc",
-        ":stream_executor",
-    ],
+    ] + if_not_windows([":stream_executor"]),
 )
 
 cc_library(
@@ -950,26 +958,47 @@ tf_proto_library_cc(
 
 cc_library(
     name = "lib_internal",
-    srcs = glob(
-        [
-            "lib/**/*.h",
-            "lib/**/*.cc",
-            "platform/*.h",
-            "platform/*.cc",
-            "platform/profile_utils/**/*.h",
-            "platform/profile_utils/**/*.cc",
-        ] + tf_additional_lib_srcs(),
-        exclude =
+    srcs = select({
+        "//tensorflow:windows": glob(
+            [
+                "lib/**/*.h",
+                "lib/**/*.cc",
+                "platform/*.h",
+                "platform/*.cc",
+            ],
+            exclude = [
+                "**/*test*",
+                "platform/**/cuda.h",
+                "platform/**/stream_executor.h",
+                "platform/load_library.cc",
+            ],
+        ),
+        "//conditions:default": glob(
             [
+                "lib/**/*.h",
+                "lib/**/*.cc",
+                "platform/*.h",
+                "platform/*.cc",
+                "platform/profile_utils/**/*.h",
+                "platform/profile_utils/**/*.cc",
+            ],
+            exclude = [
                 "**/*test*",
                 "platform/**/cuda.h",
                 "platform/**/stream_executor.h",
-            ] +
-            # Protobuf deps already included through the ":lib_proto_parsing"
-            # dependency.
-            tf_additional_proto_srcs(),
+            ],
+        ),
+    }) + tf_additional_lib_srcs(
+        exclude = [
+            "**/*test*",
+            "platform/**/cuda.h",
+            "platform/**/stream_executor.h",
+        ] +
+        # Protobuf deps already included through the ":lib_proto_parsing"
+        # dependency.
+        tf_additional_proto_srcs(),
     ),
-    hdrs = glob(tf_additional_lib_hdrs()) + [
+    hdrs = tf_additional_lib_hdrs() + [
         "lib/core/blocking_counter.h",
         "lib/core/refcount.h",
         "lib/gif/gif_io.h",
@@ -1039,6 +1068,7 @@ tf_version_info_genrule()
 cc_library(
     name = "version_lib",
     srcs = ["util/version_info.cc"],
+    copts = tf_copts(),
 )
 
 tf_cuda_library(
@@ -1060,8 +1090,18 @@ tf_cuda_library(
             "util/reporter.h",
             "util/reporter.cc",
             "framework/fake_input.*",
+            "util/memmapped_file_system.*",
+            "util/memmapped_file_system_writer.*",
         ],
-    ),
+    ) + select({
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            "util/memmapped_file_system.h",
+            "util/memmapped_file_system.cc",
+            "util/memmapped_file_system_writer.h",
+            "util/memmapped_file_system_writer.cc",
+        ],
+    }),
     hdrs = [
         "framework/op_segment.h",
         "framework/rendezvous.h",  # only needed for tests
@@ -1335,7 +1375,7 @@ tf_cuda_library(
 
 tf_cuda_library(
     name = "gpu_runtime",
-    srcs = [
+    srcs = if_not_windows([
         "common_runtime/gpu/gpu_bfc_allocator.cc",
         "common_runtime/gpu/gpu_debug_allocator.cc",
         "common_runtime/gpu/gpu_device.cc",
@@ -1347,8 +1387,8 @@ tf_cuda_library(
         "common_runtime/gpu/pool_allocator.cc",
         "common_runtime/gpu/process_state.cc",
         "common_runtime/gpu_device_context.h",
-    ],
-    hdrs = [
+    ]),
+    hdrs = if_not_windows([
         "common_runtime/gpu/gpu_bfc_allocator.h",
         "common_runtime/gpu/gpu_debug_allocator.h",
         "common_runtime/gpu/gpu_device.h",
@@ -1357,7 +1397,7 @@ tf_cuda_library(
         "common_runtime/gpu/gpu_util.h",
         "common_runtime/gpu/pool_allocator.h",
         "common_runtime/gpu/process_state.h",
-    ],
+    ]),
     copts = tf_copts(),
     linkstatic = 1,
     deps = [
@@ -1369,9 +1409,10 @@ tf_cuda_library(
         ":lib",
         ":lib_internal",
         ":protos_all_cc",
-        ":stream_executor",
         "//third_party/eigen3",
-    ],
+    ] + if_not_windows([
+        ":stream_executor",
+    ]),
     alwayslink = 1,
 )
 
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index c124947411..6183e9fe26 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -457,7 +457,7 @@ Status MaxPoolShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
   if (strides.size() != 4) {
     return errors::InvalidArgument(
-        "AvgPool requires the stride attribute to contain 4 values, but "
+        "MaxPool requires the stride attribute to contain 4 values, but "
         "got: ",
         strides.size());
   }
@@ -466,7 +466,7 @@ Status MaxPoolShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(c->GetAttr("ksize", &kernel_sizes));
   if (kernel_sizes.size() != 4) {
     return errors::InvalidArgument(
-        "AvgPool requires the ksize attribute to contain 4 values, but got: ",
+        "MaxPool requires the ksize attribute to contain 4 values, but got: ",
         kernel_sizes.size());
   }
 
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index abee32b3ad..f047ddb12a 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -171,12 +171,6 @@ class OpListOpRegistry : public OpRegistryInterface {
   std::unordered_map<string, const OpRegistrationData*> index_;
 };
 
-// Treats 'registry_ptr' as a pointer to OpRegistry, and calls
-// registry_ptr->Register(op_def) for each op_def that has been registered with
-// the current library's global op registry (obtained by calling
-// OpRegistry::Global().
-extern "C" void RegisterOps(void* registry_ptr);
-
 // Support for defining the OpDef (specifying the semantics of the Op and how
 // it should be created) and registering it in the OpRegistry::Global()
 // registry.  Usage:
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index a2a76f6047..4c14918ea7 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -1105,11 +1105,6 @@ void* GlobalKernelRegistry();
 Status FindKernelDef(DeviceType device_type, const NodeDef& node_def,
                      const KernelDef** def, string* kernel_class_name);
 
-// Treats 'registry_ptr' as a pointer to KernelRegistry. For each kernel 'k'
-// registered with the current library's global kernel registry (obtained by
-// calling GlobalKernelRegistry()), inserts 'k' into registry_ptr.
-extern "C" void RegisterKernels(void* registry_ptr);
-
 // Writes a list of all registered kernels to LOG(INFO), to help users debug
 // missing kernel errors.
 void LogAllRegisteredKernels();
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index add489a293..b2f97bd7b8 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -25,6 +25,7 @@ package_group(
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "if_not_windows",
     "tf_cc_test",
     "tf_cc_tests",
     "tf_copts",
@@ -386,6 +387,41 @@ cc_header_only_library(
 
 # OpKernel libraries ----------------------------------------------------------
 
+ARRAY_DEPS = [
+    ":batch_space_ops",
+    ":bounds_check",
+    ":concat_lib",
+    ":cuda_device_array",
+    ":depth_space_ops",
+    ":extract_image_patches_op",
+    ":fill_functor",
+    ":gather_functor",
+    ":ops_util",
+    ":split_lib",
+    ":strided_slice_op",
+    ":transpose_functor",
+    "//tensorflow/core:array_grad",
+    "//tensorflow/core:array_ops_op_lib",
+    "//tensorflow/core:core_cpu",
+    "//tensorflow/core:framework",
+    "//tensorflow/core:gpu_runtime",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:lib_internal",
+    "//tensorflow/core:proto_text",
+    "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core/debug:debug_io_utils",
+    "//third_party/eigen3",
+]
+
+tf_kernel_libraries(
+    name = "array_not_windows",
+    prefixes = [
+        "debug_ops",
+        "immutable_constant_op",
+    ],
+    deps = ARRAY_DEPS,
+)
+
 tf_kernel_libraries(
     name = "array",
     prefixes = [
@@ -393,7 +429,6 @@ tf_kernel_libraries(
         "bitcast_op",
         "concat_op",
         "constant_op",
-        "debug_ops",
         "diag_op",
         "matrix_band_part_op",
         "matrix_diag_op",
@@ -402,7 +437,6 @@ tf_kernel_libraries(
         "gather_op",
         "gather_nd_op",
         "identity_op",
-        "immutable_constant_op",
         "listdiff_op",
         "mirror_pad_op",
         "one_hot_op",
@@ -421,31 +455,7 @@ tf_kernel_libraries(
         "unpack_op",
         "where_op",
     ],
-    deps = [
-        ":batch_space_ops",
-        ":bounds_check",
-        ":concat_lib",
-        ":cuda_device_array",
-        ":depth_space_ops",
-        ":extract_image_patches_op",
-        ":fill_functor",
-        ":gather_functor",
-        ":ops_util",
-        ":split_lib",
-        ":strided_slice_op",
-        ":transpose_functor",
-        "//tensorflow/core:array_grad",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:gpu_runtime",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:proto_text",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/debug:debug_io_utils",
-        "//third_party/eigen3",
-    ],
+    deps = ARRAY_DEPS,
 )
 
 tf_cc_test(
@@ -1264,6 +1274,27 @@ tf_cc_tests(
     ],
 )
 
+MATH_DEPS = [
+    ":bounds_check",
+    ":fill_functor",
+    ":transpose_functor",
+    "//tensorflow/core:core_cpu",
+    "//tensorflow/core:framework",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:lib_internal",
+    "//tensorflow/core:math_grad",
+    "//tensorflow/core:math_ops_op_lib",
+    "//third_party/eigen3",
+]
+
+tf_kernel_libraries(
+    name = "math_not_windows",
+    prefixes = [
+        "sparse_matmul_op",
+    ],
+    deps = MATH_DEPS,
+)
+
 tf_kernel_libraries(
     name = "math",
     prefixes = [
@@ -1281,20 +1312,8 @@ tf_kernel_libraries(
         "segment_reduction_ops",
         "scan_ops",
         "sequence_ops",
-        "sparse_matmul_op",
-    ],
-    deps = [
-        ":bounds_check",
-        ":fill_functor",
-        ":transpose_functor",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:math_grad",
-        "//tensorflow/core:math_ops_op_lib",
-        "//third_party/eigen3",
     ],
+    deps = MATH_DEPS,
 )
 
 tf_cuda_cc_test(
@@ -1574,7 +1593,6 @@ tf_kernel_libraries(
         ":conv_2d",
         ":conv_ops",
         ":depthwise_conv_grad_op",
-        ":depthwise_conv_op",
         ":dilation_ops",
         ":fused_batch_norm_util_gpu",
         ":ops_util",
@@ -1585,7 +1603,9 @@ tf_kernel_libraries(
         "//tensorflow/core:nn_grad",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
-    ],
+    ] + if_not_windows([
+        ":depthwise_conv_op",
+    ]),
 )
 
 tf_cuda_cc_test(
diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc
index d3bd9913e0..280c2dc71c 100644
--- a/tensorflow/core/kernels/decode_raw_op.cc
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@@ -93,6 +93,7 @@ class DecodeRawOp : public OpKernel {
       Name("DecodeRaw").Device(DEVICE_CPU).TypeConstraint<type>("out_type"), \
       DecodeRawOp<type>)
 
+REGISTER(Eigen::half);
 REGISTER(float);
 REGISTER(double);
 REGISTER(int32);
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
index 40c6a3ab00..5146ca626a 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
@@ -67,4 +67,8 @@ template class InvVarianceToVariance<float>;
 }  // namespace functor
 }  // namespace tensorflow
 
+#else
+
+#include "tensorflow/core/kernels/fused_batch_norm_op.h"
+
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index d0ac1d4dd8..be220d5c95 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -45,4 +45,8 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 }  // namespace functor
 }  // namespace tensorflow
 
+#else
+
+#include "tensorflow/core/kernels/gather_functor.h"
+
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/scatter_functor.cc b/tensorflow/core/kernels/scatter_functor.cc
index b432092245..7eba82899f 100644
--- a/tensorflow/core/kernels/scatter_functor.cc
+++ b/tensorflow/core/kernels/scatter_functor.cc
@@ -55,4 +55,8 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_GPU_SPECS);
 }  // namespace functor
 }  // namespace tensorflow
 
+#else
+
+#include "tensorflow/core/kernels/scatter_functor.h"
+
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index e66a5fd317..8d04ae85c6 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -351,13 +351,16 @@ REGISTER_OP("FusedBatchNormGrad")
     .Attr("T: numbertype")
     .Attr("epsilon: float = 0.0001")
     .Attr("data_format: string = 'NHWC'")
+    .Attr("is_training: bool = true")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle y_backprop;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &y_backprop));
       ShapeHandle x;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &x));
 
+      bool is_training;
       string data_format;
+      c->GetAttr("is_training", &is_training);
       c->GetAttr("data_format", &data_format);
       DimensionHandle channel_dim = (data_format == "NHWC")
                                         ? c->Dim(y_backprop, 3)
@@ -386,8 +389,16 @@ REGISTER_OP("FusedBatchNormGrad")
       c->set_output(0, x_backprop);
       c->set_output(1, c->Vector(channel_dim));
       c->set_output(2, c->Vector(channel_dim));
-      c->set_output(3, c->Vector(0));
-      c->set_output(4, c->Vector(0));
+      // Set the correct shapes for reserve_spaces
+      // so that gradients can be performed when
+      // the op is in a symbolic condition.
+      if (is_training) {
+        c->set_output(3, c->Vector(0));
+        c->set_output(4, c->Vector(0));
+      } else {
+        c->set_output(3, c->Vector(channel_dim));
+        c->set_output(4, c->Vector(channel_dim));
+      }
       return Status::OK();
     })
     .Doc(R"doc(
@@ -412,6 +423,8 @@ T: The data type for the elements of input and output Tensors.
 epsilon: A small float number added to the variance of x.
 data_format: The data format for y_backprop, x, x_backprop.
              Either "NHWC" (default) or "NCHW".
+is_training: A bool value to indicate the operation is for training (default)
+             or inference.
 )doc");
 
 // --------------------------------------------------------------------------
@@ -1835,7 +1848,7 @@ pooling_ratio: Pooling ratio for each dimension of `value`, currently only
   respectively.
 pseudo_random: When set to True, generates the pooling sequence in a
   pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-  Graham, Fractional Max-Pooling] (http://arxiv.org/abs/1412.6071) for
+  Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
   difference between pseudorandom and random.
 overlapping: When set to True, it means when pooling, the values at the boundary
   of adjacent pooling cells are used by both cells. For example:
@@ -1925,7 +1938,7 @@ pooling_ratio: Pooling ratio for each dimension of `value`, currently only
   respectively.
 pseudo_random: When set to True, generates the pooling sequence in a
   pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-  Graham, Fractional Max-Pooling] (http://arxiv.org/abs/1412.6071) for
+  Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
   difference between pseudorandom and random.
 overlapping: When set to True, it means when pooling, the values at the boundary
   of adjacent pooling cells are used by both cells. For example:
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 04feaede61..30140508df 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -6976,7 +6976,7 @@ op {
     default_value {
       b: false
     }
-    description: "When set to True, generates the pooling sequence in a\npseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin\nGraham, Fractional Max-Pooling] (http://arxiv.org/abs/1412.6071) for\ndifference between pseudorandom and random."
+    description: "When set to True, generates the pooling sequence in a\npseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin\nGraham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for\ndifference between pseudorandom and random."
   }
   attr {
     name: "overlapping"
@@ -7110,7 +7110,7 @@ op {
     default_value {
       b: false
     }
-    description: "When set to True, generates the pooling sequence in a\npseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin\nGraham, Fractional Max-Pooling] (http://arxiv.org/abs/1412.6071) for\ndifference between pseudorandom and random."
+    description: "When set to True, generates the pooling sequence in a\npseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin\nGraham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for\ndifference between pseudorandom and random."
   }
   attr {
     name: "overlapping"
diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc
index b588127d97..4ca3f2e07e 100644
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@@ -27,7 +27,7 @@ using shape_inference::ShapeHandle;
 REGISTER_OP("DecodeRaw")
     .Input("bytes: string")
     .Output("output: out_type")
-    .Attr("out_type: {float,double,int32,uint8,int16,int8,int64}")
+    .Attr("out_type: {half,float,double,int32,uint8,int16,int8,int64}")
     .Attr("little_endian: bool = true")
     .SetShapeFn([](InferenceContext* c) {
       // Note: last dimension is data dependent.
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 534561a008..01ef13caa0 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -91,17 +91,31 @@ def tf_proto_library(name, srcs = [], has_services = None,
       visibility = visibility,
   )
 
-def tf_additional_lib_hdrs():
-  return [
-      "platform/default/*.h",
-      "platform/posix/*.h",
-  ]
-
-def tf_additional_lib_srcs():
-  return [
-      "platform/default/*.cc",
-      "platform/posix/*.cc",
-  ]
+def tf_additional_lib_hdrs(exclude = []):
+  return select({
+    "//tensorflow:windows" : native.glob([
+        "platform/default/*.h",
+        "platform/windows/*.h",
+        "platform/posix/error.h",
+      ], exclude = exclude),
+    "//conditions:default" : native.glob([
+        "platform/default/*.h",
+        "platform/posix/*.h",
+      ], exclude = exclude),
+  })
+
+def tf_additional_lib_srcs(exclude = []):
+  return select({
+    "//tensorflow:windows" : native.glob([
+        "platform/default/*.cc",
+        "platform/windows/*.cc",
+        "platform/posix/error.cc",
+      ], exclude = exclude),
+    "//conditions:default" : native.glob([
+        "platform/default/*.cc",
+        "platform/posix/*.cc",
+      ], exclude = exclude),
+  })
 
 def tf_additional_minimal_lib_srcs():
   return [
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index a2182a831c..09edc10a94 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -32,6 +32,8 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/windows/windows_file_system.h"
 
+#pragma comment(lib, "Shlwapi.lib")
+
 namespace tensorflow {
 
 namespace {
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index b08c1cf9f4..0721976f3e 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <snappy.h>
 #endif
 #include <WinSock2.h>
+#pragma comment(lib, "Ws2_32.lib")
 
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/demangle.h"
diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
index f1c6f2056f..98e6a9f346 100644
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -9,6 +9,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow:tensorflow.bzl", "tf_copts")
+
 # To be exported to tensorflow/core:android_srcs.
 filegroup(
     name = "android_srcs",
@@ -24,7 +26,7 @@ cc_library(
     name = "tensor_bundle",
     srcs = ["tensor_bundle.cc"],
     hdrs = ["tensor_bundle.h"],
-    copts = ["-Wno-sign-compare"],
+    copts = tf_copts() + ["-Wno-sign-compare"],
     deps = [
         ":naming",
         "//tensorflow/core:core_cpu_internal",
diff --git a/tensorflow/examples/learn/BUILD b/tensorflow/examples/learn/BUILD
index 79fdd78891..b7eb2ce1cb 100644
--- a/tensorflow/examples/learn/BUILD
+++ b/tensorflow/examples/learn/BUILD
@@ -92,6 +92,15 @@ py_binary(
 )
 
 py_binary(
+    name = "resnet",
+    srcs = ["resnet.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
     name = "text_classification",
     srcs = ["text_classification.py"],
     srcs_version = "PY2AND3",
@@ -160,6 +169,7 @@ sh_test(
         ":iris_val_based_early_stopping",
         ":iris_with_pipeline",
         ":random_forest_mnist",
+        ":resnet",
         ":text_classification",
         ":text_classification_builtin_rnn_model",
         ":text_classification_character_cnn",
diff --git a/tensorflow/examples/learn/README.md b/tensorflow/examples/learn/README.md
index feb08cbbac..0ae72ae8da 100644
--- a/tensorflow/examples/learn/README.md
+++ b/tensorflow/examples/learn/README.md
@@ -23,6 +23,7 @@ Some examples use the `pandas` library for data processing (`sudo pip install pa
 ## Specialized Models
 * [Building a Random Forest Model](random_forest.py)
 * [Building a Wide & Deep Model](wide_n_deep_tutorial.py)
+* [Building a Residual Network Model](resnet.py)
 
 ## Text classification
 
diff --git a/tensorflow/examples/learn/examples_test.sh b/tensorflow/examples/learn/examples_test.sh
index 7dfcc9b7c9..317e70d574 100755
--- a/tensorflow/examples/learn/examples_test.sh
+++ b/tensorflow/examples/learn/examples_test.sh
@@ -13,7 +13,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-# This script exercises the examples of using SkFlow.
+# This script exercises the examples of using TF.Learn.
 
 DIR="$TEST_SRCDIR"
 
@@ -28,20 +28,18 @@ then
   DIR="$DIR"/"$TEST_WORKSPACE"
 fi
 
-SKFLOW_EXAMPLE_BASE_DIR=$DIR/tensorflow/examples/learn
+TFLEARN_EXAMPLE_BASE_DIR=$DIR/tensorflow/examples/learn
 
 
 function test() {
   echo "Test "$1":"
-  $SKFLOW_EXAMPLE_BASE_DIR/$1 $2
+  $TFLEARN_EXAMPLE_BASE_DIR/$1 $2
   if [ $? -eq 0 ]
   then
     echo "Test passed."
-    echo
     return 0
   else
     echo "Test failed."
-    echo
     exit 1
   fi
 }
@@ -53,6 +51,7 @@ test iris_custom_decay_dnn
 test iris_run_config
 test iris_val_based_early_stopping
 test iris_with_pipeline
+test resnet
 test text_classification --test_with_fake_data
 test text_classification_builtin_rnn_model --test_with_fake_data
 test text_classification_cnn --test_with_fake_data
diff --git a/tensorflow/examples/skflow/resnet.py b/tensorflow/examples/learn/resnet.py
index d67022d457..3e9b579b35 100755
--- a/tensorflow/examples/skflow/resnet.py
+++ b/tensorflow/examples/learn/resnet.py
@@ -28,10 +28,9 @@ from collections import namedtuple
 from math import sqrt
 import os
 
-from sklearn import metrics
 import tensorflow as tf
 from tensorflow.contrib import learn
-from tensorflow.examples.tutorials.mnist import input_data
+from tensorflow.contrib.layers import batch_norm, convolution2d
 
 
 def res_net(x, y, activation=tf.nn.relu):
@@ -69,8 +68,9 @@ def res_net(x, y, activation=tf.nn.relu):
 
   # First convolution expands to 64 channels
   with tf.variable_scope('conv_layer1'):
-    net = learn.ops.conv2d(x, 64, [7, 7], batch_norm=True,
-                           activation=activation, bias=False)
+    net = convolution2d(x, 64, 7,
+                        normalizer_fn=batch_norm,
+                        activation_fn=activation)
 
   # Max pool
   net = tf.nn.max_pool(
@@ -78,9 +78,8 @@ def res_net(x, y, activation=tf.nn.relu):
 
   # First chain of resnets
   with tf.variable_scope('conv_layer2'):
-    net = learn.ops.conv2d(net, groups[0].num_filters,
-                           [1, 1], [1, 1, 1, 1],
-                           padding='VALID', bias=True)
+    net = convolution2d(net, groups[0].num_filters, 1,
+                        padding='VALID')
 
   # Create the bottleneck groups, each of which contains `num_blocks`
   # bottleneck groups.
@@ -90,30 +89,24 @@ def res_net(x, y, activation=tf.nn.relu):
 
       # 1x1 convolution responsible for reducing dimension
       with tf.variable_scope(name + '/conv_in'):
-        conv = learn.ops.conv2d(net, group.bottleneck_size,
-                                [1, 1], [1, 1, 1, 1],
-                                padding='VALID',
-                                activation=activation,
-                                batch_norm=True,
-                                bias=False)
+        conv = convolution2d(net, group.bottleneck_size, 1,
+                             padding='VALID',
+                             activation_fn=activation,
+                             normalizer_fn=batch_norm)
 
       with tf.variable_scope(name + '/conv_bottleneck'):
-        conv = learn.ops.conv2d(conv, group.bottleneck_size,
-                                [3, 3], [1, 1, 1, 1],
-                                padding='SAME',
-                                activation=activation,
-                                batch_norm=True,
-                                bias=False)
+        conv = convolution2d(conv, group.bottleneck_size, 3,
+                             padding='SAME',
+                             activation_fn=activation,
+                             normalizer_fn=batch_norm)
 
       # 1x1 convolution responsible for restoring dimension
       with tf.variable_scope(name + '/conv_out'):
         input_dim = net.get_shape()[-1].value
-        conv = learn.ops.conv2d(conv, input_dim,
-                                [1, 1], [1, 1, 1, 1],
-                                padding='VALID',
-                                activation=activation,
-                                batch_norm=True,
-                                bias=False)
+        conv = convolution2d(conv, input_dim, 1,
+                             padding='VALID',
+                             activation_fn=activation,
+                             normalizer_fn=batch_norm)
 
       # shortcut connections that turn the network into its counterpart
       # residual function (identity shortcut)
@@ -123,10 +116,10 @@ def res_net(x, y, activation=tf.nn.relu):
       # upscale to the next group size
       next_group = groups[group_i + 1]
       with tf.variable_scope('block_%d/conv_upscale' % group_i):
-        net = learn.ops.conv2d(net, next_group.num_filters,
-                               [1, 1], [1, 1, 1, 1],
-                               bias=False,
-                               padding='SAME')
+        net = convolution2d(net, next_group.num_filters, 1,
+                            activation_fn=None,
+                            biases_initializer=None,
+                            padding='SAME')
     except IndexError:
       pass
 
@@ -138,21 +131,38 @@ def res_net(x, y, activation=tf.nn.relu):
   net_shape = net.get_shape().as_list()
   net = tf.reshape(net, [-1, net_shape[1] * net_shape[2] * net_shape[3]])
 
-  return learn.models.logistic_regression(net, y)
+  target = tf.one_hot(y, depth=10, dtype=tf.float32)
+  return learn.models.logistic_regression(net, target)
 
-# Download and load MNIST data.
-mnist = input_data.read_data_sets('MNIST_data')
-
-# Restore model if graph is saved into a folder.
-if os.path.exists('models/resnet/graph.pbtxt'):
-  classifier = learn.TensorFlowEstimator.restore('models/resnet/')
+def res_net_model(x, y):
+  prediction, loss = res_net(x, y)
+  predicted = tf.argmax(prediction, 1)
+  accuracy = tf.equal(predicted, tf.cast(y, tf.int64))
+  predictions = {'prob': prediction, 'class': predicted, 'accuracy': accuracy}
+  train_op = tf.contrib.layers.optimize_loss(
+      loss, tf.contrib.framework.get_global_step(),
+      optimizer='Adagrad', learning_rate=0.001)
+  return predictions, loss, train_op
 
-while True:
-  # Train model and save summaries into logdir.
-  classifier.fit(
-      mnist.train.images, mnist.train.labels, logdir='models/resnet/')
-
-  # Calculate accuracy.
-  score = metrics.accuracy_score(
-      mnist.test.labels, classifier.predict(mnist.test.images, batch_size=64))
-  print('Accuracy: {0:f}'.format(score))
+# Download and load MNIST data.
+mnist = learn.datasets.load_dataset('mnist')
+
+# Create a new resnet classifier.
+classifier = learn.Estimator(model_fn=res_net_model)
+
+tf.logging.set_verbosity(tf.logging.INFO)  # Show training logs. (avoid silence)
+
+# Train model and save summaries into logdir.
+classifier.fit(
+    mnist.train.images, mnist.train.labels, batch_size=100, steps=1000)
+
+# Calculate accuracy.
+result = classifier.evaluate(
+    x=mnist.test.images, y=mnist.test.labels,
+    metrics={
+        'accuracy': learn.metric_spec.MetricSpec(
+            metric_fn=tf.contrib.metrics.streaming_accuracy,
+            prediction_key='accuracy'),
+    })
+score = result['accuracy']
+print('Accuracy: {0:f}'.format(score))
diff --git a/tensorflow/examples/skflow/BUILD b/tensorflow/examples/skflow/BUILD
index 0a3c7edf84..e18e9b1c1c 100644
--- a/tensorflow/examples/skflow/BUILD
+++ b/tensorflow/examples/skflow/BUILD
@@ -25,15 +25,6 @@ py_binary(
     ],
 )
 
-py_binary(
-    name = "resnet",
-    srcs = ["resnet.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax.py b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
index 1791f97a06..785ef5767d 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
@@ -46,7 +46,7 @@ def main(_):
 
   # The raw formulation of cross-entropy,
   #
-  #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)),
+  #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)),
   #                                 reduction_indices=[1]))
   #
   # can be numerically unstable.
diff --git a/tensorflow/g3doc/api_docs/python/client.md b/tensorflow/g3doc/api_docs/python/client.md
index 2d4c52a004..14338f71bc 100644
--- a/tensorflow/g3doc/api_docs/python/client.md
+++ b/tensorflow/g3doc/api_docs/python/client.md
@@ -52,8 +52,7 @@ with tf.Session() as sess:
   sess.run(...)
 ```
 
-The [`ConfigProto`]
-(https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
+The [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
 protocol buffer exposes various configuration options for a
 session. For example, to create a session that uses soft constraints
 for device placement, and log the resulting placement decisions,
@@ -84,8 +83,8 @@ the session constructor.
 
 
 *  <b>`target`</b>: (Optional.) The execution engine to connect to.
-    Defaults to using an in-process engine. See [Distributed Tensorflow]
-    (https://www.tensorflow.org/how_tos/distributed/index.html)
+    Defaults to using an in-process engine. See
+    [Distributed Tensorflow](https://www.tensorflow.org/how_tos/distributed/index.html)
     for more examples.
 *  <b>`graph`</b>: (Optional.) The `Graph` to be launched (described above).
 *  <b>`config`</b>: (Optional.) A [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.sampled_softmax_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.sampled_softmax_loss.md
index 6d22f67352..44388cce0c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.sampled_softmax_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.sampled_softmax_loss.md
@@ -11,8 +11,8 @@ the full softmax loss.
 At inference time, you can compute full softmax probabilities with the
 expression `tf.nn.softmax(tf.matmul(inputs, tf.transpose(weights)) + biases)`.
 
-See our [Candidate Sampling Algorithms Reference]
-(../../extras/candidate_sampling.pdf)
+See our
+[Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf)
 
 Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
 ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.Graph.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.Graph.md
index 0faea32646..27258ff899 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.Graph.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.Graph.md
@@ -632,8 +632,8 @@ Note that this is unrelated to the
 
 The GraphDef version information of this graph.
 
-For details on the meaning of each version, see [`GraphDef`]
-(https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto).
+For details on the meaning of each version, see
+[`GraphDef`](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto).
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.fractional_max_pool.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.fractional_max_pool.md
index 7f043e078c..8f8fb0237c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.fractional_max_pool.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.fractional_max_pool.md
@@ -46,7 +46,7 @@ For more details on fractional max pooling, see this paper:
 *  <b>`pseudo_random`</b>: An optional `bool`. Defaults to `False`.
     When set to True, generates the pooling sequence in a
     pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-    Graham, Fractional Max-Pooling] (http://arxiv.org/abs/1412.6071) for
+    Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
     difference between pseudorandom and random.
 *  <b>`overlapping`</b>: An optional `bool`. Defaults to `False`.
     When set to True, it means when pooling, the values at the boundary
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md
index 3de6d1ae3f..bfa01aeaba 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md
@@ -2,11 +2,10 @@
 
 Computes and returns the noise-contrastive estimation training loss.
 
-See [Noise-contrastive estimation: A new estimation principle for
-unnormalized statistical models]
-(http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
-Also see our [Candidate Sampling Algorithms Reference]
-(../../extras/candidate_sampling.pdf)
+See
+[Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+Also see our
+[Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf)
 
 Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
 so your labels must be sorted in order of decreasing frequency to achieve
@@ -44,8 +43,7 @@ with an otherwise unused class.
       where a sampled class equals one of the target classes.  If set to
       `True`, this is a "Sampled Logistic" loss instead of NCE, and we are
       learning to generate log-odds instead of log probabilities.  See
-      our [Candidate Sampling Algorithms Reference]
-      (../../extras/candidate_sampling.pdf).
+      our [Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf).
       Default is False.
 *  <b>`partition_strategy`</b>: A string specifying the partitioning strategy, relevant
       if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.parse_example.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.parse_example.md
index 2f2f511196..b52ea6ad3c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.parse_example.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.parse_example.md
@@ -2,8 +2,8 @@
 
 Parses `Example` protos into a `dict` of tensors.
 
-Parses a number of serialized [`Example`]
-(https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
+Parses a number of serialized
+[`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
 protos given in `serialized`.
 
 `example_names` may contain descriptive names for the corresponding serialized
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.RMSPropOptimizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.RMSPropOptimizer.md
index 4fcce3cbff..499b65cc84 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.RMSPropOptimizer.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.RMSPropOptimizer.md
@@ -1,7 +1,6 @@
 Optimizer that implements the RMSProp algorithm.
 
-See the [paper]
-(http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+See the [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
 
 - - -
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.atrous_conv2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.atrous_conv2d.md
index c73c76ab50..e5f0d9e567 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.atrous_conv2d.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.atrous_conv2d.md
@@ -32,11 +32,10 @@ Convolutional Nets and Fully Connected CRFs](http://arxiv.org/abs/1412.7062).
 The same operation is investigated further in [Multi-Scale Context Aggregation
 by Dilated Convolutions](http://arxiv.org/abs/1511.07122). Previous works
 that effectively use atrous convolution in different ways are, among others,
-[OverFeat: Integrated Recognition, Localization and Detection using
-Convolutional Networks](http://arxiv.org/abs/1312.6229) and [Fast Image
-Scanning with Deep Max-Pooling Convolutional Neural Networks]
-(http://arxiv.org/abs/1302.1700). Atrous convolution is also closely related
-to the so-called noble identities in multi-rate signal processing.
+[OverFeat: Integrated Recognition, Localization and Detection using Convolutional Networks](http://arxiv.org/abs/1312.6229)
+and [Fast Image Scanning with Deep Max-Pooling Convolutional Neural Networks](http://arxiv.org/abs/1302.1700).
+Atrous convolution is also closely related to the so-called noble identities in
+multi-rate signal processing.
 
 There are many different ways to implement atrous convolution (see the refs
 above). The implementation here reduces
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.fractional_avg_pool.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.fractional_avg_pool.md
index 595e664973..367205ffd6 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.fractional_avg_pool.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.fractional_avg_pool.md
@@ -22,7 +22,7 @@ pooling region.
 *  <b>`pseudo_random`</b>: An optional `bool`. Defaults to `False`.
     When set to True, generates the pooling sequence in a
     pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-    Graham, Fractional Max-Pooling] (http://arxiv.org/abs/1412.6071) for
+    Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
     difference between pseudorandom and random.
 *  <b>`overlapping`</b>: An optional `bool`. Defaults to `False`.
     When set to True, it means when pooling, the values at the boundary
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.local_response_normalization.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.local_response_normalization.md
index 81134df29f..2738a61f9d 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.local_response_normalization.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.local_response_normalization.md
@@ -11,8 +11,8 @@ each component is divided by the weighted, squared sum of inputs within
         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
     output = input / (bias + alpha * sqr_sum) ** beta
 
-For details, see [Krizhevsky et al., ImageNet classification with deep
-convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+For details, see
+[Krizhevsky et al., ImageNet classification with deep convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.Session.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.Session.md
index 7293205505..d9de06d5d0 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.Session.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.Session.md
@@ -36,8 +36,7 @@ with tf.Session() as sess:
   sess.run(...)
 ```
 
-The [`ConfigProto`]
-(https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
+The [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
 protocol buffer exposes various configuration options for a
 session. For example, to create a session that uses soft constraints
 for device placement, and log the resulting placement decisions,
@@ -68,8 +67,8 @@ the session constructor.
 
 
 *  <b>`target`</b>: (Optional.) The execution engine to connect to.
-    Defaults to using an in-process engine. See [Distributed Tensorflow]
-    (https://www.tensorflow.org/how_tos/distributed/index.html)
+    Defaults to using an in-process engine. See
+    [Distributed Tensorflow](https://www.tensorflow.org/how_tos/distributed/index.html)
     for more examples.
 *  <b>`graph`</b>: (Optional.) The `Graph` to be launched (described above).
 *  <b>`config`</b>: (Optional.) A [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.resize_images.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.resize_images.md
index 48d9cf1648..a4b7e8f57a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.resize_images.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.resize_images.md
@@ -8,12 +8,9 @@ the same as `size`.  To avoid distortions see
 
 `method` can be one of:
 
-*   <b>`ResizeMethod.BILINEAR`</b>: [Bilinear interpolation.]
-    (https://en.wikipedia.org/wiki/Bilinear_interpolation)
-*   <b>`ResizeMethod.NEAREST_NEIGHBOR`</b>: [Nearest neighbor interpolation.]
-    (https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
-*   <b>`ResizeMethod.BICUBIC`</b>: [Bicubic interpolation.]
-    (https://en.wikipedia.org/wiki/Bicubic_interpolation)
+*   <b>`ResizeMethod.BILINEAR`</b>: [Bilinear interpolation.](https://en.wikipedia.org/wiki/Bilinear_interpolation)
+*   <b>`ResizeMethod.NEAREST_NEIGHBOR`</b>: [Nearest neighbor interpolation.](https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
+*   <b>`ResizeMethod.BICUBIC`</b>: [Bicubic interpolation.](https://en.wikipedia.org/wiki/Bicubic_interpolation)
 *   <b>`ResizeMethod.AREA`</b>: Area interpolation.
 
 ##### Args:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scan.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scan.md
index 2e27268594..047971e260 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scan.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scan.md
@@ -36,9 +36,9 @@ For example, if `elems` is `(t1, [t2, t3])` and `initializer` is
 
 
 *  <b>`fn`</b>: The callable to be performed.  It accepts two arguments.  The first
-    will have the same (possibly nested) structure as `elems`.  The second
     will have the same structure as `initializer` if one is provided,
-    otherwise it will have the same structure as `elems`.  Its output
+    otherwise it will have the same structure as `elems`.  The second
+    will have the same (possibly nested) structure as `elems`.  Its output
     must have the same structure as `initializer` if one is provided,
     otherwise it must have the same structure as `elems`.
 *  <b>`elems`</b>: A tensor or (possibly nested) sequence of tensors, each of which
diff --git a/tensorflow/g3doc/get_started/os_setup.md b/tensorflow/g3doc/get_started/os_setup.md
index cbf64a32de..a280b25c88 100644
--- a/tensorflow/g3doc/get_started/os_setup.md
+++ b/tensorflow/g3doc/get_started/os_setup.md
@@ -1089,3 +1089,25 @@ This can be resolved by create a symbolic link:
 ```bash
 ln -sf /usr/local/cuda/lib/libcuda.dylib /usr/local/cuda/lib/libcuda.1.dylib
 ```
+
+### Mac OS X: RuntimeError: Broken toolchain: cannot link a simple C program
+
+On Mac OS X, when installing tensorflow you might see lots of warnings and errors, ending with a `Broken toolchain: cannot link a simple C program` message:
+
+```
+>>> sudo pip install --upgrade $TF_BINARY_URL
+
+...<lots more warnings and errors>
+
+You have not agreed to the Xcode license agreements, please run 'xcodebuild -license' (for user-level acceptance) or 'sudo xcodebuild -license' (for system-wide acceptance) from within a Terminal window to review and agree to the Xcode license agreements.
+
+...<more stack trace output>
+
+  File "numpy/core/setup.py", line 653, in get_mathlib_info
+
+    raise RuntimeError("Broken toolchain: cannot link a simple C program")
+
+RuntimeError: Broken toolchain: cannot link a simple C program
+```
+
+This is typically because you have the Xcode build tools installed, but you still need to accept the license agreements.  To resolve it, accept the license agreement by opening Xcode, or by running `xcodebuild -license` from the command line.
diff --git a/tensorflow/g3doc/resources/xla_prerelease.md b/tensorflow/g3doc/resources/xla_prerelease.md
index 932b5a2945..1a6cf1d5a4 100644
--- a/tensorflow/g3doc/resources/xla_prerelease.md
+++ b/tensorflow/g3doc/resources/xla_prerelease.md
@@ -1733,7 +1733,7 @@ degenerate dimensions to produce a 4x3x2 array result.
     for floats. However, if the range of the data is limited, floating-point
     addition is close enough to being associative for most practical uses. It
     is possible to conceive some complete un-associative reductions, however,
-    and these will produce wrong results in TLA reductions.
+    and these will produce wrong results in XLA reductions.
 
 ## C++ interface
 
diff --git a/tensorflow/go/genop/generate.sh b/tensorflow/go/genop/generate.sh
index 272c550fbd..e9d562e60e 100644
--- a/tensorflow/go/genop/generate.sh
+++ b/tensorflow/go/genop/generate.sh
@@ -16,7 +16,8 @@
 
 set -e
 
-go get github.com/golang/protobuf/{proto,protoc-gen-go}
+go get github.com/golang/protobuf/proto
+go get github.com/golang/protobuf/protoc-gen-go
 
 cd $(dirname $0)
 TF_DIR=${GOPATH}/src/github.com/tensorflow/tensorflow
@@ -32,7 +33,7 @@ then
     echo "bazel build -c opt @protobuf//:protoc"
     exit 1
   fi
-  PROTOC=PATH_PROTOC
+  PROTOC=$PATH_PROTOC
 fi
 
 # Ensure that protoc-gen-go is available in $PATH
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index ea56e52dea..4df088dbaf 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -10,6 +10,7 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
@@ -39,9 +40,11 @@ py_library(
         ":platform_test",
         ":summary",
         ":training",
-        "//tensorflow/contrib:contrib_py",
+        ":ops",
         "//tensorflow/python/debug:debug_py",
-    ],
+    ] + if_not_windows([
+        "//tensorflow/contrib:contrib_py",
+    ]),
 )
 
 py_library(
@@ -1434,7 +1437,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "gradient_checker_test",
-    size = "small",
+    size = "medium",
     srcs = ["ops/gradient_checker_test.py"],
     additional_deps = [
         ":array_ops",
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index a23a9b7eba..c139f87c32 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -1088,8 +1088,7 @@ class Session(BaseSession):
     sess.run(...)
   ```
 
-  The [`ConfigProto`]
-  (https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
+  The [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
   protocol buffer exposes various configuration options for a
   session. For example, to create a session that uses soft constraints
   for device placement, and log the resulting placement decisions,
@@ -1127,8 +1126,8 @@ class Session(BaseSession):
 
     Args:
       target: (Optional.) The execution engine to connect to.
-        Defaults to using an in-process engine. See [Distributed Tensorflow]
-        (https://www.tensorflow.org/how_tos/distributed/index.html)
+        Defaults to using an in-process engine. See
+        [Distributed Tensorflow](https://www.tensorflow.org/how_tos/distributed/index.html)
         for more examples.
       graph: (Optional.) The `Graph` to be launched (described above).
       config: (Optional.) A [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index b0dbdf12c4..84abe8030a 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -69,6 +69,8 @@ class DType(object):
 
   @@as_numpy_dtype
   @@as_datatype_enum
+  
+  @@limits
   """
 
   def __init__(self, type_enum):
@@ -222,6 +224,22 @@ class DType(object):
       except:
         raise TypeError("Cannot find maximum value of %s." % self)
 
+  @property
+  def limits(self, clip_negative=True):
+    """Return intensity limits, i.e. (min, max) tuple, of the dtype.
+    Args:
+      clip_negative : bool, optional
+          If True, clip the negative range (i.e. return 0 for min intensity)
+          even if the image dtype allows negative values.
+    Returns
+      min, max : tuple
+        Lower and upper intensity limits.
+    """
+    min, max = dtype_range[self.as_numpy_dtype]
+    if clip_negative:
+      min = 0
+    return min, max
+
   def is_compatible_with(self, other):
     """Returns True if the `other` DType will be converted to this DType.
 
@@ -277,6 +295,19 @@ class DType(object):
   def size(self):
     return np.dtype(self.as_numpy_dtype).itemsize
 
+# Define data type range of numpy dtype
+dtype_range = {np.bool_: (False, True),
+               np.bool8: (False, True),
+               np.uint8: (0, 255),
+               np.uint16: (0, 65535),
+               np.int8: (-128, 127),
+               np.int16: (-32768, 32767),
+               np.int64: (-2**63, 2**63 - 1),
+               np.uint64: (0, 2**64 - 1),
+               np.int32: (-2**31, 2**31 - 1),
+               np.uint32: (0, 2**32 - 1),
+               np.float32: (-1, 1),
+               np.float64: (-1, 1)}
 
 # Define standard wrappers for the types_pb2.DataType enum.
 resource = DType(types_pb2.DT_RESOURCE)
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index e3b1396570..e5084b0d65 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -2130,8 +2130,8 @@ class Graph(object):
   def graph_def_versions(self):
     """The GraphDef version information of this graph.
 
-    For details on the meaning of each version, see [`GraphDef`]
-    (https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto).
+    For details on the meaning of each version, see
+    [`GraphDef`](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto).
 
     Returns:
       A `VersionDef`.
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index ca99f5fdc4..1d7097d5c2 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -89,6 +89,7 @@ void PrintAllPythonOps(const std::vector<string>& hidden_ops,
 
 int main(int argc, char* argv[]) {
   tensorflow::port::InitMain(argv[0], &argc, &argv);
+
   // Usage:
   //   gen_main [ @FILENAME | OpName[,OpName]* ] (0 | 1)
   if (argc == 2) {
diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py
index ccc7ef4e6a..f3cf0643fa 100644
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
 import tensorflow as tf
 
 
@@ -57,5 +58,19 @@ class DecodeRawOpTest(tf.test.TestCase):
           "size of int16"):
         decode.eval(feed_dict={in_bytes: ["123", "456"]})
 
+  def testToFloat16(self):
+    with self.test_session():
+      in_bytes = tf.placeholder(tf.string, shape=[None])
+      decode = tf.decode_raw(in_bytes, out_type=tf.float16)
+      self.assertEqual([None, None], decode.get_shape().as_list())
+
+      expected_result = np.matrix([[1, -2, -3, 4]], dtype=np.float16)
+      result = decode.eval(
+        feed_dict={
+          in_bytes: [expected_result.tobytes()]
+        })
+
+      self.assertAllEqual(expected_result, result)
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 38b672f47c..d765989f49 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -417,9 +417,9 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
 
   Args:
     fn: The callable to be performed.  It accepts two arguments.  The first
-      will have the same (possibly nested) structure as `elems`.  The second
       will have the same structure as `initializer` if one is provided,
-      otherwise it will have the same structure as `elems`.  Its output
+      otherwise it will have the same structure as `elems`.  The second
+      will have the same (possibly nested) structure as `elems`.  Its output
       must have the same structure as `initializer` if one is provided,
       otherwise it must have the same structure as `elems`.
     elems: A tensor or (possibly nested) sequence of tensors, each of which
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index 39e4227c9b..451b3e5bf0 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -147,6 +147,8 @@ type and representation (RGB or HSV).
 @@adjust_hue
 @@random_hue
 
+@@adjust_gamma
+
 @@adjust_saturation
 @@random_saturation
 
@@ -163,6 +165,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import common_shapes
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -737,12 +740,9 @@ def resize_images(images,
 
   `method` can be one of:
 
-  *   <b>`ResizeMethod.BILINEAR`</b>: [Bilinear interpolation.]
-      (https://en.wikipedia.org/wiki/Bilinear_interpolation)
-  *   <b>`ResizeMethod.NEAREST_NEIGHBOR`</b>: [Nearest neighbor interpolation.]
-      (https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
-  *   <b>`ResizeMethod.BICUBIC`</b>: [Bicubic interpolation.]
-      (https://en.wikipedia.org/wiki/Bicubic_interpolation)
+  *   <b>`ResizeMethod.BILINEAR`</b>: [Bilinear interpolation.](https://en.wikipedia.org/wiki/Bilinear_interpolation)
+  *   <b>`ResizeMethod.NEAREST_NEIGHBOR`</b>: [Nearest neighbor interpolation.](https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
+  *   <b>`ResizeMethod.BICUBIC`</b>: [Bicubic interpolation.](https://en.wikipedia.org/wiki/Bicubic_interpolation)
   *   <b>`ResizeMethod.AREA`</b>: Area interpolation.
 
   Args:
@@ -1005,6 +1005,46 @@ def adjust_contrast(images, contrast_factor):
     return convert_image_dtype(adjusted, orig_dtype, saturate=True)
 
 
+def adjust_gamma(image, gamma=1, gain=1):
+  """Performs Gamma Correction on the input image.
+    Also known as Power Law Transform. This function transforms the 
+    input image pixelwise according to the equation Out = In**gamma 
+    after scaling each pixel to the range 0 to 1.
+
+  Args:
+    image : A Tensor.
+    gamma : A scalar. Non negative real number.
+    gain  : A scalar. The constant multiplier. 
+
+  Returns:
+    A Tensor. Gamma corrected output image.
+
+  Notes:
+    For gamma greater than 1, the histogram will shift towards left and
+    the output image will be darker than the input image.
+    For gamma less than 1, the histogram will shift towards right and
+    the output image will be brighter than the input image.
+
+  References:
+    [1] http://en.wikipedia.org/wiki/Gamma_correction
+  """
+
+  with ops.op_scope([image, gamma, gain], None, 'adjust_gamma') as name:
+    # Convert pixel value to DT_FLOAT for computing adjusted image
+    img = ops.convert_to_tensor(image, name='img', dtype=dtypes.float32)
+    # Keep image dtype for computing the scale of corresponding dtype
+    image = ops.convert_to_tensor(image, name='image')
+
+    if gamma < 0:
+      raise ValueError("Gamma should be a non-negative real number")
+    # scale = max(dtype) - min(dtype)
+    scale = constant_op.constant(image.dtype.limits[1] - image.dtype.limits[0], dtype=dtypes.float32)
+    # According to the definition of gamma correction
+    adjusted_img = (img / scale) ** gamma * scale * gain
+
+    return adjusted_img
+    
+
 ops.RegisterShape('AdjustContrast')(common_shapes.call_cpp_shape_fn)
 ops.RegisterShape('AdjustContrastv2')(common_shapes.call_cpp_shape_fn)
 ops.RegisterShape('DrawBoundingBoxes')(common_shapes.call_cpp_shape_fn)
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index f1738fd779..4d379fb388 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -164,6 +164,80 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
       self.assertFalse(rgb_unknown.get_shape())
 
 
+class AdjustGamma(test_util.TensorFlowTestCase):
+
+  def test_adjust_gamma_one(self):
+    """Same image should be returned for gamma equal to one"""
+    with self.test_session():
+      x_data = np.random.uniform(0, 255, (8, 8))
+      x_np = np.array(x_data, dtype=np.float32)
+
+      x = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.adjust_gamma(x, gamma=1)
+
+      y_tf = y.eval()
+      y_np = x_np
+
+      self.assertAllClose(y_tf, y_np, 1e-6)
+
+
+  def test_adjust_gamma_zero(self):
+    """White image should be returned for gamma equal to zero"""
+    with self.test_session():
+      x_data = np.random.uniform(0, 255, (8, 8))
+      x_np = np.array(x_data, dtype=np.float32)
+      
+      x = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.adjust_gamma(x, gamma=0)
+      
+      y_tf = y.eval()
+
+      dtype = x.dtype.as_numpy_dtype
+      y_np = np.array([dtypes.dtype_range[dtype][1]] * x_np.size)
+      y_np = y_np.reshape((8,8))
+      
+      self.assertAllClose(y_tf, y_np, 1e-6)
+      
+
+  def test_adjust_gamma_less_one(self):
+    """Verifying the output with expected results for gamma
+    correction with gamma equal to half"""
+    with self.test_session():
+      x_np = np.arange(0, 255, 4, np.uint8).reshape(8,8)
+      y = image_ops.adjust_gamma(x_np, gamma=0.5)
+      y_tf = np.trunc(y.eval())
+
+      y_np = np.array([[  0,  31,  45,  55,  63,  71,  78,  84],
+          [ 90,  95, 100, 105, 110, 115, 119, 123],
+          [127, 131, 135, 139, 142, 146, 149, 153],
+          [156, 159, 162, 165, 168, 171, 174, 177],
+          [180, 183, 186, 188, 191, 194, 196, 199],
+          [201, 204, 206, 209, 211, 214, 216, 218],
+          [221, 223, 225, 228, 230, 232, 234, 236],
+          [238, 241, 243, 245, 247, 249, 251, 253]], dtype=np.float32)
+      
+      self.assertAllClose(y_tf, y_np, 1e-6)
+
+  def test_adjust_gamma_greater_one(self):
+    """Verifying the output with expected results for gamma
+    correction with gamma equal to two"""
+    with self.test_session():
+      x_np = np.arange(0, 255, 4, np.uint8).reshape(8,8)
+      y = image_ops.adjust_gamma(x_np, gamma=2)
+      y_tf = np.trunc(y.eval())
+
+      y_np = np.array([[  0,   0,   0,   0,   1,   1,   2,   3],
+          [  4,   5,   6,   7,   9,  10,  12,  14],
+          [ 16,  18,  20,  22,  25,  27,  30,  33],
+          [ 36,  39,  42,  45,  49,  52,  56,  60],
+          [ 64,  68,  72,  76,  81,  85,  90,  95],
+          [100, 105, 110, 116, 121, 127, 132, 138],
+          [144, 150, 156, 163, 169, 176, 182, 189],
+          [196, 203, 211, 218, 225, 233, 241, 249]], dtype=np.float32)
+
+      self.assertAllClose(y_tf, y_np, 1e-6)
+
+
 class AdjustHueTest(test_util.TensorFlowTestCase):
 
   def testAdjustNegativeHue(self):
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index b3c0cfd24c..ceb55d6daf 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -1029,6 +1029,8 @@ def fused_batch_norm(x, scale, offset,  # pylint: disable=invalid-name
     mean = constant_op.constant([])
   if variance is None:
     variance = constant_op.constant([])
+  # Add 1e-12 to epsilon when epsilon <= 1e-5 to prevent CUDNN exception.
+  epsilon = epsilon if epsilon > 1e-5 else epsilon + 1e-12
   y, batch_mean, batch_var, _, _ = gen_nn_ops.fused_batch_norm(
       x,
       scale,
@@ -1271,10 +1273,8 @@ def nce_loss(weights,
   """Computes and returns the noise-contrastive estimation training loss.
 
   See [Noise-contrastive estimation: A new estimation principle for
-  unnormalized statistical models]
-  (http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
-  Also see our [Candidate Sampling Algorithms Reference]
-  (../../extras/candidate_sampling.pdf)
+  unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+  Also see our [Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf)
 
   Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
   so your labels must be sorted in order of decreasing frequency to achieve
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index ec5024e445..6a35cfb63e 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -210,36 +210,34 @@ def _BiasAddGradGrad(op, received_grad):
   Args:
     op: BiasAddGrad op for which we are calculating gradients.
     received_grad: The gradients passed to the BiasAddGrad op.
-    
+
   Returns:
     A single gradient Tensor for the input to BiasAddGrad (which
     is the gradient of the bias term in BiasAdd)
   """
-  
+
   try:
     data_format = op.get_attr("data_format")
   except ValueError:
     data_format = None
-  
+
   shape = array_ops.shape(op.inputs[0])
   rank = array_ops.rank(op.inputs[0])
   bias_shape = array_ops.shape(received_grad)
-  
+
   if data_format == b"NCHW":
     expanded_shape = array_ops.concat(
       0,
       [array_ops.ones_like(shape[:-3]), bias_shape, array_ops.ones_like(shape[-2:])]
     )
-    
     tile_mults = array_ops.concat(0, [shape[:-3], [1], shape[-2:]])
-    
   else:
     expanded_shape = array_ops.concat(0, [array_ops.ones_like(shape[:-1]), bias_shape])
     tile_mults = array_ops.concat(0, [shape[:-1], [1]])
-  
+
   expanded_grad = array_ops.reshape(received_grad, expanded_shape)
   return array_ops.tile(expanded_grad, tile_mults)
-  
+
 
 @ops.RegisterGradient("BiasAddV1")
 def _BiasAddGradV1(unused_bias_op, received_grad):
@@ -498,7 +496,8 @@ def _FusedBatchNormGrad(op, *grad):
       op.outputs[3],
       op.outputs[4],
       epsilon=op.get_attr("epsilon"),
-      data_format=op.get_attr("data_format"))
+      data_format=op.get_attr("data_format"),
+      is_training=op.get_attr("is_training"))
 
 
 @ops.RegisterGradient("L2Loss")
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index a49f57c8f7..421e767ef3 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -842,9 +842,9 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
   that effectively use atrous convolution in different ways are, among others,
   [OverFeat: Integrated Recognition, Localization and Detection using
   Convolutional Networks](http://arxiv.org/abs/1312.6229) and [Fast Image
-  Scanning with Deep Max-Pooling Convolutional Neural Networks]
-  (http://arxiv.org/abs/1302.1700). Atrous convolution is also closely related
-  to the so-called noble identities in multi-rate signal processing.
+  Scanning with Deep Max-Pooling Convolutional Neural Networks](http://arxiv.org/abs/1302.1700).
+  Atrous convolution is also closely related to the so-called noble identities
+  in multi-rate signal processing.
 
   There are many different ways to implement atrous convolution (see the refs
   above). The implementation here reduces
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index 71b946018d..992fe0c331 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -153,8 +153,7 @@ def parse_example(serialized, features, name=None, example_names=None):
   # pylint: disable=line-too-long
   """Parses `Example` protos into a `dict` of tensors.
 
-  Parses a number of serialized [`Example`]
-  (https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
+  Parses a number of serialized [`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
   protos given in `serialized`.
 
   `example_names` may contain descriptive names for the corresponding serialized
@@ -549,8 +548,7 @@ def parse_single_sequence_example(
   # pylint: disable=line-too-long
   """Parses a single `SequenceExample` proto.
 
-  Parses a single serialized [`SequenceExample`]
-  (https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
+  Parses a single serialized [`SequenceExample`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
   proto given in `serialized`.
 
   This op parses a serialize sequence example into a tuple of dictionaries
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 27e3225fea..cc5464d572 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -609,9 +609,6 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
       most TensorFlow data is batch-major, so by default this function
       accepts input and emits output in batch-major form.
     dtype: (optional) The data type for the initial state.  Required if
-      initial_state is not provided.
-    sequence_length: An int32/int64 vector, size `[batch_size]`,
-      containing the actual lengths for each of the sequences.
       either of the initial states are not provided.
     scope: VariableScope for the created subgraph; defaults to "BiRNN"
 
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index 905d6de61b..0a6a4f9612 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -94,48 +94,100 @@ def einsum(axes, *inputs):
   """
   A generalized contraction between tensors of arbitrary dimension.
 
-  Like numpy.einsum.
+  Like `numpy.einsum`, but does not support:
+  * Ellipses (subscripts like `ij...,jk...->ik...`)
+  * Subscripts where an axis appears more than once for a single input (e.g. `ijj,jk->ik`).
+
+  Args:
+    axes: a `str` describing the contraction, in the same format as `numpy.einsum`.
+    inputs: the inputs to contract (each one a `Tensor`), whose shapes should be consistent with `axes`.
+
+  Returns:
+    The contracted `Tensor`, with shape determined by `axes`.
+
+  Raises:
+    ValueError: If the format of `axes` is incorrect,
+                or the number of inputs implied by `axes` does not match `len(inputs)`,
+                or an axis appears in the output subscripts but not in any of the inputs,
+                or the number of dimensions of an input differs from the number of indices in its subscript,
+                or the input shapes are inconsistent along a particular axis.
   """
+  if '...' in axes:
+    raise ValueError("Subscripts with ellipses are not yet supported.")
 
-  match = re.match('([a-z,]+)->([a-z]+)', axes)
-  assert match, \
-    "Indices have incorrect format: %s" % axes
+  match = re.match('([a-z,]+)(->[a-z]*)?', axes)
+  if not match:
+    raise ValueError(
+      "Indices have incorrect format: %s" % axes
+    )
 
   inputs = list(inputs)
   idx_in = match.group(1).split(',')
-  idx_out = match.group(2)
   idx_all = set(''.join(idx_in))
+  indices = ''.join(sorted(idx_all))
+
+  if match.group(2):
+    idx_out = match.group(2)[2:]
 
+  else:
+    # infer the output subscripts if not given, assume alphabetical order
+    counts = {ax: 0 for ax in indices}
+    for axes_ in idx_in:
+      for ax in axes_:
+        counts[ax] += 1
 
-  assert len(idx_in) == len(inputs), \
-    "Expected %d inputs but only got %d" % (len(idx_in), len(inputs))
+    idx_out = ''.join(sorted(
+      ax for ax in indices
+      if counts[ax] == 1
+    ))
 
-  # transpose inputs so axes are in alphabetical order
+  if len(idx_in) != len(inputs):
+    raise ValueError(
+      "Expected %d inputs but got %d" % (len(idx_in), len(inputs))
+    )
+
+  missing_idx = set(idx_out).difference(idx_all)
+  if missing_idx:
+    raise ValueError(
+      "Unknown ouput axes: %s" % missing_idx
+    )
+
+  axis_order = {}
+  for ax in indices:
+    if ax not in idx_out:
+      axis_order[ax] = len(axis_order)
+  for ax in idx_out:
+    axis_order[ax] = len(axis_order)
+
+  # transpose inputs so axes are in order
   for i, (input_, axes_) in enumerate(zip(inputs, idx_in)):
-    assert input_.get_shape().ndims == len(axes_), \
-      "Input %d with axes %s has incorrect" \
-      " number of dimensions (expected %d, got %d)" % (
-        i, axes_, len(axes_), input_.get_shape().ndims
+    if input_.get_shape().ndims != len(axes_):
+      raise ValueError(
+        "Input %d with axes %s has incorrect" \
+        " number of dimensions (expected %d, got %d)" % (
+          i, axes_, len(axes_), input_.get_shape().ndims
+        )
       )
 
-    sorted_idx = sorted(axes_)
+    sorted_idx = sorted(axes_, key=axis_order.get)
+
+    if len(set(axes_)) != len(axes_):
+      raise ValueError(
+        "Subscript not supported: an axis appears more than once: %s" % axes_
+      )
 
     if list(axes_) != sorted_idx:
       permuted = [axes_.find(ax) for ax in sorted_idx]
       inputs[i] = array_ops.transpose(input_, permuted)
       idx_in[i] = sorted_idx
 
-  missing_idx = set(idx_out).difference(idx_all)
-  assert not missing_idx, \
-    "Unknown ouput axes: %s" % missing_idx
-
   reduction_idx = []
   shapes = [[dim if dim else -1
              for dim in tensor.get_shape().as_list()]
             for tensor in inputs]
 
   # validate shapes for broadcasting
-  for j, ax in enumerate(sorted(idx_all)):
+  for j, ax in enumerate(sorted(idx_all, key=axis_order.get)):
     dims = []
     for i, idx in enumerate(idx_in):
       if ax not in idx:
@@ -145,8 +197,10 @@ def einsum(axes, *inputs):
         if isinstance(dim, int) and dim > 1:
           dims.append(dim)
 
-    assert len(set(dims)) <= 1, \
-      "Dimension mismatch on axis: %s" % ax
+    if len(set(dims)) > 1:
+      raise ValueError(
+        "Dimension mismatch on axis: %s" % ax
+      )
 
     if ax not in idx_out:
       reduction_idx.append(j)
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index c190c589b7..cd7903bc35 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -113,39 +113,77 @@ class LBetaTestGpu(LBetaTest):
 
 class EinsumTest(tf.test.TestCase):
 
-  # standard cases
   simple_cases = [
     'ij,jk->ik',
     'ijk,jklm->il',
     'ij,jk,kl->il',
     'ijk->i',
-  ]
-
-  # where axes are not in order
-  misordered_cases = [
+    'ijk->kji',
     'ji,kj->ik',
+
     'ikl,kji->kl',
     'klj,lki->ij',
-  ]
-
-  # more than two arguments
-  multiarg_cases = [
+    'ijk,ilj->kli',
+    'kij,mkb->ijmb',
     'ijk,ijl,ikl->i',
     'i,ijk,j->k',
     'ij,ij,jk,kl->il',
+    'ij,kj,il,jm->ml',
+
+    'a,ab,abc->abc',
+    'a,b,ab->ab',
+    'ab,ab,c->',
+    'ab,ab,c->c',
+    'ab,ab,cd,cd->',
+    'ab,ab,cd,cd->ac',
+    'ab,ab,cd,cd->cd',
+    'ab,ab,cd,cd,ef,ef->',
+
+    'ab,cd,ef->abcdef',
+    'ab,cd,ef->acdf',
+    'ab,cd,de->abcde',
+    'ab,cd,de->be',
+    'ab,bcd,cd->abcd',
+    'ab,bcd,cd->abd',
+
+    'eb,cb,fb->cef',
+    'abcd,ad',
+    'bd,db,eac->ace',
+    'ba,ac,da->bcd',
+
+    'ab,ab',
+    'ab,ba',
+    'abc,abc',
+    'abc,bac',
+    'abc,cba',
+
+    'dba,ead,cad->bce',
+    'aef,fbc,dca->bde',
+  ]
+
+  long_cases = [
+    'bca,cdb,dbf,afc->',
+    'efc,dbc,acf,fd->abe',
+    'ea,fb,gc,hd,abcd->efgh',
+    'ea,fb,abcd,gc,hd->efgh',
+    'abhe,hidj,jgba,hiab,gab',
   ]
 
   invalid_cases = [
     # bad formats
+    '',
     'ijk ijk',
-    'ij,jk,kl'
-    'ij->',
+    'ij.jk->ik',
+    'ij...,jk...->ik...',
 
     # axis in output that does not exist
     'ij,jk->im',
 
     # incorrect number of dimensions
     'ij,jkl->kl',
+
+    # this is allowed in numpy but not implemented here yet
+    'iij,jk'
   ]
 
   dim_mismatch_cases = [
@@ -158,28 +196,18 @@ class EinsumTest(tf.test.TestCase):
     for case in self.simple_cases:
       self.run_test(case)
 
-  def test_misordered(self):
-    for case in self.misordered_cases:
-      self.run_test(case)
-
-  def test_multiarg(self):
-    for case in self.multiarg_cases:
+  def test_long(self):
+    for case in self.long_cases:
       self.run_test(case)
 
   def test_invalid(self):
     for axes in self.invalid_cases:
-      result = None
       inputs = [
         tf.placeholder(tf.float32, shape=(3,4)),
         tf.placeholder(tf.float32, shape=(3,4)),
       ]
-
-      try:
-        result = tf.einsum(axes, *inputs)
-      except AssertionError as e:
-        print(e)
-      assert result is None, \
-        "An exception should have been thrown."
+      with self.assertRaises(ValueError):
+        _ = tf.einsum(axes, *inputs)
 
   def test_dim_mismatch(self):
     for axes, input_shapes in self.dim_mismatch_cases:
@@ -187,12 +215,8 @@ class EinsumTest(tf.test.TestCase):
         tf.placeholder(tf.float32, shape=shape)
         for shape in input_shapes
       ]
-      result = None
-      try:
-        result = tf.einsum(axes, *inputs)
-      except AssertionError:
-        pass
-      assert result is None, "An exception should have been thrown."
+      with self.assertRaises(ValueError):
+        _ = tf.einsum(axes, *inputs)
 
   def run_test(self, axes):
     all_axes = {ax: np.random.randint(4, 12)
diff --git a/tensorflow/python/training/rmsprop.py b/tensorflow/python/training/rmsprop.py
index 2a2d185a07..3a9685135b 100644
--- a/tensorflow/python/training/rmsprop.py
+++ b/tensorflow/python/training/rmsprop.py
@@ -50,8 +50,7 @@ from tensorflow.python.training import training_ops
 class RMSPropOptimizer(optimizer.Optimizer):
   """Optimizer that implements the RMSProp algorithm.
 
-  See the [paper]
-  (http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+  See the [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
 
   @@__init__
   """
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index e2611cd3d0..965f8b46f2 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -33,8 +33,6 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/cuda/cuda_blas.h"
 
-#include <dlfcn.h>
-
 #include <complex>
 
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
@@ -44,6 +42,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/dso_loader.h"
+#include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/status_macros.h"
@@ -71,14 +70,20 @@ namespace dynload {
       static auto status = internal::CachedDsoLoader::GetCublasDsoHandle(); \
       return status.ValueOrDie();                                           \
     }                                                                       \
-    static FuncPointerT DynLoad() {                                         \
-      static void *f = dlsym(GetDsoHandle(), kName);                        \
-      CHECK(f != nullptr) << "could not find " << kName                     \
-                          << " in cuBLAS DSO; dlerror: " << dlerror();      \
+    static FuncPointerT LoadOrDie() {                                       \
+      void *f;                                                              \
+      port::Status s = port::Env::Default()->GetSymbolFromLibrary(          \
+          GetDsoHandle(), kName, &f);                                       \
+      CHECK(s.ok()) << "could not find " << kName                           \
+                    << " in cuBLAS DSO; dlerror: " << s.error_message();    \
       return reinterpret_cast<FuncPointerT>(f);                             \
     }                                                                       \
+    static FuncPointerT DynLoad() {                                         \
+      static FuncPointerT f = LoadOrDie();                                  \
+      return f;                                                             \
+    }                                                                       \
     template <typename... Args>                                             \
-    cublasStatus_t operator()(CUDAExecutor * parent, Args... args) {        \
+    cublasStatus_t operator()(CUDAExecutor *parent, Args... args) {         \
       cuda::ScopedActivateExecutorContext sac{parent};                      \
       return DynLoad()(args...);                                            \
     }                                                                       \
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 6c3e4e90e8..1c13379c8c 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/cuda/cuda_dnn.h"
 
-#include <dlfcn.h>
 #include <functional>
 #include <memory>
 
@@ -137,36 +136,47 @@ void* GetDsoHandle() {
   return result.ValueOrDie();
 }
 
-// Calls cudnnGetVersion in the loaded DSO.
-size_t cudnnGetVersion() {
-  static void* f = dlsym(GetDsoHandle(), "cudnnGetVersion");
+static void* DynLoadGetVersionOrDie() {
+  void* f;
+  port::Status s = port::Env::Default()->GetSymbolFromLibrary(
+      GetDsoHandle(), "cudnnGetVersion", &f);
   if (f == nullptr) {
     LOG(FATAL) << "could not find cudnnGetVersion in cudnn DSO; dlerror: "
-               << dlerror();
+               << s.error_message();
   }
+  return f;
+}
+
+// Calls cudnnGetVersion in the loaded DSO.
+size_t cudnnGetVersion() {
+  static void* f = DynLoadGetVersionOrDie();
   auto callable = reinterpret_cast<size_t (*)(void)>(f);
   return callable();
 }
 
-#define PERFTOOLS_GPUTOOLS_CUDNN_WRAP(__name)                        \
-  struct DynLoadShim__##__name {                                     \
-    static const char* kName;                                        \
-    typedef std::add_pointer<decltype(::__name)>::type FuncPointerT; \
-    static FuncPointerT DynLoad() {                                  \
-      static void* f = dlsym(GetDsoHandle(), kName);                 \
-      if (f == nullptr) {                                            \
-        LOG(FATAL) << "could not find " << kName                     \
-                   << " in cudnn DSO; dlerror: " << dlerror();       \
-      }                                                              \
-      return reinterpret_cast<FuncPointerT>(f);                      \
-    }                                                                \
-    template <typename... Args>                                      \
-    cudnnStatus_t operator()(CUDAExecutor* parent, Args... args) {   \
-      cuda::ScopedActivateExecutorContext sac{parent};               \
-      cudnnStatus_t retval = DynLoad()(args...);                     \
-      return retval;                                                 \
-    }                                                                \
-  } __name;                                                          \
+#define PERFTOOLS_GPUTOOLS_CUDNN_WRAP(__name)                           \
+  struct DynLoadShim__##__name {                                        \
+    static const char* kName;                                           \
+    typedef std::add_pointer<decltype(::__name)>::type FuncPointerT;    \
+    static FuncPointerT LoadOrDie() {                                   \
+      void* f;                                                          \
+      port::Status s = port::Env::Default()->GetSymbolFromLibrary(      \
+          GetDsoHandle(), kName, &f);                                   \
+      CHECK(s.ok()) << "could not find " << kName                       \
+                    << " in cudnn DSO; dlerror: " << s.error_message(); \
+      return reinterpret_cast<FuncPointerT>(f);                         \
+    }                                                                   \
+    static FuncPointerT DynLoad() {                                     \
+      static FuncPointerT f = LoadOrDie();                              \
+      return f;                                                         \
+    }                                                                   \
+    template <typename... Args>                                         \
+    cudnnStatus_t operator()(CUDAExecutor* parent, Args... args) {      \
+      cuda::ScopedActivateExecutorContext sac{parent};                  \
+      cudnnStatus_t retval = DynLoad()(args...);                        \
+      return retval;                                                    \
+    }                                                                   \
+  } __name;                                                             \
   const char* DynLoadShim__##__name::kName = #__name;
 
 // clang-format off
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index b3b65cdefa..095c82a104 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
 
-#include <dlfcn.h>
 #include <map>
 #include <stdint.h>
 #include <stdlib.h>
@@ -61,12 +60,18 @@ namespace dynload {
       static auto status = internal::CachedDsoLoader::GetLibcudaDsoHandle(); \
       return status.ValueOrDie();                                            \
     }                                                                        \
-    static FuncPointerT DynLoad() {                                          \
-      static void *f = dlsym(GetDsoHandle(), kName);                         \
-      CHECK(f != nullptr) << "could not find " << kName                      \
-                          << "in libcuda DSO; dlerror: " << dlerror();       \
+    static FuncPointerT LoadOrDie() {                                        \
+      void *f;                                                               \
+      port::Status s = port::Env::Default()->GetSymbolFromLibrary(           \
+          GetDsoHandle(), kName, &f);                                        \
+      CHECK(s.ok()) << "could not find " << kName                            \
+                    << " in libcuda DSO; dlerror: " << s.error_message();    \
       return reinterpret_cast<FuncPointerT>(f);                              \
     }                                                                        \
+    static FuncPointerT DynLoad() {                                          \
+      static FuncPointerT f = LoadOrDie();                                   \
+      return f;                                                              \
+    }                                                                        \
     template <typename... Args>                                              \
     CUresult operator()(Args... args) {                                      \
       return DynLoad()(args...);                                             \
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
index 59ecf9f168..a7093dfeb0 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/cuda/cuda_fft.h"
 
-#include <dlfcn.h>
-
 #include <complex>
 
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
@@ -26,6 +24,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/dso_loader.h"
+#include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/platform/logging.h"
@@ -55,13 +54,15 @@ namespace dynload {
       return status.ValueOrDie();                                          \
     }                                                                      \
     static FuncPointerT DynLoad() {                                        \
-      static void *f = dlsym(GetDsoHandle(), kName);                       \
-      CHECK(f != nullptr) << "could not find " << kName                    \
-                          << " in cuFFT DSO; dlerror: " << dlerror();      \
+      static void *f;                                                      \
+      port::Status s = port::Env::Default()->GetSymbolFromLibrary(         \
+          GetDsoHandle(), kName, &f);                                      \
+      CHECK(s.ok()) << "could not find " << kName                          \
+                    << " in cuFFT DSO; dlerror: " << s.error_message();    \
       return reinterpret_cast<FuncPointerT>(f);                            \
     }                                                                      \
     template <typename... Args>                                            \
-    cufftResult operator()(CUDAExecutor * parent, Args... args) {          \
+    cufftResult operator()(CUDAExecutor *parent, Args... args) {           \
       cuda::ScopedActivateExecutorContext sac{parent};                     \
       return DynLoad()(args...);                                           \
     }                                                                      \
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc
index 367eba4d51..a0ee677cea 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/dso_loader.h"
+#include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/platform/logging.h"
@@ -72,14 +73,20 @@ namespace dynload {
       static auto status = internal::CachedDsoLoader::GetCurandDsoHandle(); \
       return status.ValueOrDie();                                           \
     }                                                                       \
-    static FuncPointerT DynLoad() {                                         \
-      static void *f = dlsym(GetDsoHandle(), kName);                        \
-      CHECK(f != nullptr) << "could not find " << kName                     \
-                          << " in curand DSO; dlerror: " << dlerror();      \
+    static FuncPointerT LoadOrDie() {                                       \
+      void *f;                                                              \
+      port::Status s = port::Env::Default()->GetSymbolFromLibrary(          \
+          GetDsoHandle(), kName, &f);                                       \
+      CHECK(s.ok()) << "could not find " << kName                           \
+                    << " in curand DSO; dlerror: " << s.error_message();    \
       return reinterpret_cast<FuncPointerT>(f);                             \
     }                                                                       \
+    static FuncPointerT DynLoad() {                                         \
+      static FuncPointerT f = LoadOrDie();                                  \
+      return f;                                                             \
+    }                                                                       \
     template <typename... Args>                                             \
-    curandStatus_t operator()(CUDAExecutor * parent, Args... args) {        \
+    curandStatus_t operator()(CUDAExecutor *parent, Args... args) {         \
       cuda::ScopedActivateExecutorContext sac{parent};                      \
       return DynLoad()(args...);                                            \
     }                                                                       \
diff --git a/tensorflow/stream_executor/dso_loader.cc b/tensorflow/stream_executor/dso_loader.cc
index 4a96b048c4..c9b305a32a 100644
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/dso_loader.cc
@@ -29,9 +29,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/platform/load_library.h"
+#include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/str_util.h"
-#include "tensorflow/stream_executor/lib/str_util.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/platform/logging.h"
@@ -97,19 +97,23 @@ string GetCudnnVersion() { return TF_CUDNN_VERSION; }
 /* static */ port::Status DsoLoader::GetDsoHandle(port::StringPiece path,
                                                   void** dso_handle,
                                                   LoadKind load_kind) {
+  if (load_kind != LoadKind::kLocal) {
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "Only LoadKind::kLocal is currently supported");
+  }
   int dynload_flags =
       RTLD_LAZY | (load_kind == LoadKind::kLocal ? RTLD_LOCAL : RTLD_GLOBAL);
   string path_string = path.ToString();
-  *dso_handle = dlopen(path_string.c_str(), dynload_flags);
-  if (*dso_handle == nullptr) {
+  port::Status s =
+      port::Env::Default()->LoadLibrary(path_string.c_str(), dso_handle);
+  if (!s.ok()) {
     LOG(INFO) << "Couldn't open CUDA library " << path
               << ". LD_LIBRARY_PATH: " << getenv("LD_LIBRARY_PATH");
-    return port::Status(
-        port::error::FAILED_PRECONDITION,
-        port::StrCat("could not dlopen DSO: ", path, "; dlerror: ", dlerror()));
+    return port::Status(port::error::FAILED_PRECONDITION,
+                        port::StrCat("could not dlopen DSO: ", path,
+                                     "; dlerror: ", s.error_message()));
   }
-  LOG(INFO) << "successfully opened CUDA library " << path
-            << (load_kind == LoadKind::kLocal ? " locally" : " globally");
+  LOG(INFO) << "successfully opened CUDA library " << path << " locally";
   return port::Status::OK();
 }
 
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 44d75e9ce3..6a44ac658b 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -141,6 +141,12 @@ def if_not_mobile(a):
       "//conditions:default": a,
   })
 
+def if_not_windows(a):
+  return select({
+      "//tensorflow:windows": [],
+      "//conditions:default": a,
+  })  
+
 def tf_copts():
   return (["-DEIGEN_AVOID_STL_ARRAY",
            "-Iexternal/gemmlowp",
@@ -156,6 +162,10 @@ def tf_copts():
                   "-O2",
               ],
               "//tensorflow:darwin": [],
+              "//tensorflow:windows": [
+                "/DLANG_CXX11",
+                "/D__VERSION__=\\\"MSVC\\\"",
+              ],
               "//tensorflow:ios": ["-std=c++11"],
               "//conditions:default": ["-pthread"]}))
 
@@ -565,12 +575,15 @@ def _py_wrap_cc_impl(ctx):
   args += ["-outdir", py_out.dirname]
   args += [src.path]
   outputs = [cc_out, py_out]
-  ctx.action(executable=ctx.executable.swig_binary,
-             arguments=args,
+  # TODO(pcloudy): Move args to arguments after
+  # https://github.com/bazelbuild/bazel/issues/1926 is fixed
+  ctx.action(command=" ".join(["tensorflow/tools/swig/swig.sh"] + args),
+             arguments=[],
              mnemonic="PythonSwig",
              inputs=sorted(set([src]) + cc_includes + ctx.files.swig_includes +
                          ctx.attr.swig_deps.files),
              outputs=outputs,
+             use_default_shell_env=True,
              progress_message="SWIGing {input}".format(input=src.path))
   return struct(files=set(outputs))
 
@@ -593,12 +606,6 @@ _py_wrap_cc = rule(
         )),
         "module_name": attr.string(mandatory = True),
         "py_module_name": attr.string(mandatory = True),
-        "swig_binary": attr.label(
-            default = Label("//tensorflow:swig"),
-            cfg = "host",
-            executable = True,
-            allow_files = True,
-        ),
     },
     outputs = {
         "cc_out": "%{module_name}.cc",
@@ -743,6 +750,7 @@ def tf_py_wrap_cc(name, srcs, swig_includes=[], deps=[], copts=[], **kwargs):
   # Convert a rule name such as foo/bar/baz to foo/bar/_baz.so
   # and use that as the name for the rule producing the .so file.
   cc_library_name = "/".join(name.split("/")[:-1] + ["_" + module_name + ".so"])
+  cc_library_pyd_name = "/".join(name.split("/")[:-1] + ["_" + module_name + ".pyd"])
   extra_deps = []
   _py_wrap_cc(name=name + "_py_wrap",
               srcs=srcs,
@@ -755,6 +763,8 @@ def tf_py_wrap_cc(name, srcs, swig_includes=[], deps=[], copts=[], **kwargs):
           "-Wl,-exported_symbols_list",
           "//tensorflow:tf_exported_symbols.lds"
       ],
+      "//tensorflow:windows": [
+      ],
       "//conditions:default": [
           "-Wl,--version-script",
           "//tensorflow:tf_version_script.lds"
@@ -763,6 +773,8 @@ def tf_py_wrap_cc(name, srcs, swig_includes=[], deps=[], copts=[], **kwargs):
       "@local_config_cuda//cuda:darwin": [
         "//tensorflow:tf_exported_symbols.lds"
       ],
+      "//tensorflow:windows": [
+      ],
       "//conditions:default": [
         "//tensorflow:tf_version_script.lds"
       ]
@@ -779,10 +791,19 @@ def tf_py_wrap_cc(name, srcs, swig_includes=[], deps=[], copts=[], **kwargs):
       linkstatic=1,
       linkshared=1,
       deps=deps + extra_deps)
+  native.genrule(
+      name = "gen_" + cc_library_pyd_name,
+      srcs = [":" + cc_library_name],
+      outs = [cc_library_pyd_name],
+      cmd = "cp $< $@",
+  )
   native.py_library(name=name,
                     srcs=[":" + name + ".py"],
                     srcs_version="PY2AND3",
-                    data=[":" + cc_library_name])
+                    data=select({
+                      "//tensorflow:windows": [":" + cc_library_pyd_name],
+                      "//conditions:default": [":" + cc_library_name],
+                    }))
 
 def tf_py_test(name, srcs, size="medium", data=[], main=None, args=[],
                tags=[], shard_count=1, additional_deps=[], flaky=0):
diff --git a/tensorflow/tools/ci_build/install/install_deb_packages.sh b/tensorflow/tools/ci_build/install/install_deb_packages.sh
index 4dc58c8ce4..ffa276adf2 100755
--- a/tensorflow/tools/ci_build/install/install_deb_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_deb_packages.sh
@@ -44,5 +44,10 @@ apt-get install -y --no-install-recommends \
     wget \
     zip \
     zlib1g-dev
+
+# Install ca-certificates, and update the certificate store.
+apt-get install ca-certificates-java
+update-ca-certificates -f
+
 apt-get clean
 rm -rf /var/lib/apt/lists/*
diff --git a/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh b/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
index 89a0dd9169..68800d67af 100755
--- a/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
+++ b/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
@@ -95,4 +95,14 @@ if [[ -z ${NEW_TFREC_URL} ]]; then
 fi
 "${GSUTIL_BIN}" rm "${NEW_TFREC_URL}" && \
     echo "Cleaned up new tfrecord file in GCS: ${NEW_TFREC_URL}" || \
-    die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
-\ No newline at end of file
+    die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
+
+# Also clean up newly created GCS dir.
+NEW_DIR_URL=$(grep "Creating dir" "${LOG_FILE}" | \
+                awk '{print $NF}')
+if [[ -z ${NEW_DIR_URL} ]]; then
+  die "FAIL: Unable to determine the URL to the new directory created in GCS."
+fi
+"${GSUTIL_BIN}" rm -r "${NEW_DIR_URL}" && \
+    echo "Cleaned up new directory created in GCS: ${NEW_DIR_URL}" || \
+    die "FAIL: Unable to clean up new directory created in GCS: ${NEW_DIR_URL}"
diff --git a/tensorflow/tools/gcs_test/python/gcs_smoke.py b/tensorflow/tools/gcs_test/python/gcs_smoke.py
index 5db03afb4d..0e0018fc2f 100644
--- a/tensorflow/tools/gcs_test/python/gcs_smoke.py
+++ b/tensorflow/tools/gcs_test/python/gcs_smoke.py
@@ -19,10 +19,12 @@ from __future__ import print_function
 
 import random
 import sys
+import time
 
 import numpy as np
 import tensorflow as tf
 from tensorflow.core.example import example_pb2
+from tensorflow.python.lib.io import file_io
 
 flags = tf.app.flags
 flags.DEFINE_string("gcs_bucket_url", "",
@@ -48,6 +50,25 @@ def create_examples(num_examples, input_mean):
     examples.append(ex)
   return examples
 
+def create_dir_test():
+  """Verifies file_io directory handling methods ."""
+
+  starttime = int(round(time.time() * 1000))
+  dir_name = "%s/tf_gcs_test_%s" % (FLAGS.gcs_bucket_url, starttime)
+  print("Creating dir %s" % dir_name)
+  file_io.create_dir(dir_name)
+  elapsed = int(round(time.time() * 1000)) - starttime
+  print("Created directory in: %d milliseconds" % elapsed)
+  # Check that the directory exists.
+  dir_exists = file_io.is_directory(dir_name)
+  print("%s directory exists: %s" % (dir_name, dir_exists))
+
+  # List contents of just created directory.
+  starttime = int(round(time.time() * 1000))
+  print("Listing directory %s." % dir_name)
+  print(file_io.list_directory(dir_name))
+  elapsed = int(round(time.time() * 1000)) - starttime
+  print("Listed directory %s in %s milliseconds" % (dir_name, elapsed))
 
 if __name__ == "__main__":
   # Sanity check on the GCS bucket URL.
@@ -110,3 +131,5 @@ if __name__ == "__main__":
       except tf.errors.OutOfRangeError:
         print("Successfully caught the expected OutOfRangeError while "
               "reading one more record than is available")
+
+    create_dir_test()
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 8b02c09064..db9de3f816 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -112,9 +112,14 @@ def configure(src_base_path, debug=False):
     if src is None:
       open(os.path.join(gen_path, target), "w").write("")
     else:
-      if hasattr(os, 'symlink'):
-        os.symlink(src, os.path.join(gen_path, target))
-      else:
+      try:
+        # In python 3.5, symlink function exists even on Windows. But requires
+        # Windows Admin privileges, otherwise an OSError will be thrown.
+        if hasattr(os, 'symlink'):
+          os.symlink(src, os.path.join(gen_path, target))
+        else:
+          shutil.copy2(src, os.path.join(gen_path, target))
+      except OSError:
         shutil.copy2(src, os.path.join(gen_path, target))
 
   json.dump(spec, open(os.path.join(gen_path, "spec.json"), "w"), indent=2)
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index dea9159ce6..2ee9564454 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -20,15 +20,17 @@ py_binary(
     deps = ["//tensorflow:tensorflow_py"],
 )
 
-sh_binary(
-    name = "build_pip_package",
-    srcs = ["build_pip_package.sh"],
+# On Windows, python binary is a zip file of runfiles tree.
+# Add everything to its data dependency for generating a runfiles tree
+# for building the pip package on Windows.
+py_binary(
+    name = "simple_console_for_windows",
+    srcs = ["simple_console_for_windows.py"],
     data = [
         "MANIFEST.in",
         "README",
         "setup.py",
         ":other_headers",
-        ":simple_console",
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/ndlstm:all_files",
         "//tensorflow/contrib/session_bundle:all_files",
@@ -44,13 +46,55 @@ sh_binary(
         "//tensorflow/models/image/alexnet:all_files",
         "//tensorflow/models/image/cifar10:all_files",
         "//tensorflow/models/image/imagenet:all_files",
-        "//tensorflow/models/image/mnist:convolutional",
         "//tensorflow/models/rnn:package",
         "//tensorflow/models/rnn/ptb:package",
         "//tensorflow/models/rnn/translate:package",
         "//tensorflow/python:util_example_parser_configuration",
         "//tensorflow/python/debug:all_files",
         "//tensorflow/python/saved_model:all_files",
-        "//tensorflow/tensorboard",
+        # The following two targets have an issue when archiving them into
+        # the python zip, exclude them for now.
+        # "//tensorflow/models/image/mnist:convolutional",
+        # "//tensorflow/tensorboard",
     ],
+    srcs_version = "PY2AND3",
+    deps = ["//tensorflow:tensorflow_py"],
+)
+
+sh_binary(
+    name = "build_pip_package",
+    srcs = ["build_pip_package.sh"],
+    data = select({
+        "//tensorflow:windows": [":simple_console_for_windows"],
+        "//conditions:default": [
+            "MANIFEST.in",
+            "README",
+            "setup.py",
+            ":other_headers",
+            ":simple_console",
+            "//tensorflow:tensorflow_py",
+            "//tensorflow/contrib/ndlstm:all_files",
+            "//tensorflow/contrib/session_bundle:all_files",
+            "//tensorflow/contrib/slim:all_files",
+            "//tensorflow/contrib/slim/python/slim/data:all_files",
+            "//tensorflow/contrib/slim/python/slim/nets:all_files",
+            "//tensorflow/contrib/specs:all_files",
+            "//tensorflow/contrib/tensor_forest:all_files",
+            "//tensorflow/contrib/tensor_forest/hybrid:all_files",
+            "//tensorflow/core:framework_headers",
+            "//tensorflow/examples/tutorials/mnist:package",
+            "//tensorflow/models/embedding:package",
+            "//tensorflow/models/image/alexnet:all_files",
+            "//tensorflow/models/image/cifar10:all_files",
+            "//tensorflow/models/image/imagenet:all_files",
+            "//tensorflow/models/image/mnist:convolutional",
+            "//tensorflow/models/rnn:package",
+            "//tensorflow/models/rnn/ptb:package",
+            "//tensorflow/models/rnn/translate:package",
+            "//tensorflow/python:util_example_parser_configuration",
+            "//tensorflow/python/debug:all_files",
+            "//tensorflow/python/saved_model:all_files",
+            "//tensorflow/tensorboard",
+        ],
+    }),
 )
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 40f7ae9829..34b6a58ce9 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -25,6 +25,16 @@ function cp_external() {
   done
 }
 
+PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
+function is_windows() {
+  # On windows, the shell script is actually running in msys
+  if [[ "${PLATFORM}" =~ msys_nt* ]]; then
+    true
+  else
+    false
+  fi
+}
+
 function main() {
   if [ $# -lt 1 ] ; then
     echo "No destination dir provided"
@@ -41,7 +51,23 @@ function main() {
     exit 1
   fi
 
-  if [ ! -d bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow ]; then
+  if is_windows; then
+    rm -rf ./bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip
+    mkdir -p ./bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip
+    echo "Unzipping simple_console_for_windows.zip to create runfiles tree..."
+    unzip -o -q ./bazel-bin/tensorflow/tools/pip_package/simple_console_for_windows.zip -d ./bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip
+    echo "Unzip finished."
+    # runfiles structure after unzip the python binary
+    cp -R \
+      bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow/tensorflow \
+      "${TMPDIR}"
+    mkdir "${TMPDIR}/external"
+    # Note: this makes an extra copy of org_tensorflow.
+    cp_external \
+      bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles \
+      "${TMPDIR}/external"
+    RUNFILES=bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow
+  elif [ ! -d bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow ]; then
     # Really old (0.2.1-) runfiles, without workspace name.
     cp -R \
       bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/tensorflow \
@@ -78,9 +104,13 @@ function main() {
   # protobuf pip package doesn't ship with header files. Copy the headers
   # over so user defined ops can be compiled.
   mkdir -p ${TMPDIR}/google
-  rsync --include "*/" --include "*.h" --exclude "*" --prune-empty-dirs -a \
-    $RUNFILES/external/protobuf ${TMPDIR}/google
-  rsync -a $RUNFILES/third_party/eigen3 ${TMPDIR}/third_party
+  mkdir -p ${TMPDIR}/third_party
+  pushd ${RUNFILES%org_tensorflow}
+  for header in $(find protobuf -name \*.h); do
+    cp --parents "$header" ${TMPDIR}/google;
+  done
+  popd
+  cp -R $RUNFILES/third_party/eigen3 ${TMPDIR}/third_party
 
   cp tensorflow/tools/pip_package/MANIFEST.in ${TMPDIR}
   cp tensorflow/tools/pip_package/README ${TMPDIR}
@@ -93,7 +123,7 @@ function main() {
   pushd ${TMPDIR}
   rm -f MANIFEST
   echo $(date) : "=== Building wheel"
-  ${PYTHON_BIN_PATH:-python} setup.py bdist_wheel >/dev/null
+  "${PYTHON_BIN_PATH:-python}" setup.py bdist_wheel >/dev/null
   mkdir -p ${DEST}
   cp dist/* ${DEST}
   popd
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index b7d8f52617..e458f12302 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -141,6 +141,10 @@ def find_files(pattern, root):
 
 matches = ['../' + x for x in find_files('*', 'external') if '.py' not in x]
 
+if os.name == 'nt':
+  EXTENSION_NAME = 'python/_pywrap_tensorflow.pyd'
+else:
+  EXTENSION_NAME = 'python/_pywrap_tensorflow.so'
 
 headers = (list(find_files('*.h', 'tensorflow/core')) +
            list(find_files('*.h', 'google/protobuf/src')) +
@@ -167,7 +171,7 @@ setup(
     # Add in any packaged data.
     include_package_data=True,
     package_data={
-        'tensorflow': ['python/_pywrap_tensorflow.so',
+        'tensorflow': [EXTENSION_NAME,
                        'tensorboard/dist/bazel-html-imports.html',
                        'tensorboard/dist/index.html',
                        'tensorboard/dist/tf-tensorboard.html',
diff --git a/tensorflow/tools/pip_package/simple_console_for_windows.py b/tensorflow/tools/pip_package/simple_console_for_windows.py
new file mode 100644
index 0000000000..106528bbc8
--- /dev/null
+++ b/tensorflow/tools/pip_package/simple_console_for_windows.py
@@ -0,0 +1,33 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Start a simple interactive console with TensorFlow available."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import code
+import sys
+
+
+def main(_):
+  """Run an interactive console."""
+  code.interact()
+  return 0
+
+
+if __name__ == '__main__':
+  sys.exit(main(sys.argv))
diff --git a/tensorflow/tools/proto_text/BUILD b/tensorflow/tools/proto_text/BUILD
index af80d3a441..d439c9abfd 100644
--- a/tensorflow/tools/proto_text/BUILD
+++ b/tensorflow/tools/proto_text/BUILD
@@ -42,12 +42,17 @@ cc_library(
     name = "gen_proto_text_functions_lib",
     srcs = ["gen_proto_text_functions_lib.cc"],
     hdrs = ["gen_proto_text_functions_lib.h"],
-    linkopts = [
-        "-lm",
-        "-lpthread",
-    ] + select({
-        "//tensorflow:darwin": [],
-        "//conditions:default": ["-lrt"],
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//tensorflow:darwin": [
+            "-lm",
+            "-lpthread",
+        ],
+        "//conditions:default": [
+            "-lm",
+            "-lpthread",
+            "-lrt",
+        ],
     }),
     deps = [
         "//tensorflow/core:lib_proto_parsing",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 07fd0b3cff..4c199e75fc 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -27,9 +27,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
   native.http_archive(
     name = "com_googlesource_code_re2",
-    url = "http://github.com/google/re2/archive/7bab3dc83df6a838cc004cc7a7f51d5fe1a427d5.tar.gz",
-    sha256 = "ef91af8850f734c8be65f2774747f4c2d8d81e556ba009faa79b4dd8b2759555",
-    strip_prefix = "re2-7bab3dc83df6a838cc004cc7a7f51d5fe1a427d5",
+    url = "http://github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
+    sha256 = "bd63550101e056427c9e7ff12a408c1c8b74e9803f393ca916b2926fc2c4906f",
+    strip_prefix = "re2-b94b7cd42e9f02673cd748c1ac1d16db4052514c",
   )
 
   native.http_archive(
@@ -98,9 +98,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
   native.http_archive(
     name = "protobuf",
-    url = "http://github.com/google/protobuf/archive/v3.1.0.tar.gz",
-    sha256 = "0a0ae63cbffc274efb573bdde9a253e3f32e458c41261df51c5dbc5ad541e8f7",
-    strip_prefix = "protobuf-3.1.0",
+    url = "http://github.com/google/protobuf/archive/c2b3e70efd2038a54ef8973771ac58192885125e.tar.gz",
+    sha256 = "eafc1bc4c27970d62effe64ba6610823fdd66711f440d8ca4a168167786a2fcb",
+    strip_prefix = "protobuf-c2b3e70efd2038a54ef8973771ac58192885125e",
   )
 
   native.new_http_archive(
diff --git a/third_party/gpus/cuda/platform.bzl.tpl b/third_party/gpus/cuda/platform.bzl.tpl
index 7565dfc129..539ed58d2c 100644
--- a/third_party/gpus/cuda/platform.bzl.tpl
+++ b/third_party/gpus/cuda/platform.bzl.tpl
@@ -14,6 +14,11 @@ def cuda_library_path(name, version = cuda_sdk_version()):
       return "lib/lib{}.dylib".format(name)
     else:
       return "lib/lib{}.{}.dylib".format(name, version)
+  elif PLATFORM == "Windows":
+    if not version:
+      return "lib/{}.dll".format(name)
+    else:
+      return "lib/{}{}.dll".format(name, version)
   else:
     if not version:
       return "lib64/lib{}.so".format(name)
@@ -23,6 +28,8 @@ def cuda_library_path(name, version = cuda_sdk_version()):
 def cuda_static_library_path(name):
   if PLATFORM == "Darwin":
     return "lib/lib{}_static.a".format(name)
+  elif PLATFORM == "Windows":
+    return "lib/{}_static.lib".format(name)
   else:
     return "lib64/lib{}_static.a".format(name)
 
@@ -32,6 +39,11 @@ def cudnn_library_path(version = cudnn_sdk_version()):
       return "lib/libcudnn.dylib"
     else:
       return "lib/libcudnn.{}.dylib".format(version)
+  elif PLATFORM == "Windows":
+    if not version:
+      return "lib/cudnn.dll"
+    else:
+      return "lib/cudnn{}.dll".format(version)
   else:
     if not version:
       return "lib64/libcudnn.so"
@@ -44,6 +56,11 @@ def cupti_library_path(version = cuda_sdk_version()):
       return "extras/CUPTI/lib/libcupti.dylib"
     else:
       return "extras/CUPTI/lib/libcupti.{}.dylib".format(version)
+  elif PLATFORM == "Windows":
+    if not version:
+      return "extras/CUPTI/lib/cupti.dll"
+    else:
+      return "extras/CUPTI/lib/cupti{}.dll".format(version)
   else:
     if not version:
       return "extras/CUPTI/lib64/libcupti.so"
diff --git a/util/python/python_config.sh b/util/python/python_config.sh
index d75a4d62dd..50f6398f47 100755
--- a/util/python/python_config.sh
+++ b/util/python/python_config.sh
@@ -46,7 +46,7 @@ function main {
 }
 
 function python_path {
-  $PYTHON_BIN_PATH - <<END
+  "$PYTHON_BIN_PATH" - <<END
 from __future__ import print_function
 import site
 import os
@@ -74,13 +74,13 @@ for path in all_paths:
 if len(paths) == 1:
   print(paths[0])
 else:
-  ret_paths = " ".join(paths)
+  ret_paths = ",".join(paths)
   print(ret_paths)
 END
 }
 
 function default_python_path {
-  PYTHON_ARG="$1" $PYTHON_BIN_PATH - <<END
+  PYTHON_ARG="$1" "$PYTHON_BIN_PATH" - <<END
 from __future__ import print_function
 import os
 
@@ -108,26 +108,29 @@ function setup_python {
     exit 1
   fi
 
-  local python_include=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; from distutils import sysconfig; print(sysconfig.get_python_inc());')
+  local python_include="$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; from distutils import sysconfig; print(sysconfig.get_python_inc());')"
   if [ "$python_include" == "" ]; then
     echo -e "\n\nERROR: Problem getting python include path.  Is distutils installed?"
     exit 1
   fi
-
-  local python_lib_path=$(python_path)
+  local python_lib_path
+  # Split python_path into an array of paths, this allows path containing spaces
+  IFS=','
+  python_lib_path=($(python_path))
+  unset IFS
   echo "Found possible Python library paths:"
-  for x in $python_lib_path; do
+  for x in "${python_lib_path[@]}"; do
     echo "  $x"
   done
-  set -- $python_lib_path
+  set -- "${python_lib_path[@]}"
   echo "Please input the desired Python library path to use.  Default is ["$1"]"
   read b || true
   if [ "$b" == "" ]; then
-   python_lib="$(default_python_path $python_lib_path)"
+   python_lib="$(default_python_path "${python_lib_path[0]}")"
    echo $python_lib
   else
-    if test -d $b -a -x $b; then
-      python_lib=$b
+    if test -d "$b" -a -x "$b"; then
+      python_lib="$b"
     else
       echo -e "\n\nERROR: The path you have entered does not exist."
       exit 1
@@ -142,24 +145,34 @@ function setup_python {
 
   for x in $EXPECTED_PATHS; do
     if [ -e "$x" ]; then
-      # This makes ./configure slow on Windows, but it works.
       rm -rf "$x"
     fi
   done
 
-# ln -sf is acutally implemented as copying in msys since creating symbolic links is privileged on Windows
-# So we need -rf to remove them above.
-  ln -sf "${python_include}" util/python/python_include
-  ln -sf "${python_lib}" util/python/python_lib
-  ln -sf "${numpy_include}" third_party/py/numpy/numpy_include
+# ln -sf is actually implemented as copying in msys since creating symbolic
+# links is privileged on Windows. But copying is too slow, so invoke mklink
+# to create junctions on Windows.
+  if is_windows; then
+    cmd /c "mklink /J util\\python\\python_include \"${python_include}\""
+    cmd /c "mklink /J util\\python\\python_lib \"${python_lib}\""
+    cmd /c "mklink /J third_party\\py\\numpy\\numpy_include \"${numpy_include}\""
+  else
+    ln -sf "${python_include}" util/python/python_include
+    ln -sf "${python_lib}" util/python/python_lib
+    ln -sf "${numpy_include}" third_party/py/numpy/numpy_include
+  fi
+  # Convert python path to Windows style before writing into bazel.rc
+  if is_windows; then
+    PYTHON_BIN_PATH="$(cygpath -m "$PYTHON_BIN_PATH")"
+  fi
 
   # Write tools/bazel.rc
   echo "# Autogenerated by configure: DO NOT EDIT" > tools/bazel.rc
   sed -e "s/\$PYTHON_MAJOR_VERSION/$python_major_version/g" \
-      -e "s[\$PYTHON_BINARY[$PYTHON_BIN_PATH[g" \
+      -e "s[\$PYTHON_BINARY[\"$PYTHON_BIN_PATH\"[g" \
       tools/bazel.rc.template >> tools/bazel.rc
   # Write tools/python_bin_path.sh
-  echo "export PYTHON_BIN_PATH=$PYTHON_BIN_PATH" > tools/python_bin_path.sh
+  echo "export PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\"" > tools/python_bin_path.sh
 }
 
 PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
@@ -183,7 +196,13 @@ function check_python {
       echo -e "\n\nERROR: '${x}' is not a symbolic link.  Internal error.\n\n" 1>&2
       exit 1
     fi
-    true_path=$(readlink "${x}")
+    if is_windows; then
+      # In msys, readlink <path> doesn't work, because no symbolic link on
+      # Windows. readlink -f <path> returns the real path of a junction.
+      true_path=$(readlink -f "${x}")
+    else
+      true_path=$(readlink "${x}")
+    fi
     if [ ! -d "${true_path}" ]; then
       echo -e "\n\nERROR: '${x}' does not refer to an existing directory: ${true_path}.  Do you need to rerun configure?\n\n" 1>&2
       exit 1