12 files changed, 920 insertions, 40 deletions
diff --git a/configure b/configure
index 081db20d75..e59ee2a925 100755
--- a/configure
+++ b/configure
@@ -38,7 +38,7 @@ function is_windows() {
   fi
 }
 
-function bazel_clean_and_fetch() {
+function bazel_fetch() {
   if [ -z "$TF_BAZEL_TARGETS" ]; then
     bazel fetch "//tensorflow/... -//tensorflow/contrib/nccl/... -//tensorflow/examples/android/..."
   else
@@ -279,18 +279,40 @@ while [ "$TF_NEED_CUDA" == "" ]; do
   esac
 done
 
+sed_hyphen_i -e "/--action_env TF_NEED_CUDA/d" .bazelrc
+sed_hyphen_i -e "/--action_env CUD/d" .bazelrc
+sed_hyphen_i -e "/--action_env GCC_HOST/d" .bazelrc
+sed_hyphen_i -e "/--action_env TF_CUD/d" .bazelrc
+sed_hyphen_i -e "/--action_env CLANG_CUDA/d" .bazelrc
+
 export TF_NEED_CUDA
+echo "build --action_env TF_NEED_CUDA=$TF_NEED_CUDA" >>.bazelrc
+
 export TF_NEED_OPENCL
+
 if [[ "$TF_NEED_CUDA" == "0" ]] && [[ "$TF_NEED_OPENCL" == "0" ]]; then
   echo "Configuration finished"
-  bazel_clean_and_fetch
+  bazel_fetch
   exit
 fi
 
 if [ "$TF_NEED_CUDA" == "1" ]; then
+while [[ "$TF_CUDA_CLANG" == "" ]]; do
+  read -p "Do you want to use clang as CUDA compiler? [y/N] " INPUT
+  case $INPUT in
+    [Yy]* ) echo "Clang will be used as CUDA compiler"; TF_CUDA_CLANG=1;;
+    [Nn]* ) echo "nvcc will be used as CUDA compiler"; TF_CUDA_CLANG=0;;
+    "" ) echo "nvcc will be used as CUDA compiler"; TF_CUDA_CLANG=0;;
+    * ) echo "Invalid selection: " $INPUT;;
+  esac
+done
+
+export TF_CUDA_CLANG
+echo "build --action_env TF_CUDA_CLANG=$TF_CUDA_CLANG" >>.bazelrc
+
 # Set up which gcc nvcc should use as the host compiler
 # No need to set this on Windows
-while ! is_windows && true; do
+while [[ "$TF_CUDA_CLANG" != "1" ]] && ! is_windows && true; do
   fromuser=""
   if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
     default_gcc_host_compiler_path=$(which gcc || true)
@@ -302,6 +324,7 @@ while ! is_windows && true; do
   fi
   if [ -e "$GCC_HOST_COMPILER_PATH" ]; then
     export GCC_HOST_COMPILER_PATH
+    echo "build --action_env GCC_HOST_COMPILER_PATH=\"$GCC_HOST_COMPILER_PATH\"" >>.bazelrc
     break
   fi
   echo "Invalid gcc path. ${GCC_HOST_COMPILER_PATH} cannot be found" 1>&2
@@ -312,6 +335,30 @@ while ! is_windows && true; do
   # Retry
 done
 
+# Set up which clang we should use as the cuda / host compiler.
+while [[ "$TF_CUDA_CLANG" == "1" ]] && true; do
+  fromuser=""
+  if [ -z "$CLANG_CUDA_COMPILER_PATH" ]; then
+    default_clang_host_compiler_path=$(which clang || true)
+    read -p "Please specify which clang should be used as device and host compiler. [Default is $default_clang_host_compiler_path]: " CLANG_CUDA_COMPILER_PATH
+    fromuser="1"
+    if [ -z "$CLANG_CUDA_COMPILER_PATH" ]; then
+      CLANG_CUDA_COMPILER_PATH="$default_clang_host_compiler_path"
+    fi
+  fi
+  if [ -e "$CLANG_CUDA_COMPILER_PATH" ]; then
+    export CLANG_CUDA_COMPILER_PATH
+    echo "build --action_env CLANG_CUDA_COMPILER_PATH=\"$CLANG_CUDA_COMPILER_PATH\"" >>.bazelrc
+    break
+  fi
+  echo "Invalid clang path. ${CLANG_CUDA_COMPILER_PATH} cannot be found" 1>&2
+  if [ -z "$fromuser" ]; then
+    exit 1
+  fi
+  CLANG_CUDA_COMPILER_PATH=""
+  # Retry
+done
+
 # Find out where the CUDA toolkit is installed
 while true; do
   # Configure the Cuda SDK version to use.
@@ -352,7 +399,10 @@ while true; do
 
   if [ -e "${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH}" ]; then
     export CUDA_TOOLKIT_PATH
+    echo "build --action_env CUDA_TOOLKIT_PATH=\"$CUDA_TOOLKIT_PATH\"" >>.bazelrc
+
     export TF_CUDA_VERSION
+    echo "build --action_env TF_CUDA_VERSION=$TF_CUDA_VERSION" >>.bazelrc
     break
   fi
   echo "Invalid path to CUDA $TF_CUDA_VERSION toolkit. ${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH} cannot be found"
@@ -404,7 +454,10 @@ while true; do
 
   if [ -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_ALT_PATH}" -o -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_PATH}" ]; then
     export TF_CUDNN_VERSION
+    echo "build --action_env TF_CUDNN_VERSION=$TF_CUDNN_VERSION" >>.bazelrc
+
     export CUDNN_INSTALL_PATH
+    echo "build --action_env CUDNN_INSTALL_PATH=\"$CUDNN_INSTALL_PATH\"" >>.bazelrc
     break
   fi
 
@@ -417,7 +470,10 @@ while true; do
     CUDNN_PATH_FROM_LDCONFIG="$($LDCONFIG_BIN -p | sed -n 's/.*libcudnn.so .* => \(.*\)/\1/p')"
     if [ -e "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}" ]; then
       export TF_CUDNN_VERSION
+      echo "build --action_env TF_CUDNN_VERSION=$TF_CUDNN_VERSION" >>.bazelrc
+
       export CUDNN_INSTALL_PATH="$(dirname ${CUDNN_PATH_FROM_LDCONFIG})"
+      echo "build --action_env CUDNN_INSTALL_PATH=\"$CUDNN_INSTALL_PATH\"" >>.bazelrc
       break
     fi
   fi
@@ -469,6 +525,7 @@ EOF
     fi
   else
     export TF_CUDA_COMPUTE_CAPABILITIES
+    echo "build --action_env TF_CUDA_COMPUTE_CAPABILITIES=$TF_CUDA_COMPUTE_CAPABILITIES" >>.bazelrc
     break
   fi
   TF_CUDA_COMPUTE_CAPABILITIES=""
@@ -572,6 +629,6 @@ done
 # end of if "$TF_NEED_OPENCL" == "1"
 fi
 
-bazel_clean_and_fetch
+bazel_fetch
 
 echo "Configuration finished"
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index a13142fe48..f8dfd21f84 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -50,17 +50,54 @@ def _temp_workaround_http_archive_impl(repo_ctx):
                      }, False)
    repo_ctx.download_and_extract(repo_ctx.attr.urls, "", repo_ctx.attr.sha256,
                                  "", repo_ctx.attr.strip_prefix)
+   if repo_ctx.attr.patch_file != None:
+     _apply_patch(repo_ctx, repo_ctx.attr.patch_file)
 
 temp_workaround_http_archive = repository_rule(
    implementation=_temp_workaround_http_archive_impl,
    attrs = {
       "build_file": attr.label(),
       "repository": attr.string(),
+      "patch_file": attr.label(default = None),
       "urls": attr.string_list(default = []),
       "sha256": attr.string(default = ""),
       "strip_prefix": attr.string(default = ""),
    })
 
+# Executes specified command with arguments and calls 'fail' if it exited with non-zero code
+def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
+  result = repo_ctx.execute(cmd_and_args)
+  if result.return_code != 0:
+    fail(("Non-zero return code({1}) when executing '{0}':\n" +
+          "Stdout: {2}\n" +
+          "Stderr: {3}").format(" ".join(cmd_and_args),
+                                result.return_code, result.stdout, result.stderr))
+
+# Apply a patch_file to the repository root directory
+# Runs 'patch -p1'
+def _apply_patch(repo_ctx, patch_file):
+  _execute_and_check_ret_code(repo_ctx, ["patch", "-p1",
+                                         "-d", repo_ctx.path("."),
+                                         "-i", repo_ctx.path(patch_file)])
+
+# Download the repository and apply a patch to its root
+def _patched_http_archive_impl(repo_ctx):
+  repo_ctx.download_and_extract(repo_ctx.attr.urls,
+                                sha256 = repo_ctx.attr.sha256,
+                                stripPrefix = repo_ctx.attr.strip_prefix)
+  _apply_patch(repo_ctx, repo_ctx.attr.patch_file)
+
+patched_http_archive = repository_rule(
+    implementation = _patched_http_archive_impl,
+    attrs = {
+      "patch_file": attr.label(),
+      "build_file": attr.label(),
+      "repository": attr.string(),
+      "urls": attr.string_list(default = []),
+      "sha256": attr.string(default = ""),
+      "strip_prefix": attr.string(default = ""),
+    })
+
 # If TensorFlow is linked as a submodule.
 # path_prefix and tf_repo_name are no longer used.
 def tf_workspace(path_prefix = "", tf_repo_name = ""):
@@ -78,11 +115,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "eigen_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/bitbucket.org/eigen/eigen/get/9c6361787292.tar.gz",
-          "https://bitbucket.org/eigen/eigen/get/9c6361787292.tar.gz",
+          "http://bazel-mirror.storage.googleapis.com/bitbucket.org/eigen/eigen/get/deff8b280204.tar.gz",
+          "https://bitbucket.org/eigen/eigen/get/deff8b280204.tar.gz",
       ],
-      sha256 = "e6ec2502a5d82dd5df0b9b16e7697f5fccb81c322d0be8e3492969eecb66badd",
-      strip_prefix = "eigen-eigen-9c6361787292",
+      sha256 = "a39834683eb5bdb9a7434f0ab3621d2cbc3b07e8002db6de101e45ec536723eb",
+      strip_prefix = "eigen-eigen-deff8b280204",
       build_file = str(Label("//third_party:eigen.BUILD")),
   )
 
@@ -255,7 +292,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       actual = "@six_archive//:six",
   )
 
-  native.http_archive(
+  patched_http_archive(
       name = "protobuf",
       urls = [
           "http://bazel-mirror.storage.googleapis.com/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
@@ -263,6 +300,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
       sha256 = "e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
       strip_prefix = "protobuf-2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a",
+      # TODO: remove patching when tensorflow stops linking same protos into
+      #       multiple shared libraries loaded in runtime by python.
+      #       This patch fixes a runtime crash when tensorflow is compiled
+      #       with clang -O2 on Linux (see https://github.com/tensorflow/tensorflow/issues/8394)
+      patch_file = str(Label("//third_party/protobuf:add_noinlines.patch")),
   )
 
   native.new_http_archive(
@@ -452,7 +494,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
       sha256 = "6787f0eed88d52ee8e32956fa4947d92c139da469f1d8e311c307f27d641118e",
       strip_prefix = "nccl-024d1e267845f2ed06f3e2e42476d50f04a00ee6",
-      build_file = str(Label("//third_party:nccl.BUILD")),
+      build_file = str(Label("//third_party/nccl:nccl.BUILD")),
+      # TODO: Remove patching after the fix is merged into nccl(see https://github.com/NVIDIA/nccl/pull/78)
+      patch_file = str(Label("//third_party/nccl:fix_clang_compilation.patch")),
       repository = tf_repo_name,
   )
 
diff --git a/third_party/gpus/crosstool/CROSSTOOL_clang.tpl b/third_party/gpus/crosstool/CROSSTOOL_clang.tpl
new file mode 100644
index 0000000000..e4363d6045
--- /dev/null
+++ b/third_party/gpus/crosstool/CROSSTOOL_clang.tpl
@@ -0,0 +1,292 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "piii"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "arm"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "darwin"
+  toolchain_identifier: "local_darwin"
+}
+default_toolchain {
+  cpu: "ppc"
+  toolchain_identifier: "local_linux"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      flag_group {
+        flag: "-lstdc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  feature {
+    name: "alwayslink"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,-no-as-needed"
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      flag_group {
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        # Some parts of the codebase set -Werror and hit this warning, so
+        # switch it off for now.
+        flag: "-Wno-invalid-partial-specialization"
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "build-id"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      flag_group {
+        # Stamp the binary with a unique identifier.
+        flag: "-Wl,--build-id=md5"
+        flag: "-Wl,--hash-style=gnu"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      flag_group {
+        flag:"-no-canonical-prefixes"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin/"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "alwayslink"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "build-id"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,--gc-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "%{clang_path}" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+%{host_compiler_includes}
+}
diff --git a/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl b/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl
new file mode 100644
index 0000000000..116f67cbae
--- /dev/null
+++ b/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl
@@ -0,0 +1,249 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "piii"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "arm"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "darwin"
+  toolchain_identifier: "local_darwin"
+}
+default_toolchain {
+  cpu: "ppc"
+  toolchain_identifier: "local_linux"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  supports_gold_linker: false
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: false
+  supports_thin_archives: false
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  # As part of the TensorFlow release, we place some cuda-related compilation
+  # files in @local_config_cuda//crosstool/clang/bin, and this relative
+  # path, combined with the rest of our Bazel configuration causes our
+  # compilation to use those files.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+  # Use "-std=c++11" for nvcc. For consistency, force both the host compiler
+  # and the device compiler to use "-std=c++11".
+  cxx_flag: "-std=c++11"
+  linker_flag: "-Wl,-no-as-needed"
+  linker_flag: "-lstdc++"
+  linker_flag: "-B/usr/bin/"
+
+%{host_compiler_includes}
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+
+  # C(++) compiles invoke the compiler (as that is the one knowing where
+  # to find libraries), but we provide LD so other rules can invoke the linker.
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Anticipated future default.
+  unfiltered_cxx_flag: "-no-canonical-prefixes"
+
+  # Make C++ compilation deterministic. Use linkstamping instead of these
+  # compiler symbols.
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+
+  # Security hardening on by default.
+  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+  # We need to undef it before redefining it as some distributions now have
+  # it enabled by default.
+  compiler_flag: "-U_FORTIFY_SOURCE"
+  compiler_flag: "-D_FORTIFY_SOURCE=1"
+  compiler_flag: "-fstack-protector"
+  compiler_flag: "-fPIE"
+  linker_flag: "-pie"
+  linker_flag: "-Wl,-z,relro,-z,now"
+
+  # Enable coloring even if there's no attached terminal. Bazel removes the
+  # escape sequences if --nocolor is specified. This isn't supported by gcc
+  # on Ubuntu 14.04.
+  # compiler_flag: "-fcolor-diagnostics"
+
+  # All warnings are enabled. Maybe enable -Werror as well?
+  compiler_flag: "-Wall"
+  # Enable a few more warnings that aren't part of -Wall.
+  compiler_flag: "-Wunused-but-set-parameter"
+  # But disable some that are problematic.
+  compiler_flag: "-Wno-free-nonheap-object" # has false positives
+
+  # Keep stack frames for debugging, even in opt mode.
+  compiler_flag: "-fno-omit-frame-pointer"
+
+  # Anticipated future default.
+  linker_flag: "-no-canonical-prefixes"
+  unfiltered_cxx_flag: "-fno-canonical-system-headers"
+  # Have gcc return the exit code from ld.
+  linker_flag: "-pass-exit-codes"
+  # Stamp the binary with a unique identifier.
+  linker_flag: "-Wl,--build-id=md5"
+  linker_flag: "-Wl,--hash-style=gnu"
+  # Gold linker only? Can we enable this by default?
+  # linker_flag: "-Wl,--warn-execstack"
+  # linker_flag: "-Wl,--detect-odr-violations"
+
+  # Include directory for cuda headers.
+  cxx_builtin_include_directory: "%{cuda_include_path}"
+
+  compilation_mode_flags {
+    mode: DBG
+    # Enable debug symbols.
+    compiler_flag: "-g"
+  }
+  compilation_mode_flags {
+    mode: OPT
+
+    # No debug symbols.
+    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or
+    # even generally? However, that can't happen here, as it requires special
+    # handling in Bazel.
+    compiler_flag: "-g0"
+
+    # Conservative choice for -O
+    # -O3 can increase binary size and even slow down the resulting binaries.
+    # Profile first and / or use FDO if you need better performance than this.
+    compiler_flag: "-O2"
+
+    # Disable assertions
+    compiler_flag: "-DNDEBUG"
+
+    # Removal of unused code and data at link time (can this increase binary size in some cases?).
+    compiler_flag: "-ffunction-sections"
+    compiler_flag: "-fdata-sections"
+    linker_flag: "-Wl,--gc-sections"
+  }
+  linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "macosx"
+  target_cpu: "darwin"
+  target_system_name: "local"
+  toolchain_identifier: "local_darwin"
+
+  tool_path { name: "ar" path: "/usr/bin/libtool" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+  cxx_flag: "-std=c++11"
+  ar_flag: "-static"
+  ar_flag: "-s"
+  ar_flag: "-o"
+  linker_flag: "-lc++"
+  linker_flag: "-undefined"
+  linker_flag: "dynamic_lookup"
+  # TODO(ulfjack): This is wrong on so many levels. Figure out a way to auto-detect the proper
+  # setting from the local compiler, and also how to make incremental builds correct.
+  cxx_builtin_include_directory: "/"
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Anticipated future default.
+  unfiltered_cxx_flag: "-no-canonical-prefixes"
+  # Make C++ compilation deterministic. Use linkstamping instead of these
+  # compiler symbols.
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+
+  # Security hardening on by default.
+  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+  compiler_flag: "-D_FORTIFY_SOURCE=1"
+  compiler_flag: "-fstack-protector"
+
+  # Enable coloring even if there's no attached terminal. Bazel removes the
+  # escape sequences if --nocolor is specified.
+  compiler_flag: "-fcolor-diagnostics"
+
+  # All warnings are enabled. Maybe enable -Werror as well?
+  compiler_flag: "-Wall"
+  # Enable a few more warnings that aren't part of -Wall.
+  compiler_flag: "-Wthread-safety"
+  compiler_flag: "-Wself-assign"
+
+  # Keep stack frames for debugging, even in opt mode.
+  compiler_flag: "-fno-omit-frame-pointer"
+
+  # Anticipated future default.
+  linker_flag: "-no-canonical-prefixes"
+
+  # Include directory for cuda headers.
+  cxx_builtin_include_directory: "%{cuda_include_path}"
+
+  compilation_mode_flags {
+    mode: DBG
+    # Enable debug symbols.
+    compiler_flag: "-g"
+  }
+  compilation_mode_flags {
+    mode: OPT
+    # No debug symbols.
+    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or even generally?
+    # However, that can't happen here, as it requires special handling in Bazel.
+    compiler_flag: "-g0"
+
+    # Conservative choice for -O
+    # -O3 can increase binary size and even slow down the resulting binaries.
+    # Profile first and / or use FDO if you need better performance than this.
+    compiler_flag: "-O2"
+
+    # Disable assertions
+    compiler_flag: "-DNDEBUG"
+
+    # Removal of unused code and data at link time (can this increase binary size in some cases?).
+    compiler_flag: "-ffunction-sections"
+    compiler_flag: "-fdata-sections"
+  }
+}
diff --git a/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/gpus/cuda/build_defs.bzl.tpl
index a497ed98f0..ca8bbc1ee2 100644
--- a/third_party/gpus/cuda/build_defs.bzl.tpl
+++ b/third_party/gpus/cuda/build_defs.bzl.tpl
@@ -8,12 +8,14 @@ def if_cuda(if_true, if_false = []):
     """
     return select({
         "@local_config_cuda//cuda:using_nvcc": if_true,
+        "@local_config_cuda//cuda:using_clang": if_true,
         "//conditions:default": if_false
     })
 
+
 def cuda_default_copts():
     """Default options for all CUDA compilations."""
-    return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"])
+    return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + %{cuda_extra_copts})
 
 
 def cuda_is_configured():
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index a2b3e7d79e..bbe0442eaf 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -5,6 +5,9 @@
 
   * `TF_NEED_CUDA`: Whether to enable building with CUDA.
   * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path
+  * `TF_CUDA_CLANG`: Wheter to use clang as a cuda compiler.
+  * `CLANG_CUDA_COMPILER_PATH`: The clang compiler path that will be used for
+    both host and device code compilation if TF_CUDA_CLANG is 1.
   * `CUDA_TOOLKIT_PATH`: The path to the CUDA toolkit. Default is
     `/usr/local/cuda`.
   * `TF_CUDA_VERSION`: The version of the CUDA toolkit. If this is blank, then
@@ -17,6 +20,7 @@
 """
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
+_CLANG_CUDA_COMPILER_PATH = "CLANG_CUDA_COMPILER_PATH"
 _CUDA_TOOLKIT_PATH = "CUDA_TOOLKIT_PATH"
 _TF_CUDA_VERSION = "TF_CUDA_VERSION"
 _TF_CUDNN_VERSION = "TF_CUDNN_VERSION"
@@ -35,19 +39,25 @@ _DEFAULT_CUDA_COMPUTE_CAPABILITIES = ["3.5", "5.2"]
 # BEGIN cc_configure common functions.
 def find_cc(repository_ctx):
   """Find the C++ compiler."""
-  cc_name = "gcc"
-  if _GCC_HOST_COMPILER_PATH in repository_ctx.os.environ:
-    cc_name = repository_ctx.os.environ[_GCC_HOST_COMPILER_PATH].strip()
-    if not cc_name:
-      cc_name = "gcc"
+  if _use_cuda_clang(repository_ctx):
+    target_cc_name = "clang"
+    cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
+  else:
+    target_cc_name = "gcc"
+    cc_path_envvar = _GCC_HOST_COMPILER_PATH
+  cc_name = target_cc_name
+
+  if cc_path_envvar in repository_ctx.os.environ:
+    cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip()
+    if cc_name_from_env:
+      cc_name = cc_name_from_env
   if cc_name.startswith("/"):
     # Absolute path, maybe we should make this suported by our which function.
     return cc_name
   cc = repository_ctx.which(cc_name)
   if cc == None:
-    fail(
-        "Cannot find gcc, either correct your path or set the CC" +
-        " environment variable")
+    fail(("Cannot find {}, either correct your path or set the {}" +
+          " environment variable").format(target_cc_name, cc_path_envvar))
   return cc
 
 
@@ -64,10 +74,17 @@ def _cxx_inc_convert(path):
     path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
   return path
 
-
-def get_cxx_inc_directories(repository_ctx, cc):
-  """Compute the list of default C++ include directories."""
-  result = repository_ctx.execute([cc, "-E", "-xc++", "-", "-v"])
+def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp):
+  """Compute the list of default C or C++ include directories."""
+  if lang_is_cpp:
+    lang = "c++"
+  else:
+    lang = "c"
+  # TODO: We pass -no-canonical-prefixes here to match the compiler flags,
+  #       but in cuda_clang CROSSTOOL file that is a `feature` and we should
+  #       handle the case when it's disabled and no flag is passed
+  result = repository_ctx.execute([cc, "-no-canonical-prefixes",
+                                   "-E", "-x" + lang, "-", "-v"])
   index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN)
   if index1 == -1:
     return []
@@ -86,6 +103,19 @@ def get_cxx_inc_directories(repository_ctx, cc):
   return [repository_ctx.path(_cxx_inc_convert(p))
           for p in inc_dirs.split("\n")]
 
+def get_cxx_inc_directories(repository_ctx, cc):
+  """Compute the list of default C and C++ include directories."""
+  # For some reason `clang -xc` sometimes returns include paths that are
+  # different from the ones from `clang -xc++`. (Symlink and a dir)
+  # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
+  includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
+  includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
+
+  includes_cpp_set = set(includes_cpp)
+  return includes_cpp + [inc for inc in includes_c
+                         if inc not in includes_cpp_set]
+
+
 def auto_configure_fail(msg):
   """Output failure message when auto configuration fails."""
   red = "\033[0;31m"
@@ -94,7 +124,7 @@ def auto_configure_fail(msg):
 # END cc_configure common functions (see TODO above).
 
 
-def _gcc_host_compiler_includes(repository_ctx, cc):
+def _host_compiler_includes(repository_ctx, cc):
   """Generates the cxx_builtin_include_directory entries for gcc inc dirs.
 
   Args:
@@ -645,7 +675,8 @@ def _create_dummy_repository(repository_ctx):
   # Set up BUILD file for cuda/.
   _tpl(repository_ctx, "cuda:build_defs.bzl",
        {
-           "%{cuda_is_configured}": "False"
+           "%{cuda_is_configured}": "False",
+           "%{cuda_extra_copts}": "[]"
        })
   _tpl(repository_ctx, "cuda:BUILD",
        {
@@ -730,6 +761,19 @@ def _symlink_dir(repository_ctx, src_dir, dest_dir):
   for src_file in files:
     repository_ctx.symlink(src_file, dest_dir + "/" + src_file.basename)
 
+def _use_cuda_clang(repository_ctx):
+  if "TF_CUDA_CLANG" in repository_ctx.os.environ:
+    enable_cuda = repository_ctx.os.environ["TF_CUDA_CLANG"].strip()
+    return enable_cuda == "1"
+  return False
+
+def _compute_cuda_extra_copts(repository_ctx, cuda_config):
+  if _use_cuda_clang(repository_ctx):
+    capability_flags = ["--cuda-gpu-arch=sm_" + cap.replace(".", "") for cap in cuda_config.compute_capabilities]
+  else:
+    # Capabilities are handled in the "crosstool_wrapper_driver_is_not_gcc" for nvcc
+    capability_flags = []
+  return str(capability_flags)
 
 def _create_cuda_repository(repository_ctx):
   """Creates the repository containing files set up to build with CUDA."""
@@ -761,7 +805,9 @@ def _create_cuda_repository(repository_ctx):
   # Set up BUILD file for cuda/
   _tpl(repository_ctx, "cuda:build_defs.bzl",
        {
-           "%{cuda_is_configured}": "True"
+           "%{cuda_is_configured}": "True",
+           "%{cuda_extra_copts}": _compute_cuda_extra_copts(repository_ctx, cuda_config),
+
        })
   _tpl(repository_ctx, "cuda:BUILD",
        {
@@ -787,21 +833,25 @@ def _create_cuda_repository(repository_ctx):
   # Set up crosstool/
   _file(repository_ctx, "crosstool:BUILD")
   cc = find_cc(repository_ctx)
-  gcc_host_compiler_includes = _gcc_host_compiler_includes(repository_ctx, cc)
-  _tpl(repository_ctx, "crosstool:CROSSTOOL",
-       {
+  host_compiler_includes = _host_compiler_includes(repository_ctx, cc)
+  cuda_defines = {
            "%{cuda_include_path}": cuda_config.cuda_toolkit_path + '/include',
-           "%{gcc_host_compiler_includes}": gcc_host_compiler_includes,
-       })
-  _tpl(repository_ctx,
-       "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
-       {
-           "%{cpu_compiler}": str(cc),
-           "%{cuda_version}": cuda_config.cuda_version,
-           "%{gcc_host_compiler_path}": str(cc),
-           "%{cuda_compute_capabilities}": ", ".join(
-               ["\"%s\"" % c for c in cuda_config.compute_capabilities]),
-       })
+           "%{host_compiler_includes}": host_compiler_includes,
+       }
+  if _use_cuda_clang(repository_ctx):
+    cuda_defines["%{clang_path}"] = cc
+    _tpl(repository_ctx, "crosstool:CROSSTOOL_clang", cuda_defines, out="crosstool/CROSSTOOL")
+  else:
+    _tpl(repository_ctx, "crosstool:CROSSTOOL_nvcc", cuda_defines, out="crosstool/CROSSTOOL")
+    _tpl(repository_ctx,
+         "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
+         {
+             "%{cpu_compiler}": str(cc),
+             "%{cuda_version}": cuda_config.cuda_version,
+             "%{gcc_host_compiler_path}": str(cc),
+             "%{cuda_compute_capabilities}": ", ".join(
+                 ["\"%s\"" % c for c in cuda_config.compute_capabilities]),
+         })
 
   # Set up cuda_config.h, which is used by
   # tensorflow/stream_executor/dso_loader.cc.
diff --git a/third_party/nccl/BUILD b/third_party/nccl/BUILD
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/third_party/nccl/BUILD
diff --git a/third_party/nccl/fix_clang_compilation.patch b/third_party/nccl/fix_clang_compilation.patch
new file mode 100644
index 0000000000..e8d2a7dc9f
--- /dev/null
+++ b/third_party/nccl/fix_clang_compilation.patch
@@ -0,0 +1,85 @@
+From 8241cd7b6ed1425eeb88fd380090575978e358f4 Mon Sep 17 00:00:00 2001
+From: Ilya Biryukov <ibiryukov@google.com>
+Date: Thu, 16 Mar 2017 12:01:11 +0100
+Subject: [PATCH 1/1] Fix compilation error when compiling with 'clang -x
+ cuda'.
+
+Functions vFetch and vStore are not found by ADL with clang,
+so they need to be declared before usage in ReduceCopy.
+---
+ src/common_kernel.h | 52 ++++++++++++++++++++++++++--------------------------
+ 1 file changed, 26 insertions(+), 26 deletions(-)
+
+diff --git a/src/common_kernel.h b/src/common_kernel.h
+index 28fbc85..cc71f8a 100644
+--- a/src/common_kernel.h
++++ b/src/common_kernel.h
+@@ -30,6 +30,32 @@
+ #define BAR(type, barid, nthreads) \
+     BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE))
+ 
++template<typename T> inline __device__
++T vFetch(const volatile T* ptr) {
++  return *ptr;
++}
++
++#ifdef CUDA_HAS_HALF
++template<> inline __device__
++half vFetch<half>(const volatile half* ptr) {
++  half r;
++  r.x = ptr->x;
++  return r;
++}
++#endif
++
++template<typename T> inline __device__
++void vStore(volatile T* ptr, const T val) {
++  *ptr = val;
++}
++
++#ifdef CUDA_HAS_HALF
++template<> inline __device__
++void vStore<half>(volatile half* ptr, const half val) {
++  ptr->x = val.x;
++}
++#endif
++
+ __device__ unsigned int spinct;
+ 
+ // Spin wait until func evaluates to true
+@@ -225,32 +251,6 @@ __device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
+   return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
+ }
+ 
+-template<typename T> inline __device__
+-T vFetch(const volatile T* ptr) {
+-  return *ptr;
+-}
+-
+-#ifdef CUDA_HAS_HALF
+-template<> inline __device__
+-half vFetch<half>(const volatile half* ptr) {
+-  half r;
+-  r.x = ptr->x;
+-  return r;
+-}
+-#endif
+-
+-template<typename T> inline __device__
+-void vStore(volatile T* ptr, const T val) {
+-  *ptr = val;
+-}
+-
+-#ifdef CUDA_HAS_HALF
+-template<> inline __device__
+-void vStore<half>(volatile half* ptr, const half val) {
+-  ptr->x = val.x;
+-}
+-#endif
+-
+ // Assumptions:
+ // - there is exactly 1 block
+ // - THREADS is the number of producer threads
+-- 
+2.12.0.367.g23dc2f6d3c-goog
+
diff --git a/third_party/nccl/nccl.BUILD b/third_party/nccl/nccl.BUILD
new file mode 100644
index 0000000000..06b9b8ff68
--- /dev/null
+++ b/third_party/nccl/nccl.BUILD
@@ -0,0 +1,66 @@
+# NVIDIA nccl
+# A package of optimized primitives for collective multi-GPU communication.
+
+licenses(["notice"])  # BSD
+
+exports_files(["LICENSE.txt"])
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "if_cuda")
+
+SRCS = [
+    "src/all_gather.cu",
+    "src/all_reduce.cu",
+    "src/broadcast.cu",
+    "src/core.cu",
+    "src/libwrap.cu",
+    "src/reduce.cu",
+    "src/reduce_scatter.cu",
+]
+
+# Copy .cu to .cu.cc so they can be in srcs of cc_library.
+[
+    genrule(
+        name = "gen_" + src,
+        srcs = [src],
+        outs = [src + ".cc"],
+        cmd = "cp $(location " + src + ") $(location " + src + ".cc)",
+    )
+    for src in SRCS
+]
+
+SRCS_CU_CC = [src + ".cc" for src in SRCS]
+
+cc_library(
+    name = "nccl",
+    srcs = if_cuda(SRCS_CU_CC + glob(["src/*.h"])),
+    hdrs = if_cuda(["src/nccl.h"]),
+    copts = [
+        "-DCUDA_MAJOR=0",
+        "-DCUDA_MINOR=0",
+        "-DNCCL_MAJOR=0",
+        "-DNCCL_MINOR=0",
+        "-DNCCL_PATCH=0",
+        "-Iexternal/nccl_archive/src",
+        "-O3",
+    ] + cuda_default_copts(),
+    linkopts = select({
+        "@%ws%//tensorflow:android": [
+            "-pie",
+        ],
+        "@%ws%//tensorflow:darwin": [
+            "-Wl,-framework",
+            "-Wl,CoreFoundation",
+            "-Wl,-framework",
+            "-Wl,Security",
+        ],
+        "@%ws%//tensorflow:ios": [],
+        "@%ws%//tensorflow:windows": [
+            "ws2_32.lib",
+        ],
+        "//conditions:default": [
+            "-lrt",
+        ],
+    }),
+    visibility = ["//visibility:public"],
+    deps = ["@local_config_cuda//cuda:cuda_headers"],
+)
diff --git a/third_party/protobuf/BUILD b/third_party/protobuf/BUILD
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/third_party/protobuf/BUILD
diff --git a/third_party/protobuf/add_noinlines.patch b/third_party/protobuf/add_noinlines.patch
new file mode 100644
index 0000000000..af74798f06
--- /dev/null
+++ b/third_party/protobuf/add_noinlines.patch
@@ -0,0 +1,30 @@
+diff -u -r a/src/google/protobuf/compiler/cpp/cpp_file.cc b/src/google/protobuf/compiler/cpp/cpp_file.cc
+--- a/src/google/protobuf/compiler/cpp/cpp_file.cc	2017-02-10 23:55:34.000000000 +0100
++++ b/src/google/protobuf/compiler/cpp/cpp_file.cc	2017-03-21 13:41:46.931979154 +0100
+@@ -557,7 +557,7 @@
+         "      $metadata$, $enum_descriptors$, $service_descriptors$);\n"
+         "}\n"
+         "\n"
+-        "void protobuf_AssignDescriptorsOnce() {\n"
++        "GOOGLE_ATTRIBUTE_NOINLINE void protobuf_AssignDescriptorsOnce() {\n"
+         "  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n"
+         "  ::google::protobuf::GoogleOnceInit(&once, &protobuf_AssignDescriptors);\n"
+         "}\n"
+@@ -656,7 +656,7 @@
+   printer->Print(
+       "}\n"
+       "\n"
+-      "void InitDefaults() {\n"
++      "GOOGLE_ATTRIBUTE_NOINLINE void InitDefaults() {\n"
+       "  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n"
+       "  ::google::protobuf::GoogleOnceInit(&once, &TableStruct::InitDefaultsImpl);\n"
+       "}\n");
+@@ -737,7 +737,7 @@
+   printer->Print(
+       "}\n"
+       "\n"
+-      "void AddDescriptors() {\n"
++      "GOOGLE_ATTRIBUTE_NOINLINE void AddDescriptors() {\n"
+       "  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n"
+       "  ::google::protobuf::GoogleOnceInit(&once, &AddDescriptorsImpl);\n"
+       "}\n");
diff --git a/tools/bazel.rc.template b/tools/bazel.rc.template
index 3622b9423c..097ff7b9d0 100644
--- a/tools/bazel.rc.template
+++ b/tools/bazel.rc.template
@@ -1,6 +1,11 @@
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true
+
+build:cuda_clang --crosstool_top=@local_config_cuda//crosstool:toolchain
+build:cuda_clang --define=using_cuda=true --define=using_cuda_clang=true
+
 build:win-cuda --define=using_cuda=true --define=using_cuda_nvcc=true
+
 build:mkl --define=using_mkl=true
 
 build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain