1 files changed, 327 insertions, 1 deletions
diff --git a/tensorflow/core/kernels/xsmm_conv2d_test.cc b/tensorflow/core/kernels/xsmm_conv2d_test.cc
index d81368314c..f4ab6896ae 100644
--- a/tensorflow/core/kernels/xsmm_conv2d_test.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d_test.cc
@@ -15,13 +15,339 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "include/libxsmm.h"
+#include "tensorflow/core/framework/fake_input.h"
 
 namespace tensorflow {
 namespace {
 
+
+typedef struct {
+  int nImg;
+  int nIfm;
+  int nOfm;
+  int ifhp;
+  int ifwp;
+  int ifh;
+  int ifw;
+  int ofhp;
+  int ofwp;
+  int ofh;
+  int ofw;
+  int pad_h;
+  int pad_w;
+  int pad_h_in;
+  int pad_w_in;
+  int pad_h_out;
+  int pad_w_out;
+  int kh;
+  int kw;
+  int stride_h;
+  int stride_w;
+} naive_conv_t;
+
+
+LIBXSMM_INLINE void naive_copy_NCHW_to_NHWC(const float* nchw, Tensor &nhwc, int N, int H, int W, int C)
+{
+  LIBXSMM_VLA_DECL(4, const float,  input, nchw, C, H, W);
+  int n, h, w, c;
+  auto output =  nhwc.flat<float>();
+  for ( n = 0; n < N; n++ ) {
+    for ( h = 0; h < H; h++ ) {
+      for ( w = 0; w < W; w++ ) {
+        for ( c = 0; c < C; c++ ) {
+          output(n*H*W*C + h*W*C +w*C + c)  =
+          LIBXSMM_VLA_ACCESS(4,  input, n, c, h, w, C, H, W);
+        }
+      }
+    }
+  }
+}
+
+
+LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, Tensor  &rsck, int R, int S, int C, int K)
+{
+  LIBXSMM_VLA_DECL(4, const float,  input, kcrs, C, R, S);
+  int r, s, c, k;
+  auto output =  rsck.flat<float>();
+ 
+  for ( r = 0; r < R; r++ ) {
+    for ( s = 0; s < S; s++ ) {
+      for ( c = 0; c < C; c++ ) {
+        for ( k = 0; k < K; k++ ) {
+          output(r*S*C*K + s*C*K + c*K + k) =
+          LIBXSMM_VLA_ACCESS(4,  input, k, c, r, s, C, R, S);
+        }
+      }
+    }
+  }
+}
+
+
+
+LIBXSMM_INLINE void zero_buf(float* buf, long size) {
+  int i;
+  for (i = 0; i < size; ++i) {
+    buf[i] = 0.0f;
+  }
+}
+ 
+LIBXSMM_INLINE void copy_buf(Tensor &dst,float *src,long size) {
+  long  i;
+  auto output =  dst.flat<float>();
+  for (i = 0; i < size; ++i) 
+          output(i) = src[i];
+}
+ 
+LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne)
+{
+  int i;
+  zero_buf(buf, size);
+  for (i = 0; i < size; ++i) {
+    buf[i] = (float)((initOne != 0) ? 1.0 : ((initPos != 0) ? drand48() : (0.05 - drand48()/10.0)));
+  }
+}
+ 
+
+
+LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float* output, const float* filter)
+{
+  int nImg      = param->nImg;
+  int nIfm      = param->nIfm;
+  int nOfm      = param->nOfm;
+  int ifhp      = param->ifhp;
+  int ifwp      = param->ifwp;
+  int ofhp      = param->ofhp;
+  int ofwp      = param->ofwp;
+  int ifh       = param->ifh;
+  int ifw       = param->ifw;
+  int ofh       = param->ofh;
+  int ofw       = param->ofw;
+  int pad_h     = param->pad_h;
+  int pad_w     = param->pad_w;
+  int pad_h_in  = param->pad_h_in;
+  int pad_w_in  = param->pad_w_in;
+  int pad_h_out = param->pad_h_out;
+  int pad_w_out = param->pad_w_out;
+  int kh        = param->kh;
+  int kw        = param->kw;
+  int stride_h  = param->stride_h;
+  int stride_w  = param->stride_w;
+  /* loop counters */
+  int img, ofm, ifm, oj, oi, ij, ii, kj, ki;
+ 
+  LIBXSMM_VLA_DECL(4,       float, output_t, output + (pad_w_out * ofwp + pad_h_out), nOfm, ofhp, ofwp);
+  LIBXSMM_VLA_DECL(4, const float,  input_t,  input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp);
+  LIBXSMM_VLA_DECL(4, const float, filter_t, filter, nIfm, kh, kw);
+ 
+  for (img = 0; img < nImg; ++img) {
+    for (ofm = 0; ofm < nOfm; ++ofm) {
+      for (ifm = 0; ifm < nIfm; ++ifm) {
+        for (oj = 0; oj < ofh; ++oj) {
+          ij = oj * stride_h - pad_h;
+          for (oi = 0; oi < ofw; ++oi) {
+            ii = oi * stride_w - pad_w;
+            for (kj = 0; kj < kh; ++kj) {
+              if(ij+kj < 0 || ij+kj >= ifh) continue;
+              for (ki = 0; ki < kw; ++ki) {
+                if(ii+ki < 0 || ii+ki >= ifw) continue;
+                LIBXSMM_VLA_ACCESS(  4, output_t, img, ofm, oj, oi, nOfm, ofhp, ofwp) +=
+                  LIBXSMM_VLA_ACCESS(4,  input_t, img, ifm, ij + kj, ii + ki, nIfm, ifhp, ifwp)
+                * LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh, kw);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 void RunXsmmVsGeneric() {}
 
-TEST(XsmmConv2DTest, Basic) {}
 
+class XsmmConv2DTest : public OpsTestBase {
+ protected:
+  void MakeOp(int stride) {
+  
+    TF_CHECK_OK(NodeDefBuilder("xsmm", "Conv2D")
+                      .Input(FakeInput(DT_FLOAT))
+                      .Input(FakeInput(DT_FLOAT))
+                      .Attr("strides", {1, stride,stride, 1})
+                      .Attr("padding", "VALID" )
+                      .Finalize(node_def()));
+
+
+    TF_ASSERT_OK(InitOp());
+  }
+};
+ 
+TEST_F(XsmmConv2DTest, Basic) {
+     MakeOp(1);
+
+
+     int ifw = 14;           /* input width, "W" */
+     int ifh = 14;           /* input height, "H" */
+     int nImg = 32;          /* mini-batch size, "N" */
+     int nIfm = 64;         /* number of input feature maps, "C" */
+     int nOfm = 64;         /* number of output feature maps, "K" */
+     int kh = 3;             /* filter height, "R" */
+     int kw = 3;             /* filter width, "S" */
+     int pad = 0;            /* padding in output */
+     int stride = 1;         /* stride when accessing inputs */
+
+
+     int stride_w = stride;
+     int stride_h = stride;
+     int pad_h = pad;
+     int pad_w = pad;
+ 
+     int pad_h_in = pad_h;
+     int pad_w_in = pad_w;
+ 
+     int pad_h_out = 0;
+     int pad_w_out = 0;
+ 
+  /* deriving some values for naive code */
+     int ofh = (ifh + 2 * pad_h - kh) / stride_h + 1;
+     int ofw = (ifw + 2 * pad_w - kw) / stride_w + 1;
+     int ifhp = ifh + 2 * pad_h_in;
+     int ifwp = ifw + 2 * pad_w_in;
+     int ofhp = ofh + 2 * pad_h_out;
+     int ofwp = ofw + 2 * pad_w_out;
+
+
+    //Initialization of Filter and Image
+    
+    /* allocate data */
+     float *naive_input           = (float*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152);
+     float *naive_output          = (float*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152);
+     float *naive_filter          = (float*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw*    sizeof(float), 2097152);
+     /* initialize data */
+     init_buf(naive_input,          nImg*nIfm*ifhp*ifwp, 0, 0);
+     zero_buf(naive_output,         nImg*nOfm*ofhp*ofwp);
+     init_buf(naive_filter,         nOfm*nIfm*kh*kw, 0, 0);
+        
+
+     Tensor image(DT_FLOAT,
+                 {nImg, ifhp, ifwp, nIfm});
+ 
+ 
+     Tensor filter(DT_FLOAT, {kh,kw,nIfm,nOfm});
+ 
+
+     naive_copy_NCHW_to_NHWC(naive_input, image, nImg, ifhp, ifwp, nIfm);
+     naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm); 
+
+
+    //Run naive convolution
+    
+     naive_conv_t naive_param;
+
+     naive_param.nImg = nImg;
+     naive_param.nIfm = nIfm;
+     naive_param.nOfm = nOfm;
+     naive_param.ifhp = ifhp;
+     naive_param.ifwp = ifwp;
+     naive_param.ofhp = ofhp;
+     naive_param.ofwp = ofwp;
+     naive_param.ifh = ifh;
+     naive_param.ifw = ifw;
+     naive_param.ofh = ofh;
+     naive_param.ofw = ofw;
+     naive_param.pad_h = pad_h;
+     naive_param.pad_w = pad_w;
+     naive_param.pad_h_in = pad_h_in;
+     naive_param.pad_w_in = pad_w_in;
+     naive_param.pad_h_out = pad_h_out;
+     naive_param.pad_w_out = pad_w_out;
+     naive_param.kh = kh;
+     naive_param.kw = kw;
+     naive_param.stride_h = stride_h;
+     naive_param.stride_w = stride_w;
+
+
+     naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter);
+
+ 
+ 
+     AddInputFromArray<float>(image.shape(), image.flat<float>());
+     AddInputFromArray<float>(filter.shape(), filter.flat<float>());
+
+
+
+     //Run Op (TF)
+     TF_ASSERT_OK(RunOpKernel());
+ 
+     // Check the output.
+     Tensor expected(DT_FLOAT, {nImg,ofhp,ofwp, nOfm});
+     naive_copy_NCHW_to_NHWC(naive_output, expected, nImg, ofhp, ofwp, nOfm);
+
+
+     test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+     libxsmm_free(naive_input);
+     libxsmm_free(naive_output);
+     libxsmm_free(naive_filter);
+
+
+
+}
+
+/*
+
+
+TEST(XsmmConv2DTest, Basic) {
+
+    auto num_threads =
+        ctx->device()->tensorflow_cpu_worker_threads()->num_threads;
+    // See libxsmm_dnn.h for this struct definition.
+    libxsmm_dnn_conv_desc desc;
+    desc.N = batch;
+    desc.C = in_depth;
+    desc.H = input_rows;
+    desc.W = input_cols;
+    desc.K = out_depth;
+    desc.R = filter_rows;
+    desc.S = filter_cols;
+    desc.u = stride_rows;
+    desc.v = stride_cols;
+    desc.pad_h = pad_rows;
+    desc.pad_w = pad_cols;
+    desc.pad_h_in = pad_rows;  // libxsmm supports only physical padding for now
+    desc.pad_w_in = pad_cols;  // libxsmm supports only physical padding for now
+    desc.pad_h_out = 0;
+    desc.pad_w_out = 0;
+    desc.threads = num_threads;
+    desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
+    desc.buffer_format = LIBXSMM_DNN_CONV_FORMAT_NHWC;
+    desc.filter_format = LIBXSMM_DNN_CONV_FORMAT_LIBXSMM;//LIBXSMM_DNN_CONV_FORMAT_RSCK;
+    desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
+    desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
+    desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32;
+    desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32;
+ 
+    if (!CanUseXsmmConv2D(desc, data_format)) {
+      return false;
+    }
+ 
+    auto input_ptr = input.template flat<float>().data();
+    auto filter_ptr = filter.template flat<float>().data();
+    auto output_ptr = output->template flat<float>().data();
+ 
+    bool success = functor::XsmmFwdConv2D<CPUDevice, float>()(
+        ctx, desc, input_ptr, filter_ptr, output_ptr);
+    return success;
+
+
+
+
+
+
+
+}
+*/
 }  // namespace
 }  // namespace tensorflow