td2014
diff --git a/‎tensorflow/core/BUILD‎
Lines changed: 2 additions & 2 deletions b/‎tensorflow/core/BUILD‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tensorflow/core/kernels/BUILD‎
Lines changed: 10 additions & 3 deletions b/‎tensorflow/core/kernels/BUILD‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎tensorflow/core/kernels/mkl_transpose_op.cc‎
Lines changed: 67 additions & 0 deletions b/‎tensorflow/core/kernels/mkl_transpose_op.cc‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎tensorflow/core/kernels/transpose_op.cc‎
Lines changed: 15 additions & 0 deletions b/‎tensorflow/core/kernels/transpose_op.cc‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎tensorflow/core/kernels/transpose_op.h‎
Lines changed: 11 additions & 0 deletions b/‎tensorflow/core/kernels/transpose_op.h‎
Lines changed: 11 additions & 0 deletions
@@ -702,8 +702,8 @@ cc_library(
         "//tensorflow/core/kernels:math_not_windows",
         "//tensorflow/core/kernels:quantized_ops",
     ]) + if_mkl([
-        "//tensorflow/core/kernels:mkl_ops",
         "//tensorflow/core/kernels:mkl_conv_op",
+        "//tensorflow/core/kernels:mkl_matmul_op",
         "//tensorflow/core/kernels:mkl_tfconv_op",
     ]),
 )
@@ -2040,7 +2040,7 @@ if_mkl(
             "//tensorflow/cc:scope",
             "//tensorflow/cc:sendrecv_ops",
             "//tensorflow/core/kernels:mkl_conv_op",
-            "//tensorflow/core/kernels:mkl_ops",
+            "//tensorflow/core/kernels:mkl_matmul_op",
             "//tensorflow/core/kernels:mkl_tfconv_op",
             "//tensorflow/core/kernels:ops_util",
             "//third_party/eigen3",
 
@@ -690,8 +690,15 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "transpose_op",
-    prefix = "transpose_op",
-    deps = ARRAY_DEPS,
+    srcs = [
+        "transpose_op.cc",
+    ] + if_mkl([
+        "mkl_transpose_op.cc",
+    ]),
+    hdrs = ["transpose_op.h"],
+    deps = ARRAY_DEPS + if_mkl([
+        "//third_party/mkl:intel_binary_blob",
+    ]),
 )
 
 tf_kernel_library(
@@ -4365,7 +4372,7 @@ tf_cc_test(
 
 if_mkl(
     tf_kernel_library(
-        name = "mkl_ops",
+        name = "mkl_matmul_op",
         prefix = "mkl_matmul",
         deps = [
             ":math",
 
@@ -0,0 +1,67 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/transpose_op.h"
+#include "tensorflow/core/kernels/transpose_functor.h"
+#include "third_party/mkl/include/mkl_trans.h"
+
+namespace tensorflow {
+
+// output = TransposeOp(T<any> input, T<int32> perm) takes a tensor
+// of type T and rank N, and a permutation of 0, 1, ..., N-1. It
+// shuffles the dimensions of the input tensor according to permutation.
+//
+// Specifically, the returned tensor output meets the following condition:
+// 1) output.dims() == input.dims();
+// 2) output.dim_size(i) == input.dim_size(perm[i]);
+// 3) output.tensor<T, N>(i_0, i_1, ..., i_N-1) ==
+//      input.tensor<T, N>(j_0, j_1, ..., j_N-1),
+//    where i_s == j_{perm[s]}
+//
+// REQUIRES: perm is a vector of int32.
+// REQUIRES: input.dims() == perm.size().
+// REQUIRES: perm is a permutation.
+
+Status MklTransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                                      gtl::ArraySlice<int32> perm,
+                                      Tensor* out) {
+  if (in.dims() == 2 && in.dtype() == DT_FLOAT) {
+    float* user_o = out->flat<float>().data();
+    const float* user_i = in.flat<float>().data();
+
+    // Documentation here: https://software.intel.com/en-us/node/520863
+    // Parameters: (ordering:row-major, operation:transpose, num_rows, num_cols,
+    //              alpha (for scaling), array, dist_bet_adjacent_cols/rows
+    //              (source), array, dist_bet_adjacent_cols/rows (dest))
+    mkl_somatcopy('R', 'T', in.dim_size(0), in.dim_size(1), 1,
+                  user_i, in.dim_size(1),
+                  user_o, in.dim_size(0));
+
+    return Status::OK();
+  }
+
+  // Fallback to eigen if transpose parameters not supported by MKL
+  typedef Eigen::ThreadPoolDevice CPUDevice;
+  return ::tensorflow::DoTranspose(ctx->eigen_device<CPUDevice>(), in, perm,
+                                   out);
+} // MklTransposeCpuOp::DoTranspose
+} // namespace tensorflow
+
+#endif  // INTEL_MKL
@@ -180,6 +180,20 @@ Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
                                    out);
 }
 
+#ifdef INTEL_MKL
+#define REGISTER(T)                                           \
+  REGISTER_KERNEL_BUILDER(Name("Transpose")                   \
+                              .Device(DEVICE_CPU)             \
+                              .TypeConstraint<T>("T")         \
+                              .TypeConstraint<int32>("Tperm") \
+                              .HostMemory("perm"),            \
+                          MklTransposeCpuOp);
+TF_CALL_ALL_TYPES(REGISTER);
+REGISTER(bfloat16);
+#undef REGISTER
+
+#else  // INTEL_MKL
+
 #define REGISTER(T)                                           \
   REGISTER_KERNEL_BUILDER(Name("Transpose")                   \
                               .Device(DEVICE_CPU)             \
@@ -190,6 +204,7 @@ Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
 TF_CALL_ALL_TYPES(REGISTER)
 REGISTER(bfloat16);
 #undef REGISTER
+#endif  // INTEL_MKL
 
 #if GOOGLE_CUDA
 Status TransposeGpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
 
@@ -41,6 +41,17 @@ class TransposeCpuOp : public TransposeOp {
                      gtl::ArraySlice<int32> perm, Tensor* out) override;
 };
 
+#ifdef INTEL_MKL
+class MklTransposeCpuOp : public TransposeOp {
+ public:
+  explicit MklTransposeCpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
+
+ protected:
+  Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                     gtl::ArraySlice<int32> perm, Tensor* out) override;
+};
+#endif  // INTEL_MKL
+
 class TransposeGpuOp : public TransposeOp {
  public:
   explicit TransposeGpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}