tensorflow
diff --git a/‎src/backends/cpu/backend_cpu.ts
Lines changed: 46 additions & 15 deletions b/‎src/backends/cpu/backend_cpu.ts
Lines changed: 46 additions & 15 deletions
diff --git a/‎src/backends/webgl/backend_webgl.ts
Lines changed: 25 additions & 15 deletions b/‎src/backends/webgl/backend_webgl.ts
Lines changed: 25 additions & 15 deletions
diff --git a/‎src/backends/webgl/conv_backprop_gpu.ts
Lines changed: 29 additions & 8 deletions b/‎src/backends/webgl/conv_backprop_gpu.ts
Lines changed: 29 additions & 8 deletions
@@ -1558,40 +1558,52 @@ export class MathBackendCPU implements KernelBackend {
     const dilationWidth = convInfo.dilationWidth;
     const padLeft = convInfo.padInfo.left;
     const padTop = convInfo.padInfo.top;
+    const isChannelsLast = convInfo.dataFormat === 'channelsLast';
+
     const y = ops.buffer(convInfo.outShape, x.dtype as 'float32');
 
+    const xBatchStride = x.strides[0];
+    const xRowStride = isChannelsLast ? x.strides[1] : x.strides[2];
+    const xColStride = isChannelsLast ? x.strides[2] : 1;
+    const xChannelStride = isChannelsLast ? 1 : x.strides[1];
+    const yBatchStride = y.strides[0];
+    const yRowStride = isChannelsLast ? y.strides[1] : y.strides[2];
+    const yColStride = isChannelsLast ? y.strides[2] : 1;
+    const yChannelStride = isChannelsLast ? 1 : y.strides[1];
+
     const xVals = this.readSync(x.dataId) as TypedArray;
     const wVals = this.readSync(filter.dataId) as TypedArray;
     const yVals = y.values;
 
     for (let b = 0; b < convInfo.batchSize; ++b) {
-      const xOffset1 = b * x.strides[0];
-      const yOffset1 = b * y.strides[0];
+      const xOffset1 = b * xBatchStride;
+      const yOffset1 = b * yBatchStride;
       for (let yR = 0; yR < convInfo.outHeight; ++yR) {
-        const yOffset2 = yOffset1 + yR * y.strides[1];
+        const yOffset2 = yOffset1 + yR * yRowStride;
         const xRCorner = yR * convInfo.strideHeight - padTop;
         for (let wR = 0; wR < filterHeight; wR++) {
           const xR = xRCorner + wR * dilationHeight;
           if (xR < 0 || xR >= convInfo.inHeight) {
             continue;
           }
           const wOffset1 = wR * filter.strides[0];
-          const xOffset2 = xOffset1 + xR * x.strides[1];
+          const xOffset2 = xOffset1 + xR * xRowStride;
           for (let yC = 0; yC < convInfo.outWidth; ++yC) {
-            const yOffset3 = yOffset2 + yC * convInfo.outChannels;
+            const yOffset3 = yOffset2 + yC * yColStride;
             const xCCorner = yC * convInfo.strideWidth - padLeft;
             for (let wC = 0; wC < filterWidth; wC++) {
               const xC = xCCorner + wC * dilationWidth;
               if (xC < 0 || xC >= convInfo.inWidth) {
                 continue;
               }
               const wOffset2 = wOffset1 + wC * filter.strides[1];
-              const xOffset3 = xOffset2 + xC * convInfo.inChannels;
+              const xOffset3 = xOffset2 + xC * xColStride;
               let wOffset3 = wOffset2;
               for (let d1 = 0; d1 < convInfo.inChannels; ++d1) {
-                const xVal = xVals[xOffset3 + d1];
+                const xVal = xVals[xOffset3 + d1 * xChannelStride];
                 for (let d2 = 0; d2 < convInfo.outChannels; ++d2) {
-                  yVals[yOffset3 + d2] += xVal * wVals[wOffset3 + d2];
+                  yVals[yOffset3 + d2 * yChannelStride] +=
+                      xVal * wVals[wOffset3 + d2];
                 }
                 wOffset3 += convInfo.outChannels;
               }
@@ -1677,9 +1689,7 @@ export class MathBackendCPU implements KernelBackend {
 
     const dx = ops.buffer<Rank.R4>(convInfo.inShape, 'float32');
     const dxValues = dx.values;
-    const [dxS0, dxS1, dxS2] = dx.strides;
     const dyValues = this.readSync(dy.dataId) as TypedArray;
-    const [dyS0, dyS1, dyS2] = dy.strides;
     const fltValues = this.readSync(filter.dataId) as TypedArray;
     const [fltS0, fltS1, fltS2] = filter.strides;
     const {
@@ -1693,11 +1703,22 @@ export class MathBackendCPU implements KernelBackend {
       outHeight,
       outWidth,
       strideHeight,
-      strideWidth
+      strideWidth,
+      dataFormat
     } = convInfo;
     const topPad = filterHeight - 1 - convInfo.padInfo.top;
     const leftPad = filterWidth - 1 - convInfo.padInfo.left;
 
+    const isChannelsLast = dataFormat === 'channelsLast';
+    const xBatchStride = dx.strides[0];
+    const xRowStride = isChannelsLast ? dx.strides[1] : dx.strides[2];
+    const xColStride = isChannelsLast ? dx.strides[2] : 1;
+    const xChannelStride = isChannelsLast ? 1 : dx.strides[1];
+    const yBatchStride = dy.strides[0];
+    const yRowStride = isChannelsLast ? dy.strides[1] : dy.strides[2];
+    const yColStride = isChannelsLast ? dy.strides[2] : 1;
+    const yChannelStride = isChannelsLast ? 1 : dy.strides[1];
+
     for (let b = 0; b < batchSize; ++b) {
       for (let d1 = 0; d1 < inChannels; ++d1) {
         for (let xR = 0; xR < inHeight; ++xR) {
@@ -1718,18 +1739,21 @@ export class MathBackendCPU implements KernelBackend {
 
               for (let yC = xCMin; yC < yCMax; ++yC) {
                 const wC = yC * strideWidth - xCCorner;
-                const dyOffset = dyS0 * b + dyS1 * yR + dyS2 * yC;
+                const dyOffset =
+                    yBatchStride * b + yRowStride * yR + yColStride * yC;
                 const fltOffset = fltS0 * (filterHeight - 1 - wR) +
                     fltS1 * (filterWidth - 1 - wC) + fltS2 * d1;
 
                 for (let d2 = 0; d2 < outChannels; ++d2) {
-                  const pixel = dyValues[dyOffset + d2];
+                  const pixel = dyValues[dyOffset + yChannelStride * d2];
                   const weight = fltValues[fltOffset + d2];
                   dotProd += pixel * weight;
                 }
               }
             }
-            dxValues[dxS0 * b + dxS1 * xR + dxS2 * xC + d1] = dotProd;
+            const dxOffset = xBatchStride * b + xRowStride * xR +
+                xColStride * xC + xChannelStride * d1;
+            dxValues[dxOffset] = dotProd;
           }
         }
       }
@@ -1829,6 +1853,7 @@ export class MathBackendCPU implements KernelBackend {
     const strideWidth = convInfo.strideWidth;
     const filterHeight = convInfo.filterHeight;
     const filterWidth = convInfo.filterWidth;
+    const isChannelsLast = convInfo.dataFormat === 'channelsLast';
     const dW = ops.buffer<Rank.R4>(convInfo.filterShape, 'float32');
 
     const leftPad = convInfo.padInfo.left;
@@ -1854,7 +1879,13 @@ export class MathBackendCPU implements KernelBackend {
                 const xR = wR + yR * strideHeight - topPad;
                 for (let yC = yCMin; yC < yCMax; ++yC) {
                   const xC = wC + yC * strideWidth - leftPad;
-                  dotProd += xBuf.get(b, xR, xC, d1) * dyBuf.get(b, yR, yC, d2);
+                  if (isChannelsLast) {
+                    dotProd +=
+                        xBuf.get(b, xR, xC, d1) * dyBuf.get(b, yR, yC, d2);
+                  } else {
+                    dotProd +=
+                        xBuf.get(b, d1, xR, xC) * dyBuf.get(b, d2, yR, yC);
+                  }
                 }
               }
             }
 
@@ -1843,6 +1843,7 @@ export class MathBackendWebGL implements KernelBackend {
     const sharedMatMulDim = convInfo.inChannels;
     const outerShapeX = xShape[0] * xShape[1] * xShape[2];
     const outerShapeFilter = convInfo.outChannels;
+    const isChannelsLast = convInfo.dataFormat === 'channelsLast';
     const transposeA = false;
     const transposeB = false;
 
@@ -1856,10 +1857,10 @@ export class MathBackendWebGL implements KernelBackend {
     if (batchMatMulWillBeUnpacked || !ENV.getBool('WEBGL_LAZILY_UNPACK') ||
         !ENV.getBool('WEBGL_PACK_BINARY_OPERATIONS') ||
         !reshapeWillBeExpensive) {
+      const targetShape = isChannelsLast ? xShape[0] * xShape[1] * xShape[2] :
+                                           xShape[0] * xShape[2] * xShape[3];
       const xReshaped =
-          this.reshape(
-              x, [1, xShape[0] * xShape[1] * xShape[2], convInfo.inChannels]) as
-          Tensor3D;
+          this.reshape(x, [1, targetShape, convInfo.inChannels]) as Tensor3D;
       const filterReshaped =
           this.reshape(
               filter, [1, convInfo.inChannels, convInfo.outChannels]) as
@@ -1879,17 +1880,19 @@ export class MathBackendWebGL implements KernelBackend {
     }
 
     // Following optimization is specific to packed |x| with odd row count
-    // ('row count' refers to x.shape[2]): we avoid expensive packed 2x2
-    // reshape by padding row count to next, even number. When x.shape[2] is
-    // odd, the result of packed batchMatMul is the same (has the same texture
-    // layout and and values in the texture) as it is for even x.shape[2] + 1.
-    // We make the odd-rows tensor to look like even-rows tensor before the
-    // operation and, after the batchMatMul, fix the even-rows result to have
-    // odd number of rows.
-    const xReshaped =
-        Tensor.make(
-            [1, xShape[0] * xShape[1] * (xShape[2] + 1), convInfo.inChannels],
-            {dataId: x.dataId}, x.dtype, this) as Tensor3D;
+    // (For example, in channelLast mode, 'row count' refers to x.shape[2]):
+    // we avoid expensive packed 2x2 reshape by padding row count to next,
+    // even number. When x.shape[2] is odd, the result of packed batchMatMul is
+    // the same (has the same texture layout and and values in the texture) as
+    // it is for even x.shape[2] + 1. We make the odd-rows tensor to look like
+    // even-rows tensor before the operation and, after the batchMatMul,
+    // fix the even-rows result to have odd number of rows.
+    const targetShape = isChannelsLast ?
+        xShape[0] * xShape[1] * (xShape[2] + 1) :
+        xShape[0] * xShape[2] * (xShape[3] + 1);
+    const xReshaped = Tensor.make(
+                          [1, targetShape, convInfo.inChannels],
+                          {dataId: x.dataId}, x.dtype, this) as Tensor3D;
 
     // xTexData.shape gets referenced from GPGPUBinary.inShapeInfos.
     // Decrementing row count, after batchMatMul->...->compileProgram leads to
@@ -1948,8 +1951,11 @@ export class MathBackendWebGL implements KernelBackend {
       inChannels,
       outWidth,
       outHeight,
+      dataFormat
     } = convInfo;
 
+    const isChannelsLast = dataFormat === 'channelsLast';
+
     const sharedDim = filterWidth * filterHeight * inChannels;
     const numCols = outHeight * outWidth;
     const x2ColShape = [sharedDim, numCols];
@@ -1982,7 +1988,11 @@ export class MathBackendWebGL implements KernelBackend {
     }
     const product = this.compileAndRun<Tensor4D>(matmulProgram, inputs);
 
-    return product.reshape([1, outHeight, outWidth, convInfo.outChannels]);
+    if (isChannelsLast) {
+      return product.reshape([1, outHeight, outWidth, convInfo.outChannels]);
+    } else {
+      return product.reshape([1, convInfo.outChannels, outHeight, outWidth]);
+    }
   }
 
   fusedConv2d(
 
@@ -30,6 +30,7 @@ export class Conv2DDerFilterProgram implements GPGPUProgram {
     const strideWidth = convInfo.strideWidth;
     const padTop = convInfo.padInfo.top;
     const padLeft = convInfo.padInfo.left;
+    const isChannelsLast = convInfo.dataFormat === 'channelsLast';
 
     this.userCode = `
       void main() {
@@ -58,9 +59,16 @@ export class Conv2DDerFilterProgram implements GPGPUProgram {
                 continue;
               }
 
-              float dyValue = getDy(b, yR, yC, d2);
-              float xValue = getX(b, xR, xC, d1);
-              dotProd += (xValue * dyValue);
+              if (${isChannelsLast}) {
+                float dyValue = getDy(b, yR, yC, d2);
+                float xValue = getX(b, xR, xC, d1);
+                dotProd += (xValue * dyValue);
+              } else {
+                float dyValue = getDy(b, d2, yR, yC);
+                float xValue = getX(b, d1, xR, xC);
+                dotProd += (xValue * dyValue);
+              }
+
             }
           }
         }
@@ -82,19 +90,24 @@ export class Conv2DDerInputProgram implements GPGPUProgram {
     const filterWidth = convInfo.filterWidth;
     const strideHeight = convInfo.strideHeight;
     const strideWidth = convInfo.strideWidth;
+    const isChannelsLast = convInfo.dataFormat === 'channelsLast';
 
     const padTop = filterHeight - 1 - convInfo.padInfo.top;
     const padLeft = filterWidth - 1 - convInfo.padInfo.left;
 
+    const rowDim = isChannelsLast ? 1 : 2;
+    const colDim = isChannelsLast ? 2 : 3;
+    const channelDim = isChannelsLast ? 3 : 1;
+
     this.userCode = `
       const ivec2 pads = ivec2(${padTop}, ${padLeft});
 
       void main() {
         ivec4 coords = getOutputCoords();
         int batch = coords[0];
-        int d1 = coords[3];
+        int d1 = coords[${channelDim}];
 
-        ivec2 dyCorner = coords.yz - pads;
+        ivec2 dyCorner = ivec2(coords[${rowDim}], coords[${colDim}]) - pads;
         int dyRCorner = dyCorner.x;
         int dyCCorner = dyCorner.y;
 
@@ -123,9 +136,17 @@ export class Conv2DDerInputProgram implements GPGPUProgram {
             int wCPerm = ${filterWidth} - 1 - wC;
 
             for (int d2 = 0; d2 < ${convInfo.outChannels}; d2++) {
-              float xValue = getDy(batch, idyR, idyC, d2);
-              float wValue = getW(wRPerm, wCPerm, d1, d2);
-              dotProd += xValue * wValue;
+
+              if (${isChannelsLast}) {
+                float xValue = getDy(batch, idyR, idyC, d2);
+                float wValue = getW(wRPerm, wCPerm, d1, d2);
+                dotProd += xValue * wValue;
+              } else {
+                float xValue = getDy(batch, d2, idyR, idyC);
+                float wValue = getW(wRPerm, wCPerm, d1, d2);
+                dotProd += xValue * wValue;
+              }
+
             }
           }
         }