piercus
diff --git a/‎src/kernels/backend.ts
Lines changed: 2 additions & 2 deletions b/‎src/kernels/backend.ts
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/kernels/backend_cpu.ts
Lines changed: 3 additions & 3 deletions b/‎src/kernels/backend_cpu.ts
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/kernels/backend_webgl.ts
Lines changed: 12 additions & 12 deletions b/‎src/kernels/backend_webgl.ts
Lines changed: 12 additions & 12 deletions
diff --git a/‎src/ops/fused_ops.ts
Lines changed: 28 additions & 17 deletions b/‎src/ops/fused_ops.ts
Lines changed: 28 additions & 17 deletions
diff --git a/‎src/ops/fused_test.ts
Lines changed: 12 additions & 0 deletions b/‎src/ops/fused_test.ts
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/ops/fused_util.ts
Lines changed: 1 addition & 1 deletion b/‎src/ops/fused_util.ts
Lines changed: 1 addition & 1 deletion
@@ -16,7 +16,7 @@
  */
 
 import {Conv2DInfo, Conv3DInfo} from '../ops/conv_util';
-import {FusableActivation} from '../ops/fused_util';
+import {Activation} from '../ops/fused_util';
 import {DataId, Scalar, Tensor, Tensor1D, Tensor2D, Tensor3D, Tensor4D, Tensor5D} from '../tensor';
 import {DataType, DataValues, Rank, ShapeMap} from '../types';
 
@@ -124,7 +124,7 @@ export class KernelBackend implements TensorStorage, BackendTimer {
 
   fusedBatchMatMul(
       a: Tensor3D, b: Tensor3D, transposeA: boolean, transposeB: boolean,
-      bias?: Tensor3D, activation?: FusableActivation): Tensor3D {
+      bias?: Tensor, activation?: Activation): Tensor3D {
     throw new Error('Not yet implemented');
   }
 
 
@@ -25,7 +25,7 @@ import * as broadcast_util from '../ops/broadcast_util';
 import * as concat_util from '../ops/concat_util';
 import {Conv2DInfo, Conv3DInfo} from '../ops/conv_util';
 import * as erf_util from '../ops/erf_util';
-import {FusableActivation} from '../ops/fused_util';
+import {Activation} from '../ops/fused_util';
 import * as gather_nd_util from '../ops/gather_nd_util';
 import * as ops from '../ops/ops';
 import {buffer, scalar, tensor, tensor3d, tensor4d} from '../ops/ops';
@@ -46,7 +46,7 @@ import {topkImpl} from './topk_impl';
 import {whereImpl} from './where_impl';
 
 function mapActivation(
-    backend: MathBackendCPU, activation: FusableActivation, x: Tensor): Tensor {
+    backend: MathBackendCPU, activation: Activation, x: Tensor): Tensor {
   if (activation === 'linear') {
     return backend.linear(x);
   } else if (activation === 'relu') {
@@ -484,7 +484,7 @@ export class MathBackendCPU implements KernelBackend {
 
   fusedBatchMatMul(
       a: Tensor3D, b: Tensor3D, transposeA: boolean, transposeB: boolean,
-      bias?: Tensor3D, activation?: FusableActivation): Tensor3D {
+      bias?: Tensor, activation?: Activation): Tensor3D {
     let result = this.batchMatMul(a, b, transposeA, transposeB);
     if (bias) {
       result = this.add(result, bias) as Tensor3D;
 
@@ -25,7 +25,7 @@ import * as axis_util from '../ops/axis_util';
 import * as broadcast_util from '../ops/broadcast_util';
 import {computeOutShape} from '../ops/concat_util';
 import {Conv2DInfo, Conv3DInfo} from '../ops/conv_util';
-import {FusableActivation} from '../ops/fused_util';
+import {Activation} from '../ops/fused_util';
 import * as gather_nd_util from '../ops/gather_nd_util';
 import * as reduce_util from '../ops/reduce_util';
 import * as scatter_nd_util from '../ops/scatter_nd_util';
@@ -132,7 +132,7 @@ export interface WebGLTimingInfo extends TimingInfo {
 }
 
 function mapActivationToShaderProgram(
-    activation: FusableActivation, packed = false): string {
+    activation: Activation, packed = false): string {
   if (activation === 'linear') {
     if (packed) {
       return unary_packed_op.LINEAR;
@@ -789,7 +789,7 @@ export class MathBackendWebGL implements KernelBackend {
 
   fusedBatchMatMul(
       a: Tensor3D, b: Tensor3D, transposeA: boolean, transposeB: boolean,
-      bias?: Tensor3D, activation?: FusableActivation): Tensor3D {
+      bias?: Tensor, activation?: Activation): Tensor3D {
     const outerShapeA = transposeA ? a.shape[2] : a.shape[1];
     const outerShapeB = transposeB ? b.shape[1] : b.shape[2];
     const [batch, , ] = a.shape;
@@ -807,17 +807,17 @@ export class MathBackendWebGL implements KernelBackend {
           activation ? mapActivationToShaderProgram(activation, true) : null);
       const output =
           this.makePackedTensor(program.outputShape, dtype) as Tensor2D;
-      const inputs = [aSqueezed, bSqueezed];
+      const inputs: TensorHandle[] = [aSqueezed, bSqueezed];
       if (bias) {
-        inputs.push(bias.as2D(bias.shape[1], bias.shape[2]));
+        inputs.push(bias);
       }
       const result = this.compileAndRun<Tensor2D>(program, inputs, output);
       return result.reshape([1, result.shape[0], result.shape[1]]);
     } else {
       const program = new MatMulProgram(
           a.shape, b.shape, transposeA, transposeB, !!bias,
           activation ? mapActivationToShaderProgram(activation) : null);
-      const inputs = [a, b];
+      const inputs: TensorHandle[] = [a, b];
       if (bias) {
         inputs.push(bias);
       }
@@ -1441,8 +1441,8 @@ export class MathBackendWebGL implements KernelBackend {
   }
 
   exp<T extends Tensor>(x: T): T {
-    let program: UnaryOpProgram | UnaryOpPackedProgram;
-    if(ENV.get('WEBGL_PACK')) {
+    let program: UnaryOpProgram|UnaryOpPackedProgram;
+    if (ENV.get('WEBGL_PACK')) {
       program = new UnaryOpPackedProgram(x.shape, unary_op.EXP);
     } else {
       program = new UnaryOpProgram(x.shape, unary_op.EXP);
@@ -1456,8 +1456,8 @@ export class MathBackendWebGL implements KernelBackend {
   }
 
   log<T extends Tensor>(x: T): T {
-    let program: UnaryOpProgram | UnaryOpPackedProgram;
-    if(ENV.get('WEBGL_PACK')) {
+    let program: UnaryOpProgram|UnaryOpPackedProgram;
+    if (ENV.get('WEBGL_PACK')) {
       program = new UnaryOpPackedProgram(x.shape, unary_packed_op.LOG);
     } else {
       program = new UnaryOpProgram(x.shape, unary_op.LOG);
@@ -1492,8 +1492,8 @@ export class MathBackendWebGL implements KernelBackend {
   }
 
   relu<T extends Tensor>(x: T): T {
-    let program: UnaryOpProgram | UnaryOpPackedProgram;
-    if(ENV.get('WEBGL_PACK')) {
+    let program: UnaryOpProgram|UnaryOpPackedProgram;
+    if (ENV.get('WEBGL_PACK')) {
       program = new UnaryOpPackedProgram(x.shape, unary_packed_op.RELU);
     } else {
       program = new UnaryOpProgram(x.shape, unary_op.RELU);
 
@@ -22,7 +22,9 @@ import {makeTypesMatch} from '../tensor_util';
 import {convertToTensor} from '../tensor_util_env';
 import {TensorLike} from '../types';
 import * as util from '../util';
-import {FusableActivation} from './fused_util';
+
+import * as broadcast_util from './broadcast_util';
+import {Activation} from './fused_util';
 
 /**
  * Computes the dot product of two matrices with optional activation and bias.
@@ -45,7 +47,7 @@ import {FusableActivation} from './fused_util';
 /** @doc {heading: 'Operations', subheading: 'Matrices', namespace: 'fused'} */
 function matMul_<T extends Tensor>(
     a: T|TensorLike, b: T|TensorLike, transposeA = false, transposeB = false,
-    bias?: T|TensorLike, activation: FusableActivation = 'linear'): T {
+    bias?: Tensor|TensorLike, activation: Activation = 'linear'): T {
   let $a = convertToTensor(a, 'a', 'fused matMul');
   let $b = convertToTensor(b, 'b', 'fused matMul');
   [$a, $b] = makeTypesMatch($a, $b);
@@ -89,21 +91,15 @@ function matMul_<T extends Tensor>(
                            $a.as3D(batchDimA, outerShapeA, innerShapeA);
   const b3D = transposeB ? $b.as3D(batchDimB, outerShapeB, innerShapeB) :
                            $b.as3D(batchDimB, innerShapeB, outerShapeB);
-  let bias3D: Tensor3D;
+
+  let $bias: Tensor;
   if (bias != null) {
-    let $bias = convertToTensor(bias, 'bias', 'fused matMul');
+    $bias = convertToTensor(bias, 'bias', 'fused matMul');
     [$bias] = makeTypesMatch($bias, $a);
 
-    const rowsBias = $bias.shape[$bias.rank - 2];
-    const colsBias = $bias.shape[$bias.rank - 1];
-
     util.assert(
-        outerShapeA === rowsBias && outerShapeB === colsBias,
-        `Error in fused matMul: inner dimensions of bias shape ${
-            $bias.shape} must match outer shapes (${outerShapeA}) and (${
-            outerShapeB}) of Tensors with shapes ${$a.shape} and ${$b.shape}`);
-
-    bias3D = $bias.as3D(batchDimA, rowsBias, colsBias);
+        broadcast_util.getBroadcastDims(outShape, $bias.shape).length === 0,
+        `Error in fused matMul: broadcasting is not supported for bias add.`);
   }
 
   const grad = (dy: Tensor3D, saved: Tensor[]) => {
@@ -120,7 +116,20 @@ function matMul_<T extends Tensor>(
           `implemented yet.`);
     }
 
-    const biasGradient = bias != null ? {$bias: () => dyActivation} : {};
+    let biasGradient = {};
+    if (bias != null) {
+      biasGradient = {
+        $bias: () => {
+          let res = dyActivation;
+          const reduceAxes =
+              broadcast_util.getReductionAxes($bias.shape, outShape);
+          if (reduceAxes.length > 0) {
+            res = res.sum(reduceAxes);
+          }
+          return res.reshape($bias.shape);
+        }
+      };
+    }
 
     if (!transposeA && !transposeB) {
       return Object.assign(
@@ -155,14 +164,16 @@ function matMul_<T extends Tensor>(
 
   const inputs: {$a: Tensor, $b: Tensor, $bias?: Tensor} = {$a: a3D, $b: b3D};
   if (bias != null) {
-    inputs.$bias = bias3D;
+    inputs.$bias = $bias;
   }
 
   const res = ENV.engine.runKernel(
       (backend, save) => save(backend.fusedBatchMatMul(
-          a3D, b3D, transposeA, transposeB, bias3D, activation)),
+          a3D, b3D, transposeA, transposeB, $bias, activation)),
       inputs, grad);
   return res.reshape(outShape) as T;
 }
 
-export const matMul = op({matMul_});
+export const matMul = op({matMul_});
+
+export {Activation};
@@ -61,6 +61,18 @@ describeWithFlags('fused matmul', ALL_ENVS, () => {
     expectArraysClose(d, [1, 9, 0, 21]);
   });
 
+  it('A x B with relu and broadcasted bias', () => {
+    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
+    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
+    const c = tf.tensor1d([1, 1]);
+    const act: tf.fused.Activation = 'relu';
+
+    const d = tf.fused.matMul(a, b, false, false, c, act);
+
+    expect(d.shape).toEqual([2, 2]);
+    expectArraysClose(d, [1, 9, 0, 21]);
+  });
+
   it('A x B with bias only', () => {
     const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
     const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
 
@@ -15,4 +15,4 @@
  * =============================================================================
  */
 
-export type FusableActivation = 'linear'|'relu';
+export type Activation = 'linear'|'relu';