Skip to content

Commit e56c722

Browse files
astojiljannxingyuan
authored andcommitted
Packed batchMatMul.tensorflow#693 (tensorflow#1530)
PERF tensorflow/tfjs#1058 Use packed matmul program always, in batchMatMul and fusedMatMul. Remove non packed mat mul implementation. Modified [matmul benchmark](https://gist.github.com/astojilj/b6cc855e708bb2d77c7f892ef8489137) shows performance improvement for matmul of tensors with shape [3, 400, 400] and [3, 1000, 1000]: master: "N=400": { "averageTimeMs": 25.751999999629334 }, "N=1000": { "averageTimeMs": 140.3969999976107 } with this patch: "N=400": { "averageTimeMs": 9.409999999043066 }, "N=1000": { "averageTimeMs": 54.652999998943415 } Benchmark run on discrete GPU on [ASUS ROG-GL702VM](https://www.asus.com/Laptops/ROG-GL702VM-7th-Gen-Intel-Core/)
1 parent 5a3c8ec commit e56c722

File tree

3 files changed

+27
-184
lines changed

3 files changed

+27
-184
lines changed

src/kernels/backend_webgl.ts

Lines changed: 18 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@ import {Im2ColProgram} from './webgl/im2col_gpu';
7777
import {LRNProgram} from './webgl/lrn_gpu';
7878
import {LRNGradProgram} from './webgl/lrn_grad_gpu';
7979
import {MaxPool2DBackpropProgram} from './webgl/max_pool_backprop_gpu';
80-
import {MatMulProgram} from './webgl/mulmat_gpu';
8180
import {MatMulPackedProgram} from './webgl/mulmat_packed_gpu';
8281
import {MultinomialProgram} from './webgl/multinomial_gpu';
8382
import {OneHotProgram} from './webgl/onehot_gpu';
@@ -769,26 +768,11 @@ export class MathBackendWebGL implements KernelBackend {
769768

770769
const dtype = upcastType(a.dtype, b.dtype);
771770

772-
// TODO(https://github.com/tensorflow/tfjs/issues/693): Support 3D tensors
773-
if (batch === 1) {
774-
const aSqueezed = a.as2D(a.shape[1], a.shape[2]);
775-
const bSqueezed = b.as2D(b.shape[1], b.shape[2]);
776-
777-
const program = new MatMulPackedProgram(
778-
aSqueezed.shape, bSqueezed.shape, [outerShapeA, outerShapeB],
779-
transposeA, transposeB);
780-
const output =
781-
this.makePackedTensor(program.outputShape, dtype) as Tensor2D;
782-
const result =
783-
this.compileAndRun<Tensor2D>(program, [aSqueezed, bSqueezed], output);
784-
return result.reshape([1, result.shape[0], result.shape[1]]);
785-
} else {
786-
const program =
787-
new MatMulProgram(a.shape, b.shape, transposeA, transposeB);
788-
const output =
789-
this.makeOutputArray(program.outputShape, dtype) as Tensor3D;
790-
return this.compileAndRun(program, [a, b], output);
791-
}
771+
const program = new MatMulPackedProgram(a.shape,
772+
[batch, outerShapeA, outerShapeB], transposeA, transposeB);
773+
const output =
774+
this.makePackedTensor(program.outputShape, dtype) as Tensor3D;
775+
return this.compileAndRun<Tensor3D>(program, [a, b], output);
792776
}
793777

794778
fusedBatchMatMul(
@@ -800,35 +784,16 @@ export class MathBackendWebGL implements KernelBackend {
800784

801785
const dtype = upcastType(a.dtype, b.dtype);
802786

803-
// TODO(https://github.com/tensorflow/tfjs/issues/693): Support 3D tensors
804-
if (batch === 1) {
805-
const aSqueezed = a.as2D(a.shape[1], a.shape[2]);
806-
const bSqueezed = b.as2D(b.shape[1], b.shape[2]);
807-
808-
const program = new MatMulPackedProgram(
809-
aSqueezed.shape, bSqueezed.shape, [outerShapeA, outerShapeB],
810-
transposeA, transposeB, !!bias,
811-
activation ? mapActivationToShaderProgram(activation, true) : null);
812-
const output =
813-
this.makePackedTensor(program.outputShape, dtype) as Tensor2D;
814-
const inputs: TensorHandle[] = [aSqueezed, bSqueezed];
815-
if (bias) {
816-
inputs.push(bias);
817-
}
818-
const result = this.compileAndRun<Tensor2D>(program, inputs, output);
819-
return result.reshape([1, result.shape[0], result.shape[1]]);
820-
} else {
821-
const program = new MatMulProgram(
822-
a.shape, b.shape, transposeA, transposeB, !!bias,
823-
activation ? mapActivationToShaderProgram(activation) : null);
824-
const inputs: TensorHandle[] = [a, b];
825-
if (bias) {
826-
inputs.push(bias);
827-
}
828-
const output =
829-
this.makeOutputArray(program.outputShape, dtype) as Tensor3D;
830-
return this.compileAndRun(program, inputs, output);
787+
const program = new MatMulPackedProgram(a.shape,
788+
[batch, outerShapeA, outerShapeB], transposeA, transposeB, !!bias,
789+
activation ? mapActivationToShaderProgram(activation, true) : null);
790+
const output =
791+
this.makePackedTensor(program.outputShape, dtype) as Tensor3D;
792+
const inputs: TensorHandle[] = [a, b];
793+
if (bias) {
794+
inputs.push(bias);
831795
}
796+
return this.compileAndRun<Tensor3D>(program, inputs, output);
832797
}
833798

834799
multiply(a: Tensor, b: Tensor): Tensor {
@@ -1711,14 +1676,15 @@ export class MathBackendWebGL implements KernelBackend {
17111676
const x2ColShape = [sharedDim, numCols];
17121677

17131678
const xSqueezed = x.squeeze([0]);
1714-
const w2Row = filter.reshape([sharedDim, -1]) as Tensor2D;
1679+
const w2Row = filter.reshape([1, sharedDim, -1]) as Tensor3D;
17151680

17161681
const im2ColProgram =
17171682
new Im2ColProgram(x2ColShape, xSqueezed.shape, convInfo);
1718-
const im2Col = this.compileAndRun<Tensor2D>(im2ColProgram, [xSqueezed]);
1683+
const im2Col = this.compileAndRun<Tensor2D>(im2ColProgram, [xSqueezed]).
1684+
reshape([1, x2ColShape[0], x2ColShape[1]]) as Tensor3D;
17191685

17201686
const matmulProgram = new MatMulPackedProgram(
1721-
im2Col.shape, w2Row.shape, [numCols, convInfo.outChannels], true,
1687+
im2Col.shape, [1, numCols, convInfo.outChannels], true,
17221688
false);
17231689
const product =
17241690
this.compileAndRun<Tensor4D>(matmulProgram, [im2Col, w2Row]);

src/kernels/webgl/mulmat_gpu.ts

Lines changed: 0 additions & 123 deletions
This file was deleted.

src/kernels/webgl/mulmat_packed_gpu.ts

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,16 @@ export class MatMulPackedProgram implements GPGPUProgram {
2424
userCode: string;
2525

2626
constructor(
27-
aShape: [number, number], bShape: [number, number],
28-
outputShape: [number, number], transposeA = false, transposeB = false,
27+
aShape: [number, number, number], outputShape: [number, number, number],
28+
transposeA = false, transposeB = false,
2929
addBias = false, activation: string = null) {
3030
this.outputShape = outputShape;
3131

32-
const sharedDim = transposeA ? aShape[0] : aShape[1];
32+
const sharedDim = transposeA ? aShape[1] : aShape[2];
3333
const sharedDimensionPacked = Math.ceil(sharedDim / 2);
3434

35-
const aSample = transposeA ? 'i * 2, rc.x' : 'rc.x, i * 2';
36-
const bSample = transposeB ? 'rc.y, i * 2' : 'i * 2, rc.y';
35+
const aSample = transposeA ? 'i * 2, rc.y' : 'rc.y, i * 2';
36+
const bSample = transposeB ? 'rc.z, i * 2' : 'i * 2, rc.z';
3737
const aSwizzle = transposeA ? ['a.xxyy', 'a.zzww'] : ['a.xxzz', 'a.yyww'];
3838
const bSwizzle = transposeB ? ['b.xzxz', 'b.ywyw'] : ['b.xyxy', 'b.zwzw'];
3939

@@ -56,11 +56,11 @@ export class MatMulPackedProgram implements GPGPUProgram {
5656
5757
const float sharedDimension = ${sharedDimensionPacked}.0;
5858
59-
vec4 dot2x2ARowBCol(ivec2 rc) {
59+
vec4 dot2x2ARowBCol(ivec3 rc) {
6060
vec4 result = vec4(0);
6161
for (int i = 0; i < ${sharedDimensionPacked}; i++) {
62-
vec4 a = getMatrixA(${aSample});
63-
vec4 b = getMatrixB(${bSample});
62+
vec4 a = getMatrixA(rc.x, ${aSample});
63+
vec4 b = getMatrixB(rc.x, ${bSample});
6464
6565
result += (${aSwizzle[0]} * ${bSwizzle[0]}) + (${aSwizzle[1]} * ${
6666
bSwizzle[1]});
@@ -69,7 +69,7 @@ export class MatMulPackedProgram implements GPGPUProgram {
6969
}
7070
7171
void main() {
72-
ivec2 rc = getOutputCoords();
72+
ivec3 rc = getOutputCoords();
7373
vec4 result = dot2x2ARowBCol(rc);
7474
7575
${addBiasSnippet}

0 commit comments

Comments
 (0)