Skip to content

Commit ed2032a

Browse files
authored
Implement packed depthwiseConv. (tensorflow#1418)
PERF - Add flag `WEBGL_PACK_DEPTHWISECONV` that sends depthwise convolutions to the packed world under certain conditions. - Modify texture sizing logic for packed textures to insure against impossible physical shapes. ### Benchmark results on MacBook across 500 runs Parameters: `inDepth`: 3 `chMul`: 1 `fSize`: 3 `pad`: valid `stride`: 1 For `inSize` = 100: `master` averages 1.5ms, `depthwise_packed` averages 1.2ms *(1.25x)* For `inSize` = 224: `master` averages 3.8ms, `depthwise_packed` averages 2.2ms *(1.73x)*
1 parent 0eba6de commit ed2032a

File tree

6 files changed

+356
-14
lines changed

6 files changed

+356
-14
lines changed

src/environment.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,8 @@ export class Environment {
310310
return true;
311311
} else if (feature === 'WEBGL_PACK_BATCHNORMALIZATION') {
312312
return false;
313+
} else if (feature === 'WEBGL_PACK_DEPTHWISECONV') {
314+
return false;
313315
} else if (feature === 'WEBGL_LAZILY_UNPACK') {
314316
return false;
315317
} else if (feature === 'WEBGL_CONV_IM2COL') {

src/environment_util.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ export interface Features {
3030
'WEBGL_CPU_FORWARD'?: boolean;
3131
// Whether we will pack the batchnormalization op.
3232
'WEBGL_PACK_BATCHNORMALIZATION'?: boolean;
33+
// Whether we pack the depthwise convolution op.
34+
'WEBGL_PACK_DEPTHWISECONV'?: boolean;
3335
// Whether we will use the im2col algorithm to speed up convolutions.
3436
'WEBGL_CONV_IM2COL'?: boolean;
3537
// Whether we will perform memory paging.
@@ -90,6 +92,7 @@ export const URL_PROPERTIES: URLProperty[] = [
9092
{name: 'WEBGL_LAZILY_UNPACK', type: Type.BOOLEAN},
9193
{name: 'WEBGL_CPU_FORWARD', type: Type.BOOLEAN},
9294
{name: 'WEBGL_PACK_BATCHNORMALIZATION', type: Type.BOOLEAN},
95+
{name: 'WEBGL_PACK_DEPTHWISECONV', type: Type.BOOLEAN},
9396
{name: 'WEBGL_CONV_IM2COL', type: Type.BOOLEAN},
9497
{name: 'WEBGL_MAX_TEXTURE_SIZE', type: Type.NUMBER},
9598
{name: 'WEBGL_PAGING_ENABLED', type: Type.BOOLEAN},

src/kernels/backend_webgl.ts

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ import {Conv2DDerFilterProgram, Conv2DDerInputProgram} from './webgl/conv_backpr
5757
import {DepthwiseConv2DDerFilterProgram, DepthwiseConv2DDerInputProgram} from './webgl/conv_backprop_gpu_depthwise';
5858
import {Conv2DProgram} from './webgl/conv_gpu';
5959
import {DepthwiseConv2DProgram} from './webgl/conv_gpu_depthwise';
60+
import {DepthwiseConvPacked2DProgram} from './webgl/conv_packed_gpu_depthwise';
6061
import {CropAndResizeProgram} from './webgl/crop_and_resize_gpu';
6162
import {CumSumProgram} from './webgl/cumsum_gpu';
6263
import {DepthToSpaceProgram} from './webgl/depth_to_space_gpu';
@@ -1501,7 +1502,17 @@ export class MathBackendWebGL implements KernelBackend {
15011502

15021503
depthwiseConv2D(x: Tensor4D, filter: Tensor4D, convInfo: Conv2DInfo):
15031504
Tensor4D {
1504-
const program = new DepthwiseConv2DProgram(convInfo);
1505+
let program: DepthwiseConv2DProgram|DepthwiseConvPacked2DProgram;
1506+
if (ENV.get('WEBGL_PACK_DEPTHWISECONV') && convInfo.dilationWidth === 1 &&
1507+
convInfo.dilationHeight === 1 && convInfo.padInfo.left <= 1 &&
1508+
convInfo.strideWidth <= 2 &&
1509+
convInfo.outChannels / convInfo.inChannels === 1) {
1510+
program = new DepthwiseConvPacked2DProgram(convInfo);
1511+
return this.compileAndRun(
1512+
program, [x, filter], this.makePackedTensor(convInfo.outShape));
1513+
}
1514+
1515+
program = new DepthwiseConv2DProgram(convInfo);
15051516
return this.compileAndRun(program, [x, filter]);
15061517
}
15071518

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
/**
2+
* @license
3+
* Copyright 2018 Google LLC. All Rights Reserved.
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
* =============================================================================
16+
*/
17+
18+
import {Conv2DInfo} from '../../ops/conv_util';
19+
import {GPGPUProgram} from './gpgpu_math';
20+
21+
export class DepthwiseConvPacked2DProgram implements GPGPUProgram {
22+
variableNames = ['x', 'W'];
23+
usesPackedTextures = true;
24+
outputShape: number[];
25+
userCode: string;
26+
27+
constructor(convInfo: Conv2DInfo) {
28+
this.outputShape = convInfo.outShape;
29+
30+
const xNumRows = convInfo.inHeight;
31+
const xNumCols = convInfo.inWidth;
32+
const padTop = convInfo.padInfo.top;
33+
const padLeft = convInfo.padInfo.left;
34+
const strideHeight = convInfo.strideHeight;
35+
const strideWidth = convInfo.strideWidth;
36+
const filterHeight = convInfo.filterHeight;
37+
const filterWidth = convInfo.filterWidth;
38+
const texelsAcross = Math.ceil((filterWidth + 1) / 2);
39+
40+
let mainLoop = `int xR; int xC;`;
41+
42+
for (let r = 0; r < filterHeight; r++) {
43+
for (let c = -padLeft; c < texelsAcross * 2; c++) {
44+
mainLoop += `vec4 ${xTexelName(r, c)} = vec4(0.);`;
45+
}
46+
47+
for (let c = 0; c < filterWidth; c++) {
48+
mainLoop += `
49+
vec4 wR${r}C${c} = vec4(0.);
50+
vec4 xR${r}C${c} = vec4(0.);`;
51+
}
52+
}
53+
54+
/**
55+
* This vectorized implementation of depthwiseConv works by gathering the
56+
* values needed for each output channel's dot product into vec4's and then
57+
* multiplying them all together (this happens in the final double for-loop
58+
* below). Most of the main loop consists of constructing these vec4's with
59+
* the minimum number of texture2D calls as possible, which entails logic
60+
* for making use of all four returned values from a texture2D call at once.
61+
*/
62+
for (let r = 0; r < filterHeight; r++) {
63+
for (let c = 0; c < texelsAcross; c++) {
64+
const col = c * 2;
65+
const left = c * 2 + padLeft;
66+
67+
mainLoop += `
68+
xR = xRCorner + ${r};
69+
xC = xCCorner + ${left};
70+
71+
if(xR >= 0 && xR < ${xNumRows} && xC >= 0 && xC < ${xNumCols}) {
72+
${xTexelName(r, left)} = getX(batch, xR, xC, d1);
73+
}`;
74+
75+
if (padLeft === 0) {
76+
if (col < filterWidth && c === texelsAcross - 1) {
77+
if (strideWidth > 1) {
78+
mainLoop += `
79+
vec4 ${xTexelName(r, left + 2)} = vec4(0.);
80+
81+
if(xR >= 0 && xR < ${xNumRows} && xC + 2 < ${xNumCols}) {
82+
${xTexelName(r, left + 2)} = getX(batch, xR, xC + 2, d1);
83+
}`;
84+
}
85+
86+
mainLoop += `
87+
xR${r}C${left} = ${constructTexel(r, left, strideWidth, padLeft)};
88+
`;
89+
}
90+
} else if (c === 0) {
91+
mainLoop += `
92+
if(xR >= 0 && xR < ${xNumRows} && xC - 2 >= 0) {
93+
${xTexelName(r, left - 2)} = getX(batch, xR, xC - 2, d1);
94+
}`;
95+
}
96+
97+
if (col > 0) {
98+
mainLoop += `xR${r}C${left - 2} =
99+
${constructTexel(r, left - 2, strideWidth, padLeft)};`;
100+
}
101+
102+
if (left - 1 >= 0 && left - 1 < filterWidth) {
103+
mainLoop += `xR${r}C${left - 1} =
104+
${constructTexel(r, left - 1, strideWidth, padLeft)};`;
105+
}
106+
107+
if (col < filterWidth) {
108+
mainLoop += `
109+
vec4 wTexel${r}C${col} = getW(${r}, ${col}, d1, q);
110+
wR${r}C${col} = vec4(wTexel${r}C${col}.xz, wTexel${r}C${col}.xz);
111+
`;
112+
113+
if (col + 1 < filterWidth) {
114+
mainLoop += `
115+
vec4 wTexelR${r}C${col + 1} = getW(${r}, ${col + 1}, d1, q);
116+
wR${r}C${col + 1} =
117+
vec4(wTexelR${r}C${col + 1}.xz, wTexelR${r}C${col + 1}.xz);`;
118+
}
119+
}
120+
}
121+
}
122+
123+
for (let r = 0; r < filterHeight; r++) {
124+
for (let c = 0; c < filterWidth; c++) {
125+
mainLoop += `result += xR${r}C${c} * wR${r}C${c};`;
126+
}
127+
}
128+
129+
this.userCode = `
130+
const ivec2 strides = ivec2(${strideHeight}, ${strideWidth});
131+
const ivec2 pads = ivec2(${padTop}, ${padLeft});
132+
133+
void main() {
134+
ivec4 coords = getOutputCoords();
135+
int batch = coords.x;
136+
ivec2 xRCCorner = coords.yz * strides - pads;
137+
int d2 = coords.w;
138+
int d1 = d2;
139+
int q = 0;
140+
int xRCorner = xRCCorner.x;
141+
int xCCorner = xRCCorner.y;
142+
143+
vec4 result = vec4(0.);
144+
145+
${mainLoop}
146+
147+
setOutput(result);
148+
}
149+
`;
150+
}
151+
}
152+
153+
function xTexelName(r: number, c: number): string {
154+
return `xTexelR${r}C${c < 0 ? 'minus' + Math.abs(c).toString() : c}`;
155+
}
156+
157+
/**
158+
* Given a 2x2 filter, we want to multiply xR0C0, wR0C0, xR0C1, wR0C1, xR1C0,
159+
* wR1C0, xR1C1, xR1C1. The xRC's are constructed out of xTexelRC's, which are
160+
* the vec4's returned from sampling calls. Sometimes this means mixing channels
161+
* from adjacent samples, which constructTexel handles.
162+
*/
163+
function constructTexel(
164+
r: number, c: number, stride: number, padLeft: number): string {
165+
if (stride === 1) {
166+
if (padLeft % 2 === c % 2) {
167+
return xTexelName(r, c);
168+
}
169+
return `vec4(${xTexelName(r, c - 1)}.zw, ${xTexelName(r, c + 1)}.xy)`;
170+
}
171+
172+
if (padLeft % 2 === c % 2) {
173+
return `vec4(${xTexelName(r, c)}.xy, ${xTexelName(r, c + 2)}.xy)`;
174+
}
175+
return `vec4(${xTexelName(r, c - 1)}.zw, ${xTexelName(r, c + 1)}.zw)`;
176+
}

src/kernels/webgl/tex_util.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,9 @@ export function decodeMatrixFromUnpackedColorRGBAArray(
126126

127127
export function getPackedMatrixTextureShapeWidthHeight(
128128
rows: number, columns: number): [number, number] {
129-
return [Math.ceil(columns / 2), Math.ceil(rows / 2)];
129+
return [
130+
Math.max(1, Math.ceil(columns / 2)), Math.max(1, Math.ceil(rows / 2))
131+
];
130132
}
131133

132134
export function getPackedRGBAArraySizeFromMatrixShape(

0 commit comments

Comments
 (0)