1
+ // //////////////////////////////////////////////////////////////////////////
2
+ //
3
+ // Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
4
+ //
5
+ // Please refer to the NVIDIA end user license agreement (EULA) associated
6
+ // with this source code for terms and conditions that govern your use of
7
+ // this software. Any use, reproduction, disclosure, or distribution of
8
+ // this software and related documentation outside the terms of the EULA
9
+ // is strictly prohibited.
10
+ //
11
+ // //////////////////////////////////////////////////////////////////////////
12
+
13
+ //
14
+ // Matrix multiplication: C = A * B.
15
+ // Host code.
16
+ //
17
+ // This sample implements matrix multiplication as described in Chapter 3
18
+ // of the programming guide and uses the CUBLAS library to demonstrate
19
+ // the best performance.
20
+
21
+ // SOME PRECAUTIONS:
22
+ // IF WE WANT TO CALCULATE ROW-MAJOR MATRIX MULTIPLY C = A * B,
23
+ // WE JUST NEED CALL CUBLAS API IN A REVERSE ORDER: cublasSegemm(B, A)!
24
+ // The reason is explained as follows:
25
+
26
+ // CUBLAS library uses column-major storage, but C/C++ use row-major storage.
27
+ // When passing the matrix pointer to CUBLAS, the memory layout alters from
28
+ // row-major to column-major, which is equivalent to an implicit transpose.
29
+
30
+ // In the case of row-major C/C++ matrix A, B, and a simple matrix multiplication
31
+ // C = A * B, we can't use the input order like cublasSgemm(A, B) because of
32
+ // implicit transpose. The actual result of cublasSegemm(A, B) is A(T) * B(T).
33
+ // If col(A(T)) != row(B(T)), equal to row(A) != col(B), A(T) and B(T) are not
34
+ // multipliable. Moreover, even if A(T) and B(T) are multipliable, the result C
35
+ // is a column-based cublas matrix, which means C(T) in C/C++, we need extra
36
+ // transpose code to convert it to a row-based C/C++ matrix.
37
+
38
+ // To solve the problem, let's consider our desired result C, a row-major matrix.
39
+ // In cublas format, it is C(T) actually (because of the implicit transpose).
40
+ // C = A * B, so C(T) = (A * B) (T) = B(T) * A(T). Cublas matrice B(T) and A(T)
41
+ // happen to be C/C++ matrice B and A (still because of the implicit transpose)!
42
+ // We don't need extra transpose code, we only need alter the input order!
43
+ //
44
+ // CUBLAS provides high-performance matrix multiplication.
45
+ // See also:
46
+ // V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
47
+ // in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08),
48
+ // Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
49
+ //
50
+
51
+ #include " matrixMulCUBLAS.h"
52
+
53
+ #ifndef min
54
+ #define min (a,b ) ((a < b) ? a : b)
55
+ #endif
56
+ #ifndef max
57
+ #define max (a,b ) ((a > b) ? a : b)
58
+ #endif
59
+
60
+ typedef struct _matrixSize // Optional Command-line multiplier for matrix sizes
61
+ {
62
+ unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
63
+ } sMatrixSize ;
64
+
65
+ // //////////////////////////////////////////////////////////////////////////////
66
+ // ! Compute reference data set matrix multiply on CPU
67
+ // ! C = A * B
68
+ // ! @param C reference data, computed but preallocated
69
+ // ! @param A matrix A as provided to device
70
+ // ! @param B matrix B as provided to device
71
+ // ! @param hA height of matrix A
72
+ // ! @param wB width of matrix B
73
+ // //////////////////////////////////////////////////////////////////////////////
74
+ void
75
+ matrixMulCPU (float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
76
+ {
77
+ for (unsigned int i = 0 ; i < hA; ++i)
78
+ for (unsigned int j = 0 ; j < wB; ++j)
79
+ {
80
+ double sum = 0 ;
81
+
82
+ for (unsigned int k = 0 ; k < wA; ++k)
83
+ {
84
+ double a = A[i * wA + k];
85
+ double b = B[k * wB + j];
86
+ sum += a * b;
87
+ }
88
+
89
+ C[i * wB + j] = (float )sum;
90
+ }
91
+ }
92
+
93
+ // Allocates a matrix with random float entries.
94
+ void randomInit (float *data, int size)
95
+ {
96
+ for (int i = 0 ; i < size; ++i)
97
+ data[i] = rand () / (float )RAND_MAX;
98
+ }
99
+ void bytomInit (double *data, int size,int8_t aaaa[][256 ])
100
+ {
101
+ // for (int i = 0; i < size; ++i)
102
+ // data[i] = rand() / (float)RAND_MAX;
103
+ // for (int i = 0; i < 256; i++) {
104
+ for (int j = 0 ; j < size; j++) {
105
+ data[i*256 +j] = (double )(aaaa[i][j]);
106
+ // mb[i*256+j] = (double)(b.d[i][j]);
107
+ }
108
+ // }
109
+ }
110
+ void printDiff (float *data1, float *data2, int width, int height, int iListLength, float fListTol )
111
+ {
112
+ printf (" Listing first %d Differences > %.6f...\n " , iListLength, fListTol );
113
+ int i,j,k;
114
+ int error_count=0 ;
115
+
116
+ for (j = 0 ; j < height; j++)
117
+ {
118
+ if (error_count < iListLength)
119
+ {
120
+ printf (" \n Row %d:\n " , j);
121
+ }
122
+
123
+ for (i = 0 ; i < width; i++)
124
+ {
125
+ k = j * width + i;
126
+ float fDiff = fabs (data1[k] - data2[k]);
127
+
128
+ if (fDiff > fListTol )
129
+ {
130
+ if (error_count < iListLength)
131
+ {
132
+ printf (" Loc(%d,%d)\t CPU=%.5f\t GPU=%.5f\t Diff=%.6f\n " , i, j, data1[k], data2[k], fDiff );
133
+ }
134
+
135
+ error_count++;
136
+ }
137
+ }
138
+ }
139
+
140
+ printf (" \n Total Errors = %d\n " , error_count);
141
+ }
142
+
143
+ void initializeCUDA (int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
144
+ {
145
+ // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
146
+ cudaError_t error;
147
+ devID = 0 ;
148
+
149
+ devID = findCudaDevice (argc, (const char **)argv);
150
+
151
+ if (checkCmdLineFlag (argc, (const char **)argv, " sizemult" ))
152
+ {
153
+ iSizeMultiple = getCmdLineArgumentInt (argc, (const char **)argv, " sizemult" );
154
+ }
155
+
156
+ iSizeMultiple = min (iSizeMultiple, 10 );
157
+ iSizeMultiple = max (iSizeMultiple, 1 );
158
+
159
+ cudaDeviceProp deviceProp;
160
+
161
+ error = cudaGetDeviceProperties (&deviceProp, devID);
162
+
163
+ if (error != cudaSuccess)
164
+ {
165
+ printf (" cudaGetDeviceProperties returned error code %d, line(%d)\n " , error, __LINE__);
166
+ exit (EXIT_FAILURE);
167
+ }
168
+
169
+ printf (" GPU Device %d: \" %s\" with compute capability %d.%d\n\n " , devID, deviceProp.name , deviceProp.major , deviceProp.minor );
170
+
171
+ int block_size = 32 ;
172
+
173
+ matrix_size.uiWA = 3 * block_size * iSizeMultiple;
174
+ matrix_size.uiHA = 4 * block_size * iSizeMultiple;
175
+ matrix_size.uiWB = 2 * block_size * iSizeMultiple;
176
+ matrix_size.uiHB = 3 * block_size * iSizeMultiple;
177
+ matrix_size.uiWC = 2 * block_size * iSizeMultiple;
178
+ matrix_size.uiHC = 4 * block_size * iSizeMultiple;
179
+
180
+ printf (" MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n " ,
181
+ matrix_size.uiHA , matrix_size.uiWA ,
182
+ matrix_size.uiHB , matrix_size.uiWB ,
183
+ matrix_size.uiHC , matrix_size.uiWC );
184
+
185
+ if ( matrix_size.uiWA != matrix_size.uiHB ||
186
+ matrix_size.uiHA != matrix_size.uiHC ||
187
+ matrix_size.uiWB != matrix_size.uiWC )
188
+ {
189
+ printf (" ERROR: Matrix sizes do not match!\n " );
190
+ exit (-1 );
191
+ }
192
+ }
193
+
194
+ // //////////////////////////////////////////////////////////////////////////////
195
+ // ! Run a simple test matrix multiply using CUBLAS
196
+ // //////////////////////////////////////////////////////////////////////////////
197
+ int matrixMultiply (int argc, char **argv, int devID, sMatrixSize &matrix_size,int8_t aaaa[][256 ],int8_t bbbb[][256 ])
198
+ {
199
+ cudaDeviceProp deviceProp;
200
+
201
+ checkCudaErrors (cudaGetDeviceProperties (&deviceProp, devID));
202
+
203
+ int block_size = 32 ;
204
+
205
+ // set seed for rand()
206
+ // srand(2006);
207
+
208
+ // allocate host memory for matrices A and B
209
+ unsigned int size_A = 256 * 256 ;
210
+ unsigned int mem_size_A = sizeof (double ) * size_A;
211
+ double *h_A = (double *)malloc (mem_size_A);
212
+ unsigned int size_B = 256 * 256 ;
213
+ unsigned int mem_size_B = sizeof (double ) * size_B;
214
+ double *h_B = (double *)malloc (mem_size_B);
215
+
216
+ // set seed for rand()
217
+ // srand(2006);
218
+
219
+ // // initialize host memory
220
+ bytomInit (h_A, size_A,aaaa);
221
+ bytomInit (h_B, size_B,aaaa);
222
+
223
+ // allocate device memory
224
+ double *d_A, *d_B, *d_C;
225
+ unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC ;
226
+ unsigned int mem_size_C = sizeof (double ) * size_C;
227
+
228
+ // allocate host memory for the result
229
+ double *h_C = (double *) malloc (mem_size_C);
230
+ double *h_CUBLAS = (double *) malloc (mem_size_C);
231
+
232
+ checkCudaErrors (cudaMalloc ((void **) &d_A, mem_size_A));
233
+ checkCudaErrors (cudaMalloc ((void **) &d_B, mem_size_B));
234
+ checkCudaErrors (cudaMemcpy (d_A, h_A, mem_size_A, cudaMemcpyHostToDevice));
235
+ checkCudaErrors (cudaMemcpy (d_B, h_B, mem_size_B, cudaMemcpyHostToDevice));
236
+ checkCudaErrors (cudaMalloc ((void **) &d_C, mem_size_C));
237
+
238
+ // setup execution parameters
239
+ dim3 threads (block_size, block_size);
240
+ dim3 grid (matrix_size.uiWC / threads.x , matrix_size.uiHC / threads.y );
241
+
242
+ // create and start timer
243
+ printf (" Computing result using CUBLAS..." );
244
+
245
+ // execute the kernel
246
+ int nIter = 30 ;
247
+
248
+ // CUBLAS version 2.0
249
+ {
250
+ const double alpha = 1 .0f ;
251
+ const double beta = 0 .0f ;
252
+ cublasHandle_t handle;
253
+ cudaEvent_t start, stop;
254
+
255
+ checkCudaErrors (cublasCreate (&handle));
256
+
257
+ // Perform warmup operation with cublas
258
+ // checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
259
+ // cublasStatus_t cublasSgemm(cublasHandle_t handle,cublasOperation_t transa, cublasOperation_t transb,int m, int n, int k,
260
+ // const float *alpha,
261
+ // const float *A, int lda,
262
+ // const float *B, int ldb,
263
+ // const float *beta,
264
+ // float *C, int ldc)
265
+ // cublasStatus_t cublasGemmEx(cublasHandle_t handle,cublasOperation_t transa,cublasOperation_t transb,int m,int n,int k,const void *alpha,
266
+ // const void *A,cudaDataType_t Atype,int lda,
267
+ // const void *B,cudaDataType_t Btype,int ldb,const void *beta,
268
+ // void *C,cudaDataType_t Ctype,int ldc,
269
+ // cudaDataType_t computeType,
270
+ // cublasGemmAlgo_t algo)
271
+
272
+ // checkCudaErrors(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, \
273
+ // d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
274
+ // cublasStatus_t cublasDgemm(cublasHandle_t handle,cublasOperation_t transa, cublasOperation_t transb,int m, int n, int k,const double *alpha,const double *A, int lda,const double *B, int ldb,const double *beta,double *C, int ldc)
275
+ checkCudaErrors (cublasDgemm (handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB , matrix_size.uiHA , matrix_size.uiWA , &alpha, \
276
+ d_A, matrix_size.uiWA , d_B, matrix_size.uiWB , &beta, d_C, matrix_size.uiWB ));
277
+ // Allocate CUDA events that we'll use for timing
278
+ checkCudaErrors (cudaEventCreate (&start));
279
+ checkCudaErrors (cudaEventCreate (&stop));
280
+
281
+ // Record the start event
282
+ checkCudaErrors (cudaEventRecord (start, NULL ));
283
+
284
+ for (int j = 0 ; j < nIter; j++)
285
+ {
286
+ // note cublas is column primary!
287
+ // need to transpose the order
288
+ // checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
289
+ checkCudaErrors (cublasDgemm (handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB , matrix_size.uiHA , matrix_size.uiWA , &alpha, d_A, matrix_size.uiWA , d_B, matrix_size.uiWB , &beta, d_C, matrix_size.uiWB ));
290
+
291
+ }
292
+
293
+ printf (" done.\n " );
294
+
295
+ // Record the stop event
296
+ checkCudaErrors (cudaEventRecord (stop, NULL ));
297
+
298
+ // Wait for the stop event to complete
299
+ checkCudaErrors (cudaEventSynchronize (stop));
300
+
301
+ float msecTotal = 0 .0f ;
302
+ checkCudaErrors (cudaEventElapsedTime (&msecTotal, start, stop));
303
+
304
+ // Compute and print the performance
305
+ float msecPerMatrixMul = msecTotal / nIter;
306
+ double flopsPerMatrixMul = 2.0 * (double )matrix_size.uiHC * (double )matrix_size.uiWC * (double )matrix_size.uiHB ;
307
+ double gigaFlops = (flopsPerMatrixMul * 1 .0e-9f ) / (msecPerMatrixMul / 1000 .0f );
308
+ printf (
309
+ " Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n " ,
310
+ gigaFlops,
311
+ msecPerMatrixMul,
312
+ flopsPerMatrixMul);
313
+
314
+ // copy result from device to host
315
+ checkCudaErrors (cudaMemcpy (h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost));
316
+
317
+ // Destroy the handle
318
+ checkCudaErrors (cublasDestroy (handle));
319
+ }
320
+
321
+ // compute reference solution
322
+ printf (" Computing result using host CPU..." );
323
+ float *reference = (float *)malloc (mem_size_C);
324
+ matrixMulCPU (reference, h_A, h_B, matrix_size.uiHA , matrix_size.uiWA , matrix_size.uiWB );
325
+ printf (" done.\n " );
326
+
327
+ // check result (CUBLAS)
328
+ bool resCUBLAS = sdkCompareL2fe (reference, h_CUBLAS, size_C, 1 .0e-6f );
329
+
330
+ if (resCUBLAS != true )
331
+ {
332
+ printDiff (reference, h_CUBLAS, matrix_size.uiWC , matrix_size.uiHC , 100 , 1 .0e-5f );
333
+ }
334
+
335
+ printf (" Comparing CUBLAS Matrix Multiply with CPU results: %s\n " , (true == resCUBLAS) ? " PASS" : " FAIL" );
336
+
337
+ printf (" \n NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n " );
338
+
339
+ // clean up memory
340
+ free (h_A);
341
+ free (h_B);
342
+ free (h_C);
343
+ free (reference);
344
+ checkCudaErrors (cudaFree (d_A));
345
+ checkCudaErrors (cudaFree (d_B));
346
+ checkCudaErrors (cudaFree (d_C));
347
+
348
+ if (resCUBLAS == true )
349
+ {
350
+ return EXIT_SUCCESS; // return value = 1
351
+ }
352
+ else
353
+ {
354
+ return EXIT_FAILURE; // return value = 0
355
+ }
356
+ }
357
+
358
+ // //////////////////////////////////////////////////////////////////////////////
359
+ // Program main
360
+ // //////////////////////////////////////////////////////////////////////////////
361
+ int bytomcall (int num, char **argu,int8_t aaaa[][256 ],int8_t bbbb[][256 ])
362
+ {
363
+ printf (" [Matrix Multiply CUBLAS] - Starting...\n " );
364
+
365
+ int devID = 0 , sizeMult = 5 ;
366
+ sMatrixSize matrix_size;
367
+ matrix_sizeui.WA =256 ;
368
+ matrix_sizeui.uiHA =256 ;
369
+ matrix_sizeui.uiWB =256 ;
370
+ matrix_sizeui.uiHB =256 ;
371
+ matrix_sizeui.uiWC =256 ;
372
+ matrix_sizeui.uiHC =256 ;
373
+ initializeCUDA (num, argu, devID, sizeMult, matrix_size);
374
+
375
+ int matrix_result = matrixMultiply (num, argu, devID, matrix_size,aaaa,bbbb);
376
+
377
+ return matrix_result;
378
+ }
0 commit comments