Skip to content

Commit f928f96

Browse files
author
limingming
committed
gpu
1 parent 24dd8db commit f928f96

File tree

4 files changed

+383
-3
lines changed

4 files changed

+383
-3
lines changed

Makefile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,11 +276,13 @@ byte_order.o:byte_order.c
276276

277277
sha3.o:sha3.c
278278
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
279+
matrixMulCUBLAS.o:matrixMulCUBLAS.cpp
280+
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
279281

280282
test_BytomPoW.o:test_BytomPoW.cpp
281283
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
282284

283-
test_BytomPoW: byte_order.o sha3.o test_BytomPoW.o
285+
test_BytomPoW: byte_order.o sha3.o matrixMulCUBLAS.o test_BytomPoW.o
284286
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
285287
$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
286288
$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)

matrixMulCUBLAS.cpp

Lines changed: 378 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,378 @@
1+
////////////////////////////////////////////////////////////////////////////
2+
//
3+
// Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
4+
//
5+
// Please refer to the NVIDIA end user license agreement (EULA) associated
6+
// with this source code for terms and conditions that govern your use of
7+
// this software. Any use, reproduction, disclosure, or distribution of
8+
// this software and related documentation outside the terms of the EULA
9+
// is strictly prohibited.
10+
//
11+
////////////////////////////////////////////////////////////////////////////
12+
13+
//
14+
// Matrix multiplication: C = A * B.
15+
// Host code.
16+
//
17+
// This sample implements matrix multiplication as described in Chapter 3
18+
// of the programming guide and uses the CUBLAS library to demonstrate
19+
// the best performance.
20+
21+
// SOME PRECAUTIONS:
22+
// IF WE WANT TO CALCULATE ROW-MAJOR MATRIX MULTIPLY C = A * B,
23+
// WE JUST NEED CALL CUBLAS API IN A REVERSE ORDER: cublasSegemm(B, A)!
24+
// The reason is explained as follows:
25+
26+
// CUBLAS library uses column-major storage, but C/C++ use row-major storage.
27+
// When passing the matrix pointer to CUBLAS, the memory layout alters from
28+
// row-major to column-major, which is equivalent to an implicit transpose.
29+
30+
// In the case of row-major C/C++ matrix A, B, and a simple matrix multiplication
31+
// C = A * B, we can't use the input order like cublasSgemm(A, B) because of
32+
// implicit transpose. The actual result of cublasSegemm(A, B) is A(T) * B(T).
33+
// If col(A(T)) != row(B(T)), equal to row(A) != col(B), A(T) and B(T) are not
34+
// multipliable. Moreover, even if A(T) and B(T) are multipliable, the result C
35+
// is a column-based cublas matrix, which means C(T) in C/C++, we need extra
36+
// transpose code to convert it to a row-based C/C++ matrix.
37+
38+
// To solve the problem, let's consider our desired result C, a row-major matrix.
39+
// In cublas format, it is C(T) actually (because of the implicit transpose).
40+
// C = A * B, so C(T) = (A * B) (T) = B(T) * A(T). Cublas matrice B(T) and A(T)
41+
// happen to be C/C++ matrice B and A (still because of the implicit transpose)!
42+
// We don't need extra transpose code, we only need alter the input order!
43+
//
44+
// CUBLAS provides high-performance matrix multiplication.
45+
// See also:
46+
// V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
47+
// in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08),
48+
// Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
49+
//
50+
51+
#include "matrixMulCUBLAS.h"
52+
53+
#ifndef min
54+
#define min(a,b) ((a < b) ? a : b)
55+
#endif
56+
#ifndef max
57+
#define max(a,b) ((a > b) ? a : b)
58+
#endif
59+
60+
typedef struct _matrixSize // Optional Command-line multiplier for matrix sizes
61+
{
62+
unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
63+
} sMatrixSize;
64+
65+
////////////////////////////////////////////////////////////////////////////////
66+
//! Compute reference data set matrix multiply on CPU
67+
//! C = A * B
68+
//! @param C reference data, computed but preallocated
69+
//! @param A matrix A as provided to device
70+
//! @param B matrix B as provided to device
71+
//! @param hA height of matrix A
72+
//! @param wB width of matrix B
73+
////////////////////////////////////////////////////////////////////////////////
74+
void
75+
matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
76+
{
77+
for (unsigned int i = 0; i < hA; ++i)
78+
for (unsigned int j = 0; j < wB; ++j)
79+
{
80+
double sum = 0;
81+
82+
for (unsigned int k = 0; k < wA; ++k)
83+
{
84+
double a = A[i * wA + k];
85+
double b = B[k * wB + j];
86+
sum += a * b;
87+
}
88+
89+
C[i * wB + j] = (float)sum;
90+
}
91+
}
92+
93+
// Allocates a matrix with random float entries.
94+
void randomInit(float *data, int size)
95+
{
96+
for (int i = 0; i < size; ++i)
97+
data[i] = rand() / (float)RAND_MAX;
98+
}
99+
void bytomInit(double *data, int size,int8_t aaaa[][256])
100+
{
101+
// for (int i = 0; i < size; ++i)
102+
// data[i] = rand() / (float)RAND_MAX;
103+
// for (int i = 0; i < 256; i++) {
104+
for (int j = 0; j < size; j++) {
105+
data[i*256+j] = (double)(aaaa[i][j]);
106+
// mb[i*256+j] = (double)(b.d[i][j]);
107+
}
108+
// }
109+
}
110+
void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
111+
{
112+
printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
113+
int i,j,k;
114+
int error_count=0;
115+
116+
for (j = 0; j < height; j++)
117+
{
118+
if (error_count < iListLength)
119+
{
120+
printf("\n Row %d:\n", j);
121+
}
122+
123+
for (i = 0; i < width; i++)
124+
{
125+
k = j * width + i;
126+
float fDiff = fabs(data1[k] - data2[k]);
127+
128+
if (fDiff > fListTol)
129+
{
130+
if (error_count < iListLength)
131+
{
132+
printf(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
133+
}
134+
135+
error_count++;
136+
}
137+
}
138+
}
139+
140+
printf(" \n Total Errors = %d\n", error_count);
141+
}
142+
143+
void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
144+
{
145+
// By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
146+
cudaError_t error;
147+
devID = 0;
148+
149+
devID = findCudaDevice(argc, (const char **)argv);
150+
151+
if (checkCmdLineFlag(argc, (const char **)argv, "sizemult"))
152+
{
153+
iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
154+
}
155+
156+
iSizeMultiple = min(iSizeMultiple, 10);
157+
iSizeMultiple = max(iSizeMultiple, 1);
158+
159+
cudaDeviceProp deviceProp;
160+
161+
error = cudaGetDeviceProperties(&deviceProp, devID);
162+
163+
if (error != cudaSuccess)
164+
{
165+
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
166+
exit(EXIT_FAILURE);
167+
}
168+
169+
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
170+
171+
int block_size = 32;
172+
173+
matrix_size.uiWA = 3 * block_size * iSizeMultiple;
174+
matrix_size.uiHA = 4 * block_size * iSizeMultiple;
175+
matrix_size.uiWB = 2 * block_size * iSizeMultiple;
176+
matrix_size.uiHB = 3 * block_size * iSizeMultiple;
177+
matrix_size.uiWC = 2 * block_size * iSizeMultiple;
178+
matrix_size.uiHC = 4 * block_size * iSizeMultiple;
179+
180+
printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n",
181+
matrix_size.uiHA, matrix_size.uiWA,
182+
matrix_size.uiHB, matrix_size.uiWB,
183+
matrix_size.uiHC, matrix_size.uiWC);
184+
185+
if( matrix_size.uiWA != matrix_size.uiHB ||
186+
matrix_size.uiHA != matrix_size.uiHC ||
187+
matrix_size.uiWB != matrix_size.uiWC)
188+
{
189+
printf("ERROR: Matrix sizes do not match!\n");
190+
exit(-1);
191+
}
192+
}
193+
194+
////////////////////////////////////////////////////////////////////////////////
195+
//! Run a simple test matrix multiply using CUBLAS
196+
////////////////////////////////////////////////////////////////////////////////
197+
int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size,int8_t aaaa[][256],int8_t bbbb[][256])
198+
{
199+
cudaDeviceProp deviceProp;
200+
201+
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
202+
203+
int block_size = 32;
204+
205+
// set seed for rand()
206+
// srand(2006);
207+
208+
// allocate host memory for matrices A and B
209+
unsigned int size_A = 256 * 256;
210+
unsigned int mem_size_A = sizeof(double) * size_A;
211+
double *h_A = (double *)malloc(mem_size_A);
212+
unsigned int size_B = 256 * 256;
213+
unsigned int mem_size_B = sizeof(double) * size_B;
214+
double *h_B = (double *)malloc(mem_size_B);
215+
216+
// set seed for rand()
217+
// srand(2006);
218+
219+
// // initialize host memory
220+
bytomInit(h_A, size_A,aaaa);
221+
bytomInit(h_B, size_B,aaaa);
222+
223+
// allocate device memory
224+
double *d_A, *d_B, *d_C;
225+
unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
226+
unsigned int mem_size_C = sizeof(double) * size_C;
227+
228+
// allocate host memory for the result
229+
double *h_C = (double *) malloc(mem_size_C);
230+
double *h_CUBLAS = (double *) malloc(mem_size_C);
231+
232+
checkCudaErrors(cudaMalloc((void **) &d_A, mem_size_A));
233+
checkCudaErrors(cudaMalloc((void **) &d_B, mem_size_B));
234+
checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice));
235+
checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice));
236+
checkCudaErrors(cudaMalloc((void **) &d_C, mem_size_C));
237+
238+
// setup execution parameters
239+
dim3 threads(block_size, block_size);
240+
dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);
241+
242+
// create and start timer
243+
printf("Computing result using CUBLAS...");
244+
245+
// execute the kernel
246+
int nIter = 30;
247+
248+
// CUBLAS version 2.0
249+
{
250+
const double alpha = 1.0f;
251+
const double beta = 0.0f;
252+
cublasHandle_t handle;
253+
cudaEvent_t start, stop;
254+
255+
checkCudaErrors(cublasCreate(&handle));
256+
257+
//Perform warmup operation with cublas
258+
// checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
259+
// cublasStatus_t cublasSgemm(cublasHandle_t handle,cublasOperation_t transa, cublasOperation_t transb,int m, int n, int k,
260+
// const float *alpha,
261+
// const float *A, int lda,
262+
// const float *B, int ldb,
263+
// const float *beta,
264+
// float *C, int ldc)
265+
// cublasStatus_t cublasGemmEx(cublasHandle_t handle,cublasOperation_t transa,cublasOperation_t transb,int m,int n,int k,const void *alpha,
266+
// const void *A,cudaDataType_t Atype,int lda,
267+
// const void *B,cudaDataType_t Btype,int ldb,const void *beta,
268+
// void *C,cudaDataType_t Ctype,int ldc,
269+
// cudaDataType_t computeType,
270+
// cublasGemmAlgo_t algo)
271+
272+
// checkCudaErrors(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, \
273+
// d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
274+
// cublasStatus_t cublasDgemm(cublasHandle_t handle,cublasOperation_t transa, cublasOperation_t transb,int m, int n, int k,const double *alpha,const double *A, int lda,const double *B, int ldb,const double *beta,double *C, int ldc)
275+
checkCudaErrors(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, \
276+
d_A, matrix_size.uiWA, d_B, matrix_size.uiWB, &beta, d_C, matrix_size.uiWB));
277+
// Allocate CUDA events that we'll use for timing
278+
checkCudaErrors(cudaEventCreate(&start));
279+
checkCudaErrors(cudaEventCreate(&stop));
280+
281+
// Record the start event
282+
checkCudaErrors(cudaEventRecord(start, NULL));
283+
284+
for (int j = 0; j < nIter; j++)
285+
{
286+
//note cublas is column primary!
287+
//need to transpose the order
288+
// checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
289+
checkCudaErrors(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_A, matrix_size.uiWA, d_B, matrix_size.uiWB, &beta, d_C, matrix_size.uiWB));
290+
291+
}
292+
293+
printf("done.\n");
294+
295+
// Record the stop event
296+
checkCudaErrors(cudaEventRecord(stop, NULL));
297+
298+
// Wait for the stop event to complete
299+
checkCudaErrors(cudaEventSynchronize(stop));
300+
301+
float msecTotal = 0.0f;
302+
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
303+
304+
// Compute and print the performance
305+
float msecPerMatrixMul = msecTotal / nIter;
306+
double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC * (double)matrix_size.uiWC * (double)matrix_size.uiHB;
307+
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
308+
printf(
309+
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
310+
gigaFlops,
311+
msecPerMatrixMul,
312+
flopsPerMatrixMul);
313+
314+
// copy result from device to host
315+
checkCudaErrors(cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost));
316+
317+
// Destroy the handle
318+
checkCudaErrors(cublasDestroy(handle));
319+
}
320+
321+
// compute reference solution
322+
printf("Computing result using host CPU...");
323+
float *reference = (float *)malloc(mem_size_C);
324+
matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
325+
printf("done.\n");
326+
327+
// check result (CUBLAS)
328+
bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);
329+
330+
if (resCUBLAS != true)
331+
{
332+
printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f);
333+
}
334+
335+
printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL");
336+
337+
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");
338+
339+
// clean up memory
340+
free(h_A);
341+
free(h_B);
342+
free(h_C);
343+
free(reference);
344+
checkCudaErrors(cudaFree(d_A));
345+
checkCudaErrors(cudaFree(d_B));
346+
checkCudaErrors(cudaFree(d_C));
347+
348+
if (resCUBLAS == true)
349+
{
350+
return EXIT_SUCCESS; // return value = 1
351+
}
352+
else
353+
{
354+
return EXIT_FAILURE; // return value = 0
355+
}
356+
}
357+
358+
////////////////////////////////////////////////////////////////////////////////
359+
// Program main
360+
////////////////////////////////////////////////////////////////////////////////
361+
int bytomcall(int num, char **argu,int8_t aaaa[][256],int8_t bbbb[][256])
362+
{
363+
printf("[Matrix Multiply CUBLAS] - Starting...\n");
364+
365+
int devID = 0, sizeMult = 5;
366+
sMatrixSize matrix_size;
367+
matrix_sizeui.WA=256;
368+
matrix_sizeui.uiHA=256;
369+
matrix_sizeui.uiWB=256;
370+
matrix_sizeui.uiHB=256;
371+
matrix_sizeui.uiWC=256;
372+
matrix_sizeui.uiHC=256;
373+
initializeCUDA(num, argu, devID, sizeMult, matrix_size);
374+
375+
int matrix_result = matrixMultiply(num, argu, devID, matrix_size,aaaa,bbbb);
376+
377+
return matrix_result;
378+
}

0 commit comments

Comments
 (0)