BytomDAO
diff --git a/‎Makefile
Lines changed: 3 additions & 1 deletion b/‎Makefile
Lines changed: 3 additions & 1 deletion
diff --git a/‎matrixMulCUBLAS.cpp
Lines changed: 378 additions & 0 deletions b/‎matrixMulCUBLAS.cpp
Lines changed: 378 additions & 0 deletions
@@ -276,11 +276,13 @@ byte_order.o:byte_order.c
 
 sha3.o:sha3.c
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+matrixMulCUBLAS.o:matrixMulCUBLAS.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 
 test_BytomPoW.o:test_BytomPoW.cpp
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 
-test_BytomPoW: byte_order.o sha3.o test_BytomPoW.o
+test_BytomPoW: byte_order.o sha3.o matrixMulCUBLAS.o test_BytomPoW.o
 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
 	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 
@@ -0,0 +1,378 @@
+////////////////////////////////////////////////////////////////////////////
+//
+// Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+//
+// Please refer to the NVIDIA end user license agreement (EULA) associated
+// with this source code for terms and conditions that govern your use of
+// this software. Any use, reproduction, disclosure, or distribution of
+// this software and related documentation outside the terms of the EULA
+// is strictly prohibited.
+//
+////////////////////////////////////////////////////////////////////////////
+
+//
+// Matrix multiplication: C = A * B.
+// Host code.
+//
+// This sample implements matrix multiplication as described in Chapter 3
+// of the programming guide and uses the CUBLAS library to demonstrate
+// the best performance.
+
+// SOME PRECAUTIONS:
+// IF WE WANT TO CALCULATE ROW-MAJOR MATRIX MULTIPLY C = A * B,
+// WE JUST NEED CALL CUBLAS API IN A REVERSE ORDER: cublasSegemm(B, A)!
+// The reason is explained as follows:
+
+// CUBLAS library uses column-major storage, but C/C++ use row-major storage.
+// When passing the matrix pointer to CUBLAS, the memory layout alters from
+// row-major to column-major, which is equivalent to an implicit transpose.
+
+// In the case of row-major C/C++ matrix A, B, and a simple matrix multiplication
+// C = A * B, we can't use the input order like cublasSgemm(A, B)  because of
+// implicit transpose. The actual result of cublasSegemm(A, B) is A(T) * B(T).
+// If col(A(T)) != row(B(T)), equal to row(A) != col(B), A(T) and B(T) are not
+// multipliable. Moreover, even if A(T) and B(T) are multipliable, the result C
+// is a column-based cublas matrix, which means C(T) in C/C++, we need extra
+// transpose code to convert it to a row-based C/C++ matrix.
+
+// To solve the problem, let's consider our desired result C, a row-major matrix.
+// In cublas format, it is C(T) actually (because of the implicit transpose).
+// C = A * B, so C(T) = (A * B) (T) = B(T) * A(T). Cublas matrice B(T) and A(T)
+// happen to be C/C++ matrice B and A (still because of the implicit transpose)!
+// We don't need extra transpose code, we only need alter the input order!
+//
+// CUBLAS provides high-performance matrix multiplication.
+// See also:
+// V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
+// in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08),
+// Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
+//
+
+#include "matrixMulCUBLAS.h"
+
+#ifndef min
+#define min(a,b) ((a < b) ? a : b)
+#endif
+#ifndef max
+#define max(a,b) ((a > b) ? a : b)
+#endif
+
+typedef struct _matrixSize      // Optional Command-line multiplier for matrix sizes
+{
+    unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
+} sMatrixSize;
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute reference data set matrix multiply on CPU
+//! C = A * B
+//! @param C          reference data, computed but preallocated
+//! @param A          matrix A as provided to device
+//! @param B          matrix B as provided to device
+//! @param hA         height of matrix A
+//! @param wB         width of matrix B
+////////////////////////////////////////////////////////////////////////////////
+void
+matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
+{
+    for (unsigned int i = 0; i < hA; ++i)
+        for (unsigned int j = 0; j < wB; ++j)
+        {
+            double sum = 0;
+
+            for (unsigned int k = 0; k < wA; ++k)
+            {
+                double a = A[i * wA + k];
+                double b = B[k * wB + j];
+                sum += a * b;
+            }
+
+            C[i * wB + j] = (float)sum;
+        }
+}
+
+// Allocates a matrix with random float entries.
+void randomInit(float *data, int size)
+{
+    for (int i = 0; i < size; ++i)
+        data[i] = rand() / (float)RAND_MAX;
+}
+void bytomInit(double *data, int size,int8_t aaaa[][256])
+{
+    // for (int i = 0; i < size; ++i)
+    //     data[i] = rand() / (float)RAND_MAX;
+    // for (int i = 0; i < 256; i++) {
+            for (int j = 0; j < size; j++) {
+                data[i*256+j] = (double)(aaaa[i][j]);
+                // mb[i*256+j] = (double)(b.d[i][j]);
+            }
+        // }
+}
+void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
+{
+    printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
+    int i,j,k;
+    int error_count=0;
+
+    for (j = 0; j < height; j++)
+    {
+        if (error_count < iListLength)
+        {
+            printf("\n  Row %d:\n", j);
+        }
+
+        for (i = 0; i < width; i++)
+        {
+            k = j * width + i;
+            float fDiff = fabs(data1[k] - data2[k]);
+
+            if (fDiff > fListTol)
+            {
+                if (error_count < iListLength)
+                {
+                    printf("    Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
+                }
+
+                error_count++;
+            }
+        }
+    }
+
+    printf(" \n  Total Errors = %d\n", error_count);
+}
+
+void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
+{
+    // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
+    cudaError_t error;
+    devID = 0;
+
+    devID = findCudaDevice(argc, (const char **)argv);
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "sizemult"))
+    {
+        iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
+    }
+
+    iSizeMultiple = min(iSizeMultiple, 10);
+    iSizeMultiple = max(iSizeMultiple, 1);
+
+    cudaDeviceProp deviceProp;
+
+    error = cudaGetDeviceProperties(&deviceProp, devID);
+
+    if (error != cudaSuccess)
+    {
+        printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
+        exit(EXIT_FAILURE);
+    }
+
+    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
+
+    int block_size = 32;
+
+    matrix_size.uiWA = 3 * block_size * iSizeMultiple;
+    matrix_size.uiHA = 4 * block_size * iSizeMultiple;
+    matrix_size.uiWB = 2 * block_size * iSizeMultiple;
+    matrix_size.uiHB = 3 * block_size * iSizeMultiple;
+    matrix_size.uiWC = 2 * block_size * iSizeMultiple;
+    matrix_size.uiHC = 4 * block_size * iSizeMultiple;
+
+    printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n",
+           matrix_size.uiHA, matrix_size.uiWA,
+           matrix_size.uiHB, matrix_size.uiWB,
+           matrix_size.uiHC, matrix_size.uiWC);
+
+    if( matrix_size.uiWA != matrix_size.uiHB ||
+        matrix_size.uiHA != matrix_size.uiHC ||
+        matrix_size.uiWB != matrix_size.uiWC)
+    {
+       printf("ERROR: Matrix sizes do not match!\n");
+       exit(-1);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Run a simple test matrix multiply using CUBLAS
+////////////////////////////////////////////////////////////////////////////////
+int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size,int8_t aaaa[][256],int8_t bbbb[][256])
+{
+    cudaDeviceProp deviceProp;
+
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
+
+    int block_size = 32;
+
+    // set seed for rand()
+    // srand(2006);
+
+    // allocate host memory for matrices A and B
+    unsigned int size_A = 256 * 256;
+    unsigned int mem_size_A = sizeof(double) * size_A;
+    double *h_A = (double *)malloc(mem_size_A);
+    unsigned int size_B = 256 * 256;
+    unsigned int mem_size_B = sizeof(double) * size_B;
+    double *h_B = (double *)malloc(mem_size_B);
+
+    // set seed for rand()
+    // srand(2006);
+
+    // // initialize host memory
+    bytomInit(h_A, size_A,aaaa);
+    bytomInit(h_B, size_B,aaaa);
+
+    // allocate device memory
+    double *d_A, *d_B, *d_C;
+    unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
+    unsigned int mem_size_C = sizeof(double) * size_C;
+
+    // allocate host memory for the result
+    double *h_C      = (double *) malloc(mem_size_C);
+    double *h_CUBLAS = (double *) malloc(mem_size_C);
+
+    checkCudaErrors(cudaMalloc((void **) &d_A, mem_size_A));
+    checkCudaErrors(cudaMalloc((void **) &d_B, mem_size_B));
+    checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMalloc((void **) &d_C, mem_size_C));
+
+    // setup execution parameters
+    dim3 threads(block_size, block_size);
+    dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);
+
+    // create and start timer
+    printf("Computing result using CUBLAS...");
+
+    // execute the kernel
+    int nIter = 30;
+
+    // CUBLAS version 2.0
+    {
+        const double alpha = 1.0f;
+        const double beta  = 0.0f;
+        cublasHandle_t handle;
+        cudaEvent_t start, stop;
+
+        checkCudaErrors(cublasCreate(&handle));
+
+        //Perform warmup operation with cublas
+        // checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
+        // cublasStatus_t cublasSgemm(cublasHandle_t handle,cublasOperation_t transa, cublasOperation_t transb,int m, int n, int k,
+        // const float *alpha,
+        // const float *A, int lda,
+        // const float *B, int ldb,
+        // const float *beta,
+        // float *C, int ldc)
+        // cublasStatus_t cublasGemmEx(cublasHandle_t handle,cublasOperation_t transa,cublasOperation_t transb,int m,int n,int k,const void *alpha, 
+        // const void *A,cudaDataType_t Atype,int lda,
+        // const void *B,cudaDataType_t Btype,int ldb,const void *beta,
+        // void *C,cudaDataType_t Ctype,int ldc,
+        // cudaDataType_t computeType,
+        // cublasGemmAlgo_t algo)
+
+        // checkCudaErrors(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, \
+        // d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
+// cublasStatus_t cublasDgemm(cublasHandle_t handle,cublasOperation_t transa, cublasOperation_t transb,int m, int n, int k,const double *alpha,const double *A, int lda,const double *B, int ldb,const double *beta,double *C, int ldc)
+checkCudaErrors(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, \
+        d_A, matrix_size.uiWA, d_B, matrix_size.uiWB, &beta, d_C, matrix_size.uiWB));
+        // Allocate CUDA events that we'll use for timing
+        checkCudaErrors(cudaEventCreate(&start));
+        checkCudaErrors(cudaEventCreate(&stop));
+
+        // Record the start event
+        checkCudaErrors(cudaEventRecord(start, NULL));
+
+        for (int j = 0; j < nIter; j++)
+        {
+            //note cublas is column primary!
+            //need to transpose the order
+            // checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
+            checkCudaErrors(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_A, matrix_size.uiWA, d_B, matrix_size.uiWB, &beta, d_C, matrix_size.uiWB));
+
+        }
+
+        printf("done.\n");
+
+        // Record the stop event
+        checkCudaErrors(cudaEventRecord(stop, NULL));
+
+        // Wait for the stop event to complete
+        checkCudaErrors(cudaEventSynchronize(stop));
+
+        float msecTotal = 0.0f;
+        checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
+
+        // Compute and print the performance
+        float msecPerMatrixMul = msecTotal / nIter;
+        double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC * (double)matrix_size.uiWC * (double)matrix_size.uiHB;
+        double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
+        printf(
+            "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
+            gigaFlops,
+            msecPerMatrixMul,
+            flopsPerMatrixMul);
+
+        // copy result from device to host
+        checkCudaErrors(cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost));
+
+        // Destroy the handle
+        checkCudaErrors(cublasDestroy(handle));
+    }
+
+    // compute reference solution
+    printf("Computing result using host CPU...");
+    float *reference = (float *)malloc(mem_size_C);
+    matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
+    printf("done.\n");
+
+    // check result (CUBLAS)
+    bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);
+
+    if (resCUBLAS != true)
+    {
+        printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f);
+    }
+
+    printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL");
+
+    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");
+
+    // clean up memory
+    free(h_A);
+    free(h_B);
+    free(h_C);
+    free(reference);
+    checkCudaErrors(cudaFree(d_A));
+    checkCudaErrors(cudaFree(d_B));
+    checkCudaErrors(cudaFree(d_C));
+
+    if (resCUBLAS == true)
+    {
+        return EXIT_SUCCESS;    // return value = 1
+    }
+    else
+    {
+        return EXIT_FAILURE;     // return value = 0
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int bytomcall(int num, char **argu,int8_t aaaa[][256],int8_t bbbb[][256])
+{
+    printf("[Matrix Multiply CUBLAS] - Starting...\n");
+
+    int devID = 0, sizeMult = 5;
+    sMatrixSize matrix_size;
+    matrix_sizeui.WA=256;
+    matrix_sizeui.uiHA=256;
+    matrix_sizeui.uiWB=256; 
+    matrix_sizeui.uiHB=256;
+    matrix_sizeui.uiWC=256; 
+    matrix_sizeui.uiHC=256;
+    initializeCUDA(num, argu, devID, sizeMult, matrix_size);
+
+    int matrix_result = matrixMultiply(num, argu, devID, matrix_size,aaaa,bbbb);
+
+    return matrix_result;
+}