From 372393d4c4c77e2c09e4810a51a44206855883cb Mon Sep 17 00:00:00 2001 From: Andrew C Date: Wed, 26 Apr 2023 20:08:03 +1000 Subject: [PATCH 1/3] Add initial rocm support for running on AMD GPUs --- CMakeLists.txt | 45 ++++++++- ggml-rocm.cpp | 256 +++++++++++++++++++++++++++++++++++++++++++++++++ ggml-rocm.h | 41 ++++++++ 3 files changed, 341 insertions(+), 1 deletion(-) create mode 100644 ggml-rocm.cpp create mode 100644 ggml-rocm.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 11ebe9eb66fae..8480624917dd9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,6 +67,7 @@ endif() option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF) option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) +option(LLAMA_ROCBLAS "llama: use rocBLAS" ON) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) @@ -168,6 +169,47 @@ if (LLAMA_CUBLAS) endif() endif() +macro(find_package_and_print_version PACKAGE_NAME) + find_package("${PACKAGE_NAME}" ${ARGN}) + message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}") +endmacro() + +if (LLAMA_ROCBLAS) + + cmake_minimum_required(VERSION 3.17) + + if(true) + message(STATUS "AMD HIP found") + + set(hip_library_name amdhip64) + message("HIP library name: ${hip_library_name}") + + set(GGML_ROCM_SOURCES ggml-rocm.cpp ggml-rocm.h) + set(ROCM_PATH /opt/rocm) + set(ROCM_INCLUDE_DIRS ${ROCM_PATH}/include) + set(HIP_PATH ${ROCM_PATH}/hip) + set(HCC_PATH ${ROCM_PATH}/hcc) + set(HIPCC_PATH ${ROCM_PATH}/hip/bin/hipcc) + + set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH}) + + set(CMAKE_HCC_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) + set(CMAKE_HCC_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) + + find_package(hip REQUIRED) + find_package(rocblas REQUIRED) + find_package(rocfft REQUIRED) + find_package(AMDDeviceLibs REQUIRED CONFIG) + + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::host hip::device roc::rocblas roc::rocfft) + set(CMAKE_CXX_COMPILER=${ROCM_PATH}/llvm/bin/clang++) + set(CMAKE_C_COMPILER=${ROCM_PATH}/llvm/bin/clang) + + else() + message(WARNING "rocBLAS not found") + endif() +endif() + if (LLAMA_ALL_WARNINGS) if (NOT MSVC) set(c_flags @@ -307,7 +349,8 @@ endif() add_library(ggml OBJECT ggml.c ggml.h - ${GGML_CUDA_SOURCES}) + ${GGML_CUDA_SOURCES} + ${GGML_ROCM_SOURCES}) target_include_directories(ggml PUBLIC .) target_compile_features(ggml PUBLIC c_std_11) # don't bump diff --git a/ggml-rocm.cpp b/ggml-rocm.cpp new file mode 100644 index 0000000000000..6ab85b15ff0d0 --- /dev/null +++ b/ggml-rocm.cpp @@ -0,0 +1,256 @@ +#include +#include +#include +#include +#include +#include +#include "ggml-rocm.h" + +typedef uint16_t ggml_fp16_t; +static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size"); + +#define QK4_0 32 +typedef struct { + float d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants +} block_q4_0; +static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding"); + +#define QK4_1 32 +typedef struct { + float d; // delta + float m; // min + uint8_t qs[QK4_1 / 2]; // nibbles / quants +} block_q4_1; +static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); + +#define QK4_2 16 +typedef struct { + __half d; // delta + uint8_t qs[QK4_2 / 2]; // nibbles / quants +} block_q4_2; +static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); + +#define QK4_3 16 +typedef struct { + __half d; // delta + __half m; // min + uint8_t qs[QK4_3 / 2]; // nibbles / quants +} block_q4_3; +static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding"); + +#define QK8_0 32 +typedef struct { + float d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; +static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding"); + +__global__ void dequantize_block_q4_0(const void * vx, float * y) { + const block_q4_0 * x = (const block_q4_0 *) vx; + + const int i = hipBlockIdx_x; + + const float d = x[i].d; + + const uint8_t * pp = x[i].qs; + + for (int l = 0; l < QK4_0; l += 2) { + const uint8_t vi = pp[l/2]; + + const int8_t vi0 = vi & 0xf; + const int8_t vi1 = vi >> 4; + + const float v0 = (vi0 - 8)*d; + const float v1 = (vi1 - 8)*d; + + y[i*QK4_0 + l + 0] = v0; + y[i*QK4_0 + l + 1] = v1; + } +} + +__global__ void dequantize_block_q4_1(const void * vx, float * y) { + const block_q4_1 * x = (const block_q4_1 *) vx; + + const int i = hipBlockIdx_x; + + const float d = x[i].d; + const float m = x[i].m; + + const uint8_t * pp = x[i].qs; + + for (int l = 0; l < QK4_1; l += 2) { + const uint8_t vi = pp[l/2]; + + const int8_t vi0 = vi & 0xf; + const int8_t vi1 = vi >> 4; + + const float v0 = vi0*d + m; + const float v1 = vi1*d + m; + + y[i*QK4_1 + l + 0] = v0; + y[i*QK4_1 + l + 1] = v1; + } +} + +__global__ void dequantize_block_q4_2(const void * vx, float * y) { + const block_q4_2 * x = (const block_q4_2 *) vx; + + const int i = hipBlockIdx_x; + + const float d = x[i].d; + + const uint8_t * pp = x[i].qs; + + for (int l = 0; l < QK4_2; l += 2) { + const uint8_t vi = pp[l/2]; + + const int8_t vi0 = vi & 0xf; + const int8_t vi1 = vi >> 4; + + const float v0 = (vi0 - 8)*d; + const float v1 = (vi1 - 8)*d; + + y[i*QK4_2 + l + 0] = v0; + y[i*QK4_2 + l + 1] = v1; + } +} + +__global__ void dequantize_block_q4_3(const void * vx, float * y) { + const block_q4_3 * x = (const block_q4_3 *) vx; + + const int i = hipBlockIdx_x; + + const float d = x[i].d; + const float m = x[i].m; + + const uint8_t * pp = x[i].qs; + + for (int l = 0; l < QK4_3; l += 2) { + const uint8_t vi = pp[l/2]; + + const int8_t vi0 = vi & 0xf; + const int8_t vi1 = vi >> 4; + + const float v0 = vi0*d + m; + const float v1 = vi1*d + m; + + y[i*QK4_3 + l + 0] = v0; + y[i*QK4_3 + l + 1] = v1; + } +} + +__global__ void dequantize_block_q8_0(const void * vx, float * y) { + const block_q8_0 * x = (const block_q8_0 *) vx; + + const int i = hipBlockIdx_x; + + const float d = x[i].d; + + const int8_t * pp = x[i].qs; + + for (int l = 0; l < QK8_0; l++) { + const int8_t vi = pp[l]; + + y[i*QK8_0 + l] = vi*d; + } +} + +void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, hipStream_t stream) { + const int nb = k / QK4_0; + hipLaunchKernelGGL(dequantize_block_q4_0, dim3(nb), dim3(1), 0, stream, vx, y); +} + +void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, hipStream_t stream) { + const int nb = k / QK4_1; + hipLaunchKernelGGL(dequantize_block_q4_1, dim3(nb), dim3(1), 0, stream, vx, y); +} + +void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, hipStream_t stream) { + const int nb = k / QK4_2; + hipLaunchKernelGGL(dequantize_block_q4_2, dim3(nb), dim3(1), 0, stream, vx, y); +} + +void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, hipStream_t stream) { + const int nb = k / QK4_3; + hipLaunchKernelGGL(dequantize_block_q4_3, dim3(nb), dim3(1), 0, stream, vx, y); +} + +void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, hipStream_t stream) { + const int nb = k / QK8_0; + hipLaunchKernelGGL(dequantize_block_q8_0, dim3(nb), dim3(1), 0, stream, vx, y); +} + +// buffer pool for cuda +#define MAX_CUDA_BUFFERS 16 + +struct scoped_spin_lock { + std::atomic_flag& lock; + while (lock.test_and_set(std::memory_order_acquire)) { + ; // spin + } + ~scoped_spin_lock() { + lock.clear(std::memory_order_release); + } + scoped_spin_lock(const scoped_spin_lock&) = delete; + scoped_spin_lock& operator=(const scoped_spin_lock&) = delete; +}; + +struct cuda_buffer { + void * ptr = nullptr; + size_t size = 0; +}; + +static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS]; +static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT; + +void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { + scoped_spin_lock lock(g_cuda_pool_lock); + + for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { + cuda_buffer& b = g_cuda_buffer_pool[i]; + if (b.size >= size && b.ptr != nullptr) { + void * ptr = b.ptr; + *actual_size = b.size; + b.ptr = nullptr; + b.size = 0; + return ptr; + } + } + void * ptr; + CUDA_CHECK(hipMalloc((void **) &ptr, size)); + *actual_size = size; + return ptr; +} + +void ggml_cuda_pool_free(void * ptr, size_t size) { + scoped_spin_lock lock(g_cuda_pool_lock); + + for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { + cuda_buffer& b = g_cuda_buffer_pool[i]; + if (b.ptr == nullptr) { + b.ptr = ptr; + b.size = size; + return; + } + } + fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n"); + CUDA_CHECK(hipFree(ptr)); +} + +hipblasHandle_t g_cublasH = NULL; +hipStream_t g_cudaStream = NULL; + +void ggml_init_cublas(void) { + if (g_cublasH == NULL) { + // create cublas handle, bind a stream + CUBLAS_CHECK(hipblasCreate(&g_cublasH)); + + CUDA_CHECK(hipStreamCreateWithFlags(&g_cudaStream, hipStreamNonBlocking)); + + CUBLAS_CHECK(hipblasSetStream(g_cublasH, g_cudaStream)); + + // configure logging to stdout + // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL)); + } +} diff --git a/ggml-rocm.h b/ggml-rocm.h new file mode 100644 index 0000000000000..220f39c51fcc0 --- /dev/null +++ b/ggml-rocm.h @@ -0,0 +1,41 @@ + + +#ifdef __cplusplus +extern "C" { +#endif + +#define CUDA_CHECK(err) \ + do { \ + hipError_t err_ = (err); \ + if (err_ != hipSuccess) { \ + fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ + hipGetErrorString(err_)); \ + exit(1); \ + } \ + } while (0) + +#define CUBLAS_CHECK(err) \ + do { \ + hipblasStatus_t err_ = (err); \ + if (err_ != HIPBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + exit(1); \ + } \ + } while (0) + +extern hipblasHandle_t g_cublasH; +extern hipStream_t g_cudaStream; + +void ggml_init_cublas(void); +void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size); +void ggml_cuda_pool_free(void * ptr, size_t size); + +void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, hipStream_t stream); +void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, hipStream_t stream); +void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, hipStream_t stream); +void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, hipStream_t stream); +void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, hipStream_t stream); + +#ifdef __cplusplus +} +#endif From 1c0e87a5e93df2b734202cc2ceb6275d458abb1f Mon Sep 17 00:00:00 2001 From: Andrew C Date: Wed, 26 Apr 2023 20:35:58 +1000 Subject: [PATCH 2/3] ROCM now compiling. --- ggml-rocm.cpp | 91 ++++++++++++++++++++++++++------------------------- ggml-rocm.h | 41 ++++++++++++----------- 2 files changed, 68 insertions(+), 64 deletions(-) diff --git a/ggml-rocm.cpp b/ggml-rocm.cpp index 6ab85b15ff0d0..98fe2ba458f09 100644 --- a/ggml-rocm.cpp +++ b/ggml-rocm.cpp @@ -1,13 +1,13 @@ -#include -#include #include #include #include #include #include "ggml-rocm.h" +#define hipHalf __fp16 + typedef uint16_t ggml_fp16_t; -static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size"); +static_assert(sizeof(hipHalf) == sizeof(ggml_fp16_t), "wrong fp16 size"); #define QK4_0 32 typedef struct { @@ -26,16 +26,16 @@ static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 b #define QK4_2 16 typedef struct { - __half d; // delta - uint8_t qs[QK4_2 / 2]; // nibbles / quants + hipHalf d; // delta + uint8_t qs[QK4_2 / 2]; // nibbles / quants } block_q4_2; static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); #define QK4_3 16 typedef struct { - __half d; // delta - __half m; // min - uint8_t qs[QK4_3 / 2]; // nibbles / quants + hipHalf d; // delta + hipHalf m; // min + uint8_t qs[QK4_3 / 2]; // nibbles / quants } block_q4_3; static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding"); @@ -46,7 +46,7 @@ typedef struct { } block_q8_0; static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding"); -__global__ void dequantize_block_q4_0(const void * vx, float * y) { +static __global__ void dequantize_block_q4_0(const void * vx, float * y) { const block_q4_0 * x = (const block_q4_0 *) vx; const int i = hipBlockIdx_x; @@ -69,7 +69,7 @@ __global__ void dequantize_block_q4_0(const void * vx, float * y) { } } -__global__ void dequantize_block_q4_1(const void * vx, float * y) { +static __global__ void dequantize_block_q4_1(const void * vx, float * y) { const block_q4_1 * x = (const block_q4_1 *) vx; const int i = hipBlockIdx_x; @@ -93,7 +93,7 @@ __global__ void dequantize_block_q4_1(const void * vx, float * y) { } } -__global__ void dequantize_block_q4_2(const void * vx, float * y) { +static __global__ void dequantize_block_q4_2(const void * vx, float * y) { const block_q4_2 * x = (const block_q4_2 *) vx; const int i = hipBlockIdx_x; @@ -116,7 +116,7 @@ __global__ void dequantize_block_q4_2(const void * vx, float * y) { } } -__global__ void dequantize_block_q4_3(const void * vx, float * y) { +static __global__ void dequantize_block_q4_3(const void * vx, float * y) { const block_q4_3 * x = (const block_q4_3 *) vx; const int i = hipBlockIdx_x; @@ -140,7 +140,7 @@ __global__ void dequantize_block_q4_3(const void * vx, float * y) { } } -__global__ void dequantize_block_q8_0(const void * vx, float * y) { +static __global__ void dequantize_block_q8_0(const void * vx, float * y) { const block_q8_0 * x = (const block_q8_0 *) vx; const int i = hipBlockIdx_x; @@ -156,38 +156,40 @@ __global__ void dequantize_block_q8_0(const void * vx, float * y) { } } -void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, hipStream_t stream) { +void dequantize_row_q4_0_hip(const void * vx, float * y, int k, hipStream_t stream) { const int nb = k / QK4_0; hipLaunchKernelGGL(dequantize_block_q4_0, dim3(nb), dim3(1), 0, stream, vx, y); } -void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, hipStream_t stream) { +void dequantize_row_q4_1_hip(const void * vx, float * y, int k, hipStream_t stream) { const int nb = k / QK4_1; hipLaunchKernelGGL(dequantize_block_q4_1, dim3(nb), dim3(1), 0, stream, vx, y); } -void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, hipStream_t stream) { +void dequantize_row_q4_2_hip(const void * vx, float * y, int k, hipStream_t stream) { const int nb = k / QK4_2; hipLaunchKernelGGL(dequantize_block_q4_2, dim3(nb), dim3(1), 0, stream, vx, y); } -void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, hipStream_t stream) { +void dequantize_row_q4_3_hip(const void * vx, float * y, int k, hipStream_t stream) { const int nb = k / QK4_3; hipLaunchKernelGGL(dequantize_block_q4_3, dim3(nb), dim3(1), 0, stream, vx, y); } -void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, hipStream_t stream) { +void dequantize_row_q8_0_hip(const void * vx, float * y, int k, hipStream_t stream) { const int nb = k / QK8_0; hipLaunchKernelGGL(dequantize_block_q8_0, dim3(nb), dim3(1), 0, stream, vx, y); } -// buffer pool for cuda -#define MAX_CUDA_BUFFERS 16 +// buffer pool for HIP +#define MAX_HIP_BUFFERS 16 struct scoped_spin_lock { std::atomic_flag& lock; - while (lock.test_and_set(std::memory_order_acquire)) { + scoped_spin_lock(std::atomic_flag& lock) : lock(lock) { + while (lock.test_and_set(std::memory_order_acquire)) { ; // spin + } } ~scoped_spin_lock() { lock.clear(std::memory_order_release); @@ -196,19 +198,19 @@ struct scoped_spin_lock { scoped_spin_lock& operator=(const scoped_spin_lock&) = delete; }; -struct cuda_buffer { +struct hip_buffer { void * ptr = nullptr; size_t size = 0; }; -static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS]; -static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT; +static hip_buffer g_hip_buffer_pool[MAX_HIP_BUFFERS]; +static std::atomic_flag g_hip_pool_lock = ATOMIC_FLAG_INIT; -void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { - scoped_spin_lock lock(g_cuda_pool_lock); +void * ggml_hip_pool_malloc(size_t size, size_t * actual_size) { + scoped_spin_lock lock(g_hip_pool_lock); - for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { - cuda_buffer& b = g_cuda_buffer_pool[i]; + for (int i = 0; i < MAX_HIP_BUFFERS; ++i) { + hip_buffer& b = g_hip_buffer_pool[i]; if (b.size >= size && b.ptr != nullptr) { void * ptr = b.ptr; *actual_size = b.size; @@ -218,39 +220,40 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { } } void * ptr; - CUDA_CHECK(hipMalloc((void **) &ptr, size)); + HIP_CHECK(hipMalloc((void **) &ptr, size)); *actual_size = size; return ptr; } -void ggml_cuda_pool_free(void * ptr, size_t size) { - scoped_spin_lock lock(g_cuda_pool_lock); +void ggml_hip_pool_free(void * ptr, size_t size) { + scoped_spin_lock lock(g_hip_pool_lock); - for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { - cuda_buffer& b = g_cuda_buffer_pool[i]; + for (int i = 0; i < MAX_HIP_BUFFERS; ++i) { + hip_buffer& b = g_hip_buffer_pool[i]; if (b.ptr == nullptr) { b.ptr = ptr; b.size = size; return; } } - fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n"); - CUDA_CHECK(hipFree(ptr)); + fprintf(stderr, "WARNING: hip buffer pool full, increase MAX_HIP_BUFFERS\n"); + HIP_CHECK(hipFree(ptr)); } -hipblasHandle_t g_cublasH = NULL; -hipStream_t g_cudaStream = NULL; +hipblasHandle_t g_hipblasH = NULL; +hipStream_t g_hipStream = NULL; -void ggml_init_cublas(void) { - if (g_cublasH == NULL) { - // create cublas handle, bind a stream - CUBLAS_CHECK(hipblasCreate(&g_cublasH)); +void ggml_init_hipblas(void) { + if (g_hipblasH == NULL) { + // create hipblas handle, bind a stream + HIPBLAS_CHECK(hipblasCreate(&g_hipblasH)); - CUDA_CHECK(hipStreamCreateWithFlags(&g_cudaStream, hipStreamNonBlocking)); + HIP_CHECK(hipStreamCreateWithFlags(&g_hipStream, hipStreamNonBlocking)); - CUBLAS_CHECK(hipblasSetStream(g_cublasH, g_cudaStream)); + HIPBLAS_CHECK(hipblasSetStream(g_hipblasH, g_hipStream)); // configure logging to stdout - // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL)); + // HIPBLAS_CHECK(hipblasLoggerConfigure(1, 1, 0, NULL)); } } + diff --git a/ggml-rocm.h b/ggml-rocm.h index 220f39c51fcc0..99190922fdb41 100644 --- a/ggml-rocm.h +++ b/ggml-rocm.h @@ -1,40 +1,41 @@ - +#include +#include #ifdef __cplusplus extern "C" { #endif -#define CUDA_CHECK(err) \ +#define HIP_CHECK(err) \ do { \ - hipError_t err_ = (err); \ - if (err_ != hipSuccess) { \ - fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ - hipGetErrorString(err_)); \ + hipError_t err_ = (err); \ + if (err_ != hipSuccess) { \ + fprintf(stderr, "HIP error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ + hipGetErrorString(err_)); \ exit(1); \ } \ } while (0) -#define CUBLAS_CHECK(err) \ +#define HIPBLAS_CHECK(err) \ do { \ - hipblasStatus_t err_ = (err); \ - if (err_ != HIPBLAS_STATUS_SUCCESS) { \ - fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + hipblasStatus_t err_ = (err); \ + if (err_ != HIPBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "hipBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \ exit(1); \ } \ } while (0) -extern hipblasHandle_t g_cublasH; -extern hipStream_t g_cudaStream; +extern hipblasHandle_t g_hipblasH; +extern hipStream_t g_hipStream; -void ggml_init_cublas(void); -void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size); -void ggml_cuda_pool_free(void * ptr, size_t size); +void ggml_init_hipblas(void); +void * ggml_hip_pool_malloc(size_t size, size_t * actual_size); +void ggml_hip_pool_free(void * ptr, size_t size); -void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, hipStream_t stream); -void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, hipStream_t stream); -void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, hipStream_t stream); -void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, hipStream_t stream); -void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, hipStream_t stream); +void dequantize_row_q4_0_hip(const void * vx, float * y, int k, hipStream_t stream); +void dequantize_row_q4_1_hip(const void * vx, float * y, int k, hipStream_t stream); +void dequantize_row_q4_2_hip(const void * vx, float * y, int k, hipStream_t stream); +void dequantize_row_q4_3_hip(const void * vx, float * y, int k, hipStream_t stream); +void dequantize_row_q8_0_hip(const void * vx, float * y, int k, hipStream_t stream); #ifdef __cplusplus } From 8b72d9eee20490bf51d8002a98aac039df87a629 Mon Sep 17 00:00:00 2001 From: Andrew C Date: Wed, 26 Apr 2023 21:40:04 +1000 Subject: [PATCH 3/3] More ROCM support work. Now reports ROCM status in system_info --- CMakeLists.txt | 20 +- Makefile | 1002 ++++++++++++++++++++++++++++++++++++++---------- ggml-rocm.h | 1 + ggml.c | 38 +- ggml.h | 1 + llama.cpp | 1 + 6 files changed, 857 insertions(+), 206 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8480624917dd9..1b5fc8d584935 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -169,11 +169,6 @@ if (LLAMA_CUBLAS) endif() endif() -macro(find_package_and_print_version PACKAGE_NAME) - find_package("${PACKAGE_NAME}" ${ARGN}) - message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}") -endmacro() - if (LLAMA_ROCBLAS) cmake_minimum_required(VERSION 3.17) @@ -197,13 +192,18 @@ if (LLAMA_ROCBLAS) set(CMAKE_HCC_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) find_package(hip REQUIRED) - find_package(rocblas REQUIRED) - find_package(rocfft REQUIRED) + find_package(hipblas REQUIRED) + find_package(hipfft REQUIRED) find_package(AMDDeviceLibs REQUIRED CONFIG) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::host hip::device roc::rocblas roc::rocfft) - set(CMAKE_CXX_COMPILER=${ROCM_PATH}/llvm/bin/clang++) - set(CMAKE_C_COMPILER=${ROCM_PATH}/llvm/bin/clang) + set(CMAKE_CXX_STANDARD 17) + set(CMAKE_CXX_STANDARD_REQUIRED ON) + set(CMAKE_CXX_EXTENSIONS OFF) + + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::host hip::device roc::hipblas hip::hipfft) + set(CMAKE_CXX_COMPILER ${ROCM_PATH}/llvm/bin/clang++) + set(CMAKE_C_COMPILER ${ROCM_PATH}/llvm/bin/clang) + add_compile_definitions(GGML_USE_ROCMBLAS) else() message(WARNING "rocBLAS not found") diff --git a/Makefile b/Makefile index 8fbb19c46cc10..96e405526d790 100644 --- a/Makefile +++ b/Makefile @@ -1,206 +1,820 @@ -# Define the default target now so that it is always the first target -default: main quantize quantize-stats perplexity embedding vdot - -ifndef UNAME_S -UNAME_S := $(shell uname -s) -endif - -ifndef UNAME_P -UNAME_P := $(shell uname -p) -endif - -ifndef UNAME_M -UNAME_M := $(shell uname -m) -endif - -CCV := $(shell $(CC) --version | head -n 1) -CXXV := $(shell $(CXX) --version | head -n 1) - -# Mac OS + Arm can report x86_64 -# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 -ifeq ($(UNAME_S),Darwin) - ifneq ($(UNAME_P),arm) - SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null) - ifeq ($(SYSCTL_M),1) - # UNAME_P := arm - # UNAME_M := arm64 - warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789) - endif - endif -endif - -# -# Compile flags -# - -# keep standard at C11 and C++11 -CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC -CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -LDFLAGS = - -# warnings -CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar - -# OS specific -# TODO: support Windows -ifeq ($(UNAME_S),Linux) - CFLAGS += -pthread - CXXFLAGS += -pthread -endif -ifeq ($(UNAME_S),Darwin) - CFLAGS += -pthread - CXXFLAGS += -pthread -endif -ifeq ($(UNAME_S),FreeBSD) - CFLAGS += -pthread - CXXFLAGS += -pthread -endif -ifeq ($(UNAME_S),NetBSD) - CFLAGS += -pthread - CXXFLAGS += -pthread -endif -ifeq ($(UNAME_S),OpenBSD) - CFLAGS += -pthread - CXXFLAGS += -pthread -endif -ifeq ($(UNAME_S),Haiku) - CFLAGS += -pthread - CXXFLAGS += -pthread -endif - -# Architecture specific -# TODO: probably these flags need to be tweaked on some architectures -# feel free to update the Makefile for your architecture and send a pull request or issue -ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686)) - # Use all CPU extensions that are available: - CFLAGS += -march=native -mtune=native - CXXFLAGS += -march=native -mtune=native - - # Usage AVX-only - #CFLAGS += -mfma -mf16c -mavx - #CXXFLAGS += -mfma -mf16c -mavx -endif -ifneq ($(filter ppc64%,$(UNAME_M)),) - POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) - ifneq (,$(findstring POWER9,$(POWER9_M))) - CFLAGS += -mcpu=power9 - CXXFLAGS += -mcpu=power9 - endif - # Require c++23's std::byteswap for big-endian support. - ifeq ($(UNAME_M),ppc64) - CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN - endif -endif -ifndef LLAMA_NO_ACCELERATE - # Mac M1 - include Accelerate framework. - # `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time). - ifeq ($(UNAME_S),Darwin) - CFLAGS += -DGGML_USE_ACCELERATE - LDFLAGS += -framework Accelerate - endif -endif -ifdef LLAMA_OPENBLAS - CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas - LDFLAGS += -lopenblas -endif -ifdef LLAMA_CUBLAS - CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include - LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 - OBJS += ggml-cuda.o - NVCC = nvcc - NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native -ggml-cuda.o: ggml-cuda.cu ggml-cuda.h - $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@ -endif -ifdef LLAMA_GPROF - CFLAGS += -pg - CXXFLAGS += -pg -endif -ifdef LLAMA_PERF - CFLAGS += -DGGML_PERF - CXXFLAGS += -DGGML_PERF -endif -ifneq ($(filter aarch64%,$(UNAME_M)),) - CFLAGS += -mcpu=native - CXXFLAGS += -mcpu=native -endif -ifneq ($(filter armv6%,$(UNAME_M)),) - # Raspberry Pi 1, 2, 3 - CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -endif -ifneq ($(filter armv7%,$(UNAME_M)),) - # Raspberry Pi 4 - CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations -endif -ifneq ($(filter armv8%,$(UNAME_M)),) - # Raspberry Pi 4 - CFLAGS += -mfp16-format=ieee -mno-unaligned-access -endif - -# -# Print build information -# - -$(info I llama.cpp build info: ) -$(info I UNAME_S: $(UNAME_S)) -$(info I UNAME_P: $(UNAME_P)) -$(info I UNAME_M: $(UNAME_M)) -$(info I CFLAGS: $(CFLAGS)) -$(info I CXXFLAGS: $(CXXFLAGS)) -$(info I LDFLAGS: $(LDFLAGS)) -$(info I CC: $(CCV)) -$(info I CXX: $(CXXV)) -$(info ) - -# -# Build library -# - -ggml.o: ggml.c ggml.h - $(CC) $(CFLAGS) -c $< -o $@ - -llama.o: llama.cpp ggml.h llama.h llama_util.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -common.o: examples/common.cpp examples/common.h - $(CXX) $(CXXFLAGS) -c $< -o $@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 3.24 +# Default target executed when no arguments are given to make. +default_target: all +.PHONY : default_target + +# Allow only one "make -f Makefile2" at a time, but pass parallelism. +.NOTPARALLEL: + +#============================================================================= +# Special targets provided by cmake. + +# Disable implicit rules so canonical targets will work. +.SUFFIXES: + +# Disable VCS-based implicit rules. +% : %,v + +# Disable VCS-based implicit rules. +% : RCS/% + +# Disable VCS-based implicit rules. +% : RCS/%,v + +# Disable VCS-based implicit rules. +% : SCCS/s.% + +# Disable VCS-based implicit rules. +% : s.% + +.SUFFIXES: .hpux_make_needs_suffix_list + +# Command-line flag to silence nested $(MAKE). +$(VERBOSE)MAKESILENT = -s + +#Suppress display of executed commands. +$(VERBOSE).SILENT: + +# A target that is always out of date. +cmake_force: +.PHONY : cmake_force + +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E rm -f + +# Escaping for special characters. +EQUALS = = + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /home/andrewc/Development/llama.andrewjc + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /home/andrewc/Development/llama.andrewjc + +#============================================================================= +# Targets provided globally by CMake. + +# Special rule for the target test +test: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running tests..." + /usr/bin/ctest --force-new-ctest-process $(ARGS) +.PHONY : test + +# Special rule for the target test +test/fast: test +.PHONY : test/fast + +# Special rule for the target edit_cache +edit_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..." + /usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available. +.PHONY : edit_cache + +# Special rule for the target edit_cache +edit_cache/fast: edit_cache +.PHONY : edit_cache/fast + +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache +.PHONY : rebuild_cache/fast + +# The main all target +all: cmake_check_build_system + $(CMAKE_COMMAND) -E cmake_progress_start /home/andrewc/Development/llama.andrewjc/CMakeFiles /home/andrewc/Development/llama.andrewjc//CMakeFiles/progress.marks + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 all + $(CMAKE_COMMAND) -E cmake_progress_start /home/andrewc/Development/llama.andrewjc/CMakeFiles 0 +.PHONY : all + +# The main clean target clean: - rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 clean +.PHONY : clean + +# The main clean target +clean/fast: clean +.PHONY : clean/fast + +# Prepare targets for installation. +preinstall: all + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall +.PHONY : preinstall + +# Prepare targets for installation. +preinstall/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall +.PHONY : preinstall/fast + +# clear depends +depend: + $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 +.PHONY : depend + +#============================================================================= +# Target rules for targets named ggml + +# Build rule for target. +ggml: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ggml +.PHONY : ggml + +# fast build rule for target. +ggml/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ggml.dir/build.make CMakeFiles/ggml.dir/build +.PHONY : ggml/fast + +#============================================================================= +# Target rules for targets named llama + +# Build rule for target. +llama: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llama +.PHONY : llama + +# fast build rule for target. +llama/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/llama.dir/build.make CMakeFiles/llama.dir/build +.PHONY : llama/fast + +#============================================================================= +# Target rules for targets named Experimental + +# Build rule for target. +Experimental: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 Experimental +.PHONY : Experimental + +# fast build rule for target. +Experimental/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/Experimental.dir/build.make CMakeFiles/Experimental.dir/build +.PHONY : Experimental/fast + +#============================================================================= +# Target rules for targets named Nightly + +# Build rule for target. +Nightly: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 Nightly +.PHONY : Nightly + +# fast build rule for target. +Nightly/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/Nightly.dir/build.make CMakeFiles/Nightly.dir/build +.PHONY : Nightly/fast + +#============================================================================= +# Target rules for targets named Continuous + +# Build rule for target. +Continuous: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 Continuous +.PHONY : Continuous + +# fast build rule for target. +Continuous/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/Continuous.dir/build.make CMakeFiles/Continuous.dir/build +.PHONY : Continuous/fast + +#============================================================================= +# Target rules for targets named NightlyMemoryCheck + +# Build rule for target. +NightlyMemoryCheck: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 NightlyMemoryCheck +.PHONY : NightlyMemoryCheck + +# fast build rule for target. +NightlyMemoryCheck/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/NightlyMemoryCheck.dir/build.make CMakeFiles/NightlyMemoryCheck.dir/build +.PHONY : NightlyMemoryCheck/fast + +#============================================================================= +# Target rules for targets named NightlyStart + +# Build rule for target. +NightlyStart: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 NightlyStart +.PHONY : NightlyStart + +# fast build rule for target. +NightlyStart/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/NightlyStart.dir/build.make CMakeFiles/NightlyStart.dir/build +.PHONY : NightlyStart/fast + +#============================================================================= +# Target rules for targets named NightlyUpdate + +# Build rule for target. +NightlyUpdate: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 NightlyUpdate +.PHONY : NightlyUpdate + +# fast build rule for target. +NightlyUpdate/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/NightlyUpdate.dir/build.make CMakeFiles/NightlyUpdate.dir/build +.PHONY : NightlyUpdate/fast + +#============================================================================= +# Target rules for targets named NightlyConfigure + +# Build rule for target. +NightlyConfigure: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 NightlyConfigure +.PHONY : NightlyConfigure + +# fast build rule for target. +NightlyConfigure/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/NightlyConfigure.dir/build.make CMakeFiles/NightlyConfigure.dir/build +.PHONY : NightlyConfigure/fast + +#============================================================================= +# Target rules for targets named NightlyBuild + +# Build rule for target. +NightlyBuild: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 NightlyBuild +.PHONY : NightlyBuild + +# fast build rule for target. +NightlyBuild/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/NightlyBuild.dir/build.make CMakeFiles/NightlyBuild.dir/build +.PHONY : NightlyBuild/fast + +#============================================================================= +# Target rules for targets named NightlyTest + +# Build rule for target. +NightlyTest: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 NightlyTest +.PHONY : NightlyTest + +# fast build rule for target. +NightlyTest/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/NightlyTest.dir/build.make CMakeFiles/NightlyTest.dir/build +.PHONY : NightlyTest/fast + +#============================================================================= +# Target rules for targets named NightlyCoverage + +# Build rule for target. +NightlyCoverage: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 NightlyCoverage +.PHONY : NightlyCoverage + +# fast build rule for target. +NightlyCoverage/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/NightlyCoverage.dir/build.make CMakeFiles/NightlyCoverage.dir/build +.PHONY : NightlyCoverage/fast + +#============================================================================= +# Target rules for targets named NightlyMemCheck + +# Build rule for target. +NightlyMemCheck: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 NightlyMemCheck +.PHONY : NightlyMemCheck + +# fast build rule for target. +NightlyMemCheck/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/NightlyMemCheck.dir/build.make CMakeFiles/NightlyMemCheck.dir/build +.PHONY : NightlyMemCheck/fast + +#============================================================================= +# Target rules for targets named NightlySubmit + +# Build rule for target. +NightlySubmit: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 NightlySubmit +.PHONY : NightlySubmit + +# fast build rule for target. +NightlySubmit/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/NightlySubmit.dir/build.make CMakeFiles/NightlySubmit.dir/build +.PHONY : NightlySubmit/fast + +#============================================================================= +# Target rules for targets named ExperimentalStart + +# Build rule for target. +ExperimentalStart: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ExperimentalStart +.PHONY : ExperimentalStart + +# fast build rule for target. +ExperimentalStart/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ExperimentalStart.dir/build.make CMakeFiles/ExperimentalStart.dir/build +.PHONY : ExperimentalStart/fast + +#============================================================================= +# Target rules for targets named ExperimentalUpdate + +# Build rule for target. +ExperimentalUpdate: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ExperimentalUpdate +.PHONY : ExperimentalUpdate + +# fast build rule for target. +ExperimentalUpdate/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ExperimentalUpdate.dir/build.make CMakeFiles/ExperimentalUpdate.dir/build +.PHONY : ExperimentalUpdate/fast + +#============================================================================= +# Target rules for targets named ExperimentalConfigure + +# Build rule for target. +ExperimentalConfigure: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ExperimentalConfigure +.PHONY : ExperimentalConfigure + +# fast build rule for target. +ExperimentalConfigure/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ExperimentalConfigure.dir/build.make CMakeFiles/ExperimentalConfigure.dir/build +.PHONY : ExperimentalConfigure/fast + +#============================================================================= +# Target rules for targets named ExperimentalBuild + +# Build rule for target. +ExperimentalBuild: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ExperimentalBuild +.PHONY : ExperimentalBuild + +# fast build rule for target. +ExperimentalBuild/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ExperimentalBuild.dir/build.make CMakeFiles/ExperimentalBuild.dir/build +.PHONY : ExperimentalBuild/fast + +#============================================================================= +# Target rules for targets named ExperimentalTest + +# Build rule for target. +ExperimentalTest: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ExperimentalTest +.PHONY : ExperimentalTest + +# fast build rule for target. +ExperimentalTest/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ExperimentalTest.dir/build.make CMakeFiles/ExperimentalTest.dir/build +.PHONY : ExperimentalTest/fast + +#============================================================================= +# Target rules for targets named ExperimentalCoverage + +# Build rule for target. +ExperimentalCoverage: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ExperimentalCoverage +.PHONY : ExperimentalCoverage + +# fast build rule for target. +ExperimentalCoverage/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ExperimentalCoverage.dir/build.make CMakeFiles/ExperimentalCoverage.dir/build +.PHONY : ExperimentalCoverage/fast + +#============================================================================= +# Target rules for targets named ExperimentalMemCheck + +# Build rule for target. +ExperimentalMemCheck: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ExperimentalMemCheck +.PHONY : ExperimentalMemCheck + +# fast build rule for target. +ExperimentalMemCheck/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ExperimentalMemCheck.dir/build.make CMakeFiles/ExperimentalMemCheck.dir/build +.PHONY : ExperimentalMemCheck/fast + +#============================================================================= +# Target rules for targets named ExperimentalSubmit + +# Build rule for target. +ExperimentalSubmit: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ExperimentalSubmit +.PHONY : ExperimentalSubmit + +# fast build rule for target. +ExperimentalSubmit/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ExperimentalSubmit.dir/build.make CMakeFiles/ExperimentalSubmit.dir/build +.PHONY : ExperimentalSubmit/fast + +#============================================================================= +# Target rules for targets named ContinuousStart + +# Build rule for target. +ContinuousStart: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ContinuousStart +.PHONY : ContinuousStart + +# fast build rule for target. +ContinuousStart/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ContinuousStart.dir/build.make CMakeFiles/ContinuousStart.dir/build +.PHONY : ContinuousStart/fast + +#============================================================================= +# Target rules for targets named ContinuousUpdate + +# Build rule for target. +ContinuousUpdate: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ContinuousUpdate +.PHONY : ContinuousUpdate + +# fast build rule for target. +ContinuousUpdate/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ContinuousUpdate.dir/build.make CMakeFiles/ContinuousUpdate.dir/build +.PHONY : ContinuousUpdate/fast + +#============================================================================= +# Target rules for targets named ContinuousConfigure + +# Build rule for target. +ContinuousConfigure: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ContinuousConfigure +.PHONY : ContinuousConfigure + +# fast build rule for target. +ContinuousConfigure/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ContinuousConfigure.dir/build.make CMakeFiles/ContinuousConfigure.dir/build +.PHONY : ContinuousConfigure/fast + +#============================================================================= +# Target rules for targets named ContinuousBuild + +# Build rule for target. +ContinuousBuild: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ContinuousBuild +.PHONY : ContinuousBuild + +# fast build rule for target. +ContinuousBuild/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ContinuousBuild.dir/build.make CMakeFiles/ContinuousBuild.dir/build +.PHONY : ContinuousBuild/fast + +#============================================================================= +# Target rules for targets named ContinuousTest + +# Build rule for target. +ContinuousTest: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ContinuousTest +.PHONY : ContinuousTest + +# fast build rule for target. +ContinuousTest/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ContinuousTest.dir/build.make CMakeFiles/ContinuousTest.dir/build +.PHONY : ContinuousTest/fast + +#============================================================================= +# Target rules for targets named ContinuousCoverage + +# Build rule for target. +ContinuousCoverage: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ContinuousCoverage +.PHONY : ContinuousCoverage + +# fast build rule for target. +ContinuousCoverage/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ContinuousCoverage.dir/build.make CMakeFiles/ContinuousCoverage.dir/build +.PHONY : ContinuousCoverage/fast + +#============================================================================= +# Target rules for targets named ContinuousMemCheck + +# Build rule for target. +ContinuousMemCheck: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ContinuousMemCheck +.PHONY : ContinuousMemCheck + +# fast build rule for target. +ContinuousMemCheck/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ContinuousMemCheck.dir/build.make CMakeFiles/ContinuousMemCheck.dir/build +.PHONY : ContinuousMemCheck/fast + +#============================================================================= +# Target rules for targets named ContinuousSubmit + +# Build rule for target. +ContinuousSubmit: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ContinuousSubmit +.PHONY : ContinuousSubmit + +# fast build rule for target. +ContinuousSubmit/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ContinuousSubmit.dir/build.make CMakeFiles/ContinuousSubmit.dir/build +.PHONY : ContinuousSubmit/fast + +#============================================================================= +# Target rules for targets named test-quantize-fns + +# Build rule for target. +test-quantize-fns: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 test-quantize-fns +.PHONY : test-quantize-fns + +# fast build rule for target. +test-quantize-fns/fast: + $(MAKE) $(MAKESILENT) -f tests/CMakeFiles/test-quantize-fns.dir/build.make tests/CMakeFiles/test-quantize-fns.dir/build +.PHONY : test-quantize-fns/fast + +#============================================================================= +# Target rules for targets named test-quantize-perf + +# Build rule for target. +test-quantize-perf: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 test-quantize-perf +.PHONY : test-quantize-perf + +# fast build rule for target. +test-quantize-perf/fast: + $(MAKE) $(MAKESILENT) -f tests/CMakeFiles/test-quantize-perf.dir/build.make tests/CMakeFiles/test-quantize-perf.dir/build +.PHONY : test-quantize-perf/fast + +#============================================================================= +# Target rules for targets named test-tokenizer-0 + +# Build rule for target. +test-tokenizer-0: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 test-tokenizer-0 +.PHONY : test-tokenizer-0 + +# fast build rule for target. +test-tokenizer-0/fast: + $(MAKE) $(MAKESILENT) -f tests/CMakeFiles/test-tokenizer-0.dir/build.make tests/CMakeFiles/test-tokenizer-0.dir/build +.PHONY : test-tokenizer-0/fast + +#============================================================================= +# Target rules for targets named common + +# Build rule for target. +common: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 common +.PHONY : common + +# fast build rule for target. +common/fast: + $(MAKE) $(MAKESILENT) -f examples/CMakeFiles/common.dir/build.make examples/CMakeFiles/common.dir/build +.PHONY : common/fast + +#============================================================================= +# Target rules for targets named main + +# Build rule for target. +main: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 main +.PHONY : main + +# fast build rule for target. +main/fast: + $(MAKE) $(MAKESILENT) -f examples/main/CMakeFiles/main.dir/build.make examples/main/CMakeFiles/main.dir/build +.PHONY : main/fast + +#============================================================================= +# Target rules for targets named quantize + +# Build rule for target. +quantize: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize +.PHONY : quantize + +# fast build rule for target. +quantize/fast: + $(MAKE) $(MAKESILENT) -f examples/quantize/CMakeFiles/quantize.dir/build.make examples/quantize/CMakeFiles/quantize.dir/build +.PHONY : quantize/fast + +#============================================================================= +# Target rules for targets named quantize-stats + +# Build rule for target. +quantize-stats: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize-stats +.PHONY : quantize-stats + +# fast build rule for target. +quantize-stats/fast: + $(MAKE) $(MAKESILENT) -f examples/quantize-stats/CMakeFiles/quantize-stats.dir/build.make examples/quantize-stats/CMakeFiles/quantize-stats.dir/build +.PHONY : quantize-stats/fast + +#============================================================================= +# Target rules for targets named perplexity + +# Build rule for target. +perplexity: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 perplexity +.PHONY : perplexity + +# fast build rule for target. +perplexity/fast: + $(MAKE) $(MAKESILENT) -f examples/perplexity/CMakeFiles/perplexity.dir/build.make examples/perplexity/CMakeFiles/perplexity.dir/build +.PHONY : perplexity/fast + +#============================================================================= +# Target rules for targets named embedding + +# Build rule for target. +embedding: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 embedding +.PHONY : embedding + +# fast build rule for target. +embedding/fast: + $(MAKE) $(MAKESILENT) -f examples/embedding/CMakeFiles/embedding.dir/build.make examples/embedding/CMakeFiles/embedding.dir/build +.PHONY : embedding/fast + +#============================================================================= +# Target rules for targets named save-load-state + +# Build rule for target. +save-load-state: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 save-load-state +.PHONY : save-load-state + +# fast build rule for target. +save-load-state/fast: + $(MAKE) $(MAKESILENT) -f examples/save-load-state/CMakeFiles/save-load-state.dir/build.make examples/save-load-state/CMakeFiles/save-load-state.dir/build +.PHONY : save-load-state/fast + +#============================================================================= +# Target rules for targets named vdot + +# Build rule for target. +vdot: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 vdot +.PHONY : vdot + +# fast build rule for target. +vdot/fast: + $(MAKE) $(MAKESILENT) -f pocs/vdot/CMakeFiles/vdot.dir/build.make pocs/vdot/CMakeFiles/vdot.dir/build +.PHONY : vdot/fast + +#============================================================================= +# Target rules for targets named q8dot + +# Build rule for target. +q8dot: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 q8dot +.PHONY : q8dot + +# fast build rule for target. +q8dot/fast: + $(MAKE) $(MAKESILENT) -f pocs/vdot/CMakeFiles/q8dot.dir/build.make pocs/vdot/CMakeFiles/q8dot.dir/build +.PHONY : q8dot/fast + +ggml-rocm.o: ggml-rocm.cpp.o +.PHONY : ggml-rocm.o + +# target to build an object file +ggml-rocm.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ggml.dir/build.make CMakeFiles/ggml.dir/ggml-rocm.cpp.o +.PHONY : ggml-rocm.cpp.o + +ggml-rocm.i: ggml-rocm.cpp.i +.PHONY : ggml-rocm.i + +# target to preprocess a source file +ggml-rocm.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ggml.dir/build.make CMakeFiles/ggml.dir/ggml-rocm.cpp.i +.PHONY : ggml-rocm.cpp.i + +ggml-rocm.s: ggml-rocm.cpp.s +.PHONY : ggml-rocm.s + +# target to generate assembly for a file +ggml-rocm.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ggml.dir/build.make CMakeFiles/ggml.dir/ggml-rocm.cpp.s +.PHONY : ggml-rocm.cpp.s + +ggml.o: ggml.c.o +.PHONY : ggml.o + +# target to build an object file +ggml.c.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ggml.dir/build.make CMakeFiles/ggml.dir/ggml.c.o +.PHONY : ggml.c.o + +ggml.i: ggml.c.i +.PHONY : ggml.i + +# target to preprocess a source file +ggml.c.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ggml.dir/build.make CMakeFiles/ggml.dir/ggml.c.i +.PHONY : ggml.c.i + +ggml.s: ggml.c.s +.PHONY : ggml.s + +# target to generate assembly for a file +ggml.c.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/ggml.dir/build.make CMakeFiles/ggml.dir/ggml.c.s +.PHONY : ggml.c.s + +llama.o: llama.cpp.o +.PHONY : llama.o + +# target to build an object file +llama.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/llama.dir/build.make CMakeFiles/llama.dir/llama.cpp.o +.PHONY : llama.cpp.o -main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) - @echo - @echo '==== Run ./main -h for help. ====' - @echo +llama.i: llama.cpp.i +.PHONY : llama.i -quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +# target to preprocess a source file +llama.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/llama.dir/build.make CMakeFiles/llama.dir/llama.cpp.i +.PHONY : llama.cpp.i -quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +llama.s: llama.cpp.s +.PHONY : llama.s -perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +# target to generate assembly for a file +llama.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/llama.dir/build.make CMakeFiles/llama.dir/llama.cpp.s +.PHONY : llama.cpp.s -embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +# Help Target +help: + @echo "The following are some of the valid targets for this Makefile:" + @echo "... all (the default if no target is provided)" + @echo "... clean" + @echo "... depend" + @echo "... edit_cache" + @echo "... rebuild_cache" + @echo "... test" + @echo "... Continuous" + @echo "... ContinuousBuild" + @echo "... ContinuousConfigure" + @echo "... ContinuousCoverage" + @echo "... ContinuousMemCheck" + @echo "... ContinuousStart" + @echo "... ContinuousSubmit" + @echo "... ContinuousTest" + @echo "... ContinuousUpdate" + @echo "... Experimental" + @echo "... ExperimentalBuild" + @echo "... ExperimentalConfigure" + @echo "... ExperimentalCoverage" + @echo "... ExperimentalMemCheck" + @echo "... ExperimentalStart" + @echo "... ExperimentalSubmit" + @echo "... ExperimentalTest" + @echo "... ExperimentalUpdate" + @echo "... Nightly" + @echo "... NightlyBuild" + @echo "... NightlyConfigure" + @echo "... NightlyCoverage" + @echo "... NightlyMemCheck" + @echo "... NightlyMemoryCheck" + @echo "... NightlyStart" + @echo "... NightlySubmit" + @echo "... NightlyTest" + @echo "... NightlyUpdate" + @echo "... common" + @echo "... embedding" + @echo "... ggml" + @echo "... llama" + @echo "... main" + @echo "... perplexity" + @echo "... q8dot" + @echo "... quantize" + @echo "... quantize-stats" + @echo "... save-load-state" + @echo "... test-quantize-fns" + @echo "... test-quantize-perf" + @echo "... test-tokenizer-0" + @echo "... vdot" + @echo "... ggml-rocm.o" + @echo "... ggml-rocm.i" + @echo "... ggml-rocm.s" + @echo "... ggml.o" + @echo "... ggml.i" + @echo "... ggml.s" + @echo "... llama.o" + @echo "... llama.i" + @echo "... llama.s" +.PHONY : help -vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -libllama.so: llama.o ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) -# -# Tests -# +#============================================================================= +# Special targets to cleanup operation of make. -benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS) - ./benchmark-q4_0-matmult +# Special rule to run CMake to check the build system integrity. +# No rule that depends on this can have commands that come from listfiles +# because they might be regenerated. +cmake_check_build_system: + $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 +.PHONY : cmake_check_build_system -.PHONY: tests -tests: - bash ./tests/run-tests.sh diff --git a/ggml-rocm.h b/ggml-rocm.h index 99190922fdb41..16df5cfbe38f5 100644 --- a/ggml-rocm.h +++ b/ggml-rocm.h @@ -1,4 +1,5 @@ #include +#include #include #ifdef __cplusplus diff --git a/ggml.c b/ggml.c index 064510edaa798..f5bf2df3b5e4c 100644 --- a/ggml.c +++ b/ggml.c @@ -149,6 +149,8 @@ inline static void* ggml_aligned_malloc(size_t size) { #include #elif defined(GGML_USE_CUBLAS) #include "ggml-cuda.h" +#elif defined(GGML_USE_ROCMBLAS) +#include "ggml-rocm.h" #endif #undef MIN @@ -3842,6 +3844,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { ggml_init_cublas(); #endif + // initialize rocmBLAS + #if defined(GGML_USE_ROCMBLAS) + ggml_init_hipblas(); + #endif + is_first_call = false; } @@ -7703,6 +7710,18 @@ static void ggml_compute_forward_mul_mat_f32( float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size); #endif +#if defined(GGML_USE_ROCMBLAS) + const float alpha = 1.0f; + const float beta = 0.0f; + const int x_ne = ne01 * ne10; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + size_t x_size, y_size, d_size; + float *d_X = ggml_rocm_pool_malloc(sizeof(float) * x_ne, &x_size); + float *d_Y = ggml_rocm_pool_malloc(sizeof(float) * x_ne, &x_size); + float *d_D = ggml_rocm_pool_malloc(sizeof(float) * x_ne, &x_size); +#endif + for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); @@ -7740,6 +7759,13 @@ static void ggml_compute_forward_mul_mat_f32( ggml_cuda_pool_free(d_X, x_size); ggml_cuda_pool_free(d_Y, y_size); ggml_cuda_pool_free(d_D, d_size); +#endif +#if defined(GGML_USE_ROCMBLAS) + ROCM_CHECK(hipStreamSynchronize(g_hipStream)); + ggml_hip_pool_free(d_X, x_size); + ggml_hip_pool_free(d_X, x_size); + ggml_hip_pool_free(d_X, x_size); + #endif //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); @@ -8121,7 +8147,7 @@ static void ggml_compute_forward_mul_mat_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_ROCM_BLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -12493,7 +12519,7 @@ int ggml_cpu_has_wasm_simd(void) { } int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_ROCMBLAS) return 1; #else return 0; @@ -12508,6 +12534,14 @@ int ggml_cpu_has_cublas(void) { #endif } +int ggml_cpu_has_rocmblas(void) { +#if defined(GGML_USE_ROCMBLAS) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_sse3(void) { #if defined(__SSE3__) return 1; diff --git a/ggml.h b/ggml.h index 8300a0c62db9b..a16670441c871 100644 --- a/ggml.h +++ b/ggml.h @@ -854,6 +854,7 @@ extern "C" { GGML_API int ggml_cpu_has_wasm_simd (void); GGML_API int ggml_cpu_has_blas (void); GGML_API int ggml_cpu_has_cublas (void); + GGML_API int ggml_cpu_has_rocmblas (void); GGML_API int ggml_cpu_has_sse3 (void); GGML_API int ggml_cpu_has_vsx (void); diff --git a/llama.cpp b/llama.cpp index 25203c9e90b28..9a71abd3a39b2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2405,6 +2405,7 @@ const char * llama_print_system_info(void) { s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; + s += "ROCM = " + std::to_string(ggml_cpu_has_rocmblas()) + " | "; s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";