From 372393d4c4c77e2c09e4810a51a44206855883cb Mon Sep 17 00:00:00 2001
From: Andrew C <andrewc@squanch.local>
Date: Wed, 26 Apr 2023 20:08:03 +1000
Subject: [PATCH 1/3] Add initial rocm support for running on AMD GPUs

---
 CMakeLists.txt |  45 ++++++++-
 ggml-rocm.cpp  | 256 +++++++++++++++++++++++++++++++++++++++++++++++++
 ggml-rocm.h    |  41 ++++++++
 3 files changed, 341 insertions(+), 1 deletion(-)
 create mode 100644 ggml-rocm.cpp
 create mode 100644 ggml-rocm.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 11ebe9eb66fae..8480624917dd9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,6 +67,7 @@ endif()
 option(LLAMA_ACCELERATE             "llama: enable Accelerate framework"                    ON)
 option(LLAMA_OPENBLAS               "llama: use OpenBLAS"                                   OFF)
 option(LLAMA_CUBLAS                 "llama: use cuBLAS"                                     OFF)
+option(LLAMA_ROCBLAS                "llama: use rocBLAS"                                    ON)
 
 option(LLAMA_BUILD_TESTS            "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
@@ -168,6 +169,47 @@ if (LLAMA_CUBLAS)
     endif()
 endif()
 
+macro(find_package_and_print_version PACKAGE_NAME)
+    find_package("${PACKAGE_NAME}" ${ARGN})
+    message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}")
+endmacro()
+
+if (LLAMA_ROCBLAS)
+
+    cmake_minimum_required(VERSION 3.17)
+
+    if(true)
+        message(STATUS "AMD HIP found")
+
+        set(hip_library_name amdhip64)
+        message("HIP library name: ${hip_library_name}")
+
+        set(GGML_ROCM_SOURCES ggml-rocm.cpp ggml-rocm.h)
+        set(ROCM_PATH /opt/rocm)
+        set(ROCM_INCLUDE_DIRS ${ROCM_PATH}/include)
+        set(HIP_PATH ${ROCM_PATH}/hip)
+        set(HCC_PATH ${ROCM_PATH}/hcc)
+        set(HIPCC_PATH ${ROCM_PATH}/hip/bin/hipcc)
+
+        set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH})
+
+        set(CMAKE_HCC_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+        set(CMAKE_HCC_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+
+        find_package(hip REQUIRED)
+        find_package(rocblas REQUIRED)
+        find_package(rocfft REQUIRED)
+        find_package(AMDDeviceLibs REQUIRED CONFIG)
+
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::host hip::device roc::rocblas roc::rocfft)
+        set(CMAKE_CXX_COMPILER=${ROCM_PATH}/llvm/bin/clang++)
+        set(CMAKE_C_COMPILER=${ROCM_PATH}/llvm/bin/clang)
+
+    else()
+        message(WARNING "rocBLAS not found")
+    endif()
+endif()
+
 if (LLAMA_ALL_WARNINGS)
     if (NOT MSVC)
         set(c_flags
@@ -307,7 +349,8 @@ endif()
 add_library(ggml OBJECT
             ggml.c
             ggml.h
-            ${GGML_CUDA_SOURCES})
+            ${GGML_CUDA_SOURCES}
+            ${GGML_ROCM_SOURCES})
 
 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
diff --git a/ggml-rocm.cpp b/ggml-rocm.cpp
new file mode 100644
index 0000000000000..6ab85b15ff0d0
--- /dev/null
+++ b/ggml-rocm.cpp
@@ -0,0 +1,256 @@
+#include <hipblas/hipblas.h>
+#include <hip/hip_runtime.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <hip/hip_fp16.h>
+#include <atomic>
+#include "ggml-rocm.h"
+
+typedef uint16_t ggml_fp16_t;
+static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size");
+
+#define QK4_0 32
+typedef struct {
+    float   d;              // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define QK4_1 32
+typedef struct {
+    float   d;              // delta
+    float   m;              // min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
+
+#define QK4_2 16
+typedef struct {
+    __half  d;              // delta
+    uint8_t qs[QK4_2 / 2];  // nibbles / quants
+} block_q4_2;
+static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
+
+#define QK4_3 16
+typedef struct {
+    __half  d;              // delta
+    __half  m;              // min
+    uint8_t qs[QK4_3 / 2];  // nibbles / quants
+} block_q4_3;
+static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
+
+#define QK8_0 32
+typedef struct {
+    float   d;              // delta
+    int8_t  qs[QK8_0];      // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
+
+__global__ void dequantize_block_q4_0(const void * vx, float * y) {
+    const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const int i = hipBlockIdx_x;
+
+    const float d = x[i].d;
+
+    const uint8_t * pp = x[i].qs;
+
+    for (int l = 0; l < QK4_0; l += 2) {
+        const uint8_t vi = pp[l/2];
+
+        const int8_t vi0 = vi & 0xf;
+        const int8_t vi1 = vi >> 4;
+
+        const float v0 = (vi0 - 8)*d;
+        const float v1 = (vi1 - 8)*d;
+
+        y[i*QK4_0 + l + 0] = v0;
+        y[i*QK4_0 + l + 1] = v1;
+    }
+}
+
+__global__ void dequantize_block_q4_1(const void * vx, float * y) {
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+
+    const int i = hipBlockIdx_x;
+
+    const float d = x[i].d;
+    const float m = x[i].m;
+
+    const uint8_t * pp = x[i].qs;
+
+    for (int l = 0; l < QK4_1; l += 2) {
+        const uint8_t vi = pp[l/2];
+
+        const int8_t vi0 = vi & 0xf;
+        const int8_t vi1 = vi >> 4;
+
+        const float v0 = vi0*d + m;
+        const float v1 = vi1*d + m;
+
+        y[i*QK4_1 + l + 0] = v0;
+        y[i*QK4_1 + l + 1] = v1;
+    }
+}
+
+__global__ void dequantize_block_q4_2(const void * vx, float * y) {
+    const block_q4_2 * x = (const block_q4_2 *) vx;
+
+    const int i = hipBlockIdx_x;
+
+    const float d = x[i].d;
+
+    const uint8_t * pp = x[i].qs;
+
+    for (int l = 0; l < QK4_2; l += 2) {
+        const uint8_t vi = pp[l/2];
+
+        const int8_t vi0 = vi & 0xf;
+        const int8_t vi1 = vi >> 4;
+
+        const float v0 = (vi0 - 8)*d;
+        const float v1 = (vi1 - 8)*d;
+
+        y[i*QK4_2 + l + 0] = v0;
+        y[i*QK4_2 + l + 1] = v1;
+    }
+}
+
+__global__ void dequantize_block_q4_3(const void * vx, float * y) {
+    const block_q4_3 * x = (const block_q4_3 *) vx;
+
+    const int i = hipBlockIdx_x;
+
+    const float d = x[i].d;
+    const float m = x[i].m;
+
+    const uint8_t * pp = x[i].qs;
+
+    for (int l = 0; l < QK4_3; l += 2) {
+        const uint8_t vi = pp[l/2];
+
+        const int8_t vi0 = vi & 0xf;
+        const int8_t vi1 = vi >> 4;
+
+        const float v0 = vi0*d + m;
+        const float v1 = vi1*d + m;
+
+        y[i*QK4_3 + l + 0] = v0;
+        y[i*QK4_3 + l + 1] = v1;
+    }
+}
+
+__global__ void dequantize_block_q8_0(const void * vx, float * y) {
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const int i = hipBlockIdx_x;
+
+    const float d = x[i].d;
+
+    const int8_t * pp = x[i].qs;
+
+    for (int l = 0; l < QK8_0; l++) {
+        const int8_t vi = pp[l];
+
+        y[i*QK8_0 + l] = vi*d;
+    }
+}
+
+void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, hipStream_t stream) {
+    const int nb = k / QK4_0;
+    hipLaunchKernelGGL(dequantize_block_q4_0, dim3(nb), dim3(1), 0, stream, vx, y);
+}
+
+void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, hipStream_t stream) {
+    const int nb = k / QK4_1;
+    hipLaunchKernelGGL(dequantize_block_q4_1, dim3(nb), dim3(1), 0, stream, vx, y);
+}
+
+void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, hipStream_t stream) {
+    const int nb = k / QK4_2;
+    hipLaunchKernelGGL(dequantize_block_q4_2, dim3(nb), dim3(1), 0, stream, vx, y);
+}
+
+void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, hipStream_t stream) {
+    const int nb = k / QK4_3;
+    hipLaunchKernelGGL(dequantize_block_q4_3, dim3(nb), dim3(1), 0, stream, vx, y);
+}
+
+void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, hipStream_t stream) {
+    const int nb = k / QK8_0;
+    hipLaunchKernelGGL(dequantize_block_q8_0, dim3(nb), dim3(1), 0, stream, vx, y);
+}
+
+// buffer pool for cuda
+#define MAX_CUDA_BUFFERS 16
+
+struct scoped_spin_lock {
+    std::atomic_flag& lock;
+    while (lock.test_and_set(std::memory_order_acquire)) {
+            ; // spin
+    }
+    ~scoped_spin_lock() {
+        lock.clear(std::memory_order_release);
+    }
+    scoped_spin_lock(const scoped_spin_lock&) = delete;
+    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
+};
+
+struct cuda_buffer {
+    void * ptr = nullptr;
+    size_t size = 0;
+};
+
+static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
+static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
+
+void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+
+    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+        cuda_buffer& b = g_cuda_buffer_pool[i];
+        if (b.size >= size && b.ptr != nullptr) {
+            void * ptr = b.ptr;
+            *actual_size = b.size;
+            b.ptr = nullptr;
+            b.size = 0;
+            return ptr;
+        }
+    }
+    void * ptr;
+    CUDA_CHECK(hipMalloc((void **) &ptr, size));
+    *actual_size = size;
+    return ptr;
+}
+
+void ggml_cuda_pool_free(void * ptr, size_t size) {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+
+    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+        cuda_buffer& b = g_cuda_buffer_pool[i];
+        if (b.ptr == nullptr) {
+            b.ptr = ptr;
+            b.size = size;
+            return;
+        }
+    }
+    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
+    CUDA_CHECK(hipFree(ptr));
+}
+
+hipblasHandle_t g_cublasH = NULL;
+hipStream_t g_cudaStream = NULL;
+
+void ggml_init_cublas(void) {
+    if (g_cublasH == NULL) {
+        // create cublas handle, bind a stream
+        CUBLAS_CHECK(hipblasCreate(&g_cublasH));
+
+        CUDA_CHECK(hipStreamCreateWithFlags(&g_cudaStream, hipStreamNonBlocking));
+
+        CUBLAS_CHECK(hipblasSetStream(g_cublasH, g_cudaStream));
+
+        // configure logging to stdout
+        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
+    }
+}
diff --git a/ggml-rocm.h b/ggml-rocm.h
new file mode 100644
index 0000000000000..220f39c51fcc0
--- /dev/null
+++ b/ggml-rocm.h
@@ -0,0 +1,41 @@
+
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define CUDA_CHECK(err)                                                                 \
+    do {                                                                                \
+        hipError_t err_ = (err);                                                       \
+        if (err_ != hipSuccess) {                                                      \
+            fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__,   \
+                hipGetErrorString(err_));                                              \
+            exit(1);                                                                    \
+        }                                                                               \
+    } while (0)
+
+#define CUBLAS_CHECK(err)                                                               \
+    do {                                                                                \
+        hipblasStatus_t err_ = (err);                                                    \
+        if (err_ != HIPBLAS_STATUS_SUCCESS) {                                            \
+            fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__);    \
+            exit(1);                                                                    \
+        }                                                                               \
+    } while (0)
+
+extern hipblasHandle_t g_cublasH;
+extern hipStream_t   g_cudaStream;
+
+void   ggml_init_cublas(void);
+void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
+void   ggml_cuda_pool_free(void * ptr, size_t size);
+
+void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, hipStream_t stream);
+void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, hipStream_t stream);
+void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, hipStream_t stream);
+void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, hipStream_t stream);
+void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, hipStream_t stream);
+
+#ifdef  __cplusplus
+}
+#endif

From 1c0e87a5e93df2b734202cc2ceb6275d458abb1f Mon Sep 17 00:00:00 2001
From: Andrew C <andrewc@squanch.local>
Date: Wed, 26 Apr 2023 20:35:58 +1000
Subject: [PATCH 2/3] ROCM now compiling.

---
 ggml-rocm.cpp | 91 ++++++++++++++++++++++++++-------------------------
 ggml-rocm.h   | 41 ++++++++++++-----------
 2 files changed, 68 insertions(+), 64 deletions(-)

diff --git a/ggml-rocm.cpp b/ggml-rocm.cpp
index 6ab85b15ff0d0..98fe2ba458f09 100644
--- a/ggml-rocm.cpp
+++ b/ggml-rocm.cpp
@@ -1,13 +1,13 @@
-#include <hipblas/hipblas.h>
-#include <hip/hip_runtime.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <hip/hip_fp16.h>
 #include <atomic>
 #include "ggml-rocm.h"
 
+#define hipHalf __fp16
+
 typedef uint16_t ggml_fp16_t;
-static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size");
+static_assert(sizeof(hipHalf) == sizeof(ggml_fp16_t), "wrong fp16 size");
 
 #define QK4_0 32
 typedef struct {
@@ -26,16 +26,16 @@ static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 b
 
 #define QK4_2 16
 typedef struct {
-    __half  d;              // delta
-    uint8_t qs[QK4_2 / 2];  // nibbles / quants
+    hipHalf  d;              // delta
+    uint8_t qs[QK4_2 / 2];   // nibbles / quants
 } block_q4_2;
 static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
 
 #define QK4_3 16
 typedef struct {
-    __half  d;              // delta
-    __half  m;              // min
-    uint8_t qs[QK4_3 / 2];  // nibbles / quants
+    hipHalf  d;              // delta
+    hipHalf  m;              // min
+    uint8_t qs[QK4_3 / 2];   // nibbles / quants
 } block_q4_3;
 static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
 
@@ -46,7 +46,7 @@ typedef struct {
 } block_q8_0;
 static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
 
-__global__ void dequantize_block_q4_0(const void * vx, float * y) {
+static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
     const block_q4_0 * x = (const block_q4_0 *) vx;
 
     const int i = hipBlockIdx_x;
@@ -69,7 +69,7 @@ __global__ void dequantize_block_q4_0(const void * vx, float * y) {
     }
 }
 
-__global__ void dequantize_block_q4_1(const void * vx, float * y) {
+static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
     const block_q4_1 * x = (const block_q4_1 *) vx;
 
     const int i = hipBlockIdx_x;
@@ -93,7 +93,7 @@ __global__ void dequantize_block_q4_1(const void * vx, float * y) {
     }
 }
 
-__global__ void dequantize_block_q4_2(const void * vx, float * y) {
+static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
     const block_q4_2 * x = (const block_q4_2 *) vx;
 
     const int i = hipBlockIdx_x;
@@ -116,7 +116,7 @@ __global__ void dequantize_block_q4_2(const void * vx, float * y) {
     }
 }
 
-__global__ void dequantize_block_q4_3(const void * vx, float * y) {
+static __global__ void dequantize_block_q4_3(const void * vx, float * y) {
     const block_q4_3 * x = (const block_q4_3 *) vx;
 
     const int i = hipBlockIdx_x;
@@ -140,7 +140,7 @@ __global__ void dequantize_block_q4_3(const void * vx, float * y) {
     }
 }
 
-__global__ void dequantize_block_q8_0(const void * vx, float * y) {
+static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
     const block_q8_0 * x = (const block_q8_0 *) vx;
 
     const int i = hipBlockIdx_x;
@@ -156,38 +156,40 @@ __global__ void dequantize_block_q8_0(const void * vx, float * y) {
     }
 }
 
-void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, hipStream_t stream) {
+void dequantize_row_q4_0_hip(const void * vx, float * y, int k, hipStream_t stream) {
     const int nb = k / QK4_0;
     hipLaunchKernelGGL(dequantize_block_q4_0, dim3(nb), dim3(1), 0, stream, vx, y);
 }
 
-void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, hipStream_t stream) {
+void dequantize_row_q4_1_hip(const void * vx, float * y, int k, hipStream_t stream) {
     const int nb = k / QK4_1;
     hipLaunchKernelGGL(dequantize_block_q4_1, dim3(nb), dim3(1), 0, stream, vx, y);
 }
 
-void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, hipStream_t stream) {
+void dequantize_row_q4_2_hip(const void * vx, float * y, int k, hipStream_t stream) {
     const int nb = k / QK4_2;
     hipLaunchKernelGGL(dequantize_block_q4_2, dim3(nb), dim3(1), 0, stream, vx, y);
 }
 
-void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, hipStream_t stream) {
+void dequantize_row_q4_3_hip(const void * vx, float * y, int k, hipStream_t stream) {
     const int nb = k / QK4_3;
     hipLaunchKernelGGL(dequantize_block_q4_3, dim3(nb), dim3(1), 0, stream, vx, y);
 }
 
-void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, hipStream_t stream) {
+void dequantize_row_q8_0_hip(const void * vx, float * y, int k, hipStream_t stream) {
     const int nb = k / QK8_0;
     hipLaunchKernelGGL(dequantize_block_q8_0, dim3(nb), dim3(1), 0, stream, vx, y);
 }
 
-// buffer pool for cuda
-#define MAX_CUDA_BUFFERS 16
+// buffer pool for HIP
+#define MAX_HIP_BUFFERS 16
 
 struct scoped_spin_lock {
     std::atomic_flag& lock;
-    while (lock.test_and_set(std::memory_order_acquire)) {
+    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
+        while (lock.test_and_set(std::memory_order_acquire)) {
             ; // spin
+        }
     }
     ~scoped_spin_lock() {
         lock.clear(std::memory_order_release);
@@ -196,19 +198,19 @@ struct scoped_spin_lock {
     scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
 };
 
-struct cuda_buffer {
+struct hip_buffer {
     void * ptr = nullptr;
     size_t size = 0;
 };
 
-static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
-static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
+static hip_buffer g_hip_buffer_pool[MAX_HIP_BUFFERS];
+static std::atomic_flag g_hip_pool_lock = ATOMIC_FLAG_INIT;
 
-void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
-    scoped_spin_lock lock(g_cuda_pool_lock);
+void * ggml_hip_pool_malloc(size_t size, size_t * actual_size) {
+    scoped_spin_lock lock(g_hip_pool_lock);
 
-    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
-        cuda_buffer& b = g_cuda_buffer_pool[i];
+    for (int i = 0; i < MAX_HIP_BUFFERS; ++i) {
+        hip_buffer& b = g_hip_buffer_pool[i];
         if (b.size >= size && b.ptr != nullptr) {
             void * ptr = b.ptr;
             *actual_size = b.size;
@@ -218,39 +220,40 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
         }
     }
     void * ptr;
-    CUDA_CHECK(hipMalloc((void **) &ptr, size));
+    HIP_CHECK(hipMalloc((void **) &ptr, size));
     *actual_size = size;
     return ptr;
 }
 
-void ggml_cuda_pool_free(void * ptr, size_t size) {
-    scoped_spin_lock lock(g_cuda_pool_lock);
+void ggml_hip_pool_free(void * ptr, size_t size) {
+    scoped_spin_lock lock(g_hip_pool_lock);
 
-    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
-        cuda_buffer& b = g_cuda_buffer_pool[i];
+    for (int i = 0; i < MAX_HIP_BUFFERS; ++i) {
+        hip_buffer& b = g_hip_buffer_pool[i];
         if (b.ptr == nullptr) {
             b.ptr = ptr;
             b.size = size;
             return;
         }
     }
-    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
-    CUDA_CHECK(hipFree(ptr));
+    fprintf(stderr, "WARNING: hip buffer pool full, increase MAX_HIP_BUFFERS\n");
+    HIP_CHECK(hipFree(ptr));
 }
 
-hipblasHandle_t g_cublasH = NULL;
-hipStream_t g_cudaStream = NULL;
+hipblasHandle_t g_hipblasH = NULL;
+hipStream_t g_hipStream = NULL;
 
-void ggml_init_cublas(void) {
-    if (g_cublasH == NULL) {
-        // create cublas handle, bind a stream
-        CUBLAS_CHECK(hipblasCreate(&g_cublasH));
+void ggml_init_hipblas(void) {
+    if (g_hipblasH == NULL) {
+        // create hipblas handle, bind a stream
+        HIPBLAS_CHECK(hipblasCreate(&g_hipblasH));
 
-        CUDA_CHECK(hipStreamCreateWithFlags(&g_cudaStream, hipStreamNonBlocking));
+        HIP_CHECK(hipStreamCreateWithFlags(&g_hipStream, hipStreamNonBlocking));
 
-        CUBLAS_CHECK(hipblasSetStream(g_cublasH, g_cudaStream));
+        HIPBLAS_CHECK(hipblasSetStream(g_hipblasH, g_hipStream));
 
         // configure logging to stdout
-        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
+        // HIPBLAS_CHECK(hipblasLoggerConfigure(1, 1, 0, NULL));
     }
 }
+
diff --git a/ggml-rocm.h b/ggml-rocm.h
index 220f39c51fcc0..99190922fdb41 100644
--- a/ggml-rocm.h
+++ b/ggml-rocm.h
@@ -1,40 +1,41 @@
-
+#include <hipblas.h>
+#include <hip/hip_runtime.h>
 
 #ifdef  __cplusplus
 extern "C" {
 #endif
 
-#define CUDA_CHECK(err)                                                                 \
+#define HIP_CHECK(err)                                                                 \
     do {                                                                                \
-        hipError_t err_ = (err);                                                       \
-        if (err_ != hipSuccess) {                                                      \
-            fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__,   \
-                hipGetErrorString(err_));                                              \
+        hipError_t err_ = (err);                                                        \
+        if (err_ != hipSuccess) {                                                       \
+            fprintf(stderr, "HIP error %d at %s:%d: %s\n", err_, __FILE__, __LINE__,   \
+                hipGetErrorString(err_));                                               \
             exit(1);                                                                    \
         }                                                                               \
     } while (0)
 
-#define CUBLAS_CHECK(err)                                                               \
+#define HIPBLAS_CHECK(err)                                                              \
     do {                                                                                \
-        hipblasStatus_t err_ = (err);                                                    \
-        if (err_ != HIPBLAS_STATUS_SUCCESS) {                                            \
-            fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__);    \
+        hipblasStatus_t err_ = (err);                                                   \
+        if (err_ != HIPBLAS_STATUS_SUCCESS) {                                           \
+            fprintf(stderr, "hipBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__);   \
             exit(1);                                                                    \
         }                                                                               \
     } while (0)
 
-extern hipblasHandle_t g_cublasH;
-extern hipStream_t   g_cudaStream;
+extern hipblasHandle_t g_hipblasH;
+extern hipStream_t   g_hipStream;
 
-void   ggml_init_cublas(void);
-void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
-void   ggml_cuda_pool_free(void * ptr, size_t size);
+void   ggml_init_hipblas(void);
+void * ggml_hip_pool_malloc(size_t size, size_t * actual_size);
+void   ggml_hip_pool_free(void * ptr, size_t size);
 
-void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, hipStream_t stream);
-void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, hipStream_t stream);
-void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, hipStream_t stream);
-void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, hipStream_t stream);
-void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, hipStream_t stream);
+void dequantize_row_q4_0_hip(const void * vx, float * y, int k, hipStream_t stream);
+void dequantize_row_q4_1_hip(const void * vx, float * y, int k, hipStream_t stream);
+void dequantize_row_q4_2_hip(const void * vx, float * y, int k, hipStream_t stream);
+void dequantize_row_q4_3_hip(const void * vx, float * y, int k, hipStream_t stream);
+void dequantize_row_q8_0_hip(const void * vx, float * y, int k, hipStream_t stream);
 
 #ifdef  __cplusplus
 }

From 8b72d9eee20490bf51d8002a98aac039df87a629 Mon Sep 17 00:00:00 2001
From: Andrew C <andrewc@squanch.local>
Date: Wed, 26 Apr 2023 21:40:04 +1000
Subject: [PATCH 3/3] More ROCM support work. Now reports ROCM status in
 system_info

---
 CMakeLists.txt |   20 +-
 Makefile       | 1002 ++++++++++++++++++++++++++++++++++++++----------
 ggml-rocm.h    |    1 +
 ggml.c         |   38 +-
 ggml.h         |    1 +
 llama.cpp      |    1 +
 6 files changed, 857 insertions(+), 206 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8480624917dd9..1b5fc8d584935 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -169,11 +169,6 @@ if (LLAMA_CUBLAS)
     endif()
 endif()
 
-macro(find_package_and_print_version PACKAGE_NAME)
-    find_package("${PACKAGE_NAME}" ${ARGN})
-    message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}")
-endmacro()
-
 if (LLAMA_ROCBLAS)
 
     cmake_minimum_required(VERSION 3.17)
@@ -197,13 +192,18 @@ if (LLAMA_ROCBLAS)
         set(CMAKE_HCC_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
 
         find_package(hip REQUIRED)
-        find_package(rocblas REQUIRED)
-        find_package(rocfft REQUIRED)
+        find_package(hipblas REQUIRED)
+        find_package(hipfft REQUIRED)
         find_package(AMDDeviceLibs REQUIRED CONFIG)
 
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::host hip::device roc::rocblas roc::rocfft)
-        set(CMAKE_CXX_COMPILER=${ROCM_PATH}/llvm/bin/clang++)
-        set(CMAKE_C_COMPILER=${ROCM_PATH}/llvm/bin/clang)
+        set(CMAKE_CXX_STANDARD 17)
+        set(CMAKE_CXX_STANDARD_REQUIRED ON)
+        set(CMAKE_CXX_EXTENSIONS OFF)
+
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::host hip::device roc::hipblas hip::hipfft)
+        set(CMAKE_CXX_COMPILER ${ROCM_PATH}/llvm/bin/clang++)
+        set(CMAKE_C_COMPILER ${ROCM_PATH}/llvm/bin/clang)
+        add_compile_definitions(GGML_USE_ROCMBLAS)
 
     else()
         message(WARNING "rocBLAS not found")
diff --git a/Makefile b/Makefile
index 8fbb19c46cc10..96e405526d790 100644
--- a/Makefile
+++ b/Makefile
@@ -1,206 +1,820 @@
-# Define the default target now so that it is always the first target
-default: main quantize quantize-stats perplexity embedding vdot
-
-ifndef UNAME_S
-UNAME_S := $(shell uname -s)
-endif
-
-ifndef UNAME_P
-UNAME_P := $(shell uname -p)
-endif
-
-ifndef UNAME_M
-UNAME_M := $(shell uname -m)
-endif
-
-CCV := $(shell $(CC) --version | head -n 1)
-CXXV := $(shell $(CXX) --version | head -n 1)
-
-# Mac OS + Arm can report x86_64
-# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
-ifeq ($(UNAME_S),Darwin)
-	ifneq ($(UNAME_P),arm)
-		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
-		ifeq ($(SYSCTL_M),1)
-			# UNAME_P := arm
-			# UNAME_M := arm64
-			warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
-		endif
-	endif
-endif
-
-#
-# Compile flags
-#
-
-# keep standard at C11 and C++11
-CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
-LDFLAGS  =
-
-# warnings
-CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
-CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
-
-# OS specific
-# TODO: support Windows
-ifeq ($(UNAME_S),Linux)
-	CFLAGS   += -pthread
-	CXXFLAGS += -pthread
-endif
-ifeq ($(UNAME_S),Darwin)
-	CFLAGS   += -pthread
-	CXXFLAGS += -pthread
-endif
-ifeq ($(UNAME_S),FreeBSD)
-	CFLAGS   += -pthread
-	CXXFLAGS += -pthread
-endif
-ifeq ($(UNAME_S),NetBSD)
-	CFLAGS   += -pthread
-	CXXFLAGS += -pthread
-endif
-ifeq ($(UNAME_S),OpenBSD)
-	CFLAGS   += -pthread
-	CXXFLAGS += -pthread
-endif
-ifeq ($(UNAME_S),Haiku)
-	CFLAGS   += -pthread
-	CXXFLAGS += -pthread
-endif
-
-# Architecture specific
-# TODO: probably these flags need to be tweaked on some architectures
-#       feel free to update the Makefile for your architecture and send a pull request or issue
-ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
-	# Use all CPU extensions that are available:
-	CFLAGS   += -march=native -mtune=native
-	CXXFLAGS += -march=native -mtune=native
-
-	# Usage AVX-only
-	#CFLAGS   += -mfma -mf16c -mavx
-	#CXXFLAGS += -mfma -mf16c -mavx
-endif
-ifneq ($(filter ppc64%,$(UNAME_M)),)
-	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
-	ifneq (,$(findstring POWER9,$(POWER9_M)))
-		CFLAGS   += -mcpu=power9
-		CXXFLAGS += -mcpu=power9
-	endif
-	# Require c++23's std::byteswap for big-endian support.
-	ifeq ($(UNAME_M),ppc64)
-		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
-	endif
-endif
-ifndef LLAMA_NO_ACCELERATE
-	# Mac M1 - include Accelerate framework.
-	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
-	ifeq ($(UNAME_S),Darwin)
-		CFLAGS  += -DGGML_USE_ACCELERATE
-		LDFLAGS += -framework Accelerate
-	endif
-endif
-ifdef LLAMA_OPENBLAS
-	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
-	LDFLAGS += -lopenblas
-endif
-ifdef LLAMA_CUBLAS
-	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
-	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
-	OBJS      += ggml-cuda.o
-	NVCC      = nvcc
-	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
-ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
-endif
-ifdef LLAMA_GPROF
-	CFLAGS   += -pg
-	CXXFLAGS += -pg
-endif
-ifdef LLAMA_PERF
-	CFLAGS   += -DGGML_PERF
-	CXXFLAGS += -DGGML_PERF
-endif
-ifneq ($(filter aarch64%,$(UNAME_M)),)
-	CFLAGS   += -mcpu=native
-	CXXFLAGS += -mcpu=native
-endif
-ifneq ($(filter armv6%,$(UNAME_M)),)
-	# Raspberry Pi 1, 2, 3
-	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
-endif
-ifneq ($(filter armv7%,$(UNAME_M)),)
-	# Raspberry Pi 4
-	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
-endif
-ifneq ($(filter armv8%,$(UNAME_M)),)
-	# Raspberry Pi 4
-	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
-endif
-
-#
-# Print build information
-#
-
-$(info I llama.cpp build info: )
-$(info I UNAME_S:  $(UNAME_S))
-$(info I UNAME_P:  $(UNAME_P))
-$(info I UNAME_M:  $(UNAME_M))
-$(info I CFLAGS:   $(CFLAGS))
-$(info I CXXFLAGS: $(CXXFLAGS))
-$(info I LDFLAGS:  $(LDFLAGS))
-$(info I CC:       $(CCV))
-$(info I CXX:      $(CXXV))
-$(info )
-
-#
-# Build library
-#
-
-ggml.o: ggml.c ggml.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
-
-llama.o: llama.cpp ggml.h llama.h llama_util.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-common.o: examples/common.cpp examples/common.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.24
 
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/andrewc/Development/llama.andrewjc
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/andrewc/Development/llama.andrewjc
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target test
+test:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running tests..."
+	/usr/bin/ctest --force-new-ctest-process $(ARGS)
+.PHONY : test
+
+# Special rule for the target test
+test/fast: test
+.PHONY : test/fast
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# The main all target
+all: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/andrewc/Development/llama.andrewjc/CMakeFiles /home/andrewc/Development/llama.andrewjc//CMakeFiles/progress.marks
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/andrewc/Development/llama.andrewjc/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
 clean:
-	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	$(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+#=============================================================================
+# Target rules for targets named ggml
+
+# Build rule for target.
+ggml: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ggml
+.PHONY : ggml
+
+# fast build rule for target.
+ggml/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ggml.dir/build.make CMakeFiles/ggml.dir/build
+.PHONY : ggml/fast
+
+#=============================================================================
+# Target rules for targets named llama
+
+# Build rule for target.
+llama: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llama
+.PHONY : llama
+
+# fast build rule for target.
+llama/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/llama.dir/build.make CMakeFiles/llama.dir/build
+.PHONY : llama/fast
+
+#=============================================================================
+# Target rules for targets named Experimental
+
+# Build rule for target.
+Experimental: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 Experimental
+.PHONY : Experimental
+
+# fast build rule for target.
+Experimental/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Experimental.dir/build.make CMakeFiles/Experimental.dir/build
+.PHONY : Experimental/fast
+
+#=============================================================================
+# Target rules for targets named Nightly
+
+# Build rule for target.
+Nightly: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 Nightly
+.PHONY : Nightly
+
+# fast build rule for target.
+Nightly/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Nightly.dir/build.make CMakeFiles/Nightly.dir/build
+.PHONY : Nightly/fast
+
+#=============================================================================
+# Target rules for targets named Continuous
+
+# Build rule for target.
+Continuous: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 Continuous
+.PHONY : Continuous
+
+# fast build rule for target.
+Continuous/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Continuous.dir/build.make CMakeFiles/Continuous.dir/build
+.PHONY : Continuous/fast
+
+#=============================================================================
+# Target rules for targets named NightlyMemoryCheck
+
+# Build rule for target.
+NightlyMemoryCheck: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 NightlyMemoryCheck
+.PHONY : NightlyMemoryCheck
+
+# fast build rule for target.
+NightlyMemoryCheck/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/NightlyMemoryCheck.dir/build.make CMakeFiles/NightlyMemoryCheck.dir/build
+.PHONY : NightlyMemoryCheck/fast
+
+#=============================================================================
+# Target rules for targets named NightlyStart
+
+# Build rule for target.
+NightlyStart: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 NightlyStart
+.PHONY : NightlyStart
+
+# fast build rule for target.
+NightlyStart/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/NightlyStart.dir/build.make CMakeFiles/NightlyStart.dir/build
+.PHONY : NightlyStart/fast
+
+#=============================================================================
+# Target rules for targets named NightlyUpdate
+
+# Build rule for target.
+NightlyUpdate: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 NightlyUpdate
+.PHONY : NightlyUpdate
+
+# fast build rule for target.
+NightlyUpdate/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/NightlyUpdate.dir/build.make CMakeFiles/NightlyUpdate.dir/build
+.PHONY : NightlyUpdate/fast
+
+#=============================================================================
+# Target rules for targets named NightlyConfigure
+
+# Build rule for target.
+NightlyConfigure: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 NightlyConfigure
+.PHONY : NightlyConfigure
+
+# fast build rule for target.
+NightlyConfigure/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/NightlyConfigure.dir/build.make CMakeFiles/NightlyConfigure.dir/build
+.PHONY : NightlyConfigure/fast
+
+#=============================================================================
+# Target rules for targets named NightlyBuild
+
+# Build rule for target.
+NightlyBuild: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 NightlyBuild
+.PHONY : NightlyBuild
+
+# fast build rule for target.
+NightlyBuild/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/NightlyBuild.dir/build.make CMakeFiles/NightlyBuild.dir/build
+.PHONY : NightlyBuild/fast
+
+#=============================================================================
+# Target rules for targets named NightlyTest
+
+# Build rule for target.
+NightlyTest: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 NightlyTest
+.PHONY : NightlyTest
+
+# fast build rule for target.
+NightlyTest/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/NightlyTest.dir/build.make CMakeFiles/NightlyTest.dir/build
+.PHONY : NightlyTest/fast
+
+#=============================================================================
+# Target rules for targets named NightlyCoverage
+
+# Build rule for target.
+NightlyCoverage: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 NightlyCoverage
+.PHONY : NightlyCoverage
+
+# fast build rule for target.
+NightlyCoverage/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/NightlyCoverage.dir/build.make CMakeFiles/NightlyCoverage.dir/build
+.PHONY : NightlyCoverage/fast
+
+#=============================================================================
+# Target rules for targets named NightlyMemCheck
+
+# Build rule for target.
+NightlyMemCheck: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 NightlyMemCheck
+.PHONY : NightlyMemCheck
+
+# fast build rule for target.
+NightlyMemCheck/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/NightlyMemCheck.dir/build.make CMakeFiles/NightlyMemCheck.dir/build
+.PHONY : NightlyMemCheck/fast
+
+#=============================================================================
+# Target rules for targets named NightlySubmit
+
+# Build rule for target.
+NightlySubmit: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 NightlySubmit
+.PHONY : NightlySubmit
+
+# fast build rule for target.
+NightlySubmit/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/NightlySubmit.dir/build.make CMakeFiles/NightlySubmit.dir/build
+.PHONY : NightlySubmit/fast
+
+#=============================================================================
+# Target rules for targets named ExperimentalStart
+
+# Build rule for target.
+ExperimentalStart: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ExperimentalStart
+.PHONY : ExperimentalStart
+
+# fast build rule for target.
+ExperimentalStart/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ExperimentalStart.dir/build.make CMakeFiles/ExperimentalStart.dir/build
+.PHONY : ExperimentalStart/fast
+
+#=============================================================================
+# Target rules for targets named ExperimentalUpdate
+
+# Build rule for target.
+ExperimentalUpdate: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ExperimentalUpdate
+.PHONY : ExperimentalUpdate
+
+# fast build rule for target.
+ExperimentalUpdate/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ExperimentalUpdate.dir/build.make CMakeFiles/ExperimentalUpdate.dir/build
+.PHONY : ExperimentalUpdate/fast
+
+#=============================================================================
+# Target rules for targets named ExperimentalConfigure
+
+# Build rule for target.
+ExperimentalConfigure: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ExperimentalConfigure
+.PHONY : ExperimentalConfigure
+
+# fast build rule for target.
+ExperimentalConfigure/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ExperimentalConfigure.dir/build.make CMakeFiles/ExperimentalConfigure.dir/build
+.PHONY : ExperimentalConfigure/fast
+
+#=============================================================================
+# Target rules for targets named ExperimentalBuild
+
+# Build rule for target.
+ExperimentalBuild: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ExperimentalBuild
+.PHONY : ExperimentalBuild
+
+# fast build rule for target.
+ExperimentalBuild/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ExperimentalBuild.dir/build.make CMakeFiles/ExperimentalBuild.dir/build
+.PHONY : ExperimentalBuild/fast
+
+#=============================================================================
+# Target rules for targets named ExperimentalTest
+
+# Build rule for target.
+ExperimentalTest: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ExperimentalTest
+.PHONY : ExperimentalTest
+
+# fast build rule for target.
+ExperimentalTest/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ExperimentalTest.dir/build.make CMakeFiles/ExperimentalTest.dir/build
+.PHONY : ExperimentalTest/fast
+
+#=============================================================================
+# Target rules for targets named ExperimentalCoverage
+
+# Build rule for target.
+ExperimentalCoverage: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ExperimentalCoverage
+.PHONY : ExperimentalCoverage
+
+# fast build rule for target.
+ExperimentalCoverage/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ExperimentalCoverage.dir/build.make CMakeFiles/ExperimentalCoverage.dir/build
+.PHONY : ExperimentalCoverage/fast
+
+#=============================================================================
+# Target rules for targets named ExperimentalMemCheck
+
+# Build rule for target.
+ExperimentalMemCheck: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ExperimentalMemCheck
+.PHONY : ExperimentalMemCheck
+
+# fast build rule for target.
+ExperimentalMemCheck/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ExperimentalMemCheck.dir/build.make CMakeFiles/ExperimentalMemCheck.dir/build
+.PHONY : ExperimentalMemCheck/fast
+
+#=============================================================================
+# Target rules for targets named ExperimentalSubmit
+
+# Build rule for target.
+ExperimentalSubmit: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ExperimentalSubmit
+.PHONY : ExperimentalSubmit
+
+# fast build rule for target.
+ExperimentalSubmit/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ExperimentalSubmit.dir/build.make CMakeFiles/ExperimentalSubmit.dir/build
+.PHONY : ExperimentalSubmit/fast
+
+#=============================================================================
+# Target rules for targets named ContinuousStart
+
+# Build rule for target.
+ContinuousStart: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ContinuousStart
+.PHONY : ContinuousStart
+
+# fast build rule for target.
+ContinuousStart/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ContinuousStart.dir/build.make CMakeFiles/ContinuousStart.dir/build
+.PHONY : ContinuousStart/fast
+
+#=============================================================================
+# Target rules for targets named ContinuousUpdate
+
+# Build rule for target.
+ContinuousUpdate: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ContinuousUpdate
+.PHONY : ContinuousUpdate
+
+# fast build rule for target.
+ContinuousUpdate/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ContinuousUpdate.dir/build.make CMakeFiles/ContinuousUpdate.dir/build
+.PHONY : ContinuousUpdate/fast
+
+#=============================================================================
+# Target rules for targets named ContinuousConfigure
+
+# Build rule for target.
+ContinuousConfigure: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ContinuousConfigure
+.PHONY : ContinuousConfigure
+
+# fast build rule for target.
+ContinuousConfigure/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ContinuousConfigure.dir/build.make CMakeFiles/ContinuousConfigure.dir/build
+.PHONY : ContinuousConfigure/fast
+
+#=============================================================================
+# Target rules for targets named ContinuousBuild
+
+# Build rule for target.
+ContinuousBuild: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ContinuousBuild
+.PHONY : ContinuousBuild
+
+# fast build rule for target.
+ContinuousBuild/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ContinuousBuild.dir/build.make CMakeFiles/ContinuousBuild.dir/build
+.PHONY : ContinuousBuild/fast
+
+#=============================================================================
+# Target rules for targets named ContinuousTest
+
+# Build rule for target.
+ContinuousTest: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ContinuousTest
+.PHONY : ContinuousTest
+
+# fast build rule for target.
+ContinuousTest/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ContinuousTest.dir/build.make CMakeFiles/ContinuousTest.dir/build
+.PHONY : ContinuousTest/fast
+
+#=============================================================================
+# Target rules for targets named ContinuousCoverage
+
+# Build rule for target.
+ContinuousCoverage: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ContinuousCoverage
+.PHONY : ContinuousCoverage
+
+# fast build rule for target.
+ContinuousCoverage/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ContinuousCoverage.dir/build.make CMakeFiles/ContinuousCoverage.dir/build
+.PHONY : ContinuousCoverage/fast
+
+#=============================================================================
+# Target rules for targets named ContinuousMemCheck
+
+# Build rule for target.
+ContinuousMemCheck: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ContinuousMemCheck
+.PHONY : ContinuousMemCheck
+
+# fast build rule for target.
+ContinuousMemCheck/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ContinuousMemCheck.dir/build.make CMakeFiles/ContinuousMemCheck.dir/build
+.PHONY : ContinuousMemCheck/fast
+
+#=============================================================================
+# Target rules for targets named ContinuousSubmit
+
+# Build rule for target.
+ContinuousSubmit: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ContinuousSubmit
+.PHONY : ContinuousSubmit
+
+# fast build rule for target.
+ContinuousSubmit/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ContinuousSubmit.dir/build.make CMakeFiles/ContinuousSubmit.dir/build
+.PHONY : ContinuousSubmit/fast
+
+#=============================================================================
+# Target rules for targets named test-quantize-fns
+
+# Build rule for target.
+test-quantize-fns: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 test-quantize-fns
+.PHONY : test-quantize-fns
+
+# fast build rule for target.
+test-quantize-fns/fast:
+	$(MAKE) $(MAKESILENT) -f tests/CMakeFiles/test-quantize-fns.dir/build.make tests/CMakeFiles/test-quantize-fns.dir/build
+.PHONY : test-quantize-fns/fast
+
+#=============================================================================
+# Target rules for targets named test-quantize-perf
+
+# Build rule for target.
+test-quantize-perf: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 test-quantize-perf
+.PHONY : test-quantize-perf
+
+# fast build rule for target.
+test-quantize-perf/fast:
+	$(MAKE) $(MAKESILENT) -f tests/CMakeFiles/test-quantize-perf.dir/build.make tests/CMakeFiles/test-quantize-perf.dir/build
+.PHONY : test-quantize-perf/fast
+
+#=============================================================================
+# Target rules for targets named test-tokenizer-0
+
+# Build rule for target.
+test-tokenizer-0: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 test-tokenizer-0
+.PHONY : test-tokenizer-0
+
+# fast build rule for target.
+test-tokenizer-0/fast:
+	$(MAKE) $(MAKESILENT) -f tests/CMakeFiles/test-tokenizer-0.dir/build.make tests/CMakeFiles/test-tokenizer-0.dir/build
+.PHONY : test-tokenizer-0/fast
+
+#=============================================================================
+# Target rules for targets named common
+
+# Build rule for target.
+common: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 common
+.PHONY : common
+
+# fast build rule for target.
+common/fast:
+	$(MAKE) $(MAKESILENT) -f examples/CMakeFiles/common.dir/build.make examples/CMakeFiles/common.dir/build
+.PHONY : common/fast
+
+#=============================================================================
+# Target rules for targets named main
+
+# Build rule for target.
+main: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 main
+.PHONY : main
+
+# fast build rule for target.
+main/fast:
+	$(MAKE) $(MAKESILENT) -f examples/main/CMakeFiles/main.dir/build.make examples/main/CMakeFiles/main.dir/build
+.PHONY : main/fast
+
+#=============================================================================
+# Target rules for targets named quantize
+
+# Build rule for target.
+quantize: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize
+.PHONY : quantize
+
+# fast build rule for target.
+quantize/fast:
+	$(MAKE) $(MAKESILENT) -f examples/quantize/CMakeFiles/quantize.dir/build.make examples/quantize/CMakeFiles/quantize.dir/build
+.PHONY : quantize/fast
+
+#=============================================================================
+# Target rules for targets named quantize-stats
+
+# Build rule for target.
+quantize-stats: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize-stats
+.PHONY : quantize-stats
+
+# fast build rule for target.
+quantize-stats/fast:
+	$(MAKE) $(MAKESILENT) -f examples/quantize-stats/CMakeFiles/quantize-stats.dir/build.make examples/quantize-stats/CMakeFiles/quantize-stats.dir/build
+.PHONY : quantize-stats/fast
+
+#=============================================================================
+# Target rules for targets named perplexity
+
+# Build rule for target.
+perplexity: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 perplexity
+.PHONY : perplexity
+
+# fast build rule for target.
+perplexity/fast:
+	$(MAKE) $(MAKESILENT) -f examples/perplexity/CMakeFiles/perplexity.dir/build.make examples/perplexity/CMakeFiles/perplexity.dir/build
+.PHONY : perplexity/fast
+
+#=============================================================================
+# Target rules for targets named embedding
+
+# Build rule for target.
+embedding: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 embedding
+.PHONY : embedding
+
+# fast build rule for target.
+embedding/fast:
+	$(MAKE) $(MAKESILENT) -f examples/embedding/CMakeFiles/embedding.dir/build.make examples/embedding/CMakeFiles/embedding.dir/build
+.PHONY : embedding/fast
+
+#=============================================================================
+# Target rules for targets named save-load-state
+
+# Build rule for target.
+save-load-state: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 save-load-state
+.PHONY : save-load-state
+
+# fast build rule for target.
+save-load-state/fast:
+	$(MAKE) $(MAKESILENT) -f examples/save-load-state/CMakeFiles/save-load-state.dir/build.make examples/save-load-state/CMakeFiles/save-load-state.dir/build
+.PHONY : save-load-state/fast
+
+#=============================================================================
+# Target rules for targets named vdot
+
+# Build rule for target.
+vdot: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 vdot
+.PHONY : vdot
+
+# fast build rule for target.
+vdot/fast:
+	$(MAKE) $(MAKESILENT) -f pocs/vdot/CMakeFiles/vdot.dir/build.make pocs/vdot/CMakeFiles/vdot.dir/build
+.PHONY : vdot/fast
+
+#=============================================================================
+# Target rules for targets named q8dot
+
+# Build rule for target.
+q8dot: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 q8dot
+.PHONY : q8dot
+
+# fast build rule for target.
+q8dot/fast:
+	$(MAKE) $(MAKESILENT) -f pocs/vdot/CMakeFiles/q8dot.dir/build.make pocs/vdot/CMakeFiles/q8dot.dir/build
+.PHONY : q8dot/fast
+
+ggml-rocm.o: ggml-rocm.cpp.o
+.PHONY : ggml-rocm.o
+
+# target to build an object file
+ggml-rocm.cpp.o:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ggml.dir/build.make CMakeFiles/ggml.dir/ggml-rocm.cpp.o
+.PHONY : ggml-rocm.cpp.o
+
+ggml-rocm.i: ggml-rocm.cpp.i
+.PHONY : ggml-rocm.i
+
+# target to preprocess a source file
+ggml-rocm.cpp.i:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ggml.dir/build.make CMakeFiles/ggml.dir/ggml-rocm.cpp.i
+.PHONY : ggml-rocm.cpp.i
+
+ggml-rocm.s: ggml-rocm.cpp.s
+.PHONY : ggml-rocm.s
+
+# target to generate assembly for a file
+ggml-rocm.cpp.s:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ggml.dir/build.make CMakeFiles/ggml.dir/ggml-rocm.cpp.s
+.PHONY : ggml-rocm.cpp.s
+
+ggml.o: ggml.c.o
+.PHONY : ggml.o
+
+# target to build an object file
+ggml.c.o:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ggml.dir/build.make CMakeFiles/ggml.dir/ggml.c.o
+.PHONY : ggml.c.o
+
+ggml.i: ggml.c.i
+.PHONY : ggml.i
+
+# target to preprocess a source file
+ggml.c.i:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ggml.dir/build.make CMakeFiles/ggml.dir/ggml.c.i
+.PHONY : ggml.c.i
+
+ggml.s: ggml.c.s
+.PHONY : ggml.s
+
+# target to generate assembly for a file
+ggml.c.s:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/ggml.dir/build.make CMakeFiles/ggml.dir/ggml.c.s
+.PHONY : ggml.c.s
+
+llama.o: llama.cpp.o
+.PHONY : llama.o
+
+# target to build an object file
+llama.cpp.o:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/llama.dir/build.make CMakeFiles/llama.dir/llama.cpp.o
+.PHONY : llama.cpp.o
 
-main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-	@echo
-	@echo '====  Run ./main -h for help.  ===='
-	@echo
+llama.i: llama.cpp.i
+.PHONY : llama.i
 
-quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+# target to preprocess a source file
+llama.cpp.i:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/llama.dir/build.make CMakeFiles/llama.dir/llama.cpp.i
+.PHONY : llama.cpp.i
 
-quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+llama.s: llama.cpp.s
+.PHONY : llama.s
 
-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+# target to generate assembly for a file
+llama.cpp.s:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/llama.dir/build.make CMakeFiles/llama.dir/llama.cpp.s
+.PHONY : llama.cpp.s
 
-embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... rebuild_cache"
+	@echo "... test"
+	@echo "... Continuous"
+	@echo "... ContinuousBuild"
+	@echo "... ContinuousConfigure"
+	@echo "... ContinuousCoverage"
+	@echo "... ContinuousMemCheck"
+	@echo "... ContinuousStart"
+	@echo "... ContinuousSubmit"
+	@echo "... ContinuousTest"
+	@echo "... ContinuousUpdate"
+	@echo "... Experimental"
+	@echo "... ExperimentalBuild"
+	@echo "... ExperimentalConfigure"
+	@echo "... ExperimentalCoverage"
+	@echo "... ExperimentalMemCheck"
+	@echo "... ExperimentalStart"
+	@echo "... ExperimentalSubmit"
+	@echo "... ExperimentalTest"
+	@echo "... ExperimentalUpdate"
+	@echo "... Nightly"
+	@echo "... NightlyBuild"
+	@echo "... NightlyConfigure"
+	@echo "... NightlyCoverage"
+	@echo "... NightlyMemCheck"
+	@echo "... NightlyMemoryCheck"
+	@echo "... NightlyStart"
+	@echo "... NightlySubmit"
+	@echo "... NightlyTest"
+	@echo "... NightlyUpdate"
+	@echo "... common"
+	@echo "... embedding"
+	@echo "... ggml"
+	@echo "... llama"
+	@echo "... main"
+	@echo "... perplexity"
+	@echo "... q8dot"
+	@echo "... quantize"
+	@echo "... quantize-stats"
+	@echo "... save-load-state"
+	@echo "... test-quantize-fns"
+	@echo "... test-quantize-perf"
+	@echo "... test-tokenizer-0"
+	@echo "... vdot"
+	@echo "... ggml-rocm.o"
+	@echo "... ggml-rocm.i"
+	@echo "... ggml-rocm.s"
+	@echo "... ggml.o"
+	@echo "... ggml.i"
+	@echo "... ggml.s"
+	@echo "... llama.o"
+	@echo "... llama.i"
+	@echo "... llama.s"
+.PHONY : help
 
-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
-libllama.so: llama.o ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
-#
-# Tests
-#
+#=============================================================================
+# Special targets to cleanup operation of make.
 
-benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
-	./benchmark-q4_0-matmult
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	$(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
 
-.PHONY: tests
-tests:
-	bash ./tests/run-tests.sh
diff --git a/ggml-rocm.h b/ggml-rocm.h
index 99190922fdb41..16df5cfbe38f5 100644
--- a/ggml-rocm.h
+++ b/ggml-rocm.h
@@ -1,4 +1,5 @@
 #include <hipblas.h>
+#include <rocblas.h>
 #include <hip/hip_runtime.h>
 
 #ifdef  __cplusplus
diff --git a/ggml.c b/ggml.c
index 064510edaa798..f5bf2df3b5e4c 100644
--- a/ggml.c
+++ b/ggml.c
@@ -149,6 +149,8 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #include <cblas.h>
 #elif defined(GGML_USE_CUBLAS)
 #include "ggml-cuda.h"
+#elif defined(GGML_USE_ROCMBLAS)
+#include "ggml-rocm.h"
 #endif
 
 #undef MIN
@@ -3842,6 +3844,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         ggml_init_cublas();
         #endif
 
+        // initialize rocmBLAS
+        #if defined(GGML_USE_ROCMBLAS)
+        ggml_init_hipblas();
+        #endif
+
         is_first_call = false;
     }
 
@@ -7703,6 +7710,18 @@ static void ggml_compute_forward_mul_mat_f32(
         float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
 #endif
 
+#if defined(GGML_USE_ROCMBLAS)
+        const float alpha = 1.0f;
+        const float beta = 0.0f;
+        const int x_ne = ne01 * ne10;
+        const int y_ne = ne11 * ne10;
+        const int d_ne = ne11 * ne01;
+        size_t x_size, y_size, d_size;
+        float *d_X = ggml_rocm_pool_malloc(sizeof(float) * x_ne, &x_size);
+        float *d_Y = ggml_rocm_pool_malloc(sizeof(float) * x_ne, &x_size);
+        float *d_D = ggml_rocm_pool_malloc(sizeof(float) * x_ne, &x_size);
+#endif
+
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
                 const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
@@ -7740,6 +7759,13 @@ static void ggml_compute_forward_mul_mat_f32(
         ggml_cuda_pool_free(d_X, x_size);
         ggml_cuda_pool_free(d_Y, y_size);
         ggml_cuda_pool_free(d_D, d_size);
+#endif
+#if defined(GGML_USE_ROCMBLAS)
+        ROCM_CHECK(hipStreamSynchronize(g_hipStream));
+        ggml_hip_pool_free(d_X, x_size);
+        ggml_hip_pool_free(d_X, x_size);
+        ggml_hip_pool_free(d_X, x_size);
+
 #endif
         //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
 
@@ -8121,7 +8147,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_ROCM_BLAS)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         if (params->ith != 0) {
             return;
@@ -12493,7 +12519,7 @@ int ggml_cpu_has_wasm_simd(void) {
 }
 
 int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_ROCMBLAS)
     return 1;
 #else
     return 0;
@@ -12508,6 +12534,14 @@ int ggml_cpu_has_cublas(void) {
 #endif
 }
 
+int ggml_cpu_has_rocmblas(void) {
+#if defined(GGML_USE_ROCMBLAS)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_sse3(void) {
 #if defined(__SSE3__)
     return 1;
diff --git a/ggml.h b/ggml.h
index 8300a0c62db9b..a16670441c871 100644
--- a/ggml.h
+++ b/ggml.h
@@ -854,6 +854,7 @@ extern "C" {
     GGML_API int ggml_cpu_has_wasm_simd  (void);
     GGML_API int ggml_cpu_has_blas       (void);
     GGML_API int ggml_cpu_has_cublas     (void);
+    GGML_API int ggml_cpu_has_rocmblas   (void);
     GGML_API int ggml_cpu_has_sse3       (void);
     GGML_API int ggml_cpu_has_vsx        (void);
 
diff --git a/llama.cpp b/llama.cpp
index 25203c9e90b28..9a71abd3a39b2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2405,6 +2405,7 @@ const char * llama_print_system_info(void) {
     s += "FP16_VA = "     + std::to_string(ggml_cpu_has_fp16_va())     + " | ";
     s += "WASM_SIMD = "   + std::to_string(ggml_cpu_has_wasm_simd())   + " | ";
     s += "BLAS = "        + std::to_string(ggml_cpu_has_blas())        + " | ";
+    s += "ROCM = "        + std::to_string(ggml_cpu_has_rocmblas())        + " | ";
     s += "SSE3 = "        + std::to_string(ggml_cpu_has_sse3())        + " | ";
     s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";