diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..817892c
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Yavor Ivanov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index 8fa2788..8687d71 100644
--- a/README.md
+++ b/README.md
@@ -16,13 +16,28 @@ Note: you need to download the model checkpoint below (`sam_vit_b_01ec64.pth`) f
 # Convert PTH model to ggml. Requires python3, torch and numpy
 python convert-pth-to-ggml.py checkpoints/sam_vit_b_01ec64.pth . 1
 
-# Build sam.cpp. Might require cmake and SDL2 to be installed
+# You need CMake and SDL2
+SDL2 - Used for GUI windows & input [libsdl](https://www.libsdl.org)
+
+[Ubuntu]
+$ sudo apt install libsdl2-dev
+
+[Mac OS with brew]
+$ brew install sdl2
+
+[MSYS2]
+$ pacman -S git cmake make mingw-w64-x86_64-dlfcn mingw-w64-x86_64-gcc mingw-w64-x86_64-SDL2
+
+# Build sam.cpp.
 mkdir build && cd build
 cmake .. && make -j4
 
 # run inference
 ./bin/sam -t 16 -i ../img.jpg -m ../checkpoints/ggml-model-f16.bin
 ```
+Note: The optimal threads parameter ("-t") value should be manually selected based on the specific machine running the inference.
+
+Note: If you have problems with the Windows build, you can check [this issue](https://github.com/YavorGIvanov/sam.cpp/issues/8) for more details
 
 ## Downloading and converting the model checkpoints
 
@@ -95,12 +110,12 @@ Output mask (mask_out_2.png in build folder):
 
 - [X] Reduce memory usage by utilizing the new ggml-alloc
 - [X] Remove redundant graph nodes
-- [ ] Make inference faster
 - [X] Fix the difference in output masks compared to the PyTorch implementation
 - [X] Filter masks based on stability score
 - [X] Add support for point user input
+- [X] Support bigger model checkpoints
+- [ ] Make inference faster
 - [ ] Support F16 for heavy F32 ops
 - [ ] Test quantization
-- [X] Support bigger model checkpoints
+- [ ] Add support for mask and box input + #14
 - [ ] GPU support
-- [ ] Add support for mask and box input
diff --git a/examples/main.cpp b/examples/main.cpp
index b72dfb0..6871472 100644
--- a/examples/main.cpp
+++ b/examples/main.cpp
@@ -10,11 +10,101 @@
 #define SDL_DISABLE_ARM_NEON_H 1
 #include <SDL.h>
 #include <SDL_opengl.h>
+#include <cmath>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+/**
+ * Get the size of screen where the SDL window runs in.
+ *
+ * SDL_Window* window could be NULL, which means we get the screen size of the default 0-index display.
+ * If window is not NULL, the we need to get the screen size of the display where the window runs in.
+ *
+ */
+static bool get_screen_size(SDL_DisplayMode &dm, SDL_Window* window) {
+    int displayIndex = 0;
+    if (window != NULL) {
+        displayIndex = SDL_GetWindowDisplayIndex(window);
+    }
+    if (displayIndex < 0) {
+        return false;
+    }
+    if (SDL_GetCurrentDisplayMode(displayIndex, &dm) != 0) {
+        return false;
+    }
+
+    fprintf(stderr, "%s: screen size (%d x %d) \n", __func__, dm.w, dm.h);
+    return true;
+}
+
+// downscale image with nearest-neighbor interpolation
+static sam_image_u8 downscale_img(sam_image_u8 &img , float scale) {
+    sam_image_u8 new_img;
+
+    int width = img.nx;
+    int height = img.ny;
+
+    int new_width = img.nx / scale + 0.5f;
+    int new_height = img.ny / scale + 0.5f;
+
+    new_img.nx = new_width;
+    new_img.ny = new_height;
+    new_img.data.resize(new_img.nx*new_img.ny*3);
+
+    fprintf(stderr, "%s: scale: %f\n", __func__, scale);
+    fprintf(stderr, "%s: resize image from (%d x %d) to (%d x %d)\n", __func__, img.nx, img.ny, new_img.nx, new_img.ny);
+
+    for (int y = 0; y < new_height; ++y) {
+        for (int x = 0; x < new_width; ++x) {
+            int src_x = (x + 0.5f) * scale - 0.5f;
+            int src_y = (y + 0.5f) * scale - 0.5f;
+
+            int src_index = (src_y * width + src_x) * 3;
+            int dest_index = (y * new_width + x) * 3;
+
+            for (int c = 0; c < 3; ++c) {
+                new_img.data[dest_index + c] = img.data[src_index + c];
+            }
+        }
+    }
+
+
+    return new_img;
+}
+
+static bool downscale_img_to_screen(sam_image_u8 &img, SDL_Window* window) {
+    SDL_DisplayMode dm = {};
+    if (!get_screen_size(dm, window)) {
+        fprintf(stderr, "%s: failed to get screen size of the display.\n", __func__);
+        return false;
+    }
+    fprintf(stderr, "%s: screen size (%d x %d) \n", __func__,dm.w,dm.h);
+    if (dm.h == 0 || dm.w == 0) {
+        // This means the window is running in other display.
+        return false;
+    }
+
+    // Add 5% margin between screen and window
+    const float margin = 0.05f;
+    const int max_width  = dm.w - margin * dm.w;
+    const int max_height = dm.h - margin * dm.h;
+
+    fprintf(stderr, "%s: img size (%d x %d) \n", __func__,img.nx,img.ny);
+
+    if (img.ny > max_height || img.nx > max_width) {
+        fprintf(stderr, "%s: img size (%d x %d) exceeds maximum allowed size (%d x %d) \n", __func__,img.nx,img.ny,max_width,max_height);
+        const float scale_y = (float)img.ny / max_height;
+        const float scale_x = (float)img.nx / max_width;
+        const float scale = std::max(scale_x, scale_y);
+
+        img = downscale_img(img, scale);
+    }
+
+    return true;
+}
+
 static bool load_image_from_file(const std::string & fname, sam_image_u8 & img) {
     int nx, ny, nc;
     auto data = stbi_load(fname.c_str(), &nx, &ny, &nc, 3);
@@ -117,6 +207,8 @@ GLuint createGLTexture(const sam_image_u8 & img, GLint format) {
 #if defined(GL_UNPACK_ROW_LENGTH) && !defined(__EMSCRIPTEN__)
     glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
 #endif
+    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+
     glTexImage2D(GL_TEXTURE_2D, 0, format, img.nx, img.ny, 0, format, GL_UNSIGNED_BYTE, img.data.data());
 
     return tex;
@@ -132,16 +224,13 @@ void disable_blending(const ImDrawList*, const ImDrawCmd*) {
 }
 
 int main_loop(sam_image_u8 img, const sam_params & params, sam_state & state) {
-    if (SDL_Init(SDL_INIT_VIDEO) != 0) {
-        fprintf(stderr, "Error: %s\n", SDL_GetError());
-        return -1;
-    }
-
     ImGui_PreInit();
 
     const char * title = "SAM.cpp";
     SDL_WindowFlags window_flags = (SDL_WindowFlags)(SDL_WINDOW_OPENGL | SDL_WINDOW_ALLOW_HIGHDPI);
+
     SDL_Window * window = SDL_CreateWindow(title, SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, img.nx, img.ny, window_flags);
+
     if (!window) {
         fprintf(stderr, "Error: %s\n", SDL_GetError());
         return -1;
@@ -163,14 +252,18 @@ int main_loop(sam_image_u8 img, const sam_params & params, sam_state & state) {
     ImGui_EndFrame(window);
 
     bool done = false;
-    float x = 0.f, y = 0.f;
+    float x = 0.f;
+    float y = 0.f;
+    float xLast = 0.f;
+    float yLast = 0.f;
     std::vector<sam_image_u8> masks;
     std::vector<GLuint> maskTextures;
-    bool segmentOnHover = false;
+    bool segmentOnMove = false;
     bool outputMultipleMasks = false;
 
     while (!done) {
-        bool computeMasks = segmentOnHover;
+        bool computeMasks = false;
+
         SDL_Event event;
         while (SDL_PollEvent(&event)) {
             ImGui_ProcessEvent(&event);
@@ -187,7 +280,7 @@ int main_loop(sam_image_u8 img, const sam_params & params, sam_state & state) {
                     y = event.button.y;
                 }
             }
-            if (segmentOnHover && event.type == SDL_MOUSEMOTION) {
+            if (segmentOnMove && event.type == SDL_MOUSEMOTION) {
                 x = event.motion.x;
                 y = event.motion.y;
             }
@@ -198,19 +291,29 @@ int main_loop(sam_image_u8 img, const sam_params & params, sam_state & state) {
                 }
                 else {
                     SDL_SetWindowTitle(window, "Encoding new img...");
+                    downscale_img_to_screen(new_img, window);
                     if (!sam_compute_embd_img(new_img, params.n_threads, state)) {
                         printf("failed to compute encoded image\n");
                     }
                     printf("t_compute_img_ms = %d ms\n", state.t_compute_img_ms);
-                    img = std::move(new_img);
-                    tex = createGLTexture(img, GL_RGB);
-                    SDL_SetWindowSize(window, img.nx, img.ny);
+
+                    tex = createGLTexture(new_img, GL_RGB);
+
+                    SDL_SetWindowSize(window, new_img.nx, new_img.ny);
                     SDL_SetWindowTitle(window, title);
+                    img = std::move(new_img);
                     computeMasks = true;
                 }
             }
         }
 
+        if (segmentOnMove && (x != xLast || y != yLast)) {
+            computeMasks = true;
+        }
+
+        xLast = x;
+        yLast = y;
+
         if (computeMasks) {
             sam_point pt { x, y};
             printf("pt = (%f, %f)\n", pt.x, pt.y);
@@ -221,6 +324,7 @@ int main_loop(sam_image_u8 img, const sam_params & params, sam_state & state) {
                 glDeleteTextures(maskTextures.size(), maskTextures.data());
                 maskTextures.clear();
             }
+
             for (auto& mask : masks) {
                 sam_image_u8 mask_rgb = { mask.nx, mask.ny, };
                 mask_rgb.data.resize(3*mask.nx*mask.ny);
@@ -244,7 +348,7 @@ int main_loop(sam_image_u8 img, const sam_params & params, sam_state & state) {
         draw_list->AddImage((void*)(intptr_t)tex, ImVec2(0,0), ImVec2(img.nx, img.ny));
 
         ImGui::PushStyleColor(ImGuiCol_Text, IM_COL32(0, 0, 0, 255));
-        ImGui::Checkbox("Segment on hover", &segmentOnHover);
+        ImGui::Checkbox("Segment on hover", &segmentOnMove);
         ImGui::Checkbox("Output multiple masks", &outputMultipleMasks);
         ImGui::PopStyleColor();
 
@@ -261,7 +365,7 @@ int main_loop(sam_image_u8 img, const sam_params & params, sam_state & state) {
             }
         }
         else if (!maskTextures.empty()) {
-            draw_list->AddImage((void*)(intptr_t)maskTextures[0], ImVec2(0,0), ImVec2(img.nx, img.ny), ImVec2(0,0), ImVec2(1,1), IM_COL32(0, 0, 255, 128));
+            draw_list->AddImage((void*)(intptr_t)maskTextures[0], ImVec2(0,0), ImVec2(img.nx,img.ny), ImVec2(0,0), ImVec2(1,1), IM_COL32(0, 0, 255, 128));
         }
 
         draw_list->AddCallback(disable_blending, {});
@@ -296,6 +400,15 @@ int main(int argc, char ** argv) {
     }
     fprintf(stderr, "%s: loaded image '%s' (%d x %d)\n", __func__, params.fname_inp.c_str(), img0.nx, img0.ny);
 
+    // init SDL video subsystem to get the screen size
+    if (SDL_Init(SDL_INIT_VIDEO) != 0) {
+        fprintf(stderr, "Error: %s\n", SDL_GetError());
+        return -1;
+    }
+
+    // resize img when exceeds the screen
+    downscale_img_to_screen(img0, NULL);
+
     std::shared_ptr<sam_state> state = sam_load_model(params);
     if (!state) {
         fprintf(stderr, "%s: failed to load model\n", __func__);
diff --git a/ggml b/ggml
index 69bf842..dd92cfd 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 69bf842b39e6a53218eda35852a91d395d357e18
+Subproject commit dd92cfd4b188e9202dddb7b85eb8bc1e51cf8288
diff --git a/sam.cpp b/sam.cpp
index 555e51e..d7d6789 100644
--- a/sam.cpp
+++ b/sam.cpp
@@ -3,6 +3,7 @@
 
 #include "ggml.h"
 #include "ggml-alloc.h"
+#include "ggml-backend.h"
 
 #include <cassert>
 #include <cmath>
@@ -15,17 +16,9 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-static const size_t tensor_alignment = 32;
-
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
+static void ggml_graph_compute_helper(ggml_backend_t backend, ggml_cgraph * graph, int n_threads) {
+    ggml_backend_cpu_set_n_threads(backend, n_threads);
+    ggml_backend_graph_compute(backend, graph);
 }
 
 // RGB float32 image
@@ -250,6 +243,10 @@ struct sam_ggml_model {
     sam_encoder_prompt enc_prompt;
     sam_decoder_mask   dec;
 
+    ggml_backend_t backend = {};
+    ggml_backend_buffer_t buffer = {};
+
+
     //
     struct ggml_context * ctx;
     std::map<std::string, struct ggml_tensor *> tensors;
@@ -265,16 +262,6 @@ struct sam_ggml_state {
 
     //struct ggml_tensor * tmp_save = {};
 
-
-    // buffer for `ggml_graph_plan.work_data`
-    std::vector<uint8_t> work_buffer;
-    // buffers to evaluate the model
-    std::vector<uint8_t> buf_alloc_img_enc;
-    std::vector<uint8_t> buf_compute_img_enc;
-
-    std::vector<uint8_t> buf_alloc_fast;
-    std::vector<uint8_t> buf_compute_fast;
-
     struct ggml_allocr  * allocr = {};
 };
 
@@ -483,8 +470,8 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) {
 
     auto & ctx = model.ctx;
 
-    const size_t ctx_size = [&]() {
-        size_t ctx_size = 0;
+    const size_t buf_size = [&]() {
+        size_t buf_size = 0;
 
         const auto & hparams = model.hparams;
 
@@ -503,59 +490,59 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) {
 
         // image encoder
         {
-            ctx_size += n_enc_state*n_img_embd*n_img_embd*ggml_type_sizef(GGML_TYPE_F32);
+            buf_size += n_enc_state*n_img_embd*n_img_embd*ggml_type_sizef(GGML_TYPE_F32);
 
-            ctx_size += n_enc_state*3*n_patch_size*n_patch_size*ggml_type_sizef(GGML_TYPE_F16);
-            ctx_size += n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+            buf_size += n_enc_state*3*n_patch_size*n_patch_size*ggml_type_sizef(GGML_TYPE_F16);
+            buf_size += n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
 
-            ctx_size +=     n_enc_state*n_enc_out_chans*1*1*ggml_type_sizef(GGML_TYPE_F16);
-            ctx_size += n_enc_out_chans*n_enc_out_chans*3*3*ggml_type_sizef(GGML_TYPE_F16);
+            buf_size +=     n_enc_state*n_enc_out_chans*1*1*ggml_type_sizef(GGML_TYPE_F16);
+            buf_size += n_enc_out_chans*n_enc_out_chans*3*3*ggml_type_sizef(GGML_TYPE_F16);
 
-            ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
-            ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+            buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+            buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
 
-            ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
-            ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+            buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+            buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
         }
 
         // image encoder layers
         {
-            ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
-            ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+            buf_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+            buf_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
 
-            ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16);
-            ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16);
+            buf_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16);
+            buf_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16);
 
-            ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16);
-            ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16);
+            buf_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16);
+            buf_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16);
 
-            ctx_size += n_enc_layer*3*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
-            ctx_size += n_enc_layer*3*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
+            buf_size += n_enc_layer*3*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
+            buf_size += n_enc_layer*3*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
 
-            ctx_size += n_enc_layer*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
-            ctx_size += n_enc_layer*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
+            buf_size += n_enc_layer*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
+            buf_size += n_enc_layer*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
 
-            ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
-            ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+            buf_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+            buf_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
 
-            ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
-            ctx_size += n_enc_layer*4*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
+            buf_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
+            buf_size += n_enc_layer*4*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
 
-            ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
-            ctx_size += n_enc_layer*4*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
+            buf_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
+            buf_size += n_enc_layer*4*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
         }
 
-        ctx_size += (8 + 14*n_enc_layer)*ggml_tensor_overhead();
+        buf_size += (8 + 14*n_enc_layer)*ggml_tensor_overhead();
 
         // prompt encoder
         {
-            ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); // 2*(n_enc_out_chans/2)
+            buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); // 2*(n_enc_out_chans/2)
 
-            ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
-            ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+            buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+            buf_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
         }
 
-        ctx_size += (2 + n_pt_embd)*ggml_tensor_overhead();
+        buf_size += (2 + n_pt_embd)*ggml_tensor_overhead();
 
         // mask decoder
         {
@@ -567,75 +554,75 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) {
                 const int n_hypernet_mpls_count = 4;
 
                 // self_attn
-                ctx_size += tfm_layers_count*qkv_count*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += tfm_layers_count*qkv_count*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += tfm_layers_count*n_enc_state*                      ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += tfm_layers_count*qkv_count*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
+                buf_size += tfm_layers_count*qkv_count*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += tfm_layers_count*n_enc_state*                      ggml_type_sizef(GGML_TYPE_F32);
 
                 // all norms
-                ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
 
                 // cross_attn_token_to_img
-                ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)*            ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += tfm_layers_count*n_enc_state*                          ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16);
+                buf_size += tfm_layers_count*qkv_count*(n_enc_state/2)*            ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += tfm_layers_count*n_enc_state*                          ggml_type_sizef(GGML_TYPE_F32);
 
                 // mlp
-                ctx_size += tfm_layers_count*8*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += tfm_layers_count*8*n_enc_out_chans*                ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += tfm_layers_count*n_enc_out_chans*8*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += tfm_layers_count*n_enc_out_chans*                  ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += tfm_layers_count*8*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
+                buf_size += tfm_layers_count*8*n_enc_out_chans*                ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += tfm_layers_count*n_enc_out_chans*8*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
+                buf_size += tfm_layers_count*n_enc_out_chans*                  ggml_type_sizef(GGML_TYPE_F32);
 
                 // cross_attn_img_to_token
-                ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)*            ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += tfm_layers_count*n_enc_state*                          ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16);
+                buf_size += tfm_layers_count*qkv_count*(n_enc_state/2)*            ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += tfm_layers_count*n_enc_state*                          ggml_type_sizef(GGML_TYPE_F32);
 
                 // transformer_final_attn_token_to_img
-                ctx_size += qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += qkv_count*(n_enc_state/2)*            ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += n_enc_state*                          ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16);
+                buf_size += qkv_count*(n_enc_state/2)*            ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += n_enc_state*                          ggml_type_sizef(GGML_TYPE_F32);
 
                 // transformer_norm_final
-                ctx_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
 
                 // output_upscaling
-                ctx_size += n_enc_out_chans*n_img_embd*2*2*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += 3*n_img_embd*                  ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += n_enc_out_chans*n_img_embd*(n_img_embd/2)*2*2*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += (n_img_embd/2)*                               ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += n_enc_out_chans*n_img_embd*2*2*ggml_type_sizef(GGML_TYPE_F16);
+                buf_size += 3*n_img_embd*                  ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += n_enc_out_chans*n_img_embd*(n_img_embd/2)*2*2*ggml_type_sizef(GGML_TYPE_F16);
+                buf_size += (n_img_embd/2)*                               ggml_type_sizef(GGML_TYPE_F32);
 
                 // output_hypernetworks_mlps
-                ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans*                ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += n_hypernet_mpls_count*n_enc_out_chans*(n_img_embd/2)*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += n_hypernet_mpls_count*(n_img_embd/2)*                ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += n_hypernet_mpls_count*2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
+                buf_size += n_hypernet_mpls_count*2*n_enc_out_chans*                ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += n_hypernet_mpls_count*n_enc_out_chans*(n_img_embd/2)*ggml_type_sizef(GGML_TYPE_F16);
+                buf_size += n_hypernet_mpls_count*(n_img_embd/2)*                ggml_type_sizef(GGML_TYPE_F32);
 
                 // iou_prediction_head
-                ctx_size += 2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += 2*n_enc_out_chans*                ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += n_pt_embd*                ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += 2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
+                buf_size += 2*n_enc_out_chans*                ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
+                buf_size += n_pt_embd*                ggml_type_sizef(GGML_TYPE_F32);
 
                 // iou_token_w
-                ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
 
                 // mask_tokens_w
-                ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
+                buf_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
             }
         }
-        fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+        fprintf(stderr, "ggml buffer size = %6.2f MB\n", buf_size/(1024.0*1024.0));
 
-        return ctx_size;
+        return buf_size;
     }();
 
     // create the ggml context
     {
         struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
+            /*.mem_size   =*/ ggml_tensor_overhead() * GGML_MAX_NODES,
             /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
+            /*.no_alloc   =*/ true,
         };
 
         ctx = ggml_init(params);
@@ -645,6 +632,20 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) {
         }
     }
 
+    // initialize backend & allocate buffers
+    {
+        if (!model.backend) {
+            printf("Using CPU backend\n");
+            model.backend = ggml_backend_cpu_init();
+            if (!model.backend) {
+                fprintf(stderr, "%s: ggml_backend_cpu_init() failed\n", __func__);
+                return false;
+            }
+        }
+
+        model.buffer = ggml_backend_alloc_buffer(model.backend, buf_size);
+    }
+
     // prepare memory for the weights
     {
         const auto & hparams = model.hparams;
@@ -947,6 +948,8 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) {
 
     // load weights
     {
+        ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer);
+
         int n_tensors = 0;
         size_t total_size = 0;
 
@@ -977,28 +980,29 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) {
             std::string name(length, 0);
             fin.read(&name[0], length);
 
-            if (model.tensors.find(name.data()) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
+            if (model.tensors.find(name) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
                 return false;
             }
 
-            auto tensor = model.tensors[name.data()];
-            //printf("ne0 = %jd, ne1 = %jd, ne2 = %jd, ne3 = %jd\n", ne[0], ne[1], ne[2], ne[3]);
+            auto tensor = model.tensors[name];
+            ggml_set_name(tensor, name.c_str());
 
             if (ggml_nelements(tensor) != nelements) {
                 fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %d, expected %d\n",
-                        __func__, name.data(), (int) nelements, (int) ggml_nelements(tensor));
+                        __func__, name.c_str(), (int) nelements, (int) ggml_nelements(tensor));
                 return false;
             }
 
             if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2] || tensor->ne[3] != ne[3]) {
                 fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d, %d], expected [%d, %d, %d, %d]\n",
-                        __func__, name.data(),
+                        __func__, name.c_str(),
                         (int) ne[0], (int) ne[1], (int) ne[2], (int) ne[3],
                         (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], (int) tensor->ne[3]);
                 return false;
             }
 
+
             size_t bpe = 0;
 
             switch (ftype) {
@@ -1019,6 +1023,7 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) {
                 return false;
             }
 
+            ggml_allocr_alloc(alloc, tensor);
             fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
 
             total_size += ggml_nbytes(tensor);
@@ -1036,6 +1041,8 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) {
         fprintf(stderr, " done\n");
 
         fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
+
+        ggml_allocr_free(alloc);
     }
 
     fin.close();
@@ -1131,9 +1138,13 @@ struct ggml_cgraph  * sam_encode_image(
     const int32_t n_img_size    = hparams.n_img_size();
     const int32_t n_window_size = hparams.n_window_size();
 
+    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
+    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
+    static std::vector<uint8_t> buf(buf_size);
+
     struct ggml_init_params ggml_params = {
-        /*.mem_size   =*/ state.buf_compute_img_enc.size(),
-        /*.mem_buffer =*/ state.buf_compute_img_enc.data(),
+        /*.mem_size   =*/ buf.size(),
+        /*.mem_buffer =*/ buf.data(),
         /*.no_alloc   =*/ true, // skip allocating as we use ggml_alloc to allocate exact memory requirements
     };
 
@@ -1613,7 +1624,7 @@ bool sam_decode_mask(
                 struct ggml_tensor * q_0 = ggml_add(ctx0, queries, tokens);
 
                 struct ggml_tensor * self_attn = sam_decode_mask_transformer_attn(tfm_layer.self_attn, q_0, q_0, queries, ctx0, model);
-                queries = ggml_add_inplace(ctx0, queries, self_attn);
+                queries = ggml_add(ctx0, queries, self_attn);
             }
 
             queries = ggml_norm(ctx0, queries, hparams.eps_decoder_transformer);
@@ -1690,11 +1701,11 @@ bool sam_decode_mask(
     // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/mask_decoder.py#L136
     keys = ggml_cont(ctx0, ggml_transpose(ctx0, keys));
     keys = ggml_view_4d(ctx0, keys, srcNE[0], srcNE[1], srcNE[2], srcNE[3], srcNE[0]*keys->nb[0], keys->nb[1], keys->nb[2], 0);
+    // ggml_build_forward_expand(gf, keys);
     struct ggml_tensor * upscaled_embedding = {};
     {
         // ConvTranspose2d
         keys = ggml_conv_transpose_2d_p0(ctx0, dec.output_upscaling_0_w, keys, 2);
-        ggml_allocr_alloc(state.allocr, keys); // TODO: This alloc shouldn't be needed
         keys = ggml_add_inplace(ctx0, keys, ggml_repeat(ctx0,
                                      ggml_reshape_3d(ctx0, dec.output_upscaling_0_b, 1, 1, dec.output_upscaling_0_b->ne[0]),
                                      keys));
@@ -1706,7 +1717,6 @@ bool sam_decode_mask(
 
         // ConvTranspose2d
         keys = ggml_conv_transpose_2d_p0(ctx0, dec.output_upscaling_3_w, keys, 2);
-        ggml_allocr_alloc(state.allocr, keys); // TODO: This alloc shouldn't be needed
         keys = ggml_add_inplace(ctx0, ggml_repeat(ctx0,
                                 ggml_reshape_3d(ctx0, dec.output_upscaling_3_b, 1, 1, dec.output_upscaling_3_b->ne[0]),
                                 keys), keys);
@@ -1921,9 +1931,13 @@ struct ggml_cgraph  * sam_build_fast_graph(
                         int   ny,
                   sam_point   point) {
 
+    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
+    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
+    static std::vector<uint8_t> buf(buf_size);
+
     struct ggml_init_params ggml_params = {
-        /*.mem_size   =*/ state.buf_compute_fast.size(),
-        /*.mem_buffer =*/ state.buf_compute_fast.data(),
+        /*.mem_size   =*/ buf.size(),
+        /*.mem_buffer =*/ buf.data(),
         /*.no_alloc   =*/ true, // skip allocating as we use ggml_alloc to allocate exact memory requirements
     };
 
@@ -1953,6 +1967,7 @@ struct ggml_cgraph  * sam_build_fast_graph(
 }
 
 std::shared_ptr<sam_state> sam_load_model(const sam_params & params) {
+    ggml_time_init();
     const int64_t t_start_ms = ggml_time_ms();
 
     sam_state state;
@@ -2006,20 +2021,21 @@ bool sam_compute_embd_img(const sam_image_u8 & img, int n_threads, sam_state & s
             model.hparams.n_img_embd(), model.hparams.n_img_embd(), model.hparams.n_enc_out_chans);
 
     // Encode the image
-    st.buf_compute_img_enc.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
-    st.allocr = ggml_allocr_new_measure(tensor_alignment);
+    const size_t alignment = ggml_backend_get_alignment(model.backend);
+    st.allocr = ggml_allocr_new_measure(alignment);
+
     struct ggml_cgraph * gf_measure = sam_encode_image(model, st, img1);
     if (!gf_measure) {
         fprintf(stderr, "%s: failed to encode image\n", __func__);
         return false;
     }
 
-    size_t alloc_size = ggml_allocr_alloc_graph(st.allocr, gf_measure) + tensor_alignment;
+    size_t alloc_size = ggml_allocr_alloc_graph(st.allocr, gf_measure);
     ggml_allocr_free(st.allocr);
 
     // recreate allocator with exact memory requirements
-    st.buf_alloc_img_enc.resize(alloc_size);
-    st.allocr = ggml_allocr_new(st.buf_alloc_img_enc.data(), st.buf_alloc_img_enc.size(), tensor_alignment);
+    ggml_backend_buffer_t buf_compute = ggml_backend_alloc_buffer(model.backend, alloc_size);
+    st.allocr = ggml_allocr_new_from_buffer(buf_compute);
 
     // compute the graph with the measured exact memory requirements from above
     ggml_allocr_reset(st.allocr);
@@ -2032,11 +2048,12 @@ bool sam_compute_embd_img(const sam_image_u8 & img, int n_threads, sam_state & s
 
     ggml_allocr_alloc_graph(st.allocr, gf);
 
-    ggml_graph_compute_helper(st.work_buffer, gf, n_threads);
+    ggml_graph_compute_helper(model.backend, gf, n_threads);
 
     ggml_allocr_free(st.allocr);
-    st.allocr = NULL;
-    st.work_buffer.clear();
+    ggml_backend_buffer_free(buf_compute);
+
+    st.allocr = {};
 
     state.t_compute_img_ms = ggml_time_ms() - t_start_ms;
 
@@ -2075,8 +2092,8 @@ std::vector<sam_image_u8> sam_compute_masks(
     st.iou_predictions = ggml_new_tensor_1d(st.ctx_masks, GGML_TYPE_F32, 3);
 
 
-    st.buf_compute_fast.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
-    st.allocr = ggml_allocr_new_measure(tensor_alignment);
+    const size_t alignment = ggml_backend_get_alignment(model.backend);
+    st.allocr = ggml_allocr_new_measure(alignment);
 
     // measure memory requirements for the graph
     struct ggml_cgraph  * gf_measure = sam_build_fast_graph(model, st, img.nx, img.ny, pt);
@@ -2085,12 +2102,12 @@ std::vector<sam_image_u8> sam_compute_masks(
         return {};
     }
 
-    size_t alloc_size = ggml_allocr_alloc_graph(st.allocr, gf_measure) + tensor_alignment;
+    size_t alloc_size = ggml_allocr_alloc_graph(st.allocr, gf_measure);
     ggml_allocr_free(st.allocr);
 
     // recreate allocator with exact memory requirements
-    st.buf_alloc_fast.resize(alloc_size);
-    st.allocr = ggml_allocr_new(st.buf_alloc_fast.data(), st.buf_alloc_fast.size(), tensor_alignment);
+    ggml_backend_buffer_t buf_compute = ggml_backend_alloc_buffer(model.backend, alloc_size);
+    st.allocr = ggml_allocr_new_from_buffer(buf_compute);
 
     // compute the graph with the measured exact memory requirements from above
     ggml_allocr_reset(st.allocr);
@@ -2103,18 +2120,18 @@ std::vector<sam_image_u8> sam_compute_masks(
 
     ggml_allocr_alloc_graph(st.allocr, gf);
 
-    ggml_graph_compute_helper(st.work_buffer, gf, n_threads);
+    ggml_graph_compute_helper(model.backend, gf, n_threads);
 
     //print_t_f32("iou_predictions", st.iou_predictions);
     //print_t_f32("low_res_masks", st.low_res_masks);
-    ggml_allocr_free(st.allocr);
-    st.allocr = {};
-    st.buf_compute_fast.clear();
-    st.buf_alloc_fast.clear();
 
     std::vector<sam_image_u8> masks = sam_postprocess_masks(model.hparams, img.nx, img.ny, st, mask_on_val, mask_off_val);
 
+    ggml_allocr_free(st.allocr);
     ggml_free(st.ctx_masks);
+    ggml_backend_buffer_free(buf_compute);
+
+    st.allocr = {};
     st.ctx_masks = {};
     st.low_res_masks = {};
     st.iou_predictions = {};
@@ -2132,4 +2149,11 @@ void sam_deinit(sam_state & state) {
         state.model.reset();
         state.state.reset();
     }
+
+    if (state.model) {
+        if (state.model->backend) {
+            ggml_backend_free(state.model->backend);
+            ggml_backend_buffer_free(state.model->buffer);
+        }
+    }
 }