diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..817892c --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Yavor Ivanov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 8fa2788..8687d71 100644 --- a/README.md +++ b/README.md @@ -16,13 +16,28 @@ Note: you need to download the model checkpoint below (`sam_vit_b_01ec64.pth`) f # Convert PTH model to ggml. Requires python3, torch and numpy python convert-pth-to-ggml.py checkpoints/sam_vit_b_01ec64.pth . 1 -# Build sam.cpp. Might require cmake and SDL2 to be installed +# You need CMake and SDL2 +SDL2 - Used for GUI windows & input [libsdl](https://www.libsdl.org) + +[Ubuntu] +$ sudo apt install libsdl2-dev + +[Mac OS with brew] +$ brew install sdl2 + +[MSYS2] +$ pacman -S git cmake make mingw-w64-x86_64-dlfcn mingw-w64-x86_64-gcc mingw-w64-x86_64-SDL2 + +# Build sam.cpp. mkdir build && cd build cmake .. && make -j4 # run inference ./bin/sam -t 16 -i ../img.jpg -m ../checkpoints/ggml-model-f16.bin ``` +Note: The optimal threads parameter ("-t") value should be manually selected based on the specific machine running the inference. + +Note: If you have problems with the Windows build, you can check [this issue](https://github.com/YavorGIvanov/sam.cpp/issues/8) for more details ## Downloading and converting the model checkpoints @@ -95,12 +110,12 @@ Output mask (mask_out_2.png in build folder): - [X] Reduce memory usage by utilizing the new ggml-alloc - [X] Remove redundant graph nodes -- [ ] Make inference faster - [X] Fix the difference in output masks compared to the PyTorch implementation - [X] Filter masks based on stability score - [X] Add support for point user input +- [X] Support bigger model checkpoints +- [ ] Make inference faster - [ ] Support F16 for heavy F32 ops - [ ] Test quantization -- [X] Support bigger model checkpoints +- [ ] Add support for mask and box input + #14 - [ ] GPU support -- [ ] Add support for mask and box input diff --git a/examples/main.cpp b/examples/main.cpp index b72dfb0..6871472 100644 --- a/examples/main.cpp +++ b/examples/main.cpp @@ -10,11 +10,101 @@ #define SDL_DISABLE_ARM_NEON_H 1 #include #include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif +/** + * Get the size of screen where the SDL window runs in. + * + * SDL_Window* window could be NULL, which means we get the screen size of the default 0-index display. + * If window is not NULL, the we need to get the screen size of the display where the window runs in. + * + */ +static bool get_screen_size(SDL_DisplayMode &dm, SDL_Window* window) { + int displayIndex = 0; + if (window != NULL) { + displayIndex = SDL_GetWindowDisplayIndex(window); + } + if (displayIndex < 0) { + return false; + } + if (SDL_GetCurrentDisplayMode(displayIndex, &dm) != 0) { + return false; + } + + fprintf(stderr, "%s: screen size (%d x %d) \n", __func__, dm.w, dm.h); + return true; +} + +// downscale image with nearest-neighbor interpolation +static sam_image_u8 downscale_img(sam_image_u8 &img , float scale) { + sam_image_u8 new_img; + + int width = img.nx; + int height = img.ny; + + int new_width = img.nx / scale + 0.5f; + int new_height = img.ny / scale + 0.5f; + + new_img.nx = new_width; + new_img.ny = new_height; + new_img.data.resize(new_img.nx*new_img.ny*3); + + fprintf(stderr, "%s: scale: %f\n", __func__, scale); + fprintf(stderr, "%s: resize image from (%d x %d) to (%d x %d)\n", __func__, img.nx, img.ny, new_img.nx, new_img.ny); + + for (int y = 0; y < new_height; ++y) { + for (int x = 0; x < new_width; ++x) { + int src_x = (x + 0.5f) * scale - 0.5f; + int src_y = (y + 0.5f) * scale - 0.5f; + + int src_index = (src_y * width + src_x) * 3; + int dest_index = (y * new_width + x) * 3; + + for (int c = 0; c < 3; ++c) { + new_img.data[dest_index + c] = img.data[src_index + c]; + } + } + } + + + return new_img; +} + +static bool downscale_img_to_screen(sam_image_u8 &img, SDL_Window* window) { + SDL_DisplayMode dm = {}; + if (!get_screen_size(dm, window)) { + fprintf(stderr, "%s: failed to get screen size of the display.\n", __func__); + return false; + } + fprintf(stderr, "%s: screen size (%d x %d) \n", __func__,dm.w,dm.h); + if (dm.h == 0 || dm.w == 0) { + // This means the window is running in other display. + return false; + } + + // Add 5% margin between screen and window + const float margin = 0.05f; + const int max_width = dm.w - margin * dm.w; + const int max_height = dm.h - margin * dm.h; + + fprintf(stderr, "%s: img size (%d x %d) \n", __func__,img.nx,img.ny); + + if (img.ny > max_height || img.nx > max_width) { + fprintf(stderr, "%s: img size (%d x %d) exceeds maximum allowed size (%d x %d) \n", __func__,img.nx,img.ny,max_width,max_height); + const float scale_y = (float)img.ny / max_height; + const float scale_x = (float)img.nx / max_width; + const float scale = std::max(scale_x, scale_y); + + img = downscale_img(img, scale); + } + + return true; +} + static bool load_image_from_file(const std::string & fname, sam_image_u8 & img) { int nx, ny, nc; auto data = stbi_load(fname.c_str(), &nx, &ny, &nc, 3); @@ -117,6 +207,8 @@ GLuint createGLTexture(const sam_image_u8 & img, GLint format) { #if defined(GL_UNPACK_ROW_LENGTH) && !defined(__EMSCRIPTEN__) glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); #endif + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + glTexImage2D(GL_TEXTURE_2D, 0, format, img.nx, img.ny, 0, format, GL_UNSIGNED_BYTE, img.data.data()); return tex; @@ -132,16 +224,13 @@ void disable_blending(const ImDrawList*, const ImDrawCmd*) { } int main_loop(sam_image_u8 img, const sam_params & params, sam_state & state) { - if (SDL_Init(SDL_INIT_VIDEO) != 0) { - fprintf(stderr, "Error: %s\n", SDL_GetError()); - return -1; - } - ImGui_PreInit(); const char * title = "SAM.cpp"; SDL_WindowFlags window_flags = (SDL_WindowFlags)(SDL_WINDOW_OPENGL | SDL_WINDOW_ALLOW_HIGHDPI); + SDL_Window * window = SDL_CreateWindow(title, SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, img.nx, img.ny, window_flags); + if (!window) { fprintf(stderr, "Error: %s\n", SDL_GetError()); return -1; @@ -163,14 +252,18 @@ int main_loop(sam_image_u8 img, const sam_params & params, sam_state & state) { ImGui_EndFrame(window); bool done = false; - float x = 0.f, y = 0.f; + float x = 0.f; + float y = 0.f; + float xLast = 0.f; + float yLast = 0.f; std::vector masks; std::vector maskTextures; - bool segmentOnHover = false; + bool segmentOnMove = false; bool outputMultipleMasks = false; while (!done) { - bool computeMasks = segmentOnHover; + bool computeMasks = false; + SDL_Event event; while (SDL_PollEvent(&event)) { ImGui_ProcessEvent(&event); @@ -187,7 +280,7 @@ int main_loop(sam_image_u8 img, const sam_params & params, sam_state & state) { y = event.button.y; } } - if (segmentOnHover && event.type == SDL_MOUSEMOTION) { + if (segmentOnMove && event.type == SDL_MOUSEMOTION) { x = event.motion.x; y = event.motion.y; } @@ -198,19 +291,29 @@ int main_loop(sam_image_u8 img, const sam_params & params, sam_state & state) { } else { SDL_SetWindowTitle(window, "Encoding new img..."); + downscale_img_to_screen(new_img, window); if (!sam_compute_embd_img(new_img, params.n_threads, state)) { printf("failed to compute encoded image\n"); } printf("t_compute_img_ms = %d ms\n", state.t_compute_img_ms); - img = std::move(new_img); - tex = createGLTexture(img, GL_RGB); - SDL_SetWindowSize(window, img.nx, img.ny); + + tex = createGLTexture(new_img, GL_RGB); + + SDL_SetWindowSize(window, new_img.nx, new_img.ny); SDL_SetWindowTitle(window, title); + img = std::move(new_img); computeMasks = true; } } } + if (segmentOnMove && (x != xLast || y != yLast)) { + computeMasks = true; + } + + xLast = x; + yLast = y; + if (computeMasks) { sam_point pt { x, y}; printf("pt = (%f, %f)\n", pt.x, pt.y); @@ -221,6 +324,7 @@ int main_loop(sam_image_u8 img, const sam_params & params, sam_state & state) { glDeleteTextures(maskTextures.size(), maskTextures.data()); maskTextures.clear(); } + for (auto& mask : masks) { sam_image_u8 mask_rgb = { mask.nx, mask.ny, }; mask_rgb.data.resize(3*mask.nx*mask.ny); @@ -244,7 +348,7 @@ int main_loop(sam_image_u8 img, const sam_params & params, sam_state & state) { draw_list->AddImage((void*)(intptr_t)tex, ImVec2(0,0), ImVec2(img.nx, img.ny)); ImGui::PushStyleColor(ImGuiCol_Text, IM_COL32(0, 0, 0, 255)); - ImGui::Checkbox("Segment on hover", &segmentOnHover); + ImGui::Checkbox("Segment on hover", &segmentOnMove); ImGui::Checkbox("Output multiple masks", &outputMultipleMasks); ImGui::PopStyleColor(); @@ -261,7 +365,7 @@ int main_loop(sam_image_u8 img, const sam_params & params, sam_state & state) { } } else if (!maskTextures.empty()) { - draw_list->AddImage((void*)(intptr_t)maskTextures[0], ImVec2(0,0), ImVec2(img.nx, img.ny), ImVec2(0,0), ImVec2(1,1), IM_COL32(0, 0, 255, 128)); + draw_list->AddImage((void*)(intptr_t)maskTextures[0], ImVec2(0,0), ImVec2(img.nx,img.ny), ImVec2(0,0), ImVec2(1,1), IM_COL32(0, 0, 255, 128)); } draw_list->AddCallback(disable_blending, {}); @@ -296,6 +400,15 @@ int main(int argc, char ** argv) { } fprintf(stderr, "%s: loaded image '%s' (%d x %d)\n", __func__, params.fname_inp.c_str(), img0.nx, img0.ny); + // init SDL video subsystem to get the screen size + if (SDL_Init(SDL_INIT_VIDEO) != 0) { + fprintf(stderr, "Error: %s\n", SDL_GetError()); + return -1; + } + + // resize img when exceeds the screen + downscale_img_to_screen(img0, NULL); + std::shared_ptr state = sam_load_model(params); if (!state) { fprintf(stderr, "%s: failed to load model\n", __func__); diff --git a/ggml b/ggml index 69bf842..dd92cfd 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 69bf842b39e6a53218eda35852a91d395d357e18 +Subproject commit dd92cfd4b188e9202dddb7b85eb8bc1e51cf8288 diff --git a/sam.cpp b/sam.cpp index 555e51e..d7d6789 100644 --- a/sam.cpp +++ b/sam.cpp @@ -3,6 +3,7 @@ #include "ggml.h" #include "ggml-alloc.h" +#include "ggml-backend.h" #include #include @@ -15,17 +16,9 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static const size_t tensor_alignment = 32; - -static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); - } - - ggml_graph_compute(graph, &plan); +static void ggml_graph_compute_helper(ggml_backend_t backend, ggml_cgraph * graph, int n_threads) { + ggml_backend_cpu_set_n_threads(backend, n_threads); + ggml_backend_graph_compute(backend, graph); } // RGB float32 image @@ -250,6 +243,10 @@ struct sam_ggml_model { sam_encoder_prompt enc_prompt; sam_decoder_mask dec; + ggml_backend_t backend = {}; + ggml_backend_buffer_t buffer = {}; + + // struct ggml_context * ctx; std::map tensors; @@ -265,16 +262,6 @@ struct sam_ggml_state { //struct ggml_tensor * tmp_save = {}; - - // buffer for `ggml_graph_plan.work_data` - std::vector work_buffer; - // buffers to evaluate the model - std::vector buf_alloc_img_enc; - std::vector buf_compute_img_enc; - - std::vector buf_alloc_fast; - std::vector buf_compute_fast; - struct ggml_allocr * allocr = {}; }; @@ -483,8 +470,8 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) { auto & ctx = model.ctx; - const size_t ctx_size = [&]() { - size_t ctx_size = 0; + const size_t buf_size = [&]() { + size_t buf_size = 0; const auto & hparams = model.hparams; @@ -503,59 +490,59 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) { // image encoder { - ctx_size += n_enc_state*n_img_embd*n_img_embd*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_state*n_img_embd*n_img_embd*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_state*3*n_patch_size*n_patch_size*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_state*3*n_patch_size*n_patch_size*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_state*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_state*n_enc_out_chans*1*1*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_out_chans*n_enc_out_chans*3*3*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_state*n_enc_out_chans*1*1*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_out_chans*n_enc_out_chans*3*3*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); } // image encoder layers { - ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer*3*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer*3*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_layer*3*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_layer*3*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_layer*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_layer*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_layer*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer*4*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_layer*4*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer*4*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_layer*4*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); } - ctx_size += (8 + 14*n_enc_layer)*ggml_tensor_overhead(); + buf_size += (8 + 14*n_enc_layer)*ggml_tensor_overhead(); // prompt encoder { - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); // 2*(n_enc_out_chans/2) + buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); // 2*(n_enc_out_chans/2) - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); } - ctx_size += (2 + n_pt_embd)*ggml_tensor_overhead(); + buf_size += (2 + n_pt_embd)*ggml_tensor_overhead(); // mask decoder { @@ -567,75 +554,75 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) { const int n_hypernet_mpls_count = 4; // self_attn - ctx_size += tfm_layers_count*qkv_count*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += tfm_layers_count*qkv_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += tfm_layers_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*qkv_count*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); + buf_size += tfm_layers_count*qkv_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); // all norms - ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); // cross_attn_token_to_img - ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += tfm_layers_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16); + buf_size += tfm_layers_count*qkv_count*(n_enc_state/2)* ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); // mlp - ctx_size += tfm_layers_count*8*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += tfm_layers_count*8*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += tfm_layers_count*n_enc_out_chans*8*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += tfm_layers_count*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*8*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); + buf_size += tfm_layers_count*8*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*n_enc_out_chans*8*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); + buf_size += tfm_layers_count*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); // cross_attn_img_to_token - ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += tfm_layers_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16); + buf_size += tfm_layers_count*qkv_count*(n_enc_state/2)* ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); // transformer_final_attn_token_to_img - ctx_size += qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += qkv_count*(n_enc_state/2)* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + buf_size += qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16); + buf_size += qkv_count*(n_enc_state/2)* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_state* ggml_type_sizef(GGML_TYPE_F32); // transformer_norm_final - ctx_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + buf_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + buf_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); // output_upscaling - ctx_size += n_enc_out_chans*n_img_embd*2*2*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += 3*n_img_embd* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_out_chans*n_img_embd*(n_img_embd/2)*2*2*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += (n_img_embd/2)* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_out_chans*n_img_embd*2*2*ggml_type_sizef(GGML_TYPE_F16); + buf_size += 3*n_img_embd* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_out_chans*n_img_embd*(n_img_embd/2)*2*2*ggml_type_sizef(GGML_TYPE_F16); + buf_size += (n_img_embd/2)* ggml_type_sizef(GGML_TYPE_F32); // output_hypernetworks_mlps - ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_hypernet_mpls_count*n_enc_out_chans*(n_img_embd/2)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_hypernet_mpls_count*(n_img_embd/2)* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_hypernet_mpls_count*2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_hypernet_mpls_count*2*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_hypernet_mpls_count*n_enc_out_chans*(n_img_embd/2)*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_hypernet_mpls_count*(n_img_embd/2)* ggml_type_sizef(GGML_TYPE_F32); // iou_prediction_head - ctx_size += 2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += 2*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_pt_embd* ggml_type_sizef(GGML_TYPE_F32); + buf_size += 2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); + buf_size += 2*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_pt_embd* ggml_type_sizef(GGML_TYPE_F32); // iou_token_w - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); // mask_tokens_w - ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); } } - fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + fprintf(stderr, "ggml buffer size = %6.2f MB\n", buf_size/(1024.0*1024.0)); - return ctx_size; + return buf_size; }(); // create the ggml context { struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, + /*.mem_size =*/ ggml_tensor_overhead() * GGML_MAX_NODES, /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, + /*.no_alloc =*/ true, }; ctx = ggml_init(params); @@ -645,6 +632,20 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) { } } + // initialize backend & allocate buffers + { + if (!model.backend) { + printf("Using CPU backend\n"); + model.backend = ggml_backend_cpu_init(); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_cpu_init() failed\n", __func__); + return false; + } + } + + model.buffer = ggml_backend_alloc_buffer(model.backend, buf_size); + } + // prepare memory for the weights { const auto & hparams = model.hparams; @@ -947,6 +948,8 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) { // load weights { + ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer); + int n_tensors = 0; size_t total_size = 0; @@ -977,28 +980,29 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) { std::string name(length, 0); fin.read(&name[0], length); - if (model.tensors.find(name.data()) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); + if (model.tensors.find(name) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); return false; } - auto tensor = model.tensors[name.data()]; - //printf("ne0 = %jd, ne1 = %jd, ne2 = %jd, ne3 = %jd\n", ne[0], ne[1], ne[2], ne[3]); + auto tensor = model.tensors[name]; + ggml_set_name(tensor, name.c_str()); if (ggml_nelements(tensor) != nelements) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %d, expected %d\n", - __func__, name.data(), (int) nelements, (int) ggml_nelements(tensor)); + __func__, name.c_str(), (int) nelements, (int) ggml_nelements(tensor)); return false; } if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2] || tensor->ne[3] != ne[3]) { fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d, %d], expected [%d, %d, %d, %d]\n", - __func__, name.data(), + __func__, name.c_str(), (int) ne[0], (int) ne[1], (int) ne[2], (int) ne[3], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], (int) tensor->ne[3]); return false; } + size_t bpe = 0; switch (ftype) { @@ -1019,6 +1023,7 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) { return false; } + ggml_allocr_alloc(alloc, tensor); fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); total_size += ggml_nbytes(tensor); @@ -1036,6 +1041,8 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) { fprintf(stderr, " done\n"); fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); + + ggml_allocr_free(alloc); } fin.close(); @@ -1131,9 +1138,13 @@ struct ggml_cgraph * sam_encode_image( const int32_t n_img_size = hparams.n_img_size(); const int32_t n_window_size = hparams.n_window_size(); + // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data + static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead(); + static std::vector buf(buf_size); + struct ggml_init_params ggml_params = { - /*.mem_size =*/ state.buf_compute_img_enc.size(), - /*.mem_buffer =*/ state.buf_compute_img_enc.data(), + /*.mem_size =*/ buf.size(), + /*.mem_buffer =*/ buf.data(), /*.no_alloc =*/ true, // skip allocating as we use ggml_alloc to allocate exact memory requirements }; @@ -1613,7 +1624,7 @@ bool sam_decode_mask( struct ggml_tensor * q_0 = ggml_add(ctx0, queries, tokens); struct ggml_tensor * self_attn = sam_decode_mask_transformer_attn(tfm_layer.self_attn, q_0, q_0, queries, ctx0, model); - queries = ggml_add_inplace(ctx0, queries, self_attn); + queries = ggml_add(ctx0, queries, self_attn); } queries = ggml_norm(ctx0, queries, hparams.eps_decoder_transformer); @@ -1690,11 +1701,11 @@ bool sam_decode_mask( // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/mask_decoder.py#L136 keys = ggml_cont(ctx0, ggml_transpose(ctx0, keys)); keys = ggml_view_4d(ctx0, keys, srcNE[0], srcNE[1], srcNE[2], srcNE[3], srcNE[0]*keys->nb[0], keys->nb[1], keys->nb[2], 0); + // ggml_build_forward_expand(gf, keys); struct ggml_tensor * upscaled_embedding = {}; { // ConvTranspose2d keys = ggml_conv_transpose_2d_p0(ctx0, dec.output_upscaling_0_w, keys, 2); - ggml_allocr_alloc(state.allocr, keys); // TODO: This alloc shouldn't be needed keys = ggml_add_inplace(ctx0, keys, ggml_repeat(ctx0, ggml_reshape_3d(ctx0, dec.output_upscaling_0_b, 1, 1, dec.output_upscaling_0_b->ne[0]), keys)); @@ -1706,7 +1717,6 @@ bool sam_decode_mask( // ConvTranspose2d keys = ggml_conv_transpose_2d_p0(ctx0, dec.output_upscaling_3_w, keys, 2); - ggml_allocr_alloc(state.allocr, keys); // TODO: This alloc shouldn't be needed keys = ggml_add_inplace(ctx0, ggml_repeat(ctx0, ggml_reshape_3d(ctx0, dec.output_upscaling_3_b, 1, 1, dec.output_upscaling_3_b->ne[0]), keys), keys); @@ -1921,9 +1931,13 @@ struct ggml_cgraph * sam_build_fast_graph( int ny, sam_point point) { + // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data + static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead(); + static std::vector buf(buf_size); + struct ggml_init_params ggml_params = { - /*.mem_size =*/ state.buf_compute_fast.size(), - /*.mem_buffer =*/ state.buf_compute_fast.data(), + /*.mem_size =*/ buf.size(), + /*.mem_buffer =*/ buf.data(), /*.no_alloc =*/ true, // skip allocating as we use ggml_alloc to allocate exact memory requirements }; @@ -1953,6 +1967,7 @@ struct ggml_cgraph * sam_build_fast_graph( } std::shared_ptr sam_load_model(const sam_params & params) { + ggml_time_init(); const int64_t t_start_ms = ggml_time_ms(); sam_state state; @@ -2006,20 +2021,21 @@ bool sam_compute_embd_img(const sam_image_u8 & img, int n_threads, sam_state & s model.hparams.n_img_embd(), model.hparams.n_img_embd(), model.hparams.n_enc_out_chans); // Encode the image - st.buf_compute_img_enc.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); - st.allocr = ggml_allocr_new_measure(tensor_alignment); + const size_t alignment = ggml_backend_get_alignment(model.backend); + st.allocr = ggml_allocr_new_measure(alignment); + struct ggml_cgraph * gf_measure = sam_encode_image(model, st, img1); if (!gf_measure) { fprintf(stderr, "%s: failed to encode image\n", __func__); return false; } - size_t alloc_size = ggml_allocr_alloc_graph(st.allocr, gf_measure) + tensor_alignment; + size_t alloc_size = ggml_allocr_alloc_graph(st.allocr, gf_measure); ggml_allocr_free(st.allocr); // recreate allocator with exact memory requirements - st.buf_alloc_img_enc.resize(alloc_size); - st.allocr = ggml_allocr_new(st.buf_alloc_img_enc.data(), st.buf_alloc_img_enc.size(), tensor_alignment); + ggml_backend_buffer_t buf_compute = ggml_backend_alloc_buffer(model.backend, alloc_size); + st.allocr = ggml_allocr_new_from_buffer(buf_compute); // compute the graph with the measured exact memory requirements from above ggml_allocr_reset(st.allocr); @@ -2032,11 +2048,12 @@ bool sam_compute_embd_img(const sam_image_u8 & img, int n_threads, sam_state & s ggml_allocr_alloc_graph(st.allocr, gf); - ggml_graph_compute_helper(st.work_buffer, gf, n_threads); + ggml_graph_compute_helper(model.backend, gf, n_threads); ggml_allocr_free(st.allocr); - st.allocr = NULL; - st.work_buffer.clear(); + ggml_backend_buffer_free(buf_compute); + + st.allocr = {}; state.t_compute_img_ms = ggml_time_ms() - t_start_ms; @@ -2075,8 +2092,8 @@ std::vector sam_compute_masks( st.iou_predictions = ggml_new_tensor_1d(st.ctx_masks, GGML_TYPE_F32, 3); - st.buf_compute_fast.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); - st.allocr = ggml_allocr_new_measure(tensor_alignment); + const size_t alignment = ggml_backend_get_alignment(model.backend); + st.allocr = ggml_allocr_new_measure(alignment); // measure memory requirements for the graph struct ggml_cgraph * gf_measure = sam_build_fast_graph(model, st, img.nx, img.ny, pt); @@ -2085,12 +2102,12 @@ std::vector sam_compute_masks( return {}; } - size_t alloc_size = ggml_allocr_alloc_graph(st.allocr, gf_measure) + tensor_alignment; + size_t alloc_size = ggml_allocr_alloc_graph(st.allocr, gf_measure); ggml_allocr_free(st.allocr); // recreate allocator with exact memory requirements - st.buf_alloc_fast.resize(alloc_size); - st.allocr = ggml_allocr_new(st.buf_alloc_fast.data(), st.buf_alloc_fast.size(), tensor_alignment); + ggml_backend_buffer_t buf_compute = ggml_backend_alloc_buffer(model.backend, alloc_size); + st.allocr = ggml_allocr_new_from_buffer(buf_compute); // compute the graph with the measured exact memory requirements from above ggml_allocr_reset(st.allocr); @@ -2103,18 +2120,18 @@ std::vector sam_compute_masks( ggml_allocr_alloc_graph(st.allocr, gf); - ggml_graph_compute_helper(st.work_buffer, gf, n_threads); + ggml_graph_compute_helper(model.backend, gf, n_threads); //print_t_f32("iou_predictions", st.iou_predictions); //print_t_f32("low_res_masks", st.low_res_masks); - ggml_allocr_free(st.allocr); - st.allocr = {}; - st.buf_compute_fast.clear(); - st.buf_alloc_fast.clear(); std::vector masks = sam_postprocess_masks(model.hparams, img.nx, img.ny, st, mask_on_val, mask_off_val); + ggml_allocr_free(st.allocr); ggml_free(st.ctx_masks); + ggml_backend_buffer_free(buf_compute); + + st.allocr = {}; st.ctx_masks = {}; st.low_res_masks = {}; st.iou_predictions = {}; @@ -2132,4 +2149,11 @@ void sam_deinit(sam_state & state) { state.model.reset(); state.state.reset(); } + + if (state.model) { + if (state.model->backend) { + ggml_backend_free(state.model->backend); + ggml_backend_buffer_free(state.model->buffer); + } + } }