From 189581985e4076f7af443ceb6dbfeb99ec3a5c65 Mon Sep 17 00:00:00 2001 From: YavorGIvanov Date: Thu, 5 Oct 2023 17:12:39 +0300 Subject: [PATCH 1/4] Make sam.cpp work with ggml-backend --- ggml | 2 +- sam.cpp | 262 +++++++++++++++++++++++++++++++------------------------- 2 files changed, 144 insertions(+), 120 deletions(-) diff --git a/ggml b/ggml index ef33685..abf2669 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit ef336850d5bfe8237ebca1ec82cdfb97d78baff1 +Subproject commit abf2669e765cca1162e2b5fc3a7427da9e7e0d51 diff --git a/sam.cpp b/sam.cpp index 5d2cf19..39b0acc 100644 --- a/sam.cpp +++ b/sam.cpp @@ -3,6 +3,7 @@ #include "ggml.h" #include "ggml-alloc.h" +#include "ggml-backend.h" #include #include @@ -15,17 +16,9 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static const size_t tensor_alignment = 32; - -static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); - } - - ggml_graph_compute(graph, &plan); +static void ggml_graph_compute_helper(ggml_backend_t backend, ggml_cgraph * graph, int n_threads) { + ggml_backend_cpu_set_n_threads(backend, n_threads); + ggml_backend_graph_compute(backend, graph); } // RGB float32 image @@ -250,6 +243,10 @@ struct sam_ggml_model { sam_encoder_prompt enc_prompt; sam_decoder_mask dec; + ggml_backend_t backend = {}; + ggml_backend_buffer_t buffer = {}; + + // struct ggml_context * ctx; std::map tensors; @@ -265,16 +262,6 @@ struct sam_ggml_state { //struct ggml_tensor * tmp_save = {}; - - // buffer for `ggml_graph_plan.work_data` - std::vector work_buffer; - // buffers to evaluate the model - std::vector buf_alloc_img_enc; - std::vector buf_compute_img_enc; - - std::vector buf_alloc_fast; - std::vector buf_compute_fast; - struct ggml_allocr * allocr = {}; }; @@ -483,8 +470,8 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) { auto & ctx = model.ctx; - const size_t ctx_size = [&]() { - size_t ctx_size = 0; + const size_t buf_size = [&]() { + size_t buf_size = 0; const auto & hparams = model.hparams; @@ -503,59 +490,59 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) { // image encoder { - ctx_size += n_enc_state*n_img_embd*n_img_embd*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_state*n_img_embd*n_img_embd*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_state*3*n_patch_size*n_patch_size*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_state*3*n_patch_size*n_patch_size*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_state*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_state*n_enc_out_chans*1*1*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_out_chans*n_enc_out_chans*3*3*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_state*n_enc_out_chans*1*1*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_out_chans*n_enc_out_chans*3*3*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); } // image encoder layers { - ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer*3*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer*3*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_layer*3*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_layer*3*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_layer*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_layer*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_layer*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer*4*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_layer*4*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer*4*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_enc_layer*4*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); } - ctx_size += (8 + 14*n_enc_layer)*ggml_tensor_overhead(); + buf_size += (8 + 14*n_enc_layer)*ggml_tensor_overhead(); // prompt encoder { - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); // 2*(n_enc_out_chans/2) + buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); // 2*(n_enc_out_chans/2) - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); } - ctx_size += (2 + n_pt_embd)*ggml_tensor_overhead(); + buf_size += (2 + n_pt_embd)*ggml_tensor_overhead(); // mask decoder { @@ -567,75 +554,75 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) { const int n_hypernet_mpls_count = 4; // self_attn - ctx_size += tfm_layers_count*qkv_count*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += tfm_layers_count*qkv_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += tfm_layers_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*qkv_count*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); + buf_size += tfm_layers_count*qkv_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); // all norms - ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); // cross_attn_token_to_img - ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += tfm_layers_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16); + buf_size += tfm_layers_count*qkv_count*(n_enc_state/2)* ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); // mlp - ctx_size += tfm_layers_count*8*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += tfm_layers_count*8*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += tfm_layers_count*n_enc_out_chans*8*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += tfm_layers_count*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*8*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); + buf_size += tfm_layers_count*8*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*n_enc_out_chans*8*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); + buf_size += tfm_layers_count*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); // cross_attn_img_to_token - ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += tfm_layers_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16); + buf_size += tfm_layers_count*qkv_count*(n_enc_state/2)* ggml_type_sizef(GGML_TYPE_F32); + buf_size += tfm_layers_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); // transformer_final_attn_token_to_img - ctx_size += qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += qkv_count*(n_enc_state/2)* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + buf_size += qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16); + buf_size += qkv_count*(n_enc_state/2)* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_state* ggml_type_sizef(GGML_TYPE_F32); // transformer_norm_final - ctx_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + buf_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + buf_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); // output_upscaling - ctx_size += n_enc_out_chans*n_img_embd*2*2*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += 3*n_img_embd* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_out_chans*n_img_embd*(n_img_embd/2)*2*2*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += (n_img_embd/2)* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_out_chans*n_img_embd*2*2*ggml_type_sizef(GGML_TYPE_F16); + buf_size += 3*n_img_embd* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_out_chans*n_img_embd*(n_img_embd/2)*2*2*ggml_type_sizef(GGML_TYPE_F16); + buf_size += (n_img_embd/2)* ggml_type_sizef(GGML_TYPE_F32); // output_hypernetworks_mlps - ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_hypernet_mpls_count*n_enc_out_chans*(n_img_embd/2)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_hypernet_mpls_count*(n_img_embd/2)* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_hypernet_mpls_count*2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_hypernet_mpls_count*2*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_hypernet_mpls_count*n_enc_out_chans*(n_img_embd/2)*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_hypernet_mpls_count*(n_img_embd/2)* ggml_type_sizef(GGML_TYPE_F32); // iou_prediction_head - ctx_size += 2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += 2*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_pt_embd* ggml_type_sizef(GGML_TYPE_F32); + buf_size += 2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); + buf_size += 2*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); + buf_size += n_pt_embd* ggml_type_sizef(GGML_TYPE_F32); // iou_token_w - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); // mask_tokens_w - ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + buf_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); } } - fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + fprintf(stderr, "ggml buffer size = %6.2f MB\n", buf_size/(1024.0*1024.0)); - return ctx_size; + return buf_size; }(); // create the ggml context { struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, + /*.mem_size =*/ ggml_tensor_overhead() * GGML_MAX_NODES, /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, + /*.no_alloc =*/ true, }; ctx = ggml_init(params); @@ -645,6 +632,20 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) { } } + // initialize backend & allocate buffers + { + if (!model.backend) { + printf("Using CPU backend\n"); + model.backend = ggml_backend_cpu_init(); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_cpu_init() failed\n", __func__); + return false; + } + } + + model.buffer = ggml_backend_alloc_buffer(model.backend, buf_size); + } + // prepare memory for the weights { const auto & hparams = model.hparams; @@ -947,6 +948,8 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) { // load weights { + ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer); + int n_tensors = 0; size_t total_size = 0; @@ -977,28 +980,29 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) { std::string name(length, 0); fin.read(&name[0], length); - if (model.tensors.find(name.data()) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); + if (model.tensors.find(name) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); return false; } - auto tensor = model.tensors[name.data()]; - //printf("ne0 = %jd, ne1 = %jd, ne2 = %jd, ne3 = %jd\n", ne[0], ne[1], ne[2], ne[3]); + auto tensor = model.tensors[name]; + ggml_set_name(tensor, name.c_str()); if (ggml_nelements(tensor) != nelements) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %d, expected %d\n", - __func__, name.data(), (int) nelements, (int) ggml_nelements(tensor)); + __func__, name.c_str(), (int) nelements, (int) ggml_nelements(tensor)); return false; } if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2] || tensor->ne[3] != ne[3]) { fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d, %d], expected [%d, %d, %d, %d]\n", - __func__, name.data(), + __func__, name.c_str(), (int) ne[0], (int) ne[1], (int) ne[2], (int) ne[3], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], (int) tensor->ne[3]); return false; } + size_t bpe = 0; switch (ftype) { @@ -1019,6 +1023,7 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) { return false; } + ggml_allocr_alloc(alloc, tensor); fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); total_size += ggml_nbytes(tensor); @@ -1036,6 +1041,8 @@ bool sam_ggml_model_load(const std::string & fname, sam_ggml_model & model) { fprintf(stderr, " done\n"); fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); + + ggml_allocr_free(alloc); } fin.close(); @@ -1131,9 +1138,13 @@ struct ggml_cgraph * sam_encode_image( const int32_t n_img_size = hparams.n_img_size(); const int32_t n_window_size = hparams.n_window_size(); + // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data + static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead(); + static std::vector buf(buf_size); + struct ggml_init_params ggml_params = { - /*.mem_size =*/ state.buf_compute_img_enc.size(), - /*.mem_buffer =*/ state.buf_compute_img_enc.data(), + /*.mem_size =*/ buf.size(), + /*.mem_buffer =*/ buf.data(), /*.no_alloc =*/ true, // skip allocating as we use ggml_alloc to allocate exact memory requirements }; @@ -1922,9 +1933,13 @@ struct ggml_cgraph * sam_build_fast_graph( int ny, sam_point point) { + // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data + static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead(); + static std::vector buf(buf_size); + struct ggml_init_params ggml_params = { - /*.mem_size =*/ state.buf_compute_fast.size(), - /*.mem_buffer =*/ state.buf_compute_fast.data(), + /*.mem_size =*/ buf.size(), + /*.mem_buffer =*/ buf.data(), /*.no_alloc =*/ true, // skip allocating as we use ggml_alloc to allocate exact memory requirements }; @@ -2008,20 +2023,21 @@ bool sam_compute_embd_img(const sam_image_u8 & img, int n_threads, sam_state & s model.hparams.n_img_embd(), model.hparams.n_img_embd(), model.hparams.n_enc_out_chans); // Encode the image - st.buf_compute_img_enc.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); - st.allocr = ggml_allocr_new_measure(tensor_alignment); + const size_t alignment = ggml_backend_get_alignment(model.backend); + st.allocr = ggml_allocr_new_measure(alignment); + struct ggml_cgraph * gf_measure = sam_encode_image(model, st, img1); if (!gf_measure) { fprintf(stderr, "%s: failed to encode image\n", __func__); return false; } - size_t alloc_size = ggml_allocr_alloc_graph(st.allocr, gf_measure) + tensor_alignment; + size_t alloc_size = ggml_allocr_alloc_graph(st.allocr, gf_measure) + alignment; ggml_allocr_free(st.allocr); // recreate allocator with exact memory requirements - st.buf_alloc_img_enc.resize(alloc_size); - st.allocr = ggml_allocr_new(st.buf_alloc_img_enc.data(), st.buf_alloc_img_enc.size(), tensor_alignment); + ggml_backend_buffer_t buf_compute = ggml_backend_alloc_buffer(model.backend, alloc_size); + st.allocr = ggml_allocr_new_from_buffer(buf_compute); // compute the graph with the measured exact memory requirements from above ggml_allocr_reset(st.allocr); @@ -2034,11 +2050,12 @@ bool sam_compute_embd_img(const sam_image_u8 & img, int n_threads, sam_state & s ggml_allocr_alloc_graph(st.allocr, gf); - ggml_graph_compute_helper(st.work_buffer, gf, n_threads); + ggml_graph_compute_helper(model.backend, gf, n_threads); ggml_allocr_free(st.allocr); - st.allocr = NULL; - st.work_buffer.clear(); + ggml_backend_buffer_free(buf_compute); + + st.allocr = {}; state.t_compute_img_ms = ggml_time_ms() - t_start_ms; @@ -2077,8 +2094,8 @@ std::vector sam_compute_masks( st.iou_predictions = ggml_new_tensor_1d(st.ctx_masks, GGML_TYPE_F32, 3); - st.buf_compute_fast.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); - st.allocr = ggml_allocr_new_measure(tensor_alignment); + const size_t alignment = ggml_backend_get_alignment(model.backend); + st.allocr = ggml_allocr_new_measure(alignment); // measure memory requirements for the graph struct ggml_cgraph * gf_measure = sam_build_fast_graph(model, st, img.nx, img.ny, pt); @@ -2087,12 +2104,12 @@ std::vector sam_compute_masks( return {}; } - size_t alloc_size = ggml_allocr_alloc_graph(st.allocr, gf_measure) + tensor_alignment; + size_t alloc_size = ggml_allocr_alloc_graph(st.allocr, gf_measure) + alignment; ggml_allocr_free(st.allocr); // recreate allocator with exact memory requirements - st.buf_alloc_fast.resize(alloc_size); - st.allocr = ggml_allocr_new(st.buf_alloc_fast.data(), st.buf_alloc_fast.size(), tensor_alignment); + ggml_backend_buffer_t buf_compute = ggml_backend_alloc_buffer(model.backend, alloc_size); + st.allocr = ggml_allocr_new_from_buffer(buf_compute); // compute the graph with the measured exact memory requirements from above ggml_allocr_reset(st.allocr); @@ -2105,18 +2122,18 @@ std::vector sam_compute_masks( ggml_allocr_alloc_graph(st.allocr, gf); - ggml_graph_compute_helper(st.work_buffer, gf, n_threads); + ggml_graph_compute_helper(model.backend, gf, n_threads); //print_t_f32("iou_predictions", st.iou_predictions); //print_t_f32("low_res_masks", st.low_res_masks); - ggml_allocr_free(st.allocr); - st.allocr = {}; - st.buf_compute_fast.clear(); - st.buf_alloc_fast.clear(); std::vector masks = sam_postprocess_masks(model.hparams, img.nx, img.ny, st, mask_on_val, mask_off_val); + ggml_allocr_free(st.allocr); ggml_free(st.ctx_masks); + ggml_backend_buffer_free(buf_compute); + + st.allocr = {}; st.ctx_masks = {}; st.low_res_masks = {}; st.iou_predictions = {}; @@ -2134,4 +2151,11 @@ void sam_deinit(sam_state & state) { state.model.reset(); state.state.reset(); } + + if (state.model) { + if (state.model->backend) { + ggml_backend_free(state.model->backend); + ggml_backend_buffer_free(state.model->buffer); + } + } } From e8783a03ec1a2fca123b855ca7a324697ae2c87e Mon Sep 17 00:00:00 2001 From: YavorGIvanov Date: Mon, 9 Oct 2023 14:51:33 +0300 Subject: [PATCH 2/4] Remove no longer needed alignment addition to alloc size --- sam.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sam.cpp b/sam.cpp index 39b0acc..c60b3e6 100644 --- a/sam.cpp +++ b/sam.cpp @@ -2032,7 +2032,7 @@ bool sam_compute_embd_img(const sam_image_u8 & img, int n_threads, sam_state & s return false; } - size_t alloc_size = ggml_allocr_alloc_graph(st.allocr, gf_measure) + alignment; + size_t alloc_size = ggml_allocr_alloc_graph(st.allocr, gf_measure); ggml_allocr_free(st.allocr); // recreate allocator with exact memory requirements @@ -2104,7 +2104,7 @@ std::vector sam_compute_masks( return {}; } - size_t alloc_size = ggml_allocr_alloc_graph(st.allocr, gf_measure) + alignment; + size_t alloc_size = ggml_allocr_alloc_graph(st.allocr, gf_measure); ggml_allocr_free(st.allocr); // recreate allocator with exact memory requirements From ceafb7467bff7ec98e0c4f952e58a9eb8fd0238b Mon Sep 17 00:00:00 2001 From: YavorGIvanov Date: Mon, 9 Oct 2023 14:51:58 +0300 Subject: [PATCH 3/4] Update ggml submodule version --- ggml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml b/ggml index abf2669..94a4c07 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit abf2669e765cca1162e2b5fc3a7427da9e7e0d51 +Subproject commit 94a4c07c0e7a41d8929684f969ee86d6ca409c84 From 81002818eb0e2cb3b9a523286b067f80f8424431 Mon Sep 17 00:00:00 2001 From: YavorGIvanov Date: Tue, 24 Oct 2023 23:49:42 +0300 Subject: [PATCH 4/4] Remove no longer needed allocs and update ggml version --- ggml | 2 +- sam.cpp | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/ggml b/ggml index 94a4c07..dd92cfd 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 94a4c07c0e7a41d8929684f969ee86d6ca409c84 +Subproject commit dd92cfd4b188e9202dddb7b85eb8bc1e51cf8288 diff --git a/sam.cpp b/sam.cpp index c60b3e6..d7d6789 100644 --- a/sam.cpp +++ b/sam.cpp @@ -1706,7 +1706,6 @@ bool sam_decode_mask( { // ConvTranspose2d keys = ggml_conv_transpose_2d_p0(ctx0, dec.output_upscaling_0_w, keys, 2); - ggml_allocr_alloc(state.allocr, keys); // TODO: This alloc shouldn't be needed keys = ggml_add_inplace(ctx0, keys, ggml_repeat(ctx0, ggml_reshape_3d(ctx0, dec.output_upscaling_0_b, 1, 1, dec.output_upscaling_0_b->ne[0]), keys)); @@ -1718,7 +1717,6 @@ bool sam_decode_mask( // ConvTranspose2d keys = ggml_conv_transpose_2d_p0(ctx0, dec.output_upscaling_3_w, keys, 2); - ggml_allocr_alloc(state.allocr, keys); // TODO: This alloc shouldn't be needed keys = ggml_add_inplace(ctx0, ggml_repeat(ctx0, ggml_reshape_3d(ctx0, dec.output_upscaling_3_b, 1, 1, dec.output_upscaling_3_b->ne[0]), keys), keys);