diff --git a/CMakeLists.txt b/CMakeLists.txt index bca4bdb..b983917 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,7 @@ cmake_minimum_required(VERSION 3.12) project("CLIP.cpp" C CXX) +set(CMAKE_CXX_STANDARD 20) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) diff --git a/build.bat b/build.bat new file mode 100644 index 0000000..b955093 --- /dev/null +++ b/build.bat @@ -0,0 +1,8 @@ +rd /s /q build +md build +cd build +git submodule update --recursive +cmake .. +cmake --build . --config Release +@REM cmake --build . --target CLIP.cpp --config Release +pause \ No newline at end of file diff --git a/clip.cpp b/clip.cpp index ff1f5ca..3247c89 100644 --- a/clip.cpp +++ b/clip.cpp @@ -5,7 +5,8 @@ #include #include #include -#include +//#include +#include #include #include #include @@ -803,7 +804,7 @@ typedef struct { } ImageData; // Function to preprocess a single image in a thread -void * preprocess_image(void * arg) { +void preprocess_image(void * arg) { ImageData * imageData = static_cast(arg); const clip_image_u8 * input = imageData->input; clip_image_f32 * resized = imageData->resized; @@ -812,7 +813,7 @@ void * preprocess_image(void * arg) { // Call the original preprocess function on the image clip_image_preprocess(ctx, input, resized); - pthread_exit(NULL); + //pthread_exit(NULL); } // Function to batch-preprocess multiple images i @@ -834,7 +835,8 @@ void clip_image_batch_preprocess(const clip_ctx * ctx, const int n_threads, cons } else { // Multi-threaded case - std::vector threads(num_threads); + //std::vector threads(num_threads); + std::vector threads; std::vector imageData(img_inputs->size); for (t = 0; t < num_threads; t++) { @@ -849,12 +851,15 @@ void clip_image_batch_preprocess(const clip_ctx * ctx, const int n_threads, cons } // Create a thread for each batch of images - pthread_create(&threads[t], NULL, preprocess_image, static_cast(&imageData[start_index])); + //pthread_create(&threads[t], NULL, preprocess_image, static_cast(&imageData[start_index])); + std::thread proc_thread(preprocess_image,static_cast(&imageData[start_index])); + threads.push_back(std::move(proc_thread)); } // Wait for all threads to finish for (t = 0; t < num_threads; t++) { - pthread_join(threads[t], NULL); + //pthread_join(threads[t], NULL); + threads[t].join(); } } } @@ -1392,8 +1397,10 @@ bool clip_compare_text_and_image(const clip_ctx * ctx, const int n_threads, cons // prepare image and text vectors const int projection_dim = ctx->vision_model.hparams.projection_dim; - float img_vec[projection_dim]; - float txt_vec[projection_dim]; + //float img_vec[projection_dim]; + //float txt_vec[projection_dim]; + float * img_vec = new float[projection_dim]; + float * txt_vec = new float[projection_dim]; // tokenize and encode text clip_tokens tokens; @@ -1419,6 +1426,8 @@ bool clip_compare_text_and_image(const clip_ctx * ctx, const int n_threads, cons // compute similarity *score = clip_similarity_score(img_vec, txt_vec, projection_dim); + delete[] img_vec; + delete[] txt_vec; return true; } @@ -1487,14 +1496,17 @@ bool clip_zero_shot_label_image(struct clip_ctx * ctx, const int n_threads, cons clip_image_preprocess(ctx, input_img, &img_res); - float img_vec[vec_dim]; + //float img_vec[vec_dim]; + float * img_vec = new float[vec_dim]; if (!clip_image_encode(ctx, n_threads, &img_res, img_vec, false)) { return false; } // encode texts and compute similarities - float txt_vec[vec_dim]; - float similarities[n_labels]; + //float txt_vec[vec_dim]; + //float similarities[n_labels]; + float * txt_vec = new float[vec_dim]; + float * similarities = new float[n_labels]; for (int i = 0; i < n_labels; i++) { const auto & text = labels[i]; @@ -1503,10 +1515,14 @@ bool clip_zero_shot_label_image(struct clip_ctx * ctx, const int n_threads, cons clip_text_encode(ctx, n_threads, &tokens, txt_vec, false); similarities[i] = clip_similarity_score(img_vec, txt_vec, vec_dim); } + delete[] img_vec; + delete[] txt_vec; // apply softmax and sort scores softmax_with_sorting(similarities, n_labels, scores, indices); + delete[] similarities; + return true; } diff --git a/examples/extract.cpp b/examples/extract.cpp index 22e736e..ab965ec 100644 --- a/examples/extract.cpp +++ b/examples/extract.cpp @@ -45,12 +45,14 @@ int main(int argc, char ** argv) { const int vec_dim = clip_get_vision_hparams(ctx)->projection_dim; int shape[2] = {1, vec_dim}; - float vec[vec_dim]; + //float vec[vec_dim]; + float* vec = new float[vec_dim]; clip_image_encode(ctx, params.n_threads, &img_res, vec, false); // Generate a unique output filename for each image std::string output_filename = "./img_vec_" + img_path.substr(img_path.find_last_of('/') + 1) + ".npy"; writeNpyFile(output_filename.c_str(), vec, shape, 2); + delete[] vec; // Update progress processedInputs++; @@ -69,7 +71,8 @@ int main(int argc, char ** argv) { const int vec_dim = clip_get_text_hparams(ctx)->projection_dim; int shape[2] = {1, vec_dim}; - float vec[vec_dim]; + //float vec[vec_dim]; + float* vec = new float[vec_dim]; if (!clip_text_encode(ctx, params.n_threads, &tokens, vec, false)) { printf("Unable to encode text\n"); @@ -85,6 +88,8 @@ int main(int argc, char ** argv) { // Generate a unique output filename for each text std::string output_filename = "./text_vec_" + std::to_string(textCounter++) + ".npy"; writeNpyFile(output_filename.c_str(), vec, shape, 2); + + delete[] vec; } printf("\n"); // Print a newline to clear the progress bar line diff --git a/examples/simple.c b/examples/simple.c index 36aa0d3..1d39aee 100644 --- a/examples/simple.c +++ b/examples/simple.c @@ -35,7 +35,8 @@ int main() { } // Encode image - float img_vec[vec_dim]; + //float img_vec[vec_dim]; + float * img_vec = (float *)malloc(sizeof(float) * vec_dim); if (!clip_image_encode(ctx, n_threads, img_res, img_vec, true)) { fprintf(stderr, "%s: failed to encode image\n", __func__); return 1; @@ -46,7 +47,8 @@ int main() { clip_tokenize(ctx, text, tokens); // Encode text - float txt_vec[vec_dim]; + //float txt_vec[vec_dim]; + float * txt_vec = (float *)malloc(sizeof(float) * vec_dim); if (!clip_text_encode(ctx, n_threads, tokens, txt_vec, true)) { fprintf(stderr, "%s: failed to encode text\n", __func__); return 1; @@ -55,6 +57,9 @@ int main() { // Calculate image-text similarity float score = clip_similarity_score(img_vec, txt_vec, vec_dim); + free(img_vec); + free(txt_vec); + // Alternatively, you can replace the above steps with: // float score; // if (!clip_compare_text_and_image_c(ctx, n_threads, text, img0, &score)) { diff --git a/examples/zsl.cpp b/examples/zsl.cpp index b13f832..2610844 100644 --- a/examples/zsl.cpp +++ b/examples/zsl.cpp @@ -15,7 +15,8 @@ int main(int argc, char ** argv) { printf("%s: You must specify at least 2 texts for zero-shot labeling\n", __func__); } - const char * labels[n_labels]; + // const char * labels[n_labels]; + const char** labels = new const char*[n_labels]; for (size_t i = 0; i < n_labels; ++i) { labels[i] = params.texts[i].c_str(); } @@ -34,8 +35,11 @@ int main(int argc, char ** argv) { return 1; } - float sorted_scores[n_labels]; - int sorted_indices[n_labels]; + //float sorted_scores[n_labels]; + //int sorted_indices[n_labels]; + float* sorted_scores = new float[n_labels]; + int* sorted_indices = new int[n_labels]; + if (!clip_zero_shot_label_image(ctx, params.n_threads, &input_img, labels, n_labels, sorted_scores, sorted_indices)) { fprintf(stderr, "Unable to apply ZSL\n"); return 1; @@ -46,6 +50,9 @@ int main(int argc, char ** argv) { float score = sorted_scores[i]; printf("%s = %1.4f\n", label, score); } + delete[] labels; + delete[] sorted_scores; + delete[] sorted_indices; clip_free(ctx); diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp index 3640ae5..959a666 100644 --- a/tests/benchmark.cpp +++ b/tests/benchmark.cpp @@ -52,7 +52,8 @@ int main(int argc, char ** argv) { const int vec_dim = clip_get_text_hparams(ctx)->projection_dim; - float txt_vecs[n_labels * vec_dim]; + //float txt_vecs[n_labels * vec_dim]; + float* txt_vecs = new float[n_labels * vec_dim]; ggml_time_init(); @@ -79,11 +80,15 @@ int main(int argc, char ** argv) { int n_total_items = 0; // total number of images processed float total_acc1_score = 0.0f; // total accuracy at 1 for the intire dataset float total_acc5_score = 0.0f; // total accuracy at 5 in intitre dataset - float img_vecs[vec_dim * batch_size]; - - float similarities[n_labels]; - float sorted_scores[n_labels]; - int indices[n_labels]; + //float img_vecs[vec_dim * batch_size]; + float* img_vecs = new float[vec_dim * batch_size]; + + //float similarities[n_labels]; + //float sorted_scores[n_labels]; + //int indices[n_labels]; + float* similarities = new float[n_labels]; + float* sorted_scores = new float[n_labels]; + int* indices = new int[n_labels]; std::vector img_inputs(batch_size); std::vector imgs_resized(batch_size); @@ -138,6 +143,10 @@ int main(int argc, char ** argv) { n_total_items += 1; } } + delete[] img_vecs; + delete[] similarities; + delete[] sorted_scores; + delete[] indices; float acc1_score = (float)n_acc1 / n_items; float acc5_score = (float)n_acc5 / n_items;