diff --git a/README.md b/README.md index 0da9ddc..f79bf8c 100644 --- a/README.md +++ b/README.md @@ -224,4 +224,6 @@ output_file: optional. if specified, dump the output to this file instead of std See [tests/README.md](tests/README.md) for more info about benchmarking. ## Future Work -- [ ] Implement proper bicubic interpolation (PIL uses a convolutions-based algorithm, and it's more stable than affine transformations). +- [ ] Integrate up to the latest GGML and support multiple backends. + +- [ ] diff --git a/clip.cpp b/clip.cpp index a3710de..b797394 100644 --- a/clip.cpp +++ b/clip.cpp @@ -537,8 +537,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN); int idx_std = get_key_idx(ctx, KEY_IMAGE_STD); for (int i = 0; i < 3; ++i) { - new_clip->image_mean[i] = *((float *)gguf_get_arr_data(ctx, idx_mean)); - new_clip->image_std[i] = *((float *)gguf_get_arr_data(ctx, idx_std)); + new_clip->image_mean[i] = ((float *)gguf_get_arr_data(ctx, idx_mean))[i]; + new_clip->image_std[i] = ((float *)gguf_get_arr_data(ctx, idx_std))[i]; } if (verbosity >= 2) { @@ -725,8 +725,75 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { return true; } +static inline double bicubic_filter(double x) { +#define a -0.5 + if (x < 0.0) { + x = -x; + } + if (x < 1.0) { + return ((a + 2.0) * x - (a + 3.0)) * x * x + 1; + } + if (x < 2.0) { + return (((x - 5) * x + 8) * x - 4) * a; + } + return 0.0; +#undef a +} + +static bool precompute_coeffs(int inSize, float in0, float in1, int outSize, double ** kkp, int ** boundsp, int * ksize) { + double support = 2.0; // Bicubic filter support from Resample.c + double filterscale = (double)(in1 - in0) / outSize; + if (filterscale < 1.0) { + filterscale = 1.0; + } + support *= filterscale; + int ksize_local = (int)ceil(support) * 2 + 1; + + double * kk = (double *)malloc(outSize * ksize_local * sizeof(double)); + int * bounds = (int *)malloc(outSize * 2 * sizeof(int)); + if (!kk || !bounds) { + free(kk); + free(bounds); + return false; + } + + for (int xx = 0; xx < outSize; xx++) { + double center = in0 + (xx + 0.5) * (in1 - in0) / outSize; + double ww = 0.0; + double ss = 1.0 / filterscale; + int xmin = (int)(center - support + 0.5); + if (xmin < 0) + xmin = 0; + int xmax = (int)(center + support + 0.5); + if (xmax > inSize) + xmax = inSize; + xmax -= xmin; + + double * k = &kk[xx * ksize_local]; + for (int x = 0; x < xmax; x++) { + double w = bicubic_filter((x + xmin - center + 0.5) * ss); + k[x] = w; + ww += w; + } + for (int x = 0; x < xmax; x++) { + if (ww != 0.0) { + k[x] /= ww; + } + } + for (int x = xmax; x < ksize_local; x++) { + k[x] = 0.0; + } + bounds[xx * 2 + 0] = xmin; + bounds[xx * 2 + 1] = xmax; + } + + *kkp = kk; + *boundsp = bounds; + *ksize = ksize_local; + return true; +} + // normalize: x = (x - mean) / std -// TODO: implement bicubic interpolation instead of linear. bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res) { if (!ctx->has_vision_encoder) { printf("This gguf file seems to have no vision encoder\n"); @@ -735,63 +802,127 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip const int nx = img->nx; const int ny = img->ny; - const int nx2 = ctx->vision_model.hparams.image_size; const int ny2 = ctx->vision_model.hparams.image_size; + // Setting the output image size and allocating memory res->nx = nx2; res->ny = ny2; res->size = 3 * nx2 * ny2; res->data = new float[res->size](); + if (!res->data) { + printf("clip_image_f32 Memory allocation failed\n"); + return false; + } - const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size; - - const int nx3 = int(nx / scale + 0.5f); - const int ny3 = int(ny / scale + 0.5f); + // Calculate aspect ratio maintaining scaling + const float scale = std::min((float)nx, (float)ny) / (float)ctx->vision_model.hparams.image_size; + const int nx3 = (int)(nx / scale + 0.5f); + const int ny3 = (int)(ny / scale + 0.5f); + + const auto & m3 = ctx->image_mean; + const auto & s3 = ctx->image_std; + + // Calculating horizontal and vertical coeffs + double *kk_horiz, *kk_vert; + int *bounds_horiz, *bounds_vert; + int ksize_horiz, ksize_vert; + + if (!precompute_coeffs(nx, 0.0f, (float)nx, nx3, &kk_horiz, &bounds_horiz, &ksize_horiz) || + !precompute_coeffs(ny, 0.0f, (float)ny, ny3, &kk_vert, &bounds_vert, &ksize_vert)) { + delete[] res->data; + free(kk_horiz); + free(bounds_horiz); + free(kk_vert); + free(bounds_vert); + printf("Failed to calculate coeffs\n"); + return false; + } - const auto & m3 = ctx->image_mean; // {0.48145466f, 0.4578275f, 0.40821073f}; - const auto & s3 = ctx->image_std; // {0.26862954f, 0.26130258f, 0.27577711f}; + // Intermediate image buffer (stores horizontal resampling results) + float * temp = new float[3 * nx3 * ny](); + if (!temp) { + delete[] res->data; + free(kk_horiz); + free(bounds_horiz); + free(kk_vert); + free(bounds_vert); + printf("Failed to allocate intermediate buffer memory\n"); + return false; + } - for (int y = 0; y < ny3; y++) { - for (int x = 0; x < nx3; x++) { + // Horizontal bicubic resampling + for (int y = 0; y < ny; y++) { + for (int xx = 0; xx < nx3; xx++) { + int xmin = bounds_horiz[xx * 2 + 0]; + int xmax = bounds_horiz[xx * 2 + 1]; + double * k = &kk_horiz[xx * ksize_horiz]; for (int c = 0; c < 3; c++) { - // linear interpolation - const float sx = (x + 0.5f) * scale - 0.5f; - const float sy = (y + 0.5f) * scale - 0.5f; - - const int x0 = std::max(0, (int)std::floor(sx)); - const int y0 = std::max(0, (int)std::floor(sy)); - - const int x1 = std::min(x0 + 1, nx - 1); - const int y1 = std::min(y0 + 1, ny - 1); - - const float dx = sx - x0; - const float dy = sy - y0; - - const int j00 = 3 * (y0 * nx + x0) + c; - const int j01 = 3 * (y0 * nx + x1) + c; - const int j10 = 3 * (y1 * nx + x0) + c; - const int j11 = 3 * (y1 * nx + x1) + c; - - const float v00 = img->data[j00]; - const float v01 = img->data[j01]; - const float v10 = img->data[j10]; - const float v11 = img->data[j11]; - - const float v0 = v00 * (1.0f - dx) + v01 * dx; - const float v1 = v10 * (1.0f - dx) + v11 * dx; + double ss = 0.0; + for (int x = 0; x < xmax; x++) { + int src_idx = 3 * (y * nx + (x + xmin)) + c; + ss += (double)img->data[src_idx] * k[x]; + } + int dst_idx = 3 * (y * nx3 + xx) + c; + temp[dst_idx] = std::min(std::max((float)ss, 0.0f), 255.0f); + } + } + } - const float v = v0 * (1.0f - dy) + v1 * dy; + // Vertical bicubic resampling + float * resampled = new float[3 * nx3 * ny3](); + if (!resampled) { + delete[] temp; + delete[] res->data; + free(kk_horiz); + free(bounds_horiz); + free(kk_vert); + free(bounds_vert); + printf("Failed to allocate resampling buffer memory\n"); + return false; + } - const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f); + for (int yy = 0; yy < ny3; yy++) { + int ymin = bounds_vert[yy * 2 + 0]; + int ymax = bounds_vert[yy * 2 + 1]; + double * k = &kk_vert[yy * ksize_vert]; + for (int x = 0; x < nx3; x++) { + for (int c = 0; c < 3; c++) { + double ss = 0.0; + for (int y = 0; y < ymax; y++) { + int src_idx = 3 * ((y + ymin) * nx3 + x) + c; + ss += (double)temp[src_idx] * k[y]; + } + int dst_idx = 3 * (yy * nx3 + x) + c; + resampled[dst_idx] = std::min(std::max((float)ss, 0.0f), 255.0f); + } + } + } - const int i = 3 * (y * nx3 + x) + c; + // Center crop and normalize + int x_offset = (nx3 - nx2) / 2; + int y_offset = (ny3 - ny2) / 2; - res->data[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c]; + for (int yy = 0; yy < ny2; yy++) { + for (int x = 0; x < nx2; x++) { + int src_y = yy + y_offset; + int src_x = x + x_offset; + int src_idx = 3 * (src_y * nx3 + src_x); + int dst_idx = 3 * (yy * nx2 + x); + for (int c = 0; c < 3; c++) { + float v = resampled[src_idx + c]; + res->data[dst_idx + c] = ((v / 255.0f) - m3[c]) / s3[c]; } } } + delete[] resampled; + delete[] temp; + free(kk_horiz); + free(bounds_horiz); + free(kk_vert); + free(bounds_vert); + return true; } @@ -871,6 +1002,8 @@ void clip_image_batch_preprocess(const clip_ctx * ctx, const int n_threads, cons for (t = 0; t < num_threads; t++) { pthread_join(threads[t], NULL); } + + delete[] imageDataRange; } } diff --git a/examples/python_bindings/clip_cpp/clip.py b/examples/python_bindings/clip_cpp/clip.py index d927652..85cb33b 100644 --- a/examples/python_bindings/clip_cpp/clip.py +++ b/examples/python_bindings/clip_cpp/clip.py @@ -199,11 +199,11 @@ class ClipContext(ctypes.Structure): # ] # clip_image_batch_encode.restype = ctypes.c_bool -make_clip_image_u8 = clip_lib.make_clip_image_u8 +make_clip_image_u8 = clip_lib.clip_image_u8_make make_clip_image_u8.argtypes = [] make_clip_image_u8.restype = ctypes.POINTER(ClipImageU8) -make_clip_image_f32 = clip_lib.make_clip_image_f32 +make_clip_image_f32 = clip_lib.clip_image_f32_make make_clip_image_f32.argtypes = [] make_clip_image_f32.restype = ctypes.POINTER(ClipImageF32)