diff --git a/README.md b/README.md
index 0da9ddc..f79bf8c 100644
--- a/README.md
+++ b/README.md
@@ -224,4 +224,6 @@ output_file: optional. if specified, dump the output to this file instead of std
 See [tests/README.md](tests/README.md) for more info about benchmarking.
 
 ## Future Work
--   [ ] Implement proper bicubic interpolation (PIL uses a convolutions-based algorithm, and it's more stable than affine transformations).
+-   [ ] Integrate up to the latest GGML and support multiple backends.
+
+-   [ ] 
diff --git a/clip.cpp b/clip.cpp
index a3710de..b797394 100644
--- a/clip.cpp
+++ b/clip.cpp
@@ -537,8 +537,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
         int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
         for (int i = 0; i < 3; ++i) {
-            new_clip->image_mean[i] = *((float *)gguf_get_arr_data(ctx, idx_mean));
-            new_clip->image_std[i] = *((float *)gguf_get_arr_data(ctx, idx_std));
+            new_clip->image_mean[i] = ((float *)gguf_get_arr_data(ctx, idx_mean))[i];
+            new_clip->image_std[i] = ((float *)gguf_get_arr_data(ctx, idx_std))[i];
         }
 
         if (verbosity >= 2) {
@@ -725,8 +725,75 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
     return true;
 }
 
+static inline double bicubic_filter(double x) {
+#define a -0.5
+    if (x < 0.0) {
+        x = -x;
+    }
+    if (x < 1.0) {
+        return ((a + 2.0) * x - (a + 3.0)) * x * x + 1;
+    }
+    if (x < 2.0) {
+        return (((x - 5) * x + 8) * x - 4) * a;
+    }
+    return 0.0;
+#undef a
+}
+
+static bool precompute_coeffs(int inSize, float in0, float in1, int outSize, double ** kkp, int ** boundsp, int * ksize) {
+    double support = 2.0; // Bicubic filter support from Resample.c
+    double filterscale = (double)(in1 - in0) / outSize;
+    if (filterscale < 1.0) {
+        filterscale = 1.0;
+    }
+    support *= filterscale;
+    int ksize_local = (int)ceil(support) * 2 + 1;
+
+    double * kk = (double *)malloc(outSize * ksize_local * sizeof(double));
+    int * bounds = (int *)malloc(outSize * 2 * sizeof(int));
+    if (!kk || !bounds) {
+        free(kk);
+        free(bounds);
+        return false;
+    }
+
+    for (int xx = 0; xx < outSize; xx++) {
+        double center = in0 + (xx + 0.5) * (in1 - in0) / outSize;
+        double ww = 0.0;
+        double ss = 1.0 / filterscale;
+        int xmin = (int)(center - support + 0.5);
+        if (xmin < 0)
+            xmin = 0;
+        int xmax = (int)(center + support + 0.5);
+        if (xmax > inSize)
+            xmax = inSize;
+        xmax -= xmin;
+
+        double * k = &kk[xx * ksize_local];
+        for (int x = 0; x < xmax; x++) {
+            double w = bicubic_filter((x + xmin - center + 0.5) * ss);
+            k[x] = w;
+            ww += w;
+        }
+        for (int x = 0; x < xmax; x++) {
+            if (ww != 0.0) {
+                k[x] /= ww;
+            }
+        }
+        for (int x = xmax; x < ksize_local; x++) {
+            k[x] = 0.0;
+        }
+        bounds[xx * 2 + 0] = xmin;
+        bounds[xx * 2 + 1] = xmax;
+    }
+
+    *kkp = kk;
+    *boundsp = bounds;
+    *ksize = ksize_local;
+    return true;
+}
+
 // normalize: x = (x - mean) / std
-// TODO: implement bicubic interpolation instead of linear.
 bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res) {
     if (!ctx->has_vision_encoder) {
         printf("This gguf file seems to have no vision encoder\n");
@@ -735,63 +802,127 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
 
     const int nx = img->nx;
     const int ny = img->ny;
-
     const int nx2 = ctx->vision_model.hparams.image_size;
     const int ny2 = ctx->vision_model.hparams.image_size;
 
+    // Setting the output image size and allocating memory
     res->nx = nx2;
     res->ny = ny2;
     res->size = 3 * nx2 * ny2;
     res->data = new float[res->size]();
+    if (!res->data) {
+        printf("clip_image_f32 Memory allocation failed\n");
+        return false;
+    }
 
-    const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;
-
-    const int nx3 = int(nx / scale + 0.5f);
-    const int ny3 = int(ny / scale + 0.5f);
+    // Calculate aspect ratio maintaining scaling
+    const float scale = std::min((float)nx, (float)ny) / (float)ctx->vision_model.hparams.image_size;
+    const int nx3 = (int)(nx / scale + 0.5f);
+    const int ny3 = (int)(ny / scale + 0.5f);
+
+    const auto & m3 = ctx->image_mean;
+    const auto & s3 = ctx->image_std;
+
+    // Calculating horizontal and vertical coeffs
+    double *kk_horiz, *kk_vert;
+    int *bounds_horiz, *bounds_vert;
+    int ksize_horiz, ksize_vert;
+
+    if (!precompute_coeffs(nx, 0.0f, (float)nx, nx3, &kk_horiz, &bounds_horiz, &ksize_horiz) ||
+        !precompute_coeffs(ny, 0.0f, (float)ny, ny3, &kk_vert, &bounds_vert, &ksize_vert)) {
+        delete[] res->data;
+        free(kk_horiz);
+        free(bounds_horiz);
+        free(kk_vert);
+        free(bounds_vert);
+        printf("Failed to calculate coeffs\n");
+        return false;
+    }
 
-    const auto & m3 = ctx->image_mean; // {0.48145466f, 0.4578275f, 0.40821073f};
-    const auto & s3 = ctx->image_std;  // {0.26862954f, 0.26130258f, 0.27577711f};
+    // Intermediate image buffer (stores horizontal resampling results)
+    float * temp = new float[3 * nx3 * ny]();
+    if (!temp) {
+        delete[] res->data;
+        free(kk_horiz);
+        free(bounds_horiz);
+        free(kk_vert);
+        free(bounds_vert);
+        printf("Failed to allocate intermediate buffer memory\n");
+        return false;
+    }
 
-    for (int y = 0; y < ny3; y++) {
-        for (int x = 0; x < nx3; x++) {
+    // Horizontal bicubic resampling
+    for (int y = 0; y < ny; y++) {
+        for (int xx = 0; xx < nx3; xx++) {
+            int xmin = bounds_horiz[xx * 2 + 0];
+            int xmax = bounds_horiz[xx * 2 + 1];
+            double * k = &kk_horiz[xx * ksize_horiz];
             for (int c = 0; c < 3; c++) {
-                // linear interpolation
-                const float sx = (x + 0.5f) * scale - 0.5f;
-                const float sy = (y + 0.5f) * scale - 0.5f;
-
-                const int x0 = std::max(0, (int)std::floor(sx));
-                const int y0 = std::max(0, (int)std::floor(sy));
-
-                const int x1 = std::min(x0 + 1, nx - 1);
-                const int y1 = std::min(y0 + 1, ny - 1);
-
-                const float dx = sx - x0;
-                const float dy = sy - y0;
-
-                const int j00 = 3 * (y0 * nx + x0) + c;
-                const int j01 = 3 * (y0 * nx + x1) + c;
-                const int j10 = 3 * (y1 * nx + x0) + c;
-                const int j11 = 3 * (y1 * nx + x1) + c;
-
-                const float v00 = img->data[j00];
-                const float v01 = img->data[j01];
-                const float v10 = img->data[j10];
-                const float v11 = img->data[j11];
-
-                const float v0 = v00 * (1.0f - dx) + v01 * dx;
-                const float v1 = v10 * (1.0f - dx) + v11 * dx;
+                double ss = 0.0;
+                for (int x = 0; x < xmax; x++) {
+                    int src_idx = 3 * (y * nx + (x + xmin)) + c;
+                    ss += (double)img->data[src_idx] * k[x];
+                }
+                int dst_idx = 3 * (y * nx3 + xx) + c;
+                temp[dst_idx] = std::min(std::max((float)ss, 0.0f), 255.0f);
+            }
+        }
+    }
 
-                const float v = v0 * (1.0f - dy) + v1 * dy;
+    // Vertical bicubic resampling
+    float * resampled = new float[3 * nx3 * ny3]();
+    if (!resampled) {
+        delete[] temp;
+        delete[] res->data;
+        free(kk_horiz);
+        free(bounds_horiz);
+        free(kk_vert);
+        free(bounds_vert);
+        printf("Failed to allocate resampling buffer memory\n");
+        return false;
+    }
 
-                const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f);
+    for (int yy = 0; yy < ny3; yy++) {
+        int ymin = bounds_vert[yy * 2 + 0];
+        int ymax = bounds_vert[yy * 2 + 1];
+        double * k = &kk_vert[yy * ksize_vert];
+        for (int x = 0; x < nx3; x++) {
+            for (int c = 0; c < 3; c++) {
+                double ss = 0.0;
+                for (int y = 0; y < ymax; y++) {
+                    int src_idx = 3 * ((y + ymin) * nx3 + x) + c;
+                    ss += (double)temp[src_idx] * k[y];
+                }
+                int dst_idx = 3 * (yy * nx3 + x) + c;
+                resampled[dst_idx] = std::min(std::max((float)ss, 0.0f), 255.0f);
+            }
+        }
+    }
 
-                const int i = 3 * (y * nx3 + x) + c;
+    // Center crop and normalize
+    int x_offset = (nx3 - nx2) / 2;
+    int y_offset = (ny3 - ny2) / 2;
 
-                res->data[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
+    for (int yy = 0; yy < ny2; yy++) {
+        for (int x = 0; x < nx2; x++) {
+            int src_y = yy + y_offset;
+            int src_x = x + x_offset;
+            int src_idx = 3 * (src_y * nx3 + src_x);
+            int dst_idx = 3 * (yy * nx2 + x);
+            for (int c = 0; c < 3; c++) {
+                float v = resampled[src_idx + c];
+                res->data[dst_idx + c] = ((v / 255.0f) - m3[c]) / s3[c];
             }
         }
     }
 
+    delete[] resampled;
+    delete[] temp;
+    free(kk_horiz);
+    free(bounds_horiz);
+    free(kk_vert);
+    free(bounds_vert);
+
     return true;
 }
 
@@ -871,6 +1002,8 @@ void clip_image_batch_preprocess(const clip_ctx * ctx, const int n_threads, cons
         for (t = 0; t < num_threads; t++) {
             pthread_join(threads[t], NULL);
         }
+
+        delete[] imageDataRange;
     }
 }
 
diff --git a/examples/python_bindings/clip_cpp/clip.py b/examples/python_bindings/clip_cpp/clip.py
index d927652..85cb33b 100644
--- a/examples/python_bindings/clip_cpp/clip.py
+++ b/examples/python_bindings/clip_cpp/clip.py
@@ -199,11 +199,11 @@ class ClipContext(ctypes.Structure):
 # ]
 # clip_image_batch_encode.restype = ctypes.c_bool
 
-make_clip_image_u8 = clip_lib.make_clip_image_u8
+make_clip_image_u8 = clip_lib.clip_image_u8_make
 make_clip_image_u8.argtypes = []
 make_clip_image_u8.restype = ctypes.POINTER(ClipImageU8)
 
-make_clip_image_f32 = clip_lib.make_clip_image_f32
+make_clip_image_f32 = clip_lib.clip_image_f32_make
 make_clip_image_f32.argtypes = []
 make_clip_image_f32.restype = ctypes.POINTER(ClipImageF32)