adding parallel inference code

kherud · vaiju1981 · Mar 14, 2025 · Mar 15, 2025 · Mar 18, 2025 · Mar 18, 2025
commit 29bef1a41b0bc7acc7dc29217ad6cc4d85926ee2
diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp
@@ -2304,4 +2304,130 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleKVCacheAction(JN
         env->ThrowNew(c_llama_error, e.what());
         return nullptr;
     }
+}
+
+/**
+ * Configure parallel inference settings.
+ * Controls how inference tasks are distributed and executed in parallel.
+ */
+JNIEXPORT jboolean JNICALL Java_de_kherud_llama_LlamaModel_configureParallelInference(JNIEnv* env, jobject obj, jstring jconfig) {
+    try {
+        // Get server context pointer from Java object
+        jlong server_handle = env->GetLongField(obj, f_model_pointer);
+        if (server_handle == 0) {
+            env->ThrowNew(c_llama_error, "Model is not loaded");
+            return JNI_FALSE;
+        }
+
+        auto* ctx_server = reinterpret_cast<server_context*>(server_handle);
+
+        // Parse configuration from JSON
+        std::string config_str = parse_jstring(env, jconfig);
+        json config = json::parse(config_str);
+
+        // Store original settings for rollback in case of failure
+        int original_n_parallel = ctx_server->params_base.n_parallel;
+        float original_similarity_threshold = ctx_server->slot_prompt_similarity;
+
+        // Track changes to report
+        json changes = json::object();
+        bool changes_made = false;
+
+        if (config.contains("n_parallel")) {
+            int n_parallel = config["n_parallel"].get<int>();
+            if (n_parallel <= 0) {
+                env->ThrowNew(c_llama_error, "n_parallel must be greater than 0");
+                return JNI_FALSE;
+            }
+
+            if (n_parallel != ctx_server->params_base.n_parallel) {
+                // Changing the number of parallel slots requires model reloading
+                // which isn't supported at runtime, so we'll throw an error
+                env->ThrowNew(c_llama_error, "Changing the number of parallel slots requires restarting the model");
+                return JNI_FALSE;
+            }
+
+            changes["n_parallel"] = n_parallel;
+        }
+
+        if (config.contains("slot_prompt_similarity")) {
+            float similarity = config["slot_prompt_similarity"].get<float>();
+            if (similarity < 0.0f || similarity > 1.0f) {
+                env->ThrowNew(c_llama_error, "slot_prompt_similarity must be between 0.0 and 1.0");
+                return JNI_FALSE;
+            }
+
+            ctx_server->slot_prompt_similarity = similarity;
+            changes["slot_prompt_similarity"] = similarity;
+            changes_made = true;
+        }
+
+        // Check for other parameters in server context that you want to configure
+        // For example, n_threads, n_threads_batch, etc.
+        if (config.contains("n_threads")) {
+            int n_threads = config["n_threads"].get<int>();
+            if (n_threads <= 0) {
+                env->ThrowNew(c_llama_error, "n_threads must be greater than 0");
+                return JNI_FALSE;
+            }
+
+            ctx_server->params_base.cpuparams.n_threads = n_threads;
+            changes["n_threads"] = n_threads;
+            changes_made = true;
+        }
+
+        if (config.contains("n_threads_batch")) {
+            int n_threads_batch = config["n_threads_batch"].get<int>();
+            if (n_threads_batch <= 0) {
+                env->ThrowNew(c_llama_error, "n_threads_batch must be greater than 0");
+                return JNI_FALSE;
+            }
+
+            ctx_server->params_base.cpuparams_batch.n_threads = n_threads_batch;
+            changes["n_threads_batch"] = n_threads_batch;
+            changes_made = true;
+        }
+
+        // Since there's no dedicated task type for updating parallel config,
+        // we'll use the metrics task to ensure the changes are propagated
+        // through the server context
+        if (changes_made) {
+            // Request metrics to ensure changes are propagated
+            server_task task(SERVER_TASK_TYPE_METRICS);
+            task.id = ctx_server->queue_tasks.get_new_id();
+
+            ctx_server->queue_results.add_waiting_task_id(task.id);
+            ctx_server->queue_tasks.post(task, true);  // High priority
+
+            // Wait for the result
+            server_task_result_ptr result = ctx_server->queue_results.recv(task.id);
+            ctx_server->queue_results.remove_waiting_task_id(task.id);
+
+            if (result->is_error()) {
+                // Rollback changes if there was an error
+                ctx_server->params_base.n_parallel = original_n_parallel;
+                ctx_server->slot_prompt_similarity = original_similarity_threshold;
+
+                std::string error_msg = result->to_json()["message"].get<std::string>();
+                env->ThrowNew(c_llama_error, error_msg.c_str());
+                return JNI_FALSE;
+            }
+
+            // Create a success response
+            json response = {
+                {"success", true},
+                {"changes", changes}
+            };
+
+            SRV_INF("Parallel inference configuration updated: %s\n", changes.dump().c_str());
+            return JNI_TRUE;
+        } else {
+            SRV_INF("No parallel inference parameters were changed\n", " ");
+            return JNI_TRUE;
+        }
+    } catch (const std::exception& e) {
+        SRV_ERR("Exception in configureParallelInference: %s\n", e.what());
+        env->ThrowNew(c_llama_error, e.what());
+        return JNI_FALSE;
+    }
 }
diff --git a/src/main/cpp/jllama.h b/src/main/cpp/jllama.h
@@ -157,6 +157,8 @@ JNIEXPORT jintArray JNICALL Java_de_kherud_llama_LlamaModel_encode(JNIEnv * , jo
  */
 JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleKVCacheAction(JNIEnv* env, jobject obj, jint action, jint slotId, jstring jfilename);
 
+JNIEXPORT jboolean JNICALL Java_de_kherud_llama_LlamaModel_configureParallelInference(JNIEnv* , jobject , jstring );
+
   #ifdef __cplusplus
 }
 #endif

diff --git a/src/main/java/de/kherud/llama/LlamaModel.java b/src/main/java/de/kherud/llama/LlamaModel.java
@@ -308,4 +308,7 @@ public void close() throws Exception {
 	public static final int KVCACHE_ACTION_CLEAR = 1;
 	public static final int KVCACHE_ACTION_SAVE = 2;
 	public static final int KVCACHE_ACTION_LOAD = 3;
+
+
+	public native boolean configureParallelInference(String config);
 }
diff --git a/src/test/java/de/kherud/llama/LlamaEmbedingModelTest.java b/src/test/java/de/kherud/llama/LlamaEmbedingModelTest.java
@@ -34,6 +34,8 @@ public static void tearDown() throws Exception {
 
 	@Test
 	public void testEmbedding() {
+
+		model.handleKVCacheAction(LlamaModel.KVCACHE_ACTION_CLEAR, 0, null);
 	    // Create the request in JSON format
 	    String request = "{\"content\": \"You are an AI Assistant\"}";
 

diff --git a/src/test/java/de/kherud/llama/ParallelTests.java b/src/test/java/de/kherud/llama/ParallelTests.java
@@ -0,0 +1,149 @@
+package de.kherud.llama;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+
+import com.fasterxml.jackson.databind.JsonNode;
+
+public class ParallelTests {
+
+	private static LlamaModel model;
+
+	@BeforeClass
+	public static void setup() {
+		model = new LlamaModel(new ModelParameters()
+				.setModel("models/Phi-4-mini-instruct-Q2_K.gguf")
+				.setGpuLayers(43)
+				.enableLogTimestamps()
+				.enableLogPrefix()
+				.enableJinja()
+				.slotSavePath("models"));
+				;
+	}
+
+	@AfterClass
+	public static void tearDown() throws Exception {
+		if (model != null) {
+			model.close();
+		}
+	}
+
+	@Ignore
+	public void testParallelInference() {
+	    System.out.println("***** Running the test: testParallelInference");
+
+	    // 1. Configure parallel inference with specific parameters
+	    String config = "{\"slot_prompt_similarity\": 0.8, \"batch_mode\": true, \"defer_when_full\": true}";
+	    boolean configSuccess = model.configureParallelInference(config);
+	    Assert.assertTrue("Failed to configure parallel inference", configSuccess);
+
+	    // 2. Create multiple inference tasks with different prompts
+	    List<String> prompts = Arrays.asList(
+	        "The quick brown fox",
+	        "Once upon a time",
+	        "In a galaxy far far away",
+	        "Four score and seven years ago"
+	    );
+
+	    // 3. Execute tasks concurrently and measure response times
+	    List<Callable<Long>> tasks = new ArrayList<>();
+	    List<Future<Long>> futures = new ArrayList<>();
+	    ExecutorService executor = Executors.newFixedThreadPool(prompts.size());
+
+	    for (String prompt : prompts) {
+	        tasks.add(() -> {
+	            long startTime = System.currentTimeMillis();
+
+	            InferenceParameters params = new InferenceParameters()
+	                .setPrompt(prompt)
+	                .setNPredict(10);
+
+	            // Run completion and wait for result
+	            String result = model.handleCompletions(params.toString(), false);
+
+	            // Calculate execution time
+	            return System.currentTimeMillis() - startTime;
+	        });
+	    }
+
+	    try {
+	        // Submit all tasks
+	        futures = executor.invokeAll(tasks);
+
+	        // Collect execution times
+	        List<Long> executionTimes = new ArrayList<>();
+	        for (Future<Long> future : futures) {
+	            executionTimes.add(future.get());
+	        }
+
+	        // 4. Verify parallel execution happened
+	        // Calculate total and average execution time
+	        long totalTime = executionTimes.stream().mapToLong(Long::longValue).sum();
+	        long avgTime = totalTime / executionTimes.size();
+
+	        System.out.println("Individual execution times: " + executionTimes);
+	        System.out.println("Total execution time: " + totalTime + "ms");
+	        System.out.println("Average execution time: " + avgTime + "ms");
+
+	        // 5. Validate the results - if parallel inference is working correctly:
+	        // - Total time should be less than sum of individual times if run sequentially
+	        // - Individual times should be reasonable given the prompt length
+
+	        // Here we're assuming that if parallel inference is working correctly,
+	        // the total time should be significantly less than 4x the average time
+	        // This is a heuristic and might need adjustment based on your hardware
+	        Assert.assertTrue("Parallel inference doesn't appear to be working efficiently",
+	                          totalTime < (avgTime * executionTimes.size() * 0.8));
+
+	    } catch (InterruptedException | ExecutionException e) {
+	        Assert.fail("Error during parallel execution: " + e.getMessage());
+	    } finally {
+	        executor.shutdown();
+	    }
+
+	    // 6. Test slot reuse with similar prompts
+	    String similarPrompt1 = "The quick brown fox jumps over the lazy dog";
+	    String similarPrompt2 = "The quick brown fox jumps over the fence";
+
+	    try {
+	        // First run with one prompt
+	        InferenceParameters params1 = new InferenceParameters()
+	            .setPrompt(similarPrompt1)
+	            .setNPredict(5);
+
+	        String result1 = model.handleCompletions(params1.toString(), false);
+
+	        // Then quickly run with a similar prompt - should reuse the slot
+	        InferenceParameters params2 = new InferenceParameters()
+	            .setPrompt(similarPrompt2)
+	            .setNPredict(5);
+
+	        String result2 = model.handleCompletions(params2.toString(), false);
+
+	        // Both operations should succeed
+	        JsonNode jsonNode1 = JsonUtils.INSTANCE.jsonToNode(result1);
+	        JsonNode jsonNode2 = JsonUtils.INSTANCE.jsonToNode(result2);
+
+	        Assert.assertTrue(jsonNode1.has("result"));
+	        Assert.assertTrue(jsonNode2.has("result"));
+
+	        // We can't directly verify slot reuse from the API, but we can check
+	        // that both operations completed successfully
+	        System.out.println("Successfully processed similar prompts, likely with slot reuse");
+
+	    } catch (Exception e) {
+	        Assert.fail("Error during slot reuse test: " + e.getMessage());
+	    }
+	}
+}
-Original file line number
+Diff line change
@@ Expand Up @@
      */
     JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleKVCacheAction(JNIEnv* env, jobject obj, jint action, jint slotId, jstring jfilename);
+    JNIEXPORT jboolean JNICALL Java_de_kherud_llama_LlamaModel_configureParallelInference(JNIEnv* , jobject , jstring );
       #ifdef __cplusplus
     }
     #endif
@@ Expand Down @@