quarkiverse · cescoffier · May 15, 2024 · May 15, 2024
@@ -74,5 +74,11 @@ interface DevServicesConfig {
          */
         @WithDefault("11434")
         Integer port();
+
+        /**
+         * Instructs Ollama to preload a model in order to get faster response times
+         */
+        @WithDefault("true")
+        boolean preload();
     }
 }
@@ -139,6 +139,14 @@ public void onComplete() {
                     throw new RuntimeException(e.getCause());
                 }
             }
+
+            // preload model - it only makes sense to load a single model
+            if ((ollamaChatModels.size() == 1) && (config.devservices().preload())) {
+                String modelName = ollamaChatModels.get(0).getModelName();
+                LOGGER.infof("Preloading model %s", modelName);
+                client.preloadChatModel(modelName);
+            }
+
             compressor.close();
 
             String ollamaBaseUrl = String.format("http://localhost:%d", config.devservices().port());

@@ -124,6 +124,33 @@ public Flow.Publisher<PullAsyncLine> pullAsync(String modelName) {
         }
     }
 
+    @Override
+    public void preloadChatModel(String modelName) {
+        String serverUrl = String.format("http://%s:%d/api/chat", options.host(), options.port());
+        try {
+            HttpRequest httpRequest = HttpRequest.newBuilder()
+                    .uri(new URI(serverUrl))
+                    .POST(HttpRequest.BodyPublishers.ofString(String.format("{\"model\": \"%s\"}", modelName)))
+                    .build();
+
+            HttpResponse<String> httpResponse = HttpClient.newHttpClient().send(httpRequest,
+                    HttpResponse.BodyHandlers.ofString());
+            if (httpResponse.statusCode() != 200) {
+                throw new RuntimeException(
+                        "Unexpected response code: " + httpResponse.statusCode() + " response body: "
+                                + httpResponse.body());
+            }
+        } catch (URISyntaxException e) {
+            throw new IllegalStateException("Unable to convert " + serverUrl + " to URI", e);
+        } catch (ConnectException e) {
+            throw new OllamaClient.ServerUnavailableException(options.host(), options.port());
+        } catch (IOException e) {
+            throw new UncheckedIOException(e);
+        } catch (InterruptedException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
     private record PullAsyncLineSubscriber(MultiEmitter<? super PullAsyncLine> emitter, ObjectMapper objectMapper,
             String modelName) implements Flow.Subscriber<String> {
 

@@ -29,6 +29,14 @@ static OllamaClient create(Options options) {
      */
     Flow.Publisher<PullAsyncLine> pullAsync(String modelName);
 
+    /**
+     * Instructs Ollama to preload a model in order to get faster response times.
+     * See <a href=
+     * "https://github.com/ollama/ollama/blob/main/docs/faq.md#how-can-i-pre-load-a-model-to-get-faster-response-times">this</a>
+     * for more information
+     */
+    void preloadChatModel(String modelName);
+
     record ModelInfo(String name, @JsonProperty("modelfile") String modelFile, String parameters, Details details) {
 
         public record Details(String family, String parameterSize) {

@@ -220,4 +220,26 @@ public void testRealPullAsync() {
         failedSubscriber.assertCompleted();
     }
 
+    @Test
+    public void testPreLoad() {
+        wiremock().register(
+                post(urlEqualTo("/api/chat"))
+                        .willReturn(aResponse()
+                                .withHeader("Content-Type", "application/json")
+                                .withBody(
+                                        """
+                                                {
+                                                  "model": "llama3",
+                                                  "created_at": "2024-05-15T06:20:50.537329742Z",
+                                                  "message": {
+                                                    "role": "assistant",
+                                                    "content": ""
+                                                  },
+                                                  "done_reason": "load",
+                                                  "done": true
+                                                }""")));
+
+        client.preloadChatModel("llama3");
+    }
+
 }