serge-chat · gaby · Sep 26, 2023 · Sep 25, 2023 · Sep 26, 2023 · Sep 26, 2023
diff --git a/api/src/serge/routers/chat.py b/api/src/serge/routers/chat.py
@@ -1,5 +1,3 @@
-import os
-
 from typing import Optional
 from fastapi import APIRouter
 from langchain.memory import RedisChatMessageHistory
@@ -30,6 +28,7 @@ async def create_new_chat(
    repeat_last_n: int = 64,
     repeat_penalty: float = 1.3,
     init_prompt: str = "Below is an instruction that describes a task. Write a response that appropriately completes the request.",
+    n_threads: int = 4,
 ):
     try:
         client = Llama(
@@ -52,7 +51,7 @@ async def create_new_chat(
         n_gpu_layers=gpu_layers,
         last_n_tokens_size=repeat_last_n,
         repeat_penalty=repeat_penalty,
-        n_threads=len(os.sched_getaffinity(0)),
+        n_threads=n_threads,
         init_prompt=init_prompt,
     )
     # create the chat

diff --git a/api/src/serge/utils/llm.py b/api/src/serge/utils/llm.py
@@ -119,8 +119,8 @@ def _default_params(self) -> dict[str, Any]:
             "stop_sequences": self.stop_sequences,
             "repeat_penalty": self.repeat_penalty,
             "top_k": self.top_k,
-            "n_ctx": self.n_ctx,
             "n_threads": self.n_threads,
+            "n_ctx": self.n_ctx,
             "n_gpu_layers": self.n_gpu_layers,
             "n_parts": self.n_parts,
             "seed": self.seed,

diff --git a/web/src/routes/+layout.svelte b/web/src/routes/+layout.svelte
@@ -104,7 +104,7 @@
         `/api/chat/?model=${dataCht.params.model_path}&temperature=${dataCht.params.temperature}&top_k=${dataCht.params.top_k}` +
           `&top_p=${dataCht.params.top_p}&max_length=${dataCht.params.max_tokens}&context_window=${dataCht.params.n_ctx}` +
           `&repeat_last_n=${dataCht.params.last_n_tokens_size}&repeat_penalty=${dataCht.params.repeat_penalty}` +
-          `&init_prompt=${dataCht.history[0].data.content}` +
+          `&n_threads=${dataCht.params.n_threads}&init_prompt=${dataCht.history[0].data.content}` +
           `&gpu_layers=${dataCht.params.n_gpu_layers}`,
 
         {

diff --git a/web/src/routes/+page.svelte b/web/src/routes/+page.svelte
@@ -23,6 +23,7 @@
   let init_prompt =
     "Below is an instruction that describes a task. Write a response that appropriately completes the request.";
 
+  let n_threads = 4;
   let context_window = 2048;
   let gpu_layers = 0;
 
@@ -225,6 +226,20 @@
             {/each}
           </select>
         </div>
+        <div
+          class="tooltip flex flex-col"
+          data-tip="Number of threads to run LLaMA on."
+        >
+          <label for="n_threads" class="label-text pb-1">n_threads</label>
+          <input
+            class="input-bordered input w-full max-w-xs"
+            name="n_threads"
+            type="number"
+            bind:value={n_threads}
+            min="0"
+            max="64"
+          />
+        </div>
         <div
           class="tooltip flex flex-col"
           data-tip="The weight of the penalty to avoid repeating the last repeat_last_n tokens."

diff --git a/web/src/routes/chat/[id]/+page.svelte b/web/src/routes/chat/[id]/+page.svelte
@@ -125,7 +125,7 @@
       `/api/chat/?model=${data.chat.params.model_path}&temperature=${data.chat.params.temperature}&top_k=${data.chat.params.top_k}` +
         `&top_p=${data.chat.params.top_p}&max_length=${data.chat.params.max_tokens}&context_window=${data.chat.params.n_ctx}` +
         `&repeat_last_n=${data.chat.params.last_n_tokens_size}&repeat_penalty=${data.chat.params.repeat_penalty}` +
-        `&init_prompt=${data.chat.history[0].data.content}` +
+        `&n_threads=${data.chat.params.n_threads}&init_prompt=${data.chat.history[0].data.content}` +
         `&gpu_layers=${data.chat.params.n_gpu_layers}`,
 
       {
@@ -337,6 +337,27 @@
             {data.chat.params.n_ctx}/{data.chat.params.max_tokens}
           </span>
         </div>
+        {#if data.chat.params.n_threads > 0}
+          <div class="pl-4 hidden sm:flex flex-row items-center justify-center">
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke-width="1.5"
+              stroke="currentColor"
+              class="w-4 h-4"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M8.25 3v1.5M4.5 8.25H3m18 0h-1.5M4.5 12H3m18 0h-1.5m-15 3.75H3m18 0h-1.5M8.25 19.5V21M12 3v1.5m0 15V21m3.75-18v1.5m0 15V21m-9-1.5h10.5a2.25 2.25 0 002.25-2.25V6.75a2.25 2.25 0 00-2.25-2.25H6.75A2.25 2.25 0 004.5 6.75v10.5a2.25 2.25 0 002.25 2.25zm.75-12h9v9h-9v-9z"
+              />
+            </svg>
+            <span class="ml-2 inline-block text-center text-sm font-semibold">
+              {data.chat.params.n_threads}
+            </span>
+          </div>
+        {/if}
         {#if data.chat.params.n_gpu_layers > 0}
           <div class="pl-4 hidden sm:flex flex-row items-center justify-center">
             <svg

diff --git a/web/src/routes/chat/[id]/+page.ts b/web/src/routes/chat/[id]/+page.ts
@@ -15,6 +15,7 @@ interface Params {
   model_path: string;
   n_ctx: number;
   n_gpu_layers: number;
+  n_threads: number;
   last_n_tokens_size: number;
   max_tokens: number;
   temperature: number;