mirage-project · NorthmanPKU · Jun 28, 2025 · Jun 28, 2025 · Jun 29, 2025 · Jun 29, 2025
diff --git a/demo/qwen3/demo.py b/demo/qwen3/demo.py
@@ -12,6 +12,13 @@
     parser.add_argument(
         "--profiling", action="store_true", help="Use Profiler to generate trace"
     )
+    # lookahead or promptlookup
+    parser.add_argument(
+        "--spec-decode",
+        default=None,
+        choices=["promptlookup", "lookahead"],
+        help="Enable speculative decoding with 'lookahead' or 'promptlookup' mode.",
+    )
     args = parser.parse_args()
     try:
         from mpi4py import MPI
@@ -111,6 +118,13 @@
             ).contiguous()
         else:
             profiler_tensor = None
+        if args.spec_decode:
+            spec_decode_config = mi.speculative.LookaheadConfig(
+                ngram_size=3,
+                spec_length=5,
+            )
+        else:
+            spec_decode_config = None
         mpk = mi.PersistentKernel(
             world_size=world_size,
             mpi_rank=rank,
@@ -119,7 +133,10 @@
             num_remote_schedulers=0,
             meta_tensors=[step, tokens],
             profiler_tensor=profiler_tensor,
+            spec_decode_config=spec_decode_config,
         )
+        if args.spec_decode == "promptlookup":
+            all_tokens = mpk.attach_input(torch_tensor=tokens, name="all_tokens")
         x = mpk.attach_input(torch_tensor=input_tokens, name="input_token")
         cos_pos_embed = mpk.attach_input(
             torch_tensor=position_embeddings[0][0, :4096, :],
@@ -208,6 +225,15 @@
             io_category="cuda_tensor",
         )
 
+        # add spec tokens layer
+        if args.spec_decode:
+            spec_tokens = mpk.draft_forward_layer_dispatcher(
+                spec_decode = args.spec_decode, 
+                tokens = all_tokens,
+                grid_dim=(96, 1, 1),
+                block_dim=(128, 1, 1),
+            )
+            x = spec_tokens
         # Add Embed
         w = mpk.attach_input(
             torch_tensor=model.model.embed_tokens.weight, name="embed_tokens"
@@ -369,6 +395,15 @@
             grid_dim=(1, 1, 1),
             block_dim=(128, 1, 1),
         )
+        if args.spec_decode:
+            # TODO:(Jianan Ji) Align the output of argmax_reduce with the spec tokens
+            verify_out = mpk.verify_layer_dispatcher(
+                spec_decode = args.spec_decode,
+                spec_tokens = spec_tokens,
+                target_output = argmax_out,
+                grid_dim = (1, 1, 1),
+                block_dim = (128, 1, 1),
+            )
 
         results = mpk.kn_graph.generate_task_graph(num_gpus=world_size)
         with open("task_graph.json", "w") as f:
@@ -436,7 +471,7 @@
         starter.record()
 
         step.fill_(prompt_len)
-        mpk()
+        mpk(output_dir="output")
 
         ender.record()
         torch.cuda.synchronize()

diff --git a/include/mirage/kernel/task_register.h b/include/mirage/kernel/task_register.h
@@ -42,6 +42,12 @@ class TaskRegister {
                                    std::vector<int> const &params);
   int register_argmax_reduce_task(threadblock::Graph const &bgraph,
                                   std::vector<int> const &params);
+  int register_find_ngram_partial_task(threadblock::Graph const &bgraph,
+                                        std::vector<int> const &params);
+  int register_find_ngram_global_task(threadblock::Graph const &bgraph,
+                                      std::vector<int> const &params);
+  int register_target_verify_greedy_task(threadblock::Graph const &bgraph,
+                                           std::vector<int> const &params);
   int register_task_variant(TaskType type, std::string const &code);
 
 public:

diff --git a/include/mirage/persistent_kernel/runtime_header.h b/include/mirage/persistent_kernel/runtime_header.h
@@ -50,6 +50,9 @@ enum TaskType {
   TASK_ARGMAX = 109,
   TASK_ARGMAX_PARTIAL = 110,
   TASK_ARGMAX_REDUCE = 111,
+  TASK_FIND_NGRAM_PARTIAL = 112,
+  TASK_FIND_NGRAM_GLOBAL = 113,
+  TASK_TARGET_VERIFY_GREEDY = 114,
   TASK_NVSHMEM_COPY = 199,
   TASK_SCHD_TASKS = 200,
   TASK_SCHD_EVENTS = 201,

diff --git a/include/mirage/persistent_kernel/tasks/argmax.cuh b/include/mirage/persistent_kernel/tasks/argmax.cuh
@@ -16,7 +16,7 @@
 #include "common.h"
 #include "utils.cuh"
 namespace kernel {
-
+// TODO:(Jianan Ji) These might be further merged as one task?
 template <typename T>
 __device__ __forceinline__ void warp_reduce_max_idx(T &val, long long &idx) {
 #pragma unroll

diff --git a/include/mirage/persistent_kernel/tasks/speculative_decoding/prompt_lookup.cuh b/include/mirage/persistent_kernel/tasks/speculative_decoding/prompt_lookup.cuh
@@ -0,0 +1,136 @@
+/* Copyright 2025 CMU
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "common.h"
+#include "element_unary.cuh"
+#include <climits>
+
+namespace kernel {
+// TODO:(Jianan Ji) These could be further merged as one?
+// Find the first n-gram in the sequence
+template <int NGRAM_SIZE, int NUM_WORKERS>
+static __device__ __forceinline__ void 
+      find_ngram_partial_kernel(long long const *__restrict__ input_ptr,
+                                long long  *__restrict__ output_id_ptr,
+                                int input_token_num) {
+  if (input_token_num <= NGRAM_SIZE) {
+    return;
+  }
+  long long const *__restrict__ ngram_id_ptr = input_ptr + input_token_num - NGRAM_SIZE;
+
+  long long *__restrict__ output = output_id_ptr;
+
+  int block_id = blockIdx.x;
+  int t_id = threadIdx.x;
+
+  __shared__ int ngram[NGRAM_SIZE];
+  __shared__ int input_tokens[NUM_THREADS + NGRAM_SIZE - 1];
+  __shared__ int block_min_idx;
+
+  if (t_id == 0) {
+    block_min_idx = INT_MAX;
+  }
+  if (t_id < NGRAM_SIZE) {
+    ngram[t_id] = ngram_id_ptr[t_id];
+  }
+
+  for (int idx = t_id + block_id * NUM_THREADS; idx < input_token_num - NGRAM_SIZE; idx += NUM_WORKERS * NUM_THREADS) {
+
+    // Load input tokens into shared memory
+    input_tokens[t_id] = input_ptr[idx];
+    if (t_id >= NUM_THREADS_PER_WARP && t_id < NUM_THREADS_PER_WARP + NGRAM_SIZE - 1) {
+        input_tokens[NUM_THREADS + t_id - NUM_THREADS_PER_WARP] =
+            input_ptr[idx + (NUM_THREADS - NUM_THREADS_PER_WARP) + t_id - NUM_THREADS_PER_WARP];
+    }
+    __syncthreads();
+
+    // Each thread checks if an n-gram starts at its position
+    bool is_ngram = true;
+    if (idx > input_token_num - NGRAM_SIZE) {
+      is_ngram = false;
+    } else {
+      #pragma unroll
+      for (int i = 0; i < NGRAM_SIZE; i++) {
+        if (ngram[i] != input_tokens[t_id + i]) {
+            is_ngram = false;
+            break;
+        }
+      }
+    }
+
+    if (is_ngram) {
+      atomicMin(&block_min_idx, idx);
+    }
+    __syncthreads();
+    // Synchronize to make sure all threads see the updated block_min_idx
+    // If a thread in this block has already found a match, exit the loop
+    if (block_min_idx != INT_MAX) {
+      break;
+    }
+  }
+
+  // After the loop, thread 0 writes the block's result to the global output
+  if (t_id == 0) {
+    output[0] = block_min_idx;
+  }
+}
+
+// Find the first n-gram in the sequence
+template <int NGRAM_SIZE, int SPEC_LENGTH, int NUM_PARTIAL_TASKS>
+static __device__ __forceinline__ void 
+find_ngram_global_kernel(long long const *__restrict__ input_array,
+                         long long const *__restrict__ tokens_ptr,
+                         long long *__restrict__ output_result) {
+
+    int t_id = threadIdx.x;
+    __shared__ long long block_min_idx_shared;
+
+    if (t_id == 0) {
+        block_min_idx_shared = INT_MAX;
+    }
+    __syncthreads();
+
+    // Grid-stride loop for a single block to process the array
+    for (int i = threadIdx.x; i < NUM_PARTIAL_TASKS; i += NUM_THREADS) {
+        if (input_array[i] < INT_MAX) {
+            atomicMin(&block_min_idx_shared, input_array[i]);
+        }
+    }
+
+    __syncthreads();
+
+    if (t_id < SPEC_LENGTH) {
+        if (block_min_idx_shared != INT_MAX) {
+            output_result[t_id] = tokens_ptr[block_min_idx_shared + NGRAM_SIZE + t_id];
+        } else {
+            output_result[t_id] = -1;
+        }
+    }
+}
+
+template <int NUM_PARTIAL_TASKS>
+static __device__ __forceinline__ void 
+find_ngram_global_kernel_sequential(long long const *__restrict__ input_array,
+                         long long *__restrict__ output_result) {
+    for (int i = 0; i < NUM_PARTIAL_TASKS; i++) {
+        if (input_array[i] < INT_MAX) {
+            output_result[0] = input_array[i];
+            return;
+        }
+    }
+    output_result[0] = -1;
+}
+
+} // namespace kernel
diff --git a/include/mirage/persistent_kernel/tasks/speculative_decoding/target_verify.cuh b/include/mirage/persistent_kernel/tasks/speculative_decoding/target_verify.cuh
@@ -0,0 +1,38 @@
+/* Copyright 2025 CMU
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "common.h"
+#include "utils.cuh"
+namespace kernel {
+
+// simply sequential greedy search
+template <int NUM_SPEC_TOKENS>
+__device__ __forceinline__ void
+    target_verify_greedy_kernel(void const *__restrict__ spec_token_id_ptr,
+                         void const *__restrict__ target_token_id_ptr,
+                         void *__restrict__ final_output_ptr) {
+    int const *__restrict__ spec_token_id = static_cast<int const *>(spec_token_id_ptr);
+    int const *__restrict__ target_token_id = static_cast<int const *>(target_token_id_ptr);
+    int *__restrict__ accepted_spec_token_num = static_cast<int *>(final_output_ptr);
+    for(int i = 0; i < NUM_SPEC_TOKENS; i++) {
+        if(spec_token_id_ptr[i] != target_token_id_ptr[i]) {
+            accepted_spec_token_num[0] = i;
+            return;
+        }
+    }
+    accepted_spec_token_num[0] = NUM_SPEC_TOKENS + 1;
+}
+
+} // namespace kernel
diff --git a/include/mirage/persistent_kernel/tasks/speculative_decoding/verify_argmax.cuh b/include/mirage/persistent_kernel/tasks/speculative_decoding/verify_argmax.cuh
@@ -0,0 +1,62 @@
+/* Copyright 2025 CMU
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "common.h"
+#include "utils.cuh"
+namespace kernel {
+
+template <typename T, int CHUNK_SIZE, int NUM_PARTIAL_TASKS>
+__device__ __forceinline__ void
+    verify_argmax_reduce_kernel(void const *__restrict__ input_val_ptr,
+                         void const *__restrict__ input_idx_ptr,
+                         void *__restrict__ final_output_ptr,
+                         int step,
+                         long long *tokens) {
+  T const *__restrict__ partial_vals = static_cast<T const *>(input_val_ptr);
+  long long const *__restrict__ partial_idxs =
+      static_cast<long long const *>(input_idx_ptr);
+  long long *__restrict__ final_output =
+      static_cast<long long *>(final_output_ptr);
+
+  int tidx = threadIdx.x;
+  T local_max = T(-inf);
+  // Pack (chunk_index, relative_index) into a single 64-bit integer
+  long long local_packed_idx = -1;
+
+  for (int i = tidx; i < NUM_PARTIAL_TASKS; i += blockDim.x) {
+    T current_val = partial_vals[i];
+    if (current_val > local_max) {
+      local_max = current_val;
+      // Higher 32 bits for chunk_index (i), lower 32 for relative_index
+      local_packed_idx = ((long long)i << 32) | partial_idxs[i];
+    }
+  }
+
+  block_reduce_max_idx(local_max, local_packed_idx);
+
+  if (tidx == 0) {
+    if (local_packed_idx != -1) {
+      long long winning_chunk_idx = local_packed_idx >> 32;
+      long long winning_relative_idx = local_packed_idx & 0xFFFFFFFF;
+      final_output[0] = winning_chunk_idx * CHUNK_SIZE + winning_relative_idx;
+      tokens[step + 1] = winning_chunk_idx * CHUNK_SIZE + winning_relative_idx;
+    } else {
+      final_output[0] = -1;
+      tokens[step + 1] = -1;
+    }
+  }
+}
+
+} // namespace kernel