tekintian
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 17 additions & 0 deletions b/‎.github/workflows/build.yml‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎benches/nemotron/nemotron-dgx-spark.md‎
Lines changed: 72 additions & 0 deletions b/‎benches/nemotron/nemotron-dgx-spark.md‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎common/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎common/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎common/arg.cpp‎
Lines changed: 31 additions & 2 deletions b/‎common/arg.cpp‎
Lines changed: 31 additions & 2 deletions
diff --git a/‎common/chat-auto-parser-generator.cpp‎
Lines changed: 3 additions & 1 deletion b/‎common/chat-auto-parser-generator.cpp‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎common/chat-peg-parser.cpp‎
Lines changed: 24 additions & 24 deletions b/‎common/chat-peg-parser.cpp‎
Lines changed: 24 additions & 24 deletions
@@ -469,6 +469,7 @@ jobs:
           cd build
           export GGML_VK_VISIBLE_DEVICES=0
           export GGML_VK_DISABLE_F16=1
+          export GGML_VK_DISABLE_COOPMAT=1
           # This is using llvmpipe and runs slower than other backends
           ctest -L main --verbose --timeout 4800
 
@@ -1726,6 +1727,22 @@ jobs:
           vulkaninfo --summary
           GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
+  ggml-ci-x64-linux-intel-vulkan:
+    runs-on: [self-hosted, Linux, X64, Intel]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+          persist-credentials: false
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          vulkaninfo --summary
+          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
   ggml-ci-arm64-cpu-kleidiai:
      runs-on: ubuntu-22.04-arm
 
 
@@ -0,0 +1,72 @@
+# NVIDIA DGX Spark
+
+## System info
+
+```bash
+uname --all
+Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux
+
+g++ --version
+g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
+
+nvidia-smi
+Fri Mar  6 11:39:45 2026
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
++-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA GB10                    On  |   0000000F:01:00.0 Off |                  N/A |
+| N/A   52C    P0             13W /  N/A  | Not Supported          |      0%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+```
+
+## ggml-org/nemotron-3-super-120b-GGUF
+
+Model: https://huggingface.co/ggml-org/nemotron-3-super-120b-GGUF
+
+- `llama-batched-bench`
+
+main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = 99, n_threads = 20, n_threads_batch = 20
+
+|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
+|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
+|   512 |     32 |    1 |    544 |    1.094 |   468.05 |    1.621 |    19.74 |    2.715 |   200.37 |
+|   512 |     32 |    2 |   1088 |    1.463 |   700.16 |    2.437 |    26.26 |    3.900 |   279.01 |
+|   512 |     32 |    4 |   2176 |    2.647 |   773.76 |    4.043 |    31.66 |    6.689 |   325.29 |
+|   512 |     32 |    8 |   4352 |    5.291 |   774.14 |    6.151 |    41.62 |   11.442 |   380.37 |
+|   512 |     32 |   16 |   8704 |   10.603 |   772.62 |   10.385 |    49.30 |   20.987 |   414.72 |
+|   512 |     32 |   32 |  17408 |   21.231 |   771.69 |   18.235 |    56.16 |   39.466 |   441.09 |
+|  4096 |     32 |    1 |   4128 |    5.340 |   767.05 |    1.616 |    19.81 |    6.956 |   593.47 |
+|  4096 |     32 |    2 |   8256 |   10.673 |   767.55 |    2.454 |    26.08 |   13.127 |   628.94 |
+|  4096 |     32 |    4 |  16512 |   21.348 |   767.46 |    4.072 |    31.44 |   25.420 |   649.57 |
+|  4096 |     32 |    8 |  33024 |   42.714 |   767.15 |    6.277 |    40.78 |   48.991 |   674.08 |
+|  4096 |     32 |   16 |  66048 |   85.385 |   767.54 |   10.596 |    48.32 |   95.981 |   688.14 |
+|  4096 |     32 |   32 | 132096 |  170.819 |   767.32 |   18.619 |    55.00 |  189.437 |   697.31 |
+|  8192 |     32 |    1 |   8224 |   10.690 |   766.32 |    1.619 |    19.76 |   12.310 |   668.10 |
+|  8192 |     32 |    2 |  16448 |   21.382 |   766.24 |    2.467 |    25.94 |   23.850 |   689.65 |
+|  8192 |     32 |    4 |  32896 |   42.782 |   765.92 |    4.098 |    31.23 |   46.881 |   701.69 |
+|  8192 |     32 |    8 |  65792 |   85.582 |   765.77 |    6.368 |    40.20 |   91.951 |   715.52 |
+|  8192 |     32 |   16 | 131584 |  171.066 |   766.21 |   10.774 |    47.52 |  181.840 |   723.62 |
+|  8192 |     32 |   32 | 263168 |  342.140 |   766.19 |   18.969 |    53.98 |  361.109 |   728.78 |
+
+
+- `llama-bench`
+
+| model                   |       size |     params | backend    | n_ubatch | fa |            test |                  t/s |
+| ----------------------- | ---------: | ---------: | ---------- | -------: | -: | --------------: | -------------------: |
+| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |          pp2048 |        768.84 ± 0.90 |
+| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |            tg32 |         19.94 ± 0.16 |
+| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |  pp2048 @ d4096 |        764.51 ± 0.50 |
+| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |    tg32 @ d4096 |         19.95 ± 0.18 |
+| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |  pp2048 @ d8192 |        759.53 ± 0.71 |
+| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |    tg32 @ d8192 |         19.83 ± 0.18 |
+| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 | pp2048 @ d16384 |        747.98 ± 1.58 |
+| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |   tg32 @ d16384 |         19.84 ± 0.18 |
+| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 | pp2048 @ d32768 |        724.40 ± 2.70 |
+| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |   tg32 @ d32768 |         19.45 ± 0.18 |
+
+build: 04a65daab (8268)
@@ -81,6 +81,8 @@ add_library(${TARGET} STATIC
     preset.cpp
     preset.h
     regex-partial.cpp
+    reasoning-budget.cpp
+    reasoning-budget.h
     regex-partial.h
     sampling.cpp
     sampling.h
 
@@ -2913,6 +2913,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             auto parsed = json::parse(value);
             for (const auto & item : parsed.items()) {
+                if (item.key() == "enable_thinking") {
+                    LOG_WRN("Setting 'enable_thinking' via --chat-template-kwargs is deprecated. "
+                            "Use --reasoning on / --reasoning off instead.\n");
+                }
                 params.default_template_kwargs[item.key()] = item.value().dump();
             }
         }
@@ -3048,14 +3052,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.reasoning_format = common_reasoning_format_from_name(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
+    add_opt(common_arg(
+        {"-rea", "--reasoning"}, "[on|off|auto]",
+        "Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))",
+        [](common_params & params, const std::string & value) {
+            if (is_truthy(value)) {
+                params.enable_reasoning = 1;
+                params.default_template_kwargs["enable_thinking"] = "true";
+            } else if (is_falsey(value)) {
+                params.enable_reasoning = 0;
+                params.default_template_kwargs["enable_thinking"] = "false";
+            } else if (is_autoy(value)) {
+                params.enable_reasoning = -1;
+            } else {
+                throw std::invalid_argument(
+                    string_format("error: unknown value for --reasoning: '%s'\n", value.c_str()));
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_REASONING"));
     add_opt(common_arg(
         {"--reasoning-budget"}, "N",
-        "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
+        "token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
         [](common_params & params, int value) {
-            if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
+            if (value < -1) { throw std::invalid_argument("invalid value"); }
             params.reasoning_budget = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
+    add_opt(common_arg(
+        {"--reasoning-budget-message"}, "MESSAGE",
+        "message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
+        [](common_params & params, const std::string & value) {
+            params.reasoning_budget_message = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
     add_opt(common_arg(
         {"--chat-template"}, "JINJA_TEMPLATE",
         string_format(
 
@@ -135,7 +135,9 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
     if (thinking_forced_open || thinking_forced_closed) {
         // Thinking is forced open OR forced closed with enable_thinking=true
         // In both cases, expect only the closing tag (opening was in template)
-        return p.reasoning(p.until(end)) + end;
+        // However, since we might have incorrectly detected the open/close pattern,
+        // we admit an optional starting marker
+        return p.optional(p.literal(start)) + p.reasoning(p.until(end)) + end;
     }
     if (mode == reasoning_mode::TAG_BASED || mode == reasoning_mode::TOOLS_ONLY) {
         // Standard tag-based reasoning OR tools-only mode (reasoning appears with tools)
 
@@ -6,7 +6,7 @@
 
 #include <nlohmann/json.hpp>
 
-using json = nlohmann::ordered_json;
+using ordered_json = nlohmann::ordered_json;
 
 static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
     int count = 0;
@@ -68,7 +68,7 @@ static int json_brace_depth(const std::string & s) {
 
 // JSON-escape a string and return the inner content (without surrounding quotes).
 static std::string escape_json_string_inner(const std::string & s) {
-    std::string escaped = json(s).dump();
+    std::string escaped = ordered_json(s).dump();
     if (escaped.size() >= 2 && escaped.front() == '"' && escaped.back() == '"') {
         return escaped.substr(1, escaped.size() - 2);
     }
@@ -309,7 +309,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
         if (arg_count > 0) {
             arg_entry = ",";
         }
-        arg_entry += json(trim(node.text)).dump() + ":";
+        arg_entry += ordered_json(trim(node.text)).dump() + ":";
         ++arg_count;
 
         auto & target = args_target();
@@ -343,7 +343,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
 
             // Try to parse as JSON value (number, bool, null, object, array)
             try {
-                json parsed = json::parse(value_content);
+                ordered_json parsed = ordered_json::parse(value_content);
                 if (parsed.is_string()) {
                     // Don't add closing quote yet (added by arg_close) for monotonic streaming
                     std::string escaped = parsed.dump();
@@ -408,7 +408,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
 
 common_peg_parser common_chat_peg_builder::standard_constructed_tools(
     const std::map<std::string, std::string> & markers,
-    const nlohmann::json &                     tools,
+    const ordered_json &                       tools,
     bool                                       parallel_tool_calls,
     bool                                       force_tool_calls) {
     if (!tools.is_array() || tools.empty()) {
@@ -439,7 +439,7 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools(
         }
         const auto &   function = tool_def.at("function");
         std::string    name     = function.at("name");
-        nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
+        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
 
         // Build argument parsers
         auto args = eps();
@@ -479,8 +479,8 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools(
 // Python-style tool calls: name(arg1="value1", arg2=123)
 // Used only by LFM2 for now, so we don't merge it into autoparser
 common_peg_parser common_chat_peg_builder::python_style_tool_calls(
-    const nlohmann::json & tools,
-    bool                   parallel_tool_calls) {
+    const ordered_json & tools,
+    bool                 parallel_tool_calls) {
     if (!tools.is_array() || tools.empty()) {
         return eps();
     }
@@ -493,7 +493,7 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls(
         }
         const auto &   function = tool_def.at("function");
         std::string    name     = function.at("name");
-        nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
+        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
 
         auto args = eps();
         if (params.contains("properties") && !params["properties"].empty()) {
@@ -555,11 +555,11 @@ static std::pair<std::string, std::string> parse_key_spec(const std::string & ke
 
 // Mode 1: function_is_key — parse {"function_name": {...}}
 common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
-    const nlohmann::json & tools,
-    const std::string &    args_key,
-    const std::string &    effective_args_key,
-    const std::string &    call_id_key,
-    const std::string &    gen_call_id_key) {
+    const ordered_json & tools,
+    const std::string &  args_key,
+    const std::string &  effective_args_key,
+    const std::string &  call_id_key,
+    const std::string &  gen_call_id_key) {
 
     auto tool_choices = choice();
 
@@ -569,7 +569,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
         }
         const auto &   function = tool_def.at("function");
         std::string    name     = function.at("name");
-        nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
+        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
 
         // Build inner object fields
         std::vector<common_peg_parser> inner_fields;
@@ -634,11 +634,11 @@ common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
 
 // Mode 2: Nested keys (dot notation like "function.name")
 common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
-    const nlohmann::json & tools,
-    const std::string &    effective_name_key,
-    const std::string &    effective_args_key,
-    const std::string &    call_id_key,
-    const std::string &    gen_call_id_key) {
+    const ordered_json & tools,
+    const std::string &  effective_name_key,
+    const std::string &  effective_args_key,
+    const std::string &  call_id_key,
+    const std::string &  gen_call_id_key) {
 
     auto tool_choices = choice();
 
@@ -655,7 +655,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
         }
         const auto &   function = tool_def.at("function");
         std::string    name     = function.at("name");
-        nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
+        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
 
         auto nested_name = literal("\"" + nested_name_field + "\"") + space() + literal(":") + space() +
                           literal("\"") + tool_name(literal(name)) + literal("\"");
@@ -706,7 +706,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
 
 // Mode 3: Flat keys with optional ID fields and parameter ordering
 common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
-    const nlohmann::json &           tools,
+    const ordered_json &             tools,
     const std::string &              effective_name_key,
     const std::string &              effective_args_key,
     const std::string &              call_id_key,
@@ -723,7 +723,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
         }
         const auto &   function = tool_def.at("function");
         std::string    name     = function.at("name");
-        nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
+        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
 
         auto tool_name_ = name_key_parser + space() + literal(":") + space() +
                          literal("\"") + tool_name(literal(name)) + literal("\"");
@@ -791,7 +791,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
 common_peg_parser common_chat_peg_builder::standard_json_tools(
                                                        const std::string &              section_start,
                                                        const std::string &              section_end,
-                                                       const nlohmann::json &           tools,
+                                                       const ordered_json &             tools,
                                                        bool                             parallel_tool_calls,
                                                        bool                             force_tool_calls,
                                                        const std::string &              name_key,