Name and Version
image: ghcr.io/ggml-org/llama.cpp:server-rocm
build 9191
Operating systems
Linux
GGML backends
CUDA
Hardware
7900xtx
Models
https://huggingface.co/unsloth/Qwen3.6-27B-MTP-GGUF/tree/main
Qwen3.6-27B-UD-Q4_K_XL.gguf
https://huggingface.co/llmfan46/Qwen3.6-27B-uncensored-heretic-v2-Native-MTP-Preserved-GGUF
Qwen3.6-27B-uncensored-heretic-v2-Native-MTP-Preserved-Q4_K_M.gguf
Problem description & steps to reproduce
Start docker container, browse to 127.0.0.1:8089, prompt "say hi"
ive tried 50k tokens, still oom
services:
llamacpp-server:
# Note: Ensure this image supports your Intel GPU (SYCL)
image: ghcr.io/ggml-org/llama.cpp:server-rocm
restart: unless-stopped
ports:
- "8089:8089"
shm_size: '6gb'
devices:
- "/dev/kfd:/dev/kfd"
- "/dev/dri:/dev/dri"
group_add:
- video
environment:
# Critical for 7900 XTX (gfx1100)
- HSA_OVERRIDE_GFX_VERSION=11.0.0
volumes:
- /etc/localtime:/etc/localtime:ro
- /opt/llamacpp:/models:rw
command: >
-m /models/Qwen3.6-27B-uncensored-heretic-v2-Native-MTP-Preserved-Q4_K_M.gguf
--alias qwen
--host 0.0.0.0
--port 8089
--parallel 1
--threads 4
--fit on
--fit-target 2048
--fit-ctx 76800
--spec-type draft-mtp
--spec-draft-n-max 2
--kv-unified
--reasoning-budget 2048
--reasoning-budget-message "\n\nThinking limit reached. Output the best answer possible given the reasoning so far."
--n-gpu-layers 999
--no-mmproj
--no-mmap
--temp 0
--min-p 0.05
--top-p 0.95
--top-k 20
--repeat-penalty 1.0
--presence-penalty 0.0
--flash-attn on
--cache-ram 4096
--jinja
--cache-type-k q8_0
--cache-type-v q8_0
--webui-mcp-proxy
--tools all
# llamacp-embedding-server:
# image: ghcr.io/ggml-org/llama.cpp:server
# restart: unless-stopped
# ports:
# - "8088:8088"
# volumes:
# - /etc/localtime:/etc/localtime:ro
# - /opt/llamacpp:/models:rw
# command: >
# -m /models/nomic-embed-text-v1.Q8_0.gguf
# --host 0.0.0.0
# --port 8088
# --threads 8
# --ctx-size 2048
# --n-gpu-layers 0
# --embedding
# --alias nomic
open-webui:
image: ghcr.io/open-webui/open-webui:main
container_name: open-webui
ports:
- "3000:8080"
volumes:
- open-webui-data:/app/backend/data
environment:
# Tell Open WebUI to look for an OpenAI-compatible API (llama.cpp)
- 'OPENAI_API_BASE_URLS=http://llamacpp-server:8089/v1'
- 'OPENAI_API_KEYS=sk-no-key-required'
- 'ENABLE_OLLAMA_API=False' # Optional: disables the default Ollama check
restart: always
volumes:
open-webui-data:
First Bad Commit
No response
Relevant log output
warn: LLAMA_ARG_HOST environment variable is set, but will be overwritten by command line argument --host
0.00.048.295 I log_info: verbosity = 3 (adjust with the -lv N CLI arg)
0.00.048.297 I device_info:
0.00.048.313 I - ROCm0 : AMD Radeon RX 7900 XTX (24560 MiB, 24510 MiB free)
0.00.048.317 I - CPU : AMD Ryzen 7 5800X 8-Core Processor (48091 MiB, 48091 MiB free)
0.00.048.340 I system_info: n_threads = 4 (n_threads_batch = 4) / 16 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
0.00.048.363 I srv init: running without SSL
0.00.048.377 I srv init: using 15 threads for HTTP server
0.00.048.431 W srv main: -----------------
0.00.048.432 W srv main: CORS proxy is enabled, do not expose server to untrusted environments
0.00.048.432 W srv main: This feature is EXPERIMENTAL and may be removed or changed in future versions
0.00.048.432 W srv main: -----------------
0.00.048.437 W srv main: -----------------
0.00.048.437 W srv main: Built-in tools are enabled, do not expose server to untrusted environments
0.00.048.438 W srv main: This feature is EXPERIMENTAL and may be changed in the future
0.00.048.438 W srv main: -----------------
0.00.048.440 I srv start: binding port with default address family
0.00.049.568 I srv main: loading model
0.00.049.570 I srv load_model: loading model '/models/Qwen3.6-27B-uncensored-heretic-v2-Native-MTP-Preserved-Q4_K_M.gguf'
0.00.049.585 I common_init_result: fitting params to device memory ...
0.00.049.587 I common_init_result: (for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on)
0.12.435.350 W llama_context: n_ctx_seq (151808) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
0.12.554.781 W common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
0.12.770.253 I srv load_model: creating MTP draft context against the target model '/models/Qwen3.6-27B-uncensored-heretic-v2-Native-MTP-Preserved-Q4_K_M.gguf'
0.13.033.220 I srv load_model: initializing slots, n_slots = 1
0.13.055.153 I common_context_can_seq_rm: the context supports bounded partial sequence removal
0.13.104.161 I common_speculative_init: adding speculative implementation 'draft-mtp'
0.13.104.219 I srv load_model: speculative decoding context initialized
0.13.104.220 I slot load_model: id 0 | task -1 | new slot, n_ctx = 151808
0.13.104.257 I srv load_model: prompt cache is enabled, size limit: 4096 MiB
0.13.104.258 I srv load_model: use --cache-ram 0 to disable the prompt cache
0.13.104.258 I srv load_model: for more info see #16391
0.13.104.268 I srv init: idle slots will be saved to prompt cache and cleared upon starting a new task
0.13.117.755 I init: chat template, example_format: '<|im_start|>system
You are a helpful assistant<|im_end|>
<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there<|im_end|>
<|im_start|>user
How are you?<|im_end|>
<|im_start|>assistant
'
0.13.127.716 I srv init: init: chat template, thinking = 1
0.13.127.730 I srv main: model loaded
0.13.127.731 I srv main: server is listening on http://0.0.0.0:8089
0.13.127.733 I srv update_slots: all slots are idle
0.19.883.448 W srv operator(): got exception: {"error":{"code":500,"message":"invalid URL: no scheme","type":"server_error"}}
0.25.306.452 I srv params_from_: Chat format: peg-native
0.25.307.358 I slot get_availabl: id 0 | task -1 | selected slot by LRU, t_last = -1
0.25.307.359 I srv get_availabl: updating prompt cache
0.25.307.362 I srv load: - looking for better prompt, base f_keep = -1.000, sim = 0.000
0.25.307.365 I srv update: - cache state: 0 prompts, 0.000 MiB (limits: 4096.000 MiB, 151808 tokens, 4294967296 est)
0.25.307.365 I srv get_availabl: prompt cache update took 0.01 ms
0.25.313.167 I reasoning-budget: activated, budget=2048 tokens
0.25.313.179 I slot launch_slot_: id 0 | task 0 | processing task, is_child = 0
0.31.441.196 E ROCm error: out of memory
0.31.441.199 E /app/ggml/src/ggml-cuda/ggml-cuda.cu:102: ROCm error
current device: 0, in function alloc at /app/ggml/src/ggml-cuda/ggml-cuda.cu:448
0.31.441.199 E err
libggml-base.so.0(+0x1addb)[0x754c923f8ddb]
libggml-base.so.0(ggml_print_backtrace+0x21c)[0x754c923f925c]
libggml-base.so.0(ggml_abort+0x15b)[0x754c923f943b]
/app/libggml-hip.so(+0x240d3af2)[0x754c8b710af2]
/app/libggml-hip.so(_ZN18ggml_cuda_pool_leg5allocEmPm+0x2c0)[0x754c8b7279b0]
/app/libggml-hip.so(+0x240e8f76)[0x754c8b725f76]
/app/libggml-hip.so(+0x240e7ef3)[0x754c8b724ef3]
/app/libggml-hip.so(+0x240e290c)[0x754c8b71f90c]
/app/libggml-hip.so(+0x240ddb0a)[0x754c8b71ab0a]
/app/libggml-hip.so(+0x240da636)[0x754c8b717636]
libggml-base.so.0(ggml_backend_sched_graph_compute_async+0x817)[0x754c924168e7]
libllama.so.0(_ZN13llama_context13graph_computeEP11ggml_cgraphb+0xa1)[0x754c925781a1]
libllama.so.0(_ZN13llama_context14process_ubatchERK12llama_ubatch14llm_graph_typeP22llama_memory_context_iR11ggml_status+0x114)[0x754c9257a934]
libllama.so.0(_ZN13llama_context6decodeERK11llama_batch+0x387)[0x754c92582187]
libllama.so.0(llama_decode+0xf)[0x754c92583e8f]
/app/llama-server(+0x1101d0)[0x647f8f73e1d0]
/app/llama-server(+0x1a2171)[0x647f8f7d0171]
/app/llama-server(+0x6c207)[0x647f8f69a207]
/lib/x86_64-linux-gnu/libc.so.6(+0x2a1ca)[0x754c91e5a1ca]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0x8b)[0x754c91e5a28b]
/app/llama-server(+0x6ca55)[0x647f8f69aa55]
Name and Version
image: ghcr.io/ggml-org/llama.cpp:server-rocm
build 9191
Operating systems
Linux
GGML backends
CUDA
Hardware
7900xtx
Models
https://huggingface.co/unsloth/Qwen3.6-27B-MTP-GGUF/tree/main
Qwen3.6-27B-UD-Q4_K_XL.gguf
https://huggingface.co/llmfan46/Qwen3.6-27B-uncensored-heretic-v2-Native-MTP-Preserved-GGUF
Qwen3.6-27B-uncensored-heretic-v2-Native-MTP-Preserved-Q4_K_M.gguf
Problem description & steps to reproduce
Start docker container, browse to 127.0.0.1:8089, prompt "say hi"
ive tried 50k tokens, still oom
First Bad Commit
No response
Relevant log output
warn: LLAMA_ARG_HOST environment variable is set, but will be overwritten by command line argument --host
0.00.048.295 I log_info: verbosity = 3 (adjust with the
-lv NCLI arg)0.00.048.297 I device_info:
0.00.048.313 I - ROCm0 : AMD Radeon RX 7900 XTX (24560 MiB, 24510 MiB free)
0.00.048.317 I - CPU : AMD Ryzen 7 5800X 8-Core Processor (48091 MiB, 48091 MiB free)
0.00.048.340 I system_info: n_threads = 4 (n_threads_batch = 4) / 16 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
0.00.048.363 I srv init: running without SSL
0.00.048.377 I srv init: using 15 threads for HTTP server
0.00.048.431 W srv main: -----------------
0.00.048.432 W srv main: CORS proxy is enabled, do not expose server to untrusted environments
0.00.048.432 W srv main: This feature is EXPERIMENTAL and may be removed or changed in future versions
0.00.048.432 W srv main: -----------------
0.00.048.437 W srv main: -----------------
0.00.048.437 W srv main: Built-in tools are enabled, do not expose server to untrusted environments
0.00.048.438 W srv main: This feature is EXPERIMENTAL and may be changed in the future
0.00.048.438 W srv main: -----------------
0.00.048.440 I srv start: binding port with default address family
0.00.049.568 I srv main: loading model
0.00.049.570 I srv load_model: loading model '/models/Qwen3.6-27B-uncensored-heretic-v2-Native-MTP-Preserved-Q4_K_M.gguf'
0.00.049.585 I common_init_result: fitting params to device memory ...
0.00.049.587 I common_init_result: (for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on)
0.12.435.350 W llama_context: n_ctx_seq (151808) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
0.12.554.781 W common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
0.12.770.253 I srv load_model: creating MTP draft context against the target model '/models/Qwen3.6-27B-uncensored-heretic-v2-Native-MTP-Preserved-Q4_K_M.gguf'
0.13.033.220 I srv load_model: initializing slots, n_slots = 1
0.13.055.153 I common_context_can_seq_rm: the context supports bounded partial sequence removal
0.13.104.161 I common_speculative_init: adding speculative implementation 'draft-mtp'
0.13.104.219 I srv load_model: speculative decoding context initialized
0.13.104.220 I slot load_model: id 0 | task -1 | new slot, n_ctx = 151808
0.13.104.257 I srv load_model: prompt cache is enabled, size limit: 4096 MiB
0.13.104.258 I srv load_model: use
--cache-ram 0to disable the prompt cache0.13.104.258 I srv load_model: for more info see #16391
0.13.104.268 I srv init: idle slots will be saved to prompt cache and cleared upon starting a new task
0.13.117.755 I init: chat template, example_format: '<|im_start|>system
You are a helpful assistant<|im_end|>
<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there<|im_end|>
<|im_start|>user
How are you?<|im_end|>
<|im_start|>assistant
'
0.13.127.716 I srv init: init: chat template, thinking = 1
0.13.127.730 I srv main: model loaded
0.13.127.731 I srv main: server is listening on http://0.0.0.0:8089
0.13.127.733 I srv update_slots: all slots are idle
0.19.883.448 W srv operator(): got exception: {"error":{"code":500,"message":"invalid URL: no scheme","type":"server_error"}}
0.25.306.452 I srv params_from_: Chat format: peg-native
0.25.307.358 I slot get_availabl: id 0 | task -1 | selected slot by LRU, t_last = -1
0.25.307.359 I srv get_availabl: updating prompt cache
0.25.307.362 I srv load: - looking for better prompt, base f_keep = -1.000, sim = 0.000
0.25.307.365 I srv update: - cache state: 0 prompts, 0.000 MiB (limits: 4096.000 MiB, 151808 tokens, 4294967296 est)
0.25.307.365 I srv get_availabl: prompt cache update took 0.01 ms
0.25.313.167 I reasoning-budget: activated, budget=2048 tokens
0.25.313.179 I slot launch_slot_: id 0 | task 0 | processing task, is_child = 0
0.31.441.196 E ROCm error: out of memory
0.31.441.199 E /app/ggml/src/ggml-cuda/ggml-cuda.cu:102: ROCm error
current device: 0, in function alloc at /app/ggml/src/ggml-cuda/ggml-cuda.cu:448
0.31.441.199 E err
libggml-base.so.0(+0x1addb)[0x754c923f8ddb]
libggml-base.so.0(ggml_print_backtrace+0x21c)[0x754c923f925c]
libggml-base.so.0(ggml_abort+0x15b)[0x754c923f943b]
/app/libggml-hip.so(+0x240d3af2)[0x754c8b710af2]
/app/libggml-hip.so(_ZN18ggml_cuda_pool_leg5allocEmPm+0x2c0)[0x754c8b7279b0]
/app/libggml-hip.so(+0x240e8f76)[0x754c8b725f76]
/app/libggml-hip.so(+0x240e7ef3)[0x754c8b724ef3]
/app/libggml-hip.so(+0x240e290c)[0x754c8b71f90c]
/app/libggml-hip.so(+0x240ddb0a)[0x754c8b71ab0a]
/app/libggml-hip.so(+0x240da636)[0x754c8b717636]
libggml-base.so.0(ggml_backend_sched_graph_compute_async+0x817)[0x754c924168e7]
libllama.so.0(_ZN13llama_context13graph_computeEP11ggml_cgraphb+0xa1)[0x754c925781a1]
libllama.so.0(_ZN13llama_context14process_ubatchERK12llama_ubatch14llm_graph_typeP22llama_memory_context_iR11ggml_status+0x114)[0x754c9257a934]
libllama.so.0(_ZN13llama_context6decodeERK11llama_batch+0x387)[0x754c92582187]
libllama.so.0(llama_decode+0xf)[0x754c92583e8f]
/app/llama-server(+0x1101d0)[0x647f8f73e1d0]
/app/llama-server(+0x1a2171)[0x647f8f7d0171]
/app/llama-server(+0x6c207)[0x647f8f69a207]
/lib/x86_64-linux-gnu/libc.so.6(+0x2a1ca)[0x754c91e5a1ca]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0x8b)[0x754c91e5a28b]
/app/llama-server(+0x6ca55)[0x647f8f69aa55]