model.save_pretrained_gguf error

from unsloth import FastLanguageModel

max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.




# 基于unsloth加载Llama的蒸馏模型
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="./model/unsloth/DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit", #DeepSeek-R1-Distill-Llama-8B
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)



prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.
Please answer the following medical question.

### Question:
{}

### Response:
<think>{}"""

question = "一个患有急性阑尾炎的病人已经发病5天，腹痛稍有减轻但仍然发热，在体检时发现右下腹有压痛的包块，此时应如何处理？"


FastLanguageModel.for_inference(model)
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)

response = tokenizer.batch_decode(outputs)
print(response[0].split("### Response:")[1])





#  开始训练工作

train_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.
Please answer the following medical question.

### Question:
{}

### Response:
<think>
{}
</think>
{}"""


EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN


# 迭代训练集数据，处理prompt
def formatting_prompts_func(examples):
    inputs = examples["Question"]
    cots = examples["Complex_CoT"]
    outputs = examples["Response"]
    texts = []
    for input, cot, output in zip(inputs, cots, outputs):
        text = train_prompt_style.format(input, cot, output) + EOS_TOKEN
        texts.append(text)
    return {
        "text": texts,
    }


from datasets import load_dataset

dataset = load_dataset("./datasets/FreedomIntelligence", 'zh', split="train[0:500]", trust_remote_code=True)
print(dataset.column_names)

dataset = dataset.map(formatting_prompts_func, batched=True)


FastLanguageModel.for_training(model)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)


from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences.
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=5,
        # num_train_epochs = 1, # For longer training runs!
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",  # Use this for WandB etc
    ),
)

# 训练
trainer_stats = trainer.train()

FastLanguageModel.for_inference(model)  # Unsloth has 2x faster inference!
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs)
print(response[0].split("### Response:")[1])

model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")


# Save to 8bit Q8_0
if True: model.save_pretrained_gguf("model", tokenizer,)

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model_f16", tokenizer, quantization_method = "f16")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")





Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.
make: *** llama.cpp: No such file or directory.  Stop.
CMake Error: The source directory "/home/wb/dev/Unsloth/llama.cpp" does not exist.
Specify --help for usage, or press the help button on the CMake GUI.
Traceback (most recent call last):
  File "/home/wb/dev/Unsloth/医疗专家诊断模型训练.py", line 175, in <module>
    if True: model.save_pretrained_gguf("model", tokenizer,)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/anaconda3/envs/unsloth/lib/python3.11/site-packages/unsloth/save.py", line 1697, in unsloth_save_pretrained_gguf
    makefile = install_llama_cpp_make_non_blocking()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/anaconda3/envs/unsloth/lib/python3.11/site-packages/unsloth/save.py", line 783, in install_llama_cpp_make_non_blocking
    raise RuntimeError(f"*** Unsloth: Failed compiling llama.cpp using os.system(...) with error {check}. Please report this ASAP!")
RuntimeError: *** Unsloth: Failed compiling llama.cpp using os.system(...) with error 256. Please report this ASAP!
ERROR conda.cli.main_run:execute(125): `conda run python /home/wb/dev/Unsloth/医疗专家诊断模型训练.py` failed. (See above for error)


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

model.save_pretrained_gguf error #1846

基于unsloth加载Llama的蒸馏模型

Instruction:

Question:

Response:

开始训练工作

Instruction:

Question:

Response:

迭代训练集数据，处理prompt

训练

Save to 8bit Q8_0

Save to 16bit GGUF

Save to q4_k_m GGUF

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Uh oh!

model.save_pretrained_gguf error #1846

Description

基于unsloth加载Llama的蒸馏模型

Instruction:

Question:

Response:

开始训练工作

Instruction:

Question:

Response:

迭代训练集数据，处理prompt

训练

Save to 8bit Q8_0

Save to 16bit GGUF

Save to q4_k_m GGUF

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions