I used the safetensors model saved by save_pretrained_gguf, but the output did not have the fine-tuned lora parameters
import os, torch
from unsloth import FastLanguageModel
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # 只使用 3070 Ti
MODEL_DIR = "model" # ← 你的合并后HF目录
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["http_proxy"] = "http://192.168.1.222:8080"
os.environ["https_proxy"] = "http://192.168.1.222:8080"
# 建议用半精度推理;若显存紧缺可换 torch.float16
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_DIR,
max_seq_length = 2048,
load_in_4bit = True, # 合并后通常按全精/8bit保存;若你确实保存为4bit可设 True
# torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32,
)
# FastLanguageModel.for_inference(model) # 推理加速
if model.device.type != DEVICE:
model.to(DEVICE)
model.eval()
def chat(user_text: str, max_new_tokens: int = 2048) -> str:
messages = [
# {"role":"user","content":"你是一个体能训练大模型"},
{"role":"user","content": user_text},
]
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True,
# 注意:不是 reasoning 基座就不要开 enable_thinking
enable_thinking=True,
)
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
with torch.no_grad():
output_ids = model.generate(
**inputs,
do_sample=True, temperature=0.6, top_p=0.95, top_k=20,
max_new_tokens=max_new_tokens,
renormalize_logits=True, # 防止数值异常
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
new_ids = output_ids[0, inputs["input_ids"].shape[1]:]
return tokenizer.decode(new_ids, skip_special_tokens=True).strip()
print(chat("你是一个体能训练大模型,你是谁开发的?"))
I used the safetensors model saved by save_pretrained_gguf, but the output did not have the fine-tuned lora parameters