-
Notifications
You must be signed in to change notification settings - Fork 32.5k
Closed
Description
System Info
transformersversion: 4.34.0.dev0- Platform: macOS-13.6-arm64-arm-64bit
- Python version: 3.9.6
- Huggingface_hub version: 0.16.4
- Safetensors version: 0.3.2
- Accelerate version: 0.23.0
- Accelerate config: not found
- PyTorch version (GPU?): 2.0.1 (False)
Who can help?
@ArthurZucker and @younesbelkada
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examplesfolder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
import tokenizers
from transformers import (
AutoTokenizer
)
START_TOKEN = "<|im_start|>"
END_TOKEN = "<|im_end|>"
tokenizer_path = 'model/Llama-2-7b-hf'
complete_tokenizer_path = 'model/complete_tokenizer'
use_fast = False
legacy = True
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_path,
use_fast=use_fast,
legacy=legacy,
)
tokenizer.add_tokens([
tokenizers.AddedToken(START_TOKEN, lstrip=False, rstrip=False, normalized=False, special=True),
tokenizers.AddedToken(END_TOKEN, lstrip=False, rstrip=False, normalized=False, special=True),
])
before = tokenizer.encode(f'{START_TOKEN}\n')
tokenizer.save_pretrained('model/complete_tokenizer')
tokenizer = AutoTokenizer.from_pretrained(
complete_tokenizer_path,
use_fast=use_fast,
legacy=legacy,
model_max_length=2048,
)
after = tokenizer.encode(f'{START_TOKEN}\n')
assert before == after, (before, after)Expected behavior
The saved tokenizer files looks good but after from_pretrained, the customized AddedToken settings for rstrp and lstrip set to True are not being recognized as expected.
The saved tokenizer_config file:
{
"add_bos_token": true,
"add_eos_token": false,
"added_tokens_decoder": {
"0": {
"content": "<unk>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false,
"special": true
},
"1": {
"content": "<s>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false,
"special": true
},
"2": {
"content": "</s>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false,
"special": true
},
"32000": {
"content": "<|im_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"32001": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"additional_special_tokens": [
"<|im_start|>",
"<|im_end|>"
],
"bos_token": "<s>",
"clean_up_tokenization_spaces": false,
"eos_token": "</s>",
"legacy": true,
"model_max_length": 1600,
"pad_token": null,
"padding_side": "right",
"sp_model_kwargs": {},
"spaces_between_special_tokens": false,
"tokenizer_class": "LlamaTokenizer",
"tokenizer_file": null,
"unk_token": "<unk>",
"use_default_system_prompt": true
}
print(tokenizer) will show :
LlamaTokenizer(name_or_path='model/complete_tokenizer', vocab_size=32000, model_max_length=2048, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}, clean_up_tokenization_spaces=False), added_tokens_decoder={
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
32000: AddedToken("<|im_start|>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=True),
32001: AddedToken("<|im_end|>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=True),
}
This only effect slow tokenizer
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels