# xlm-roberta-base directory: git clone https://huggingface.co/xlm-roberta-base
from transformers import XLMRobertaTokenizer
tokenizer_a = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base/')
tokenizer_b = XLMRobertaTokenizer('xlm-roberta-base/sentencepiece.bpe.model')
t = 'texta<s>textb'
print(tokenizer_a.tokenize(t))
print(tokenizer_b.tokenize(t))
# what I expect is that both outputs:
['▁text', 'a', '<s>', '▁text', 'b']
['▁text', 'a', '<s>', '▁text', 'b']
# However, in reality, their outputs are as follows:
['▁text', 'a', '<s>', '▁text', 'b']
['▁text', 'a', '<', 's', '>', 'text', 'b']