Skip to content

Commit f2ec3b2

Browse files
authored
remove enforcement of non special when adding tokens (#1521)
* remove enforcement of non special when adding tokens * mut no longer needed * add a small test * nit * style * audit * ignore cargo audit's own vulnerability * update * revert * remove CVE
1 parent 71c2a8d commit f2ec3b2

4 files changed

Lines changed: 19 additions & 2 deletions

File tree

.github/workflows/python.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,9 @@ jobs:
9595
command: clippy
9696
args: --manifest-path ./bindings/python/Cargo.toml --all-targets --all-features -- -D warnings
9797

98+
- name: Install cargo-audit
99+
run: cargo install cargo-audit
100+
98101
- name: Run Audit
99102
uses: actions-rs/cargo@v1
100103
with:

.github/workflows/rust.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,9 @@ jobs:
8181
command: test
8282
args: --verbose --manifest-path ./tokenizers/Cargo.toml --doc
8383

84+
- name: Install cargo-audit
85+
run: cargo install cargo-audit
86+
8487
- name: Run Audit
8588
uses: actions-rs/cargo@v1
8689
with:

bindings/python/src/tokenizer.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1151,8 +1151,7 @@ impl PyTokenizer {
11511151
.map(|token| {
11521152
if let Ok(content) = token.extract::<String>() {
11531153
Ok(PyAddedToken::from(content, Some(false)).get_token())
1154-
} else if let Ok(mut token) = token.extract::<PyRefMut<PyAddedToken>>() {
1155-
token.special = false;
1154+
} else if let Ok(token) = token.extract::<PyRefMut<PyAddedToken>>() {
11561155
Ok(token.get_token())
11571156
} else {
11581157
Err(exceptions::PyTypeError::new_err(

bindings/python/tests/bindings/test_tokenizer.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,3 +535,15 @@ def test_splitting(self):
535535
"▁▁▁▁▁▁",
536536
"▁.",
537537
]
538+
539+
def test_decode_special(self):
540+
tokenizer = Tokenizer(BPE())
541+
tokenizer.add_tokens([AddedToken("my", special=True), AddedToken("name", special=False), "is", "john", "pair"])
542+
543+
# Can decode single sequences
544+
output = tokenizer.decode([0, 1, 2, 3], skip_special_tokens=False)
545+
assert output == "my name is john"
546+
547+
output = tokenizer.decode([0, 1, 2, 3], skip_special_tokens=True)
548+
assert output == "name is john"
549+
assert tokenizer.get_added_tokens_decoder()[0] == AddedToken("my", special=True)

0 commit comments

Comments
 (0)