remove enforcement of non special when adding tokens (#1521)

ArthurZucker · web-flow · commit f2ec3b239b0a · 2024-04-30T15:53:47.000+02:00
* remove enforcement of non special when adding tokens

* mut no longer needed

* add a small test

* nit

* style

* audit

* ignore cargo audit's own vulnerability

* update

* revert

* remove CVE
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -95,6 +95,9 @@ jobs:
           command: clippy
           args: --manifest-path ./bindings/python/Cargo.toml --all-targets --all-features -- -D warnings
 
+      - name: Install cargo-audit
+        run: cargo install cargo-audit
+
       - name: Run Audit
         uses: actions-rs/cargo@v1
         with:
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -81,6 +81,9 @@ jobs:
           command: test
           args: --verbose --manifest-path ./tokenizers/Cargo.toml --doc
 
+      - name: Install cargo-audit
+        run: cargo install cargo-audit
+
       - name: Run Audit
         uses: actions-rs/cargo@v1
         with:
diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
@@ -1151,8 +1151,7 @@ impl PyTokenizer {
             .map(|token| {
                 if let Ok(content) = token.extract::<String>() {
                     Ok(PyAddedToken::from(content, Some(false)).get_token())
-                } else if let Ok(mut token) = token.extract::<PyRefMut<PyAddedToken>>() {
-                    token.special = false;
+                } else if let Ok(token) = token.extract::<PyRefMut<PyAddedToken>>() {
                     Ok(token.get_token())
                 } else {
                     Err(exceptions::PyTypeError::new_err(
diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py
@@ -535,3 +535,15 @@ def test_splitting(self):
             "▁▁▁▁▁▁",
             "▁.",
         ]
+
+    def test_decode_special(self):
+        tokenizer = Tokenizer(BPE())
+        tokenizer.add_tokens([AddedToken("my", special=True), AddedToken("name", special=False), "is", "john", "pair"])
+
+        # Can decode single sequences
+        output = tokenizer.decode([0, 1, 2, 3], skip_special_tokens=False)
+        assert output == "my name is john"
+
+        output = tokenizer.decode([0, 1, 2, 3], skip_special_tokens=True)
+        assert output == "name is john"
+        assert tokenizer.get_added_tokens_decoder()[0] == AddedToken("my", special=True)