Changing Decoder trait to be more composable. (#938)

Narsil · web-flow · commit cdabef14c4f9 · 2022-03-17T10:32:09.000+01:00
* Changing `Decoder` trait to be more composable. Fix #872 * Fixing Python side. * Fixing test. * Updating cleanup signature, removing turbofish.
diff --git a/bindings/node/lib/bindings/decoders.test.ts b/bindings/node/lib/bindings/decoders.test.ts
@@ -12,7 +12,7 @@ describe("wordPieceDecoder", () => {
   it("can decode arrays of strings", () => {
     expect(
       wordPieceDecoder().decode(["Hel", "##lo", "there", "my", "fr", "##iend"])
-    ).toEqual("Hello there my friend");
+    ).toEqual(["Hel", "lo", " there", " my", " fr", "iend"]);
   });
 });
 
@@ -39,6 +39,6 @@ describe("ctcDecoder", () => {
   it("encodes correctly", () => {
     expect(
       ctcDecoder().decode(["<pad>", "h", "h", "e", "e", "l", "l", "<pad>", "l", "l", "o"])
-    ).toEqual("hello");
+    ).toEqual(["h", "e", "l", "l", "o"]);
   });
 });
diff --git a/bindings/node/native/src/decoders.rs b/bindings/node/native/src/decoders.rs
@@ -14,7 +14,7 @@ pub struct Decoder {
 }
 
 impl tk::Decoder for Decoder {
-    fn decode(&self, tokens: Vec<String>) -> tk::Result<String> {
+    fn decode(&self, tokens: Vec<String>) -> tk::Result<Vec<String>> {
         self.decoder
             .as_ref()
             .ok_or("Uninitialized Decoder")?
@@ -41,7 +41,13 @@ declare_types! {
                 .decode(tokens)
                 .map_err(|e| Error(format!("{}", e)))?;
 
-            Ok(cx.string(output).upcast())
+            let decoded = JsArray::new(&mut cx, output.len() as u32);
+            for (i, token) in output.into_iter().enumerate() {
+                let js_token = cx.string(token);
+                decoded.set(&mut cx, i as u32, js_token)?;
+            }
+
+            Ok(decoded.upcast())
         }
     }
 }
diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
@@ -51,7 +51,7 @@ impl PyDecoder {
 }
 
 impl Decoder for PyDecoder {
-    fn decode(&self, tokens: Vec<String>) -> tk::Result<String> {
+    fn decode(&self, tokens: Vec<String>) -> tk::Result<Vec<String>> {
         self.decoder.decode(tokens)
     }
 }
@@ -98,7 +98,7 @@ impl PyDecoder {
     /// Returns:
     ///     :obj:`str`: The decoded string
     #[text_signature = "(self, tokens)"]
-    fn decode(&self, tokens: Vec<String>) -> PyResult<String> {
+    fn decode(&self, tokens: Vec<String>) -> PyResult<Vec<String>> {
         ToPyResult(self.decoder.decode(tokens)).into()
     }
 }
@@ -337,12 +337,12 @@ impl CustomDecoder {
 }
 
 impl Decoder for CustomDecoder {
-    fn decode(&self, tokens: Vec<String>) -> tk::Result<String> {
+    fn decode(&self, tokens: Vec<String>) -> tk::Result<Vec<String>> {
         Python::with_gil(|py| {
             let decoded = self
                 .inner
                 .call_method(py, "decode", (tokens,), None)?
-                .extract::<String>(py)?;
+                .extract(py)?;
             Ok(decoded)
         })
     }
@@ -396,7 +396,7 @@ where
 }
 
 impl Decoder for PyDecoderWrapper {
-    fn decode(&self, tokens: Vec<String>) -> tk::Result<String> {
+    fn decode(&self, tokens: Vec<String>) -> tk::Result<Vec<String>> {
         match self {
             PyDecoderWrapper::Wrapped(inner) => inner.read().unwrap().decode(tokens),
             PyDecoderWrapper::Custom(inner) => inner.read().unwrap().decode(tokens),
diff --git a/bindings/python/tests/bindings/test_decoders.py b/bindings/python/tests/bindings/test_decoders.py
@@ -14,7 +14,7 @@ def test_instantiate(self):
 
     def test_decoding(self):
         decoder = ByteLevel()
-        assert decoder.decode(["My", "Ġname", "Ġis", "ĠJohn"]) == "My name is John"
+        assert decoder.decode(["My", "Ġname", "Ġis", "ĠJohn"]) == ["My name is John"]
 
     def test_manual_reload(self):
         byte_level = ByteLevel()
@@ -34,11 +34,25 @@ def test_instantiate(self):
 
     def test_decoding(self):
         decoder = WordPiece()
-        assert decoder.decode(["My", "na", "##me", "is", "Jo", "##hn"]) == "My name is John"
-        assert decoder.decode(["I", "'m", "Jo", "##hn"]) == "I'm John"
+        assert decoder.decode(["My", "na", "##me", "is", "Jo", "##hn"]) == [
+            "My",
+            " na",
+            "me",
+            " is",
+            " Jo",
+            "hn",
+        ]
+        assert decoder.decode(["I", "'m", "Jo", "##hn"]) == ["I", "'m", " Jo", "hn"]
         decoder = WordPiece(prefix="__", cleanup=False)
-        assert decoder.decode(["My", "na", "__me", "is", "Jo", "__hn"]) == "My name is John"
-        assert decoder.decode(["I", "'m", "Jo", "__hn"]) == "I 'm John"
+        assert decoder.decode(["My", "na", "__me", "is", "Jo", "__hn"]) == [
+            "My",
+            " na",
+            "me",
+            " is",
+            " Jo",
+            "hn",
+        ]
+        assert decoder.decode(["I", "'m", "Jo", "__hn"]) == ["I", " 'm", " Jo", "hn"]
 
     def test_can_modify(self):
         decoder = WordPiece(prefix="$$", cleanup=False)
@@ -66,9 +80,9 @@ def test_instantiate(self):
 
     def test_decoding(self):
         decoder = Metaspace()
-        assert decoder.decode(["▁My", "▁name", "▁is", "▁John"]) == "My name is John"
+        assert decoder.decode(["▁My", "▁name", "▁is", "▁John"]) == ["My", " name", " is", " John"]
         decoder = Metaspace(replacement="-", add_prefix_space=False)
-        assert decoder.decode(["-My", "-name", "-is", "-John"]) == " My name is John"
+        assert decoder.decode(["-My", "-name", "-is", "-John"]) == [" My", " name", " is", " John"]
 
     def test_can_modify(self):
         decoder = Metaspace(replacement="*", add_prefix_space=False)
@@ -93,12 +107,23 @@ def test_instantiate(self):
 
     def test_decoding(self):
         decoder = BPEDecoder()
-        assert (
-            decoder.decode(["My</w>", "na", "me</w>", "is</w>", "Jo", "hn</w>"])
-            == "My name is John"
-        )
+        assert decoder.decode(["My</w>", "na", "me</w>", "is</w>", "Jo", "hn</w>"]) == [
+            "My ",
+            "na",
+            "me ",
+            "is ",
+            "Jo",
+            "hn",
+        ]
         decoder = BPEDecoder(suffix="_")
-        assert decoder.decode(["My_", "na", "me_", "is_", "Jo", "hn_"]) == "My name is John"
+        assert decoder.decode(["My_", "na", "me_", "is_", "Jo", "hn_"]) == [
+            "My ",
+            "na",
+            "me ",
+            "is ",
+            "Jo",
+            "hn",
+        ]
 
     def test_can_modify(self):
         decoder = BPEDecoder(suffix="123")
@@ -120,19 +145,13 @@ def test_instantiate(self):
 
     def test_decoding(self):
         decoder = CTC()
-        assert (
-            decoder.decode(
-                ["<pad>", "<pad>", "h", "e", "e", "l", "l", "<pad>", "l", "o", "o", "o", "<pad>"]
-            )
-            == "hello"
-        )
+        assert decoder.decode(
+            ["<pad>", "<pad>", "h", "e", "e", "l", "l", "<pad>", "l", "o", "o", "o", "<pad>"]
+        ) == ["h", "e", "l", "l", "o"]
         decoder = CTC(pad_token="[PAD]")
-        assert (
-            decoder.decode(
-                ["[PAD]", "[PAD]", "h", "e", "e", "l", "l", "[PAD]", "l", "o", "o", "o", "[PAD]"]
-            )
-            == "hello"
-        )
+        assert decoder.decode(
+            ["[PAD]", "[PAD]", "h", "e", "e", "l", "l", "[PAD]", "l", "o", "o", "o", "[PAD]"]
+        ) == ["h", "e", "l", "l", "o"]
 
     def test_can_modify(self):
         decoder = CTC(pad_token="[PAD]")
diff --git a/tokenizers/src/decoders/bpe.rs b/tokenizers/src/decoders/bpe.rs
@@ -24,7 +24,15 @@ impl Default for BPEDecoder {
 }
 
 impl Decoder for BPEDecoder {
-    fn decode(&self, tokens: Vec<String>) -> Result<String> {
-        Ok(tokens.join("").replace(&self.suffix, " ").trim().to_owned())
+    fn decode(&self, tokens: Vec<String>) -> Result<Vec<String>> {
+        let n = tokens.len() - 1;
+        Ok(tokens
+            .into_iter()
+            .enumerate()
+            .map(|(i, token)| {
+                let replacement = if i == n { "" } else { " " };
+                token.replace(&self.suffix, replacement)
+            })
+            .collect())
     }
 }
diff --git a/tokenizers/src/decoders/ctc.rs b/tokenizers/src/decoders/ctc.rs
@@ -42,16 +42,23 @@ impl Default for CTC {
 }
 
 impl Decoder for CTC {
-    fn decode(&self, tokens: Vec<String>) -> Result<String> {
-        let mut output = tokens
+    fn decode(&self, tokens: Vec<String>) -> Result<Vec<String>> {
+        Ok(tokens
             .into_iter()
             .dedup()
-            .join("")
-            .replace(&self.pad_token, "");
-        if self.cleanup {
-            output = wordpiece::cleanup(output).replace(&self.word_delimiter_token, " ");
-        }
-        Ok(output)
+            .filter_map(|token| {
+                let mut replaced = token.replace(&self.pad_token, "");
+                if self.cleanup {
+                    replaced =
+                        wordpiece::cleanup(&replaced).replace(&self.word_delimiter_token, " ");
+                }
+                if replaced.is_empty() {
+                    None
+                } else {
+                    Some(replaced)
+                }
+            })
+            .collect())
     }
 }
 
@@ -67,7 +74,7 @@ mod tests {
             .collect();
         assert_eq!(
             ctc_decoder.decode(id_to_string_result).unwrap(),
-            "hello".to_string()
+            vec!["h", "e", "l", "l", "o"]
         );
     }
     #[test]
@@ -79,7 +86,7 @@ mod tests {
             .collect();
         assert_eq!(
             ctc_decoder.decode(id_to_string_result).unwrap(),
-            "hello world".to_string()
+            vec!["h", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d"]
         );
     }
     #[test]
@@ -88,7 +95,11 @@ mod tests {
         let id_to_string_result = "<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> A | | <pad> M <pad> <pad> <pad> <pad> A <pad> <pad> N <pad> <pad> <pad> | | | <pad> <pad> <pad> <pad> S <pad> <pad> <pad> A I <pad> D D | | T T <pad> O <pad> | | T H E E | | | <pad> U U <pad> N N <pad> I <pad> <pad> V <pad> <pad> <pad> E R R <pad> <pad> <pad> S E E | | <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> S S <pad> <pad> <pad> <pad> I <pad> R R <pad> <pad> | | | <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> I <pad> <pad> <pad> | <pad> <pad> <pad> E X <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> I <pad> S <pad> <pad> T <pad> <pad> | | <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>".split(' ').map(|s| s.to_string()).collect();
         assert_eq!(
             ctc_decoder.decode(id_to_string_result).unwrap(),
-            "A MAN SAID TO THE UNIVERSE SIR I EXIST ".to_string()
+            vec![
+                "A", " ", "M", "A", "N", " ", "S", "A", "I", "D", " ", "T", "O", " ", "T", "H",
+                "E", " ", "U", "N", "I", "V", "E", "R", "S", "E", " ", "S", "I", "R", " ", "I",
+                " ", "E", "X", "I", "S", "T", " "
+            ]
         );
     }
     #[test]
@@ -97,7 +108,13 @@ mod tests {
         let id_to_string_result = "<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> H <pad> I <pad> S S | | <pad> <pad> <pad> I N <pad> <pad> S <pad> T T <pad> <pad> A N C C T <pad> | | | | | <pad> <pad> <pad> <pad> P <pad> <pad> <pad> <pad> A <pad> <pad> N N N <pad> <pad> I <pad> C <pad> <pad> | | <pad> W <pad> <pad> A S <pad> | | <pad> <pad> <pad> F <pad> <pad> O L <pad> <pad> L L O O W E E D | | <pad> B <pad> <pad> <pad> Y <pad> | | | A | | <pad> S S S <pad> M M <pad> <pad> <pad> A L L <pad> <pad> <pad> <pad> L <pad> | | | <pad> <pad> <pad> <pad> S H H <pad> <pad> <pad> <pad> A R R <pad> <pad> P <pad> <pad> | <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> B <pad> <pad> L L <pad> <pad> <pad> <pad> <pad> O W W <pad> <pad> | | | <pad> <pad> <pad> <pad> <pad> <pad> <pad> H <pad> <pad> <pad> <pad> <pad> <pad> <pad> I G H H | | <pad> <pad> O N <pad> | | H <pad> I S S | | <pad> <pad> C H H <pad> <pad> <pad> E <pad> S S <pad> T T <pad> <pad> | | | <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>".split(' ').map(|s| s.to_string()).collect();
         assert_eq!(
             ctc_decoder.decode(id_to_string_result).unwrap(),
-            "HIS INSTANCT PANIC WAS FOLLOWED BY A SMALL SHARP BLOW HIGH ON HIS CHEST ".to_string()
+            vec![
+                "H", "I", "S", " ", "I", "N", "S", "T", "A", "N", "C", "T", " ", "P", "A", "N",
+                "I", "C", " ", "W", "A", "S", " ", "F", "O", "L", "L", "O", "W", "E", "D", " ",
+                "B", "Y", " ", "A", " ", "S", "M", "A", "L", "L", " ", "S", "H", "A", "R", "P",
+                " ", "B", "L", "O", "W", " ", "H", "I", "G", "H", " ", "O", "N", " ", "H", "I",
+                "S", " ", "C", "H", "E", "S", "T", " "
+            ]
         );
     }
 }
diff --git a/tokenizers/src/decoders/mod.rs b/tokenizers/src/decoders/mod.rs
@@ -26,7 +26,7 @@ pub enum DecoderWrapper {
 }
 
 impl Decoder for DecoderWrapper {
-    fn decode(&self, tokens: Vec<String>) -> Result<String> {
+    fn decode(&self, tokens: Vec<String>) -> Result<Vec<String>> {
         match self {
             Self::BPE(bpe) => bpe.decode(tokens),
             Self::ByteLevel(bl) => bl.decode(tokens),
diff --git a/tokenizers/src/decoders/wordpiece.rs b/tokenizers/src/decoders/wordpiece.rs
@@ -28,7 +28,7 @@ impl Default for WordPiece {
         }
     }
 }
-pub fn cleanup(dirty_input: String) -> String {
+pub fn cleanup(dirty_input: &str) -> String {
     dirty_input
         .replace(" .", ".")
         .replace(" ?", "?")
@@ -44,12 +44,21 @@ pub fn cleanup(dirty_input: String) -> String {
 }
 
 impl Decoder for WordPiece {
-    fn decode(&self, tokens: Vec<String>) -> Result<String> {
-        let mut output = tokens.join(" ").replace(&format!(" {}", self.prefix), "");
-        if self.cleanup {
-            output = cleanup(output);
-        }
-
-        Ok(output)
+    fn decode(&self, mut tokens: Vec<String>) -> Result<Vec<String>> {
+        tokens
+            .iter_mut()
+            .enumerate()
+            .map(|(i, token)| {
+                if token.starts_with(&self.prefix) {
+                    *token = token.replacen(&self.prefix, "", 1);
+                } else if i != 0 {
+                    *token = format!(" {}", token);
+                }
+                if self.cleanup {
+                    *token = cleanup(token);
+                }
+                Ok(token.to_string())
+            })
+            .collect::<Result<_>>()
     }
 }
diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs
@@ -124,8 +124,11 @@ impl PreTokenizer for ByteLevel {
 
 /// As a `Decoder`, `ByteLevel` is in charge of converting any byte-level characters to their
 /// unicode counterpart, before merging everything back into a single String.
+/// This decoder will consume the tokens and merge them in one step to alleviate
+/// the fact that single token decoded might be a byte not representable as
+/// as String.
 impl Decoder for ByteLevel {
-    fn decode(&self, tokens: Vec<String>) -> Result<String> {
+    fn decode(&self, tokens: Vec<String>) -> Result<Vec<String>> {
         let toks = tokens
             .into_iter()
             .flat_map(|t| {
@@ -138,8 +141,8 @@ impl Decoder for ByteLevel {
                     })
                     .unwrap_or_else(|| t.as_bytes().to_vec())
             })
-            .collect::<Vec<_>>();
-        Ok(String::from_utf8_lossy(&toks).into_owned())
+            .collect::<Vec<u8>>();
+        Ok(vec![String::from_utf8_lossy(&toks).to_string()])
     }
 }
 
@@ -248,7 +251,6 @@ mod tests {
     fn decoding() {
         let bytelevel = ByteLevel::default().add_prefix_space(false);
         assert_eq!(
-            "Hello my friend, how is your day going?",
             bytelevel
                 .decode(
                     vec![
@@ -259,7 +261,8 @@ mod tests {
                     .map(|s| s.into())
                     .collect::<Vec<String>>()
                 )
-                .unwrap()
+                .unwrap(),
+            vec!["Hello my friend, how is your day going?"]
         );
     }
 
@@ -311,7 +314,7 @@ mod tests {
                 .iter()
                 .flat_map(|(s, _, _)| s.split("").map(|t| t.into()))
                 .collect::<Vec<_>>();
-            assert_eq!(sample, bytelevel.decode(separated_tokens).unwrap());
+            assert_eq!(sample, bytelevel.decode(separated_tokens).unwrap().join(""));
         }
     }
 
@@ -507,7 +510,7 @@ mod tests {
                     "[PA D]".into()
                 ])
                 .unwrap(),
-            "Hello there dear friend! [PA D]"
+            vec!["Hello there dear friend! [PA D]"]
         );
     }
 }
diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs
diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ pub struct Decoder {`
`14`	`14`	`}`
`15`	`15`
`16`	`16`	`impl tk::Decoder for Decoder {`
`17`		`- fn decode(&self, tokens: Vec<String>) -> tk::Result<String> {`
	`17`	`+ fn decode(&self, tokens: Vec<String>) -> tk::Result<Vec<String>> {`
`18`	`18`	`self.decoder`
`19`	`19`	`.as_ref()`
`20`	`20`	`.ok_or("Uninitialized Decoder")?`
`@@ -41,7 +41,13 @@ declare_types! {`
`41`	`41`	`.decode(tokens)`
`42`	`42`	`.map_err(\|e\| Error(format!("{}", e)))?;`
`43`	`43`
`44`		`- Ok(cx.string(output).upcast())`
	`44`	`+ let decoded = JsArray::new(&mut cx, output.len() as u32);`
	`45`	`+ for (i, token) in output.into_iter().enumerate() {`
	`46`	`+ let js_token = cx.string(token);`
	`47`	`+ decoded.set(&mut cx, i as u32, js_token)?;`
	`48`	`+ }`
	`49`	`+`
	`50`	`+ Ok(decoded.upcast())`
`45`	`51`	`}`
`46`	`52`	`}`
`47`	`53`	`}`
Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@ impl PyDecoder {`
`51`	`51`	`}`
`52`	`52`
`53`	`53`	`impl Decoder for PyDecoder {`
`54`		`- fn decode(&self, tokens: Vec<String>) -> tk::Result<String> {`
	`54`	`+ fn decode(&self, tokens: Vec<String>) -> tk::Result<Vec<String>> {`
`55`	`55`	`self.decoder.decode(tokens)`
`56`	`56`	`}`
`57`	`57`	`}`
`@@ -98,7 +98,7 @@ impl PyDecoder {`
`98`	`98`	`/// Returns:`
`99`	`99`	/// :obj:`str`: The decoded string
`100`	`100`	`#[text_signature = "(self, tokens)"]`
`101`		`- fn decode(&self, tokens: Vec<String>) -> PyResult<String> {`
	`101`	`+ fn decode(&self, tokens: Vec<String>) -> PyResult<Vec<String>> {`
`102`	`102`	`ToPyResult(self.decoder.decode(tokens)).into()`
`103`	`103`	`}`
`104`	`104`	`}`
`@@ -337,12 +337,12 @@ impl CustomDecoder {`
`337`	`337`	`}`
`338`	`338`
`339`	`339`	`impl Decoder for CustomDecoder {`
`340`		`- fn decode(&self, tokens: Vec<String>) -> tk::Result<String> {`
	`340`	`+ fn decode(&self, tokens: Vec<String>) -> tk::Result<Vec<String>> {`
`341`	`341`	`Python::with_gil(\|py\| {`
`342`	`342`	`let decoded = self`
`343`	`343`	`.inner`
`344`	`344`	`.call_method(py, "decode", (tokens,), None)?`
`345`		`- .extract::<String>(py)?;`
	`345`	`+ .extract(py)?;`
`346`	`346`	`Ok(decoded)`
`347`	`347`	`})`
`348`	`348`	`}`
`@@ -396,7 +396,7 @@ where`
`396`	`396`	`}`
`397`	`397`
`398`	`398`	`impl Decoder for PyDecoderWrapper {`
`399`		`- fn decode(&self, tokens: Vec<String>) -> tk::Result<String> {`
	`399`	`+ fn decode(&self, tokens: Vec<String>) -> tk::Result<Vec<String>> {`
`400`	`400`	`match self {`
`401`	`401`	`PyDecoderWrapper::Wrapped(inner) => inner.read().unwrap().decode(tokens),`
`402`	`402`	`PyDecoderWrapper::Custom(inner) => inner.read().unwrap().decode(tokens),`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,15 @@ impl Default for BPEDecoder {`
`24`	`24`	`}`
`25`	`25`
`26`	`26`	`impl Decoder for BPEDecoder {`
`27`		`- fn decode(&self, tokens: Vec<String>) -> Result<String> {`
`28`		`- Ok(tokens.join("").replace(&self.suffix, " ").trim().to_owned())`
	`27`	`+ fn decode(&self, tokens: Vec<String>) -> Result<Vec<String>> {`
	`28`	`+ let n = tokens.len() - 1;`
	`29`	`+ Ok(tokens`
	`30`	`+ .into_iter()`
	`31`	`+ .enumerate()`
	`32`	`+ .map(\|(i, token)\| {`
	`33`	`+ let replacement = if i == n { "" } else { " " };`
	`34`	`+ token.replace(&self.suffix, replacement)`
	`35`	`+ })`
	`36`	`+ .collect())`
`29`	`37`	`}`
`30`	`38`	`}`
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ pub enum DecoderWrapper {`
`26`	`26`	`}`
`27`	`27`
`28`	`28`	`impl Decoder for DecoderWrapper {`
`29`		`- fn decode(&self, tokens: Vec<String>) -> Result<String> {`
	`29`	`+ fn decode(&self, tokens: Vec<String>) -> Result<Vec<String>> {`
`30`	`30`	`match self {`
`31`	`31`	`Self::BPE(bpe) => bpe.decode(tokens),`
`32`	`32`	`Self::ByteLevel(bl) => bl.decode(tokens),`