Skip to content

Commit

Permalink
Revert "[BREAKING CHANGE] Ignore added_tokens (both special and not) … (
Browse files Browse the repository at this point in the history
#1569)

* Revert "[BREAKING CHANGE] Ignore added_tokens (both special and not) in the decoder (#1513)"

This reverts commit 25aee8b.

* don't remove audit

* deprecate id_to_token

* use simple id to token

* don't break id_to_token since we are deprecating anyways?
  • Loading branch information
ArthurZucker authored Jul 12, 2024
1 parent fdd26ba commit f2a44dc
Showing 1 changed file with 16 additions and 28 deletions.
44 changes: 16 additions & 28 deletions tokenizers/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -847,35 +847,23 @@ where

/// Decode the given ids, back to a String
pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String> {
let mut result = String::with_capacity(ids.len());
let mut chunks = Vec::with_capacity(ids.len());
for id in ids {
if let Some(added_token) = self.added_vocabulary.simple_id_to_token(*id) {
if skip_special_tokens && self.added_vocabulary.is_special_token(&added_token) {
continue;
}
let text_chunk = if let Some(decoder) = &self.decoder {
decoder.decode(chunks.clone())?
} else {
chunks.join(" ")
};
result.push_str(&text_chunk);
if !result.is_empty() && self.decoder.is_none() {
result.push(' ');
}
result.push_str(&added_token);
chunks.clear();
} else if let Some(token) = self.model.id_to_token(*id) {
chunks.push(token);
}
}
let text_chunk = if let Some(decoder) = &self.decoder {
decoder.decode(chunks.clone())?
let tokens = ids
.iter()
.filter_map(|id| {
self.added_vocabulary
.simple_id_to_token(*id)
.or_else(|| self.model.id_to_token(*id))
.filter(|token| {
!skip_special_tokens || !self.added_vocabulary.is_special_token(token)
})
})
.collect::<Vec<_>>();

if let Some(decoder) = &self.decoder {
decoder.decode(tokens)
} else {
chunks.join(" ")
};
result.push_str(&text_chunk);
Ok(result)
Ok(tokens.join(" "))
}
}
}

Expand Down

0 comments on commit f2a44dc

Please sign in to comment.