Skip to content

Commit

Permalink
Fix chance of lossy string replacement character pushing string over …
Browse files Browse the repository at this point in the history
…SUDACHI_MAX_TOKENIZER_LENGTH
  • Loading branch information
Kuuuube committed Nov 10, 2024
1 parent 2aaa5f2 commit 8042a71
Showing 1 changed file with 9 additions and 1 deletion.
10 changes: 9 additions & 1 deletion src/file_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,15 @@ pub fn get_plain_file_data(filepaths: Vec<PathBuf>) -> Vec<String> {
let current_string_clone = current_string.clone();
let split_string = current_string_clone.as_bytes().split_at(SUDACHI_MAX_TOKENIZER_LENGTH);
current_string = String::from_utf8_lossy(split_string.1).to_string();
String::from_utf8_lossy(split_string.0).to_string()
let expected_string_len = split_string.0.len();
let lossy_string = String::from_utf8_lossy(split_string.0).to_string();
if lossy_string.len() > expected_string_len { //if `from_utf8_lossy` creates a replacement character `�` it needs to be chopped off
let mut lossy_chars = lossy_string.chars();
lossy_chars.next_back();
lossy_chars.collect()
} else {
lossy_string
}
} else {
current_string = "".to_string();
current_string.clone()
Expand Down

0 comments on commit 8042a71

Please sign in to comment.