From 4c8cf07fa341fff10d27cd9266378d4b7a1bf74b Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Mon, 19 Aug 2024 12:26:43 +0200 Subject: [PATCH] Tokenizer fixes (#113) * Bring over hf token envvar from preview branch * Add tests for Gemma, including edge cases Edge cases also added for other BPE tokenizers, but not for T5 yet. * Sort added tokens by length (descending) to avoid early partial matches Similar to https://github.com/xenova/transformers.js/commit/c305c3824f628f1f02806a6310bd3b18b0f7f8f5 * Store vocab as NSString to allow multiple tokens with the same Unicode canonical representation. * Remove comments * Go back to making vocab dictionaries private * Use ungated copy of Gemma tokenizer * Use NSString in UnigramTokenizer * Switch test to microsoft tokenizer, verify in Python --- Sources/Hub/Hub.swift | 12 ++--- Sources/Hub/HubApi.swift | 6 +-- Sources/Tokenizers/BPETokenizer.swift | 24 +++++----- Sources/Tokenizers/Tokenizer.swift | 23 ++++++--- Sources/Tokenizers/UnigramTokenizer.swift | 18 +++---- Tests/HubTests/HubTests.swift | 21 +++++++++ Tests/TokenizersTests/AddedTokensTests.swift | 15 ++++-- .../Resources/gemma_encoded.json | 1 + .../Resources/tokenizer_tests.json | 2 +- Tests/TokenizersTests/TokenizerTests.swift | 47 ++++++++++++++++++- 10 files changed, 129 insertions(+), 40 deletions(-) create mode 100644 Tests/TokenizersTests/Resources/gemma_encoded.json diff --git a/Sources/Hub/Hub.swift b/Sources/Hub/Hub.swift index 8deeee8..613cdd9 100644 --- a/Sources/Hub/Hub.swift +++ b/Sources/Hub/Hub.swift @@ -38,9 +38,9 @@ public extension Hub { @dynamicMemberLookup public struct Config { - public private(set) var dictionary: [String: Any] + public private(set) var dictionary: [NSString: Any] - public init(_ dictionary: [String: Any]) { + public init(_ dictionary: [NSString: Any]) { self.dictionary = dictionary } @@ -76,8 +76,8 @@ public struct Config { public subscript(dynamicMember member: String) -> Config? { - let key = dictionary[member] != nil ? member : uncamelCase(member) - if let value = dictionary[key] as? [String: Any] { + let key = (dictionary[member as NSString] != nil ? member : uncamelCase(member)) as NSString + if let value = dictionary[key] as? [NSString: Any] { return Config(value) } else if let value = dictionary[key] { return Config(["value": value]) @@ -96,7 +96,7 @@ public struct Config { // Instead of doing this we could provide custom classes and decode to them public var arrayValue: [Config]? { guard let list = value as? [Any] else { return nil } - return list.map { Config($0 as! [String : Any]) } + return list.map { Config($0 as! [NSString : Any]) } } /// Tuple of token identifier and string value @@ -206,7 +206,7 @@ public class LanguageModelConfigurationFromHub { do { let data = try Data(contentsOf: url) let parsed = try JSONSerialization.jsonObject(with: data, options: []) - guard let dictionary = parsed as? [String: Any] else { return nil } + guard let dictionary = parsed as? [NSString: Any] else { return nil } return Config(dictionary) } catch { return nil diff --git a/Sources/Hub/HubApi.swift b/Sources/Hub/HubApi.swift index 0d789da..355f5e2 100644 --- a/Sources/Hub/HubApi.swift +++ b/Sources/Hub/HubApi.swift @@ -17,7 +17,7 @@ public struct HubApi { public typealias Repo = Hub.Repo public init(downloadBase: URL? = nil, hfToken: String? = nil, endpoint: String = "https://huggingface.co", useBackgroundSession: Bool = false) { - self.hfToken = hfToken + self.hfToken = hfToken ?? ProcessInfo.processInfo.environment["HUGGING_FACE_HUB_TOKEN"] if let downloadBase { self.downloadBase = downloadBase } else { @@ -102,7 +102,7 @@ public extension HubApi { func configuration(fileURL: URL) throws -> Config { let data = try Data(contentsOf: fileURL) let parsed = try JSONSerialization.jsonObject(with: data, options: []) - guard let dictionary = parsed as? [String: Any] else { throw Hub.HubClientError.parse } + guard let dictionary = parsed as? [NSString: Any] else { throw Hub.HubClientError.parse } return Config(dictionary) } } @@ -116,7 +116,7 @@ public extension HubApi { let (data, _) = try await httpGet(for: url) let parsed = try JSONSerialization.jsonObject(with: data, options: []) - guard let dictionary = parsed as? [String: Any] else { throw Hub.HubClientError.parse } + guard let dictionary = parsed as? [NSString: Any] else { throw Hub.HubClientError.parse } return Config(dictionary) } } diff --git a/Sources/Tokenizers/BPETokenizer.swift b/Sources/Tokenizers/BPETokenizer.swift index bed4785..aa3889c 100644 --- a/Sources/Tokenizers/BPETokenizer.swift +++ b/Sources/Tokenizers/BPETokenizer.swift @@ -33,9 +33,11 @@ struct BytePair: Hashable { class BPETokenizer: PreTrainedTokenizerModel { let bpeRanks: Dictionary - private let tokensToIds: [String: Int] - private let idsToTokens: [Int: String] - + private let tokensToIds: [NSString: Int] + private let idsToTokens: [Int: NSString] + + var vocabCount: Int { tokensToIds.count } + public let bosToken: String? public let bosTokenId: Int? public let eosToken: String? @@ -45,7 +47,7 @@ class BPETokenizer: PreTrainedTokenizerModel { required init(tokenizerConfig: Config, tokenizerData: Config, addedTokens: [String : Int]) throws { guard let merges = tokenizerData.model?.merges?.value as? [String] else { fatalError("BPETokenizer requires merges") } - guard let vocab = tokenizerData.model?.vocab?.dictionary as? [String: Int] else { + guard let vocab = tokenizerData.model?.vocab?.dictionary as? [NSString: Int] else { throw TokenizerError.missingVocab } var bpeRanks: Dictionary = [:] @@ -56,31 +58,31 @@ class BPETokenizer: PreTrainedTokenizerModel { } self.bpeRanks = bpeRanks - self.tokensToIds = vocab.merging(addedTokens) { $1 } + self.tokensToIds = vocab.merging(addedTokens as [NSString : Int]) { $1 } self.idsToTokens = Utils.invert(self.tokensToIds) // Populate tokens if let unknownToken = TokenizerModel.unknownToken(from: tokenizerConfig) { self.unknownToken = unknownToken - self.unknownTokenId = self.tokensToIds[unknownToken] + self.unknownTokenId = self.tokensToIds[unknownToken as NSString] } else { self.unknownToken = nil self.unknownTokenId = nil } eosToken = tokenizerConfig.eosToken?.stringValue - eosTokenId = eosToken == nil ? nil : tokensToIds[eosToken!] + eosTokenId = eosToken == nil ? nil : tokensToIds[eosToken! as NSString] bosToken = tokenizerConfig.bosToken?.stringValue - bosTokenId = bosToken == nil ? nil : tokensToIds[bosToken!] + bosTokenId = bosToken == nil ? nil : tokensToIds[bosToken! as NSString] } func convertTokenToId(_ token: String) -> Int? { - return tokensToIds[token] ?? self.unknownTokenId + return tokensToIds[token as NSString] ?? self.unknownTokenId } func convertIdToToken(_ id: Int) -> String? { - return idsToTokens[id] + return idsToTokens[id] as String? } func byteEncode(text: String) -> [String] { @@ -162,7 +164,7 @@ class BPETokenizer: PreTrainedTokenizerModel { var tokens: [String] = [] let bpeTokens = self.bpe(token: text).split(separator: " ").map { String($0) } for token in bpeTokens { - if let _ = tokensToIds[token] { + if convertTokenToId(token) != unknownTokenId { tokens.append(token) } else { // TODO: if config.byte_fallback is False, append the unknown token instead diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index b06fdb3..34460bc 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -163,12 +163,23 @@ public class PreTrainedTokenizer: Tokenizer { } } - let addedTokensRegexString = (tokenizerData.addedTokens?.arrayValue ?? []).compactMap { addedToken in - guard let content = addedToken.content?.stringValue else { return nil } - let prefix = (addedToken.lstrip?.boolValue ?? false ? #"\s*"# : "") - let suffix = (addedToken.rstrip?.boolValue ?? false ? #"\s*"# : "") - let token = NSRegularExpression.escapedPattern(for: content) - return "\(prefix)(\(token))\(suffix)" + // Convert to tuples for easier access, then sort by length (descending) to avoid early partial matches + // (https://github.com/xenova/transformers.js/commit/c305c3824f628f1f02806a6310bd3b18b0f7f8f5) + let unwrappedAddedTokens : [(content: String, prefix: Bool, suffix: Bool)] = (tokenizerData.addedTokens?.arrayValue ?? []).compactMap { addedToken in + guard let content = addedToken.content?.stringValue else { return nil } + let prefix = addedToken.lstrip?.boolValue ?? false + let suffix = addedToken.rstrip?.boolValue ?? false + return (content: content, prefix: prefix, suffix: suffix) + }.sorted { + $0.content.count > $1.content.count + } + + // then concatenate into regular expression + let addedTokensRegexString = unwrappedAddedTokens.map { + let token = NSRegularExpression.escapedPattern(for: $0.content) + let prefix = $0.prefix ? #"\s*"# : "" + let suffix = $0.suffix ? #"\s*"# : "" + return "\(prefix)(\(token))\(suffix)" }.joined(separator: "|") addedTokensRegex = try? NSRegularExpression(pattern: addedTokensRegexString, options: []) diff --git a/Sources/Tokenizers/UnigramTokenizer.swift b/Sources/Tokenizers/UnigramTokenizer.swift index 2ac672b..4a274a5 100644 --- a/Sources/Tokenizers/UnigramTokenizer.swift +++ b/Sources/Tokenizers/UnigramTokenizer.swift @@ -23,8 +23,8 @@ class UnigramTokenizer: PreTrainedTokenizerModel { public var unknownToken: String? { unknownPiece.token } let minScore: Float - let tokensToIds: [String: Int] - + let tokensToIds: [NSString: Int] + let bosToken: String? = " " let bosTokenId: Int? let eosToken: String? @@ -63,12 +63,12 @@ class UnigramTokenizer: PreTrainedTokenizerModel { self.unknownTokenId = unknownTokenId self.unknownPiece = SentencePieceToken(token: vocab[unknownTokenId].token, score: minScore - 10) - tokensToIds = Dictionary(uniqueKeysWithValues: vocab.map { $0.token }.enumerated().map { ($1, $0) }) - bosTokenId = tokensToIds[bosToken!] // May be nil - + tokensToIds = Dictionary(uniqueKeysWithValues: vocab.map { $0.token as NSString }.enumerated().map { ($1, $0) }) + bosTokenId = tokensToIds[bosToken! as NSString] // May be nil + eosToken = tokenizerConfig.eosToken?.stringValue - eosTokenId = eosToken == nil ? nil : tokensToIds[eosToken!] - + eosTokenId = eosToken == nil ? nil : tokensToIds[eosToken! as NSString] + trie = Trie() trie.append(contentsOf: vocab.map { $0.token }) @@ -76,7 +76,7 @@ class UnigramTokenizer: PreTrainedTokenizerModel { } func convertTokenToId(_ token: String) -> Int? { - return tokensToIds[token] ?? self.unknownTokenId + return tokensToIds[token as NSString] ?? self.unknownTokenId } func convertIdToToken(_ id: Int) -> String? { @@ -95,7 +95,7 @@ class UnigramTokenizer: PreTrainedTokenizerModel { let beginIndex = sentence.index(sentence.startIndex, offsetBy: beginPos) for token in trie.commonPrefixSearchIterator(sentence[beginIndex...]).map({ String($0) }) { - guard let tokenId = tokensToIds[token] else { fatalError("Token not in vocab: \(token)") } + guard let tokenId = tokensToIds[token as NSString] else { fatalError("Token not in vocab: \(token)") } let tokenScore = vocab[tokenId].score lattice.insert(startOffset: beginPos, length: token.count, score: tokenScore, tokenId: tokenId) if !hasSingleNode && token.count == mblen { diff --git a/Tests/HubTests/HubTests.swift b/Tests/HubTests/HubTests.swift index 9a1caea..1d7bc86 100644 --- a/Tests/HubTests/HubTests.swift +++ b/Tests/HubTests/HubTests.swift @@ -97,4 +97,25 @@ class HubTests: XCTestCase { XCTFail("Cannot download test configuration from the Hub: \(error)") } } + + func testConfigUnicode() { + // These are two different characters + let json = "{\"vocab\": {\"à\": 1, \"à\": 2}}" + let data = json.data(using: .utf8) + let dict = try! JSONSerialization.jsonObject(with: data!, options: []) as! [NSString: Any] + let config = Config(dict) + + let vocab_nsdict = config.dictionary["vocab"] as! NSDictionary + let vocab_nsstring = config.dictionary["vocab"] as! [NSString: Int] + let vocab = config.vocab!.dictionary + + XCTAssertEqual(vocab_nsdict.count, 2) + XCTAssertEqual(vocab_nsstring.count, 2) + XCTAssertEqual(vocab.count, 2) + + // This is expected because, unlike with NSString, String comparison uses the canonical Unicode representation + // https://developer.apple.com/documentation/swift/string#Modifying-and-Comparing-Strings + let vocab_dict = config.dictionary["vocab"] as! [String: Int] + XCTAssertNotEqual(vocab_dict.count, 2) + } } diff --git a/Tests/TokenizersTests/AddedTokensTests.swift b/Tests/TokenizersTests/AddedTokensTests.swift index 722609f..c82e45f 100644 --- a/Tests/TokenizersTests/AddedTokensTests.swift +++ b/Tests/TokenizersTests/AddedTokensTests.swift @@ -11,12 +11,21 @@ import Hub class AddedTokensTests: XCTestCase { func testPhiAddedTokens() async throws { - let tokenizer = try await AutoTokenizer.from(pretrained: "mlx-community/Phi-3-mini-128k-instruct-4bit") + let tokenizer = try await AutoTokenizer.from(pretrained: "microsoft/Phi-3-mini-128k-instruct") let inputIds = tokenizer("This is the <|end|>. My only friend, the <|end|>") - XCTAssertEqual(inputIds, [1, 910, 338, 278, 29871, 32007, 29889, 1619, 871, 5121, 29892, 278, 29871, 32007]) + XCTAssertEqual(inputIds, [910, 338, 278, 29871, 32007, 29889, 1619, 871, 5121, 29892, 278, 29871, 32007]) let decoded = tokenizer.decode(tokens: inputIds) - XCTAssertEqual(decoded, " This is the <|end|>. My only friend, the <|end|>") + XCTAssertEqual(decoded, "This is the <|end|>. My only friend, the <|end|>") + } + + func testGemmaAddedTokens() async throws { + let tokenizer = try await AutoTokenizer.from(pretrained: "pcuenq/gemma-tokenizer") + let inputIds = tokenizer("This\n\nis\na\ntest.") + XCTAssertEqual(inputIds, [2, 1596, 109, 502, 108, 235250, 108, 2195, 235265]) + + let decoded = tokenizer.decode(tokens: inputIds) + XCTAssertEqual(decoded, "This\n\nis\na\ntest.") } func testSplitWithCaptureGroups() { diff --git a/Tests/TokenizersTests/Resources/gemma_encoded.json b/Tests/TokenizersTests/Resources/gemma_encoded.json new file mode 100644 index 0000000..5647f59 --- /dev/null +++ b/Tests/TokenizersTests/Resources/gemma_encoded.json @@ -0,0 +1 @@ +{"text": "Fatouville-Grestain est une commune du Nord-Ouest du d\u00e9partement de l'Eure situ\u00e9e au \nbord de l'estuaire de la Seine et \u00e0 proximit\u00e9 du d\u00e9partement du Calvados. Selon l'atlas des paysages \nde Haute-Normandie, elle appartient \u00e0 la r\u00e9gion naturelle du Lieuvin. Toutefois, l'Agreste, le service \nde la statistique et de la prospective du minist\u00e8re de l'Agriculture, de l'Agroalimentaire et de la For\u00eat, \nla classe au sein du pays d'Auge (en tant que r\u00e9gion agricole).La commune est \u00e0 moins de dix kilom\u00e8tres \u00e0 \nl'est de Honfleur, \u00e0 autant de Beuzeville et \u00e0 environ dix-sept kilom\u00e8tres de Pont-Audemer.", "bpe_tokens": ["Fat", "ou", "ville", "-", "G", "rest", "ain", "\u2581est", "\u2581une", "\u2581commune", "\u2581du", "\u2581Nord", "-", "Ouest", "\u2581du", "\u2581d\u00e9partement", "\u2581de", "\u2581l", "'", "Eure", "\u2581situ\u00e9e", "\u2581au", "\u2581", "\n", "bord", "\u2581de", "\u2581l", "'", "est", "uaire", "\u2581de", "\u2581la", "\u2581Seine", "\u2581et", "\u2581\u00e0", "\u2581proximit\u00e9", "\u2581du", "\u2581d\u00e9partement", "\u2581du", "\u2581Cal", "vados", ".", "\u2581Selon", "\u2581l", "'", "atlas", "\u2581des", "\u2581paysages", "\u2581", "\n", "de", "\u2581Haute", "-", "Norman", "die", ",", "\u2581elle", "\u2581appartient", "\u2581\u00e0", "\u2581la", "\u2581r\u00e9gion", "\u2581naturelle", "\u2581du", "\u2581Lieu", "vin", ".", "\u2581Toutefois", ",", "\u2581l", "'", "Ag", "reste", ",", "\u2581le", "\u2581service", "\u2581", "\n", "de", "\u2581la", "\u2581statistique", "\u2581et", "\u2581de", "\u2581la", "\u2581prospective", "\u2581du", "\u2581minist\u00e8re", "\u2581de", "\u2581l", "'", "Agriculture", ",", "\u2581de", "\u2581l", "'", "Agro", "alimenta", "ire", "\u2581et", "\u2581de", "\u2581la", "\u2581For", "\u00eat", ",", "\u2581", "\n", "la", "\u2581classe", "\u2581au", "\u2581sein", "\u2581du", "\u2581pays", "\u2581d", "'", "Au", "ge", "\u2581(", "en", "\u2581tant", "\u2581que", "\u2581r\u00e9gion", "\u2581agricole", ").", "La", "\u2581commune", "\u2581est", "\u2581\u00e0", "\u2581moins", "\u2581de", "\u2581dix", "\u2581kilom\u00e8tres", "\u2581\u00e0", "\u2581", "\n", "l", "'", "est", "\u2581de", "\u2581Hon", "fleur", ",", "\u2581\u00e0", "\u2581autant", "\u2581de", "\u2581Be", "uze", "ville", "\u2581et", "\u2581\u00e0", "\u2581environ", "\u2581dix", "-", "sept", "\u2581kilom\u00e8tres", "\u2581de", "\u2581Pont", "-", "Au", "de", "mer", "."], "token_ids": [2, 33690, 507, 5259, 235290, 235319, 4803, 985, 1455, 2360, 34960, 1344, 14852, 235290, 101323, 1344, 57781, 581, 533, 235303, 128985, 80493, 992, 235248, 108, 51123, 581, 533, 235303, 644, 106910, 581, 683, 53876, 1008, 1305, 72883, 1344, 57781, 1344, 2659, 119613, 235265, 86721, 533, 235303, 64117, 848, 141362, 235248, 108, 495, 70628, 235290, 74906, 3917, 235269, 11340, 133635, 1305, 683, 33927, 72277, 1344, 174959, 2964, 235265, 145673, 235269, 533, 235303, 6665, 62423, 235269, 709, 2566, 235248, 108, 495, 683, 160719, 1008, 581, 683, 40675, 1344, 85986, 581, 533, 235303, 79742, 235269, 581, 533, 235303, 166317, 104544, 844, 1008, 581, 683, 1699, 19941, 235269, 235248, 108, 522, 30739, 992, 8399, 1344, 11928, 499, 235303, 2159, 541, 591, 479, 21482, 907, 33927, 113917, 846, 2841, 34960, 1455, 1305, 15006, 581, 51102, 118516, 1305, 235248, 108, 235257, 235303, 644, 581, 9073, 129564, 235269, 1305, 54409, 581, 2065, 52172, 5259, 1008, 1305, 15265, 51102, 235290, 91012, 118516, 581, 52291, 235290, 2159, 495, 977, 235265], "decoded_text": "Fatouville-Grestain est une commune du Nord-Ouest du d\u00e9partement de l'Eure situ\u00e9e au \nbord de l'estuaire de la Seine et \u00e0 proximit\u00e9 du d\u00e9partement du Calvados. Selon l'atlas des paysages \nde Haute-Normandie, elle appartient \u00e0 la r\u00e9gion naturelle du Lieuvin. Toutefois, l'Agreste, le service \nde la statistique et de la prospective du minist\u00e8re de l'Agriculture, de l'Agroalimentaire et de la For\u00eat, \nla classe au sein du pays d'Auge (en tant que r\u00e9gion agricole).La commune est \u00e0 moins de dix kilom\u00e8tres \u00e0 \nl'est de Honfleur, \u00e0 autant de Beuzeville et \u00e0 environ dix-sept kilom\u00e8tres de Pont-Audemer."} \ No newline at end of file diff --git a/Tests/TokenizersTests/Resources/tokenizer_tests.json b/Tests/TokenizersTests/Resources/tokenizer_tests.json index 18742a7..e392476 100644 --- a/Tests/TokenizersTests/Resources/tokenizer_tests.json +++ b/Tests/TokenizersTests/Resources/tokenizer_tests.json @@ -1 +1 @@ -{"bert-base-uncased": [{"input": "hello world", "encoded": {"input_ids": [101, 7592, 2088, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] hello world [SEP]", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [101, 7592, 2088, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] hello world [SEP]", "decoded_without_special": "hello world"}, {"input": "How are you doing?", "encoded": {"input_ids": [101, 2129, 2024, 2017, 2725, 1029, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] how are you doing? [SEP]", "decoded_without_special": "how are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [101, 2017, 2323, 1005, 2310, 2589, 2023, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] you should've done this [SEP]", "decoded_without_special": "you should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [101, 1037, 1005, 2222, 999, 999, 2000, 1029, 1005, 1040, 1005, 1005, 1040, 1997, 1010, 2064, 1005, 1056, 1012, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] a'll!! to?'d'' d of, can't. [SEP]", "decoded_without_special": "a'll!! to?'d'' d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [101, 13366, 2364, 1006, 1007, 1024, 3413, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] def main ( ) : pass [SEP]", "decoded_without_special": "def main ( ) : pass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [101, 2023, 2003, 1037, 3231, 1012, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] this is a test. [SEP]", "decoded_without_special": "this is a test."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [101, 2292, 1037, 1027, 27885, 3501, 1012, 2000, 3367, 4892, 1006, 1007, 1025, 2000, 3367, 4892, 1006, 1007, 1025, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] let a = obj. tostring ( ) ; tostring ( ) ; [SEP]", "decoded_without_special": "let a = obj. tostring ( ) ; tostring ( ) ;"}, {"input": "Hi Hello", "encoded": {"input_ids": [101, 7632, 7592, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] hi hello [SEP]", "decoded_without_special": "hi hello"}, {"input": "trailing space ", "encoded": {"input_ids": [101, 12542, 2686, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] trailing space [SEP]", "decoded_without_special": "trailing space"}, {"input": " leading space", "encoded": {"input_ids": [101, 2877, 2686, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] leading space [SEP]", "decoded_without_special": "leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [101, 1910, 100, 1916, 1921, 100, 100, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] \u751f [UNK] \u7684 \u771f [UNK] [UNK] [SEP]", "decoded_without_special": "\u751f \u7684 \u771f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [101, 1996, 2194, 2001, 2631, 1999, 2355, 1012, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] the company was founded in 2016. [SEP]", "decoded_without_special": "the company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [101, 3231, 1002, 1015, 1054, 2475, 1001, 1017, 1574, 2549, 27813, 1071, 2575, 100, 1576, 2620, 1575, 2683, 3231, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] test $ 1 r2 # 3 \u20ac4 \u00a35 \u00a56 [UNK] \u20b98 \u20b19 test [SEP]", "decoded_without_special": "test $ 1 r2 # 3 \u20ac4 \u00a35 \u00a56 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [101, 1045, 4149, 2019, 6207, 2005, 1002, 1015, 1012, 4002, 2012, 1996, 3573, 1012, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] i bought an apple for $ 1. 00 at the store. [SEP]", "decoded_without_special": "i bought an apple for $ 1. 00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [101, 2017, 1529, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] you \u2026 [SEP]", "decoded_without_special": "you \u2026"}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [101, 2017, 1529, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] you \u2026 [SEP]", "decoded_without_special": "you \u2026"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [101, 2017, 1529, 2017, 1529, 102], "token_type_ids": [0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] you \u2026 you \u2026 [SEP]", "decoded_without_special": "you \u2026 you \u2026"}], "distilgpt2": [{"input": "hello world", "encoded": {"input_ids": [31373, 995], "attention_mask": [1, 1]}, "decoded_with_special": "hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [15496, 2159], "attention_mask": [1, 1]}, "decoded_with_special": "Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [2437, 389, 345, 1804, 30], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [1639, 815, 1053, 1760, 428], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [32, 198, 1183, 37867, 1462, 8348, 67, 7061, 67, 286, 11, 460, 470, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "A\n'll!!to?'d''d of, can't.", "decoded_without_special": "A\n'll!!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [4299, 1388, 33529, 198, 197, 6603], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "def main():\n\tpass", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [1212, 198, 198, 271, 198, 64, 198, 9288, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "This\n\nis\na\ntest.", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [1616, 257, 796, 26181, 13, 1462, 10100, 9783, 198, 1462, 10100, 9783], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "let a = obj.toString();\ntoString();", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [17250, 220, 18435], "attention_mask": [1, 1, 1]}, "decoded_with_special": "Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [9535, 4386, 2272, 220, 220, 220], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "trailing space ", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [220, 220, 3756, 2272], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " leading space", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [37955, 162, 112, 119, 21410, 40367, 253, 164, 108, 249, 42468], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [464, 1664, 373, 9393, 287, 1584, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [9288, 720, 16, 371, 17, 1303, 18, 10432, 19, 4248, 20, 38221, 21, 2343, 224, 96, 22, 2343, 224, 117, 23, 2343, 224, 109, 24, 1332], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [40, 5839, 281, 17180, 329, 720, 16, 13, 405, 379, 262, 3650, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [5832, 1399, 220, 220], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "you\u2026 ", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [5832, 1399, 4603], "attention_mask": [1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [5832, 1399, 1849, 1849, 5832, 1399, 4603], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}], "coreml-projects/Llama-2-7b-chat-coreml": [{"input": "hello world", "encoded": {"input_ids": [1, 22172, 3186], "attention_mask": [1, 1, 1]}, "decoded_with_special": " hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [1, 15043, 2787], "attention_mask": [1, 1, 1]}, "decoded_with_special": " Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [1, 1128, 526, 366, 2599, 29973], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": " How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [1, 887, 881, 29915, 345, 2309, 445], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [1, 319, 13, 29915, 645, 21443, 517, 17901, 29881, 4907, 29881, 310, 29892, 508, 29915, 29873, 29889], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " A\n'll !!to?'d''d of, can't.", "decoded_without_special": "A\n'll !!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [1, 822, 1667, 7295, 13, 12, 3364], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " def main():\n\tpass", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [1, 910, 13, 13, 275, 13, 29874, 13, 1688, 29889], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " This\n\nis\na\ntest.", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [1, 1235, 263, 353, 5446, 29889, 7711, 890, 13, 7711, 890], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " let a = obj.toString();\ntoString();", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [1, 6324, 29871, 15043], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [1, 25053, 2913, 1678], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " trailing space ", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [1, 1678, 8236, 2913], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " leading space", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " \u751f\u6d3b\u7684\u771f\u8c1b\u662f", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [1, 450, 5001, 471, 11091, 297, 29871, 29906, 29900, 29896, 29953, 29889], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [1, 1243, 395, 29896, 390, 29906, 396, 29941, 25540, 29946, 15151, 29945, 29871, 30563, 29953, 29871, 229, 133, 166, 29955, 29871, 30620, 29947, 29871, 229, 133, 180, 29929, 1243], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [1, 306, 18093, 385, 26163, 363, 395, 29896, 29889, 29900, 29900, 472, 278, 3787, 29889], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [1, 366, 30098, 259], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " you\u2026 ", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [1, 366, 30098, 8655], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [1, 366, 30098, 8655, 6293, 30098, 8655], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}], "tiiuae/falcon-7b": [{"input": "hello world", "encoded": {"input_ids": [30835, 1079], "token_type_ids": [0, 0], "attention_mask": [1, 1]}, "decoded_with_special": "hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [9856, 2889], "token_type_ids": [0, 0], "attention_mask": [1, 1]}, "decoded_with_special": "Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [1830, 362, 299, 1836, 42], "token_type_ids": [0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [1357, 808, 18, 298, 1782, 414], "token_type_ids": [0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [44, 193, 18, 567, 204, 1409, 534, 12493, 79, 7544, 79, 275, 23, 418, 18, 95, 25], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "A\n'll!!to?'d''d of, can't.", "decoded_without_special": "A\n'll!!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [3071, 1316, 13160, 193, 192, 5412], "token_type_ids": [0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "def main():\n\tpass", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [1182, 193, 193, 259, 193, 76, 193, 4780, 25], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "This\n\nis\na\ntest.", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [1025, 241, 204, 40, 13756, 25, 19409, 2032, 193, 19409, 2032], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "let a = obj.toString();\ntoString();", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [5516, 204, 23090], "token_type_ids": [0, 0, 0], "attention_mask": [1, 1, 1]}, "decoded_with_special": "Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [9172, 4447, 2151, 466], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "trailing space ", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [258, 3736, 2151], "token_type_ids": [0, 0, 0], "attention_mask": [1, 1, 1]}, "decoded_with_special": " leading space", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [32725, 1105, 15498, 8061, 233, 2364], "token_type_ids": [0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [487, 1438, 398, 9923, 272, 204, 626, 33, 25], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [4780, 204, 15, 28, 382, 29, 204, 14, 30, 6471, 31, 5131, 32, 3068, 110, 33, 25631, 108, 34, 25631, 129, 35, 25631, 121, 36, 1318], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [52, 5659, 267, 12381, 312, 204, 15, 28, 25, 527, 388, 248, 2946, 25], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [5667, 898, 258], "token_type_ids": [0, 0, 0], "attention_mask": [1, 1, 1]}, "decoded_with_special": "you\u2026 ", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [5667, 898, 60482], "token_type_ids": [0, 0, 0], "attention_mask": [1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [5667, 898, 4381, 4381, 5667, 898, 60482], "token_type_ids": [0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}, {"input": "12 and 123 and 1234", "encoded": {"input_ids": [928, 273, 204, 10963, 273, 204, 10963, 31], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "12 and 123 and 1234", "decoded_without_special": "12 and 123 and 1234"}]} +{"bert-base-uncased": [{"input": "hello world", "encoded": {"input_ids": [101, 7592, 2088, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] hello world [SEP]", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [101, 7592, 2088, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] hello world [SEP]", "decoded_without_special": "hello world"}, {"input": "How are you doing?", "encoded": {"input_ids": [101, 2129, 2024, 2017, 2725, 1029, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] how are you doing? [SEP]", "decoded_without_special": "how are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [101, 2017, 2323, 1005, 2310, 2589, 2023, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] you should've done this [SEP]", "decoded_without_special": "you should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [101, 1037, 1005, 2222, 999, 999, 2000, 1029, 1005, 1040, 1005, 1005, 1040, 1997, 1010, 2064, 1005, 1056, 1012, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] a'll!! to?'d'' d of, can't. [SEP]", "decoded_without_special": "a'll!! to?'d'' d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [101, 13366, 2364, 1006, 1007, 1024, 3413, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] def main ( ) : pass [SEP]", "decoded_without_special": "def main ( ) : pass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [101, 2023, 2003, 1037, 3231, 1012, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] this is a test. [SEP]", "decoded_without_special": "this is a test."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [101, 2292, 1037, 1027, 27885, 3501, 1012, 2000, 3367, 4892, 1006, 1007, 1025, 2000, 3367, 4892, 1006, 1007, 1025, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] let a = obj. tostring ( ) ; tostring ( ) ; [SEP]", "decoded_without_special": "let a = obj. tostring ( ) ; tostring ( ) ;"}, {"input": "Hi Hello", "encoded": {"input_ids": [101, 7632, 7592, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] hi hello [SEP]", "decoded_without_special": "hi hello"}, {"input": "trailing space ", "encoded": {"input_ids": [101, 12542, 2686, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] trailing space [SEP]", "decoded_without_special": "trailing space"}, {"input": " leading space", "encoded": {"input_ids": [101, 2877, 2686, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] leading space [SEP]", "decoded_without_special": "leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [101, 1910, 100, 1916, 1921, 100, 100, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] \u751f [UNK] \u7684 \u771f [UNK] [UNK] [SEP]", "decoded_without_special": "\u751f \u7684 \u771f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [101, 1996, 2194, 2001, 2631, 1999, 2355, 1012, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] the company was founded in 2016. [SEP]", "decoded_without_special": "the company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [101, 3231, 1002, 1015, 1054, 2475, 1001, 1017, 1574, 2549, 27813, 1071, 2575, 100, 1576, 2620, 1575, 2683, 3231, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] test $ 1 r2 # 3 \u20ac4 \u00a35 \u00a56 [UNK] \u20b98 \u20b19 test [SEP]", "decoded_without_special": "test $ 1 r2 # 3 \u20ac4 \u00a35 \u00a56 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [101, 1045, 4149, 2019, 6207, 2005, 1002, 1015, 1012, 4002, 2012, 1996, 3573, 1012, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] i bought an apple for $ 1. 00 at the store. [SEP]", "decoded_without_special": "i bought an apple for $ 1. 00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [101, 2017, 1529, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] you \u2026 [SEP]", "decoded_without_special": "you \u2026"}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [101, 2017, 1529, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] you \u2026 [SEP]", "decoded_without_special": "you \u2026"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [101, 2017, 1529, 2017, 1529, 102], "token_type_ids": [0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] you \u2026 you \u2026 [SEP]", "decoded_without_special": "you \u2026 you \u2026"}], "distilgpt2": [{"input": "hello world", "encoded": {"input_ids": [31373, 995], "attention_mask": [1, 1]}, "decoded_with_special": "hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [15496, 2159], "attention_mask": [1, 1]}, "decoded_with_special": "Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [2437, 389, 345, 1804, 30], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [1639, 815, 1053, 1760, 428], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [32, 198, 1183, 37867, 1462, 8348, 67, 7061, 67, 286, 11, 460, 470, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "A\n'll!!to?'d''d of, can't.", "decoded_without_special": "A\n'll!!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [4299, 1388, 33529, 198, 197, 6603], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "def main():\n\tpass", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [1212, 198, 198, 271, 198, 64, 198, 9288, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "This\n\nis\na\ntest.", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [1616, 257, 796, 26181, 13, 1462, 10100, 9783, 198, 1462, 10100, 9783], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "let a = obj.toString();\ntoString();", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [17250, 220, 18435], "attention_mask": [1, 1, 1]}, "decoded_with_special": "Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [9535, 4386, 2272, 220, 220, 220], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "trailing space ", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [220, 220, 3756, 2272], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " leading space", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [37955, 162, 112, 119, 21410, 40367, 253, 164, 108, 249, 42468], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [464, 1664, 373, 9393, 287, 1584, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [9288, 720, 16, 371, 17, 1303, 18, 10432, 19, 4248, 20, 38221, 21, 2343, 224, 96, 22, 2343, 224, 117, 23, 2343, 224, 109, 24, 1332], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [40, 5839, 281, 17180, 329, 720, 16, 13, 405, 379, 262, 3650, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [5832, 1399, 220, 220], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "you\u2026 ", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [5832, 1399, 4603], "attention_mask": [1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [5832, 1399, 1849, 1849, 5832, 1399, 4603], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}], "coreml-projects/Llama-2-7b-chat-coreml": [{"input": "hello world", "encoded": {"input_ids": [1, 22172, 3186], "attention_mask": [1, 1, 1]}, "decoded_with_special": " hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [1, 15043, 2787], "attention_mask": [1, 1, 1]}, "decoded_with_special": " Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [1, 1128, 526, 366, 2599, 29973], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": " How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [1, 887, 881, 29915, 345, 2309, 445], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [1, 319, 13, 29915, 645, 21443, 517, 17901, 29881, 4907, 29881, 310, 29892, 508, 29915, 29873, 29889], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " A\n'll !!to?'d''d of, can't.", "decoded_without_special": "A\n'll !!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [1, 822, 1667, 7295, 13, 12, 3364], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " def main():\n\tpass", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [1, 910, 13, 13, 275, 13, 29874, 13, 1688, 29889], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " This\n\nis\na\ntest.", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [1, 1235, 263, 353, 5446, 29889, 7711, 890, 13, 7711, 890], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " let a = obj.toString();\ntoString();", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [1, 6324, 29871, 15043], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [1, 25053, 2913, 1678], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " trailing space ", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [1, 1678, 8236, 2913], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " leading space", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " \u751f\u6d3b\u7684\u771f\u8c1b\u662f", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [1, 450, 5001, 471, 11091, 297, 29871, 29906, 29900, 29896, 29953, 29889], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [1, 1243, 395, 29896, 390, 29906, 396, 29941, 25540, 29946, 15151, 29945, 29871, 30563, 29953, 29871, 229, 133, 166, 29955, 29871, 30620, 29947, 29871, 229, 133, 180, 29929, 1243], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [1, 306, 18093, 385, 26163, 363, 395, 29896, 29889, 29900, 29900, 472, 278, 3787, 29889], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [1, 366, 30098, 259], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " you\u2026 ", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [1, 366, 30098, 8655], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [1, 366, 30098, 8655, 6293, 30098, 8655], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}], "tiiuae/falcon-7b": [{"input": "hello world", "encoded": {"input_ids": [30835, 1079], "token_type_ids": [0, 0], "attention_mask": [1, 1]}, "decoded_with_special": "hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [9856, 2889], "token_type_ids": [0, 0], "attention_mask": [1, 1]}, "decoded_with_special": "Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [1830, 362, 299, 1836, 42], "token_type_ids": [0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [1357, 808, 18, 298, 1782, 414], "token_type_ids": [0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [44, 193, 18, 567, 204, 1409, 534, 12493, 79, 7544, 79, 275, 23, 418, 18, 95, 25], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "A\n'll!!to?'d''d of, can't.", "decoded_without_special": "A\n'll!!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [3071, 1316, 13160, 193, 192, 5412], "token_type_ids": [0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "def main():\n\tpass", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [1182, 193, 193, 259, 193, 76, 193, 4780, 25], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "This\n\nis\na\ntest.", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [1025, 241, 204, 40, 13756, 25, 19409, 2032, 193, 19409, 2032], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "let a = obj.toString();\ntoString();", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [5516, 204, 23090], "token_type_ids": [0, 0, 0], "attention_mask": [1, 1, 1]}, "decoded_with_special": "Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [9172, 4447, 2151, 466], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "trailing space ", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [258, 3736, 2151], "token_type_ids": [0, 0, 0], "attention_mask": [1, 1, 1]}, "decoded_with_special": " leading space", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [32725, 1105, 15498, 8061, 233, 2364], "token_type_ids": [0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [487, 1438, 398, 9923, 272, 204, 626, 33, 25], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [4780, 204, 15, 28, 382, 29, 204, 14, 30, 6471, 31, 5131, 32, 3068, 110, 33, 25631, 108, 34, 25631, 129, 35, 25631, 121, 36, 1318], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [52, 5659, 267, 12381, 312, 204, 15, 28, 25, 527, 388, 248, 2946, 25], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [5667, 898, 258], "token_type_ids": [0, 0, 0], "attention_mask": [1, 1, 1]}, "decoded_with_special": "you\u2026 ", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [5667, 898, 60482], "token_type_ids": [0, 0, 0], "attention_mask": [1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [5667, 898, 4381, 4381, 5667, 898, 60482], "token_type_ids": [0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}, {"input": "12 and 123 and 1234", "encoded": {"input_ids": [928, 273, 204, 10963, 273, 204, 10963, 31], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "12 and 123 and 1234", "decoded_without_special": "12 and 123 and 1234"}], "google/gemma-2-2b-it": [{"input": "hello world", "encoded": {"input_ids": [2, 17534, 2134], "attention_mask": [1, 1, 1]}, "decoded_with_special": "hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [2, 4521, 3855], "attention_mask": [1, 1, 1]}, "decoded_with_special": "Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [2, 2299, 708, 692, 3900, 235336], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [2, 2045, 1412, 235303, 524, 3015, 736], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [2, 235280, 108, 235303, 529, 9063, 511, 18016, 235258, 3404, 235258, 576, 235269, 798, 235303, 235251, 235265], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "A\n'll !!to?'d''d of, can't.", "decoded_without_special": "A\n'll !!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [2, 1293, 1872, 4409, 108, 226, 3095], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "def main():\n\tpass", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [2, 1596, 109, 502, 108, 235250, 108, 2195, 235265], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "This\n\nis\na\ntest.", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [2, 1243, 476, 589, 6555, 235265, 7114, 821, 108, 7114, 821], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "let a = obj.toString();\ntoString();", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [2, 2151, 139, 4521], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [2, 100504, 3641, 140], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "trailing space ", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [2, 140, 26650, 3641], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " leading space", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [2, 122182, 235710, 245467, 235427], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [2, 651, 3277, 729, 18942, 575, 235248, 235284, 235276, 235274, 235318, 235265], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [2, 2195, 697, 235274, 625, 235284, 1700, 235304, 8296, 235310, 5955, 235308, 74393, 235318, 235248, 252058, 235324, 56712, 235321, 235248, 243132, 235315, 2121], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [2, 235285, 8989, 671, 15491, 604, 697, 235274, 235265, 235276, 235276, 696, 573, 4659, 235265], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [2, 4747, 235417, 139], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "you\u2026 ", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [2, 4747, 235417, 25445], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [2, 4747, 235417, 25445, 4747, 235417, 25445], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}], "openai/whisper-tiny.en": [{"input": "hello world", "encoded": {"input_ids": [50257, 50362, 31373, 995, 50256], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>hello world<|endoftext|>", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [50257, 50362, 15496, 2159, 50256], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>Hello World<|endoftext|>", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [50257, 50362, 2437, 389, 345, 1804, 30, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>How are you doing?<|endoftext|>", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [50257, 50362, 1639, 815, 1053, 1760, 428, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>You should've done this<|endoftext|>", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [50257, 50362, 32, 198, 1183, 37867, 1462, 8348, 67, 7061, 67, 286, 11, 460, 470, 13, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>A\n'll!!to?'d''d of, can't.<|endoftext|>", "decoded_without_special": "A\n'll!!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [50257, 50362, 4299, 1388, 33529, 198, 197, 6603, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>def main():\n\tpass<|endoftext|>", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [50257, 50362, 1212, 198, 198, 271, 198, 64, 198, 9288, 13, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>This\n\nis\na\ntest.<|endoftext|>", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [50257, 50362, 1616, 257, 796, 26181, 13, 1462, 10100, 9783, 198, 1462, 10100, 9783, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>let a = obj.toString();\ntoString();<|endoftext|>", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [50257, 50362, 17250, 220, 18435, 50256], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>Hi Hello<|endoftext|>", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [50257, 50362, 9535, 4386, 2272, 220, 220, 220, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>trailing space <|endoftext|>", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [50257, 50362, 220, 220, 3756, 2272, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|> leading space<|endoftext|>", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [50257, 50362, 37955, 162, 112, 119, 21410, 40367, 253, 164, 108, 249, 42468, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>\u751f\u6d3b\u7684\u771f\u8c1b\u662f<|endoftext|>", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [50257, 50362, 464, 1664, 373, 9393, 287, 1584, 13, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>The company was founded in 2016.<|endoftext|>", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [50257, 50362, 9288, 720, 16, 371, 17, 1303, 18, 10432, 19, 4248, 20, 38221, 21, 2343, 224, 96, 22, 2343, 224, 117, 23, 2343, 224, 109, 24, 1332, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test<|endoftext|>", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [50257, 50362, 40, 5839, 281, 17180, 329, 720, 16, 13, 405, 379, 262, 3650, 13, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>I bought an apple for $1.00 at the store.<|endoftext|>", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [50257, 50362, 5832, 1399, 220, 220, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>you\u2026 <|endoftext|>", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [50257, 50362, 5832, 1399, 4603, 50256], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>you\u2026\u00a0\u00a0<|endoftext|>", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [50257, 50362, 5832, 1399, 1849, 1849, 5832, 1399, 4603, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0<|endoftext|>", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}], "openai/whisper-large-v2": [{"input": "hello world", "encoded": {"input_ids": [50258, 50363, 675, 1913, 1002, 50257], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>hello world<|endoftext|>", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [50258, 50363, 15947, 3937, 50257], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>Hello World<|endoftext|>", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [50258, 50363, 6462, 366, 291, 884, 30, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>How are you doing?<|endoftext|>", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [50258, 50363, 3223, 820, 600, 1096, 341, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>You should've done this<|endoftext|>", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [50258, 50363, 32, 198, 603, 15138, 1353, 8569, 67, 15025, 67, 295, 11, 393, 380, 13, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>A\n'll!!to?'d''d of, can't.<|endoftext|>", "decoded_without_special": "A\n'll!!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [50258, 50363, 20595, 2135, 7, 4507, 198, 197, 9216, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>def main():\n\tpass<|endoftext|>", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [50258, 50363, 5723, 198, 198, 271, 198, 64, 198, 31636, 13, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>This\n\nis\na\ntest.<|endoftext|>", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [50258, 50363, 2631, 257, 6585, 1111, 73, 13, 1353, 4520, 2937, 7, 34446, 198, 1353, 4520, 2937, 7, 34446, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>let a = obj.toString();\ntoString();<|endoftext|>", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [50258, 50363, 17155, 220, 2425, 50257], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>Hi Hello<|endoftext|>", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [50258, 50363, 17227, 4883, 1901, 220, 220, 220, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>trailing space <|endoftext|>", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [50258, 50363, 220, 220, 5775, 1901, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|> leading space<|endoftext|>", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [50258, 50363, 49958, 1546, 6303, 8897, 249, 1541, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>\u751f\u6d3b\u7684\u771f\u8c1b\u662f<|endoftext|>", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [50258, 50363, 2278, 2237, 390, 13234, 294, 6549, 13, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>The company was founded in 2016.<|endoftext|>", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [50258, 50363, 31636, 1848, 16, 497, 17, 3536, 18, 17450, 19, 14378, 20, 1815, 98, 21, 672, 224, 96, 22, 672, 224, 117, 23, 672, 224, 109, 24, 1500, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test<|endoftext|>", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [50258, 50363, 40, 4243, 364, 10606, 337, 1848, 16, 13, 628, 412, 264, 3531, 13, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>I bought an apple for $1.00 at the store.<|endoftext|>", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [50258, 50363, 5616, 1260, 220, 220, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>you\u2026 <|endoftext|>", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [50258, 50363, 5616, 1260, 126, 254, 126, 254, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>you\u2026\u00a0\u00a0<|endoftext|>", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [50258, 50363, 5616, 1260, 126, 254, 126, 254, 5616, 1260, 126, 254, 126, 254, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0<|endoftext|>", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}]} \ No newline at end of file diff --git a/Tests/TokenizersTests/TokenizerTests.swift b/Tests/TokenizersTests/TokenizerTests.swift index 4b4b496..684d807 100644 --- a/Tests/TokenizersTests/TokenizerTests.swift +++ b/Tests/TokenizersTests/TokenizerTests.swift @@ -28,6 +28,13 @@ class LlamaTokenizerTests: TokenizerTests { override class var hubModelName: String? { "coreml-projects/Llama-2-7b-chat-coreml" } override class var encodedSamplesFilename: String? { "llama_encoded" } override class var unknownTokenId: Int? { 0 } + + func testHexaEncode() async { + if let tester = Self._tester { + let tokenized = await tester.tokenizer?.tokenize(text: "\n") + XCTAssertEqual(tokenized, ["▁", "<0x0A>"]) + } + } } class WhisperLargeTokenizerTests: TokenizerTests { @@ -48,6 +55,41 @@ class T5TokenizerTests: TokenizerTests { override class var unknownTokenId: Int? { 2 } } +class GemmaTokenizerTests: TokenizerTests { + override class var hubModelName: String? { "pcuenq/gemma-tokenizer" } + override class var encodedSamplesFilename: String? { "gemma_encoded" } + override class var unknownTokenId: Int? { 3 } + + func testUnicodeEdgeCase() async { + guard let tester = Self._tester else { + XCTFail() + return + } + + // These are two different characters + let cases = ["à" /* 0x61 0x300 */, "à" /* 0xe0 */] + let expected = [217138, 1305] + + // These are different characters + for (s, expected) in zip(cases, expected) { + let encoded = await tester.tokenizer?.encode(text: " " + s) + XCTAssertEqual(encoded, [2, expected]) + } + } +} + +class GemmaUnicodeTests: XCTestCase { + func testGemmaVocab() async throws { + guard let tokenizer = try await AutoTokenizer.from(pretrained: "pcuenq/gemma-tokenizer") as? PreTrainedTokenizer else { + XCTFail() + return + } + + // FIXME: This should be 256_000, I believe + XCTAssertEqual((tokenizer.model as? BPETokenizer)?.vocabCount, 255994) + } +} + struct EncodedTokenizerSamplesDataset: Decodable { let text: String @@ -156,7 +198,10 @@ class TokenizerTester { /// Test encode and decode for a few edge cases func testEdgeCases() async { - guard let edgeCases = edgeCases else { return } + guard let edgeCases = edgeCases else { + print("Edge cases test ignored") + return + } guard let tokenizer = await tokenizer else { return } for edgeCase in edgeCases { print("Testing \(edgeCase.input)")