From c5bff790693e675eed7fb4a0b8cc88707941f482 Mon Sep 17 00:00:00 2001 From: keighbee Date: Tue, 28 Jan 2025 09:28:48 -0800 Subject: [PATCH 1/5] package.swift test --- Package.swift | 1 + 1 file changed, 1 insertion(+) diff --git a/Package.swift b/Package.swift index ac6a853..2b12214 100644 --- a/Package.swift +++ b/Package.swift @@ -8,6 +8,7 @@ let package = Package( platforms: [.iOS(.v16), .macOS(.v13)], products: [ .library(name: "Transformers", targets: ["Tokenizers", "Generation", "Models"]), + .library(name: "Hub", targets: ["Hub"]), .executable(name: "transformers", targets: ["TransformersCLI"]), .executable(name: "hub-cli", targets: ["HubCLI"]), ], From 19d9ada92dfffa2beb23351b8df61b01265a1c8d Mon Sep 17 00:00:00 2001 From: keighbee Date: Tue, 28 Jan 2025 11:24:57 -0800 Subject: [PATCH 2/5] added tokenizers as a lib --- Package.swift | 1 + 1 file changed, 1 insertion(+) diff --git a/Package.swift b/Package.swift index 2b12214..e6b3d19 100644 --- a/Package.swift +++ b/Package.swift @@ -9,6 +9,7 @@ let package = Package( products: [ .library(name: "Transformers", targets: ["Tokenizers", "Generation", "Models"]), .library(name: "Hub", targets: ["Hub"]), + .library(name: "Tokenizers", targets: ["Tokenizers"]), .executable(name: "transformers", targets: ["TransformersCLI"]), .executable(name: "hub-cli", targets: ["HubCLI"]), ], From 46327cd2949f0d9778f114e915786afd7f61ab3e Mon Sep 17 00:00:00 2001 From: keighbee Date: Tue, 28 Jan 2025 11:29:32 -0800 Subject: [PATCH 3/5] added nessisary target to tokenizers --- Package.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Package.swift b/Package.swift index e6b3d19..d00ebed 100644 --- a/Package.swift +++ b/Package.swift @@ -9,7 +9,7 @@ let package = Package( products: [ .library(name: "Transformers", targets: ["Tokenizers", "Generation", "Models"]), .library(name: "Hub", targets: ["Hub"]), - .library(name: "Tokenizers", targets: ["Tokenizers"]), + .library(name: "Tokenizers", targets: ["Tokenizers", "Models"]), .executable(name: "transformers", targets: ["TransformersCLI"]), .executable(name: "hub-cli", targets: ["HubCLI"]), ], From c80bb71d5338247e77263e8f32c41325d95863ba Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Thu, 30 Jan 2025 11:50:33 +0100 Subject: [PATCH 4/5] Experimental: tokenizers with and without templates --- Package.swift | 18 +- Sources/Tokenizers/Tokenizer.swift | 213 +++--------------- .../TokenizersTemplates.swift | 115 ++++++++++ .../TokenizersWrapper/TokenizersWrapper.swift | 83 +++++++ 4 files changed, 247 insertions(+), 182 deletions(-) create mode 100644 Sources/TokenizersTemplates/TokenizersTemplates.swift create mode 100644 Sources/TokenizersWrapper/TokenizersWrapper.swift diff --git a/Package.swift b/Package.swift index d00ebed..175cc06 100644 --- a/Package.swift +++ b/Package.swift @@ -7,25 +7,33 @@ let package = Package( name: "swift-transformers", platforms: [.iOS(.v16), .macOS(.v13)], products: [ - .library(name: "Transformers", targets: ["Tokenizers", "Generation", "Models"]), .library(name: "Hub", targets: ["Hub"]), - .library(name: "Tokenizers", targets: ["Tokenizers", "Models"]), + // ^ Hub client library + .library(name: "Tokenizers", targets: ["Tokenizers"]), + // ^ Tokenizers. Includes `Hub` to download config files + .library(name: "TokenizersTemplates", targets: ["TokenizersTemplates"]), + // ^ Optionally depend on this to add chat template support to Tokenizers + .library(name: "Transformers", targets: ["Tokenizers", "Generation", "Models"]), + // ^ Everything, including Core ML inference .executable(name: "transformers", targets: ["TransformersCLI"]), .executable(name: "hub-cli", targets: ["HubCLI"]), ], dependencies: [ .package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.4.0"), - .package(url: "https://github.com/johnmai-dev/Jinja", from: "1.1.0") + .package(url: "https://github.com/johnmai-dev/Jinja", from: "1.1.0"), ], targets: [ .executableTarget( name: "TransformersCLI", dependencies: [ - "Models", "Generation", "Tokenizers", + "Models", "Generation", "TokenizersTemplates", .product(name: "ArgumentParser", package: "swift-argument-parser")]), .executableTarget(name: "HubCLI", dependencies: ["Hub", .product(name: "ArgumentParser", package: "swift-argument-parser")]), .target(name: "Hub", resources: [.process("FallbackConfigs")]), - .target(name: "Tokenizers", dependencies: ["Hub", .product(name: "Jinja", package: "Jinja")]), + .target(name: "TokenizersCore", dependencies: ["Hub"], path: "Sources/Tokenizers"), + .target(name: "TokenizersTemplates", dependencies: ["TokenizersCore", .product(name: "Jinja", package: "Jinja")]), + .target(name: "Tokenizers", dependencies: ["TokenizersCore", .product(name: "Jinja", package: "Jinja")], path: "Sources/TokenizersWrapper"), + // ^ This is just a wrapper or façade against TokenizersCore, but adds templates if available .target(name: "TensorUtils"), .target(name: "Generation", dependencies: ["Tokenizers", "TensorUtils"]), .target(name: "Models", dependencies: ["Tokenizers", "Generation", "TensorUtils"]), diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index cabfd76..35a3fb0 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -7,9 +7,8 @@ import Hub import Foundation -import Jinja -enum TokenizerError: Error { +public enum TokenizerError: Error { case missingConfig case missingTokenizerClassInConfig case unsupportedTokenizer(String) @@ -43,7 +42,7 @@ public protocol TokenizingModel { } // Helper - possibly to be moved somewhere else -func addedTokenAsString(_ addedToken: Config?) -> String? { +public func addedTokenAsString(_ addedToken: Config?) -> String? { guard let addedToken = addedToken else { return nil } if let stringValue = addedToken.stringValue { return stringValue @@ -161,6 +160,20 @@ public protocol Tokenizer { ) throws -> [Int] } +extension Tokenizer { + public func applyChatTemplate(messages: [[String: String]]) throws -> [Int] { + try applyChatTemplate(messages: messages, chatTemplate: nil, addGenerationPrompt: true, truncation: false, maxLength: nil, tools: nil) + } + + public func applyChatTemplate(messages: [[String: String]], chatTemplate: ChatTemplateArgument) throws -> [Int] { + try applyChatTemplate(messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: true, truncation: false, maxLength: nil, tools: nil) + } + + public func applyChatTemplate(messages: [[String: String]], chatTemplate: String) throws -> [Int] { + try applyChatTemplate(messages: messages, chatTemplate: .literal(chatTemplate), addGenerationPrompt: true, truncation: false, maxLength: nil, tools: nil) + } +} + public extension Tokenizer { func callAsFunction(_ text: String, addSpecialTokens: Bool = true) -> [Int] { encode(text: text, addSpecialTokens: addSpecialTokens) @@ -179,18 +192,8 @@ public extension Tokenizer { } } -let specialTokenAttributes: [String] = [ - "bos_token", - "eos_token", - "unk_token", - "sep_token", - "pad_token", - "cls_token", - "mask_token", - "additional_special_tokens" -] - -public class PreTrainedTokenizer: Tokenizer { +// open because we have to subclass from `TokenizersTemplates` +open class PreTrainedTokenizer: Tokenizer { let model: TokenizingModel public var bosToken: String? { model.bosToken } @@ -201,17 +204,17 @@ public class PreTrainedTokenizer: Tokenizer { public var unknownTokenId: Int? { model.unknownTokenId } public var fuseUnknownTokens: Bool { model.fuseUnknownTokens } - private let addedTokens: Set - private let specialTokens: [String: Int] - private let addedTokensRegex: NSRegularExpression? + let addedTokens: Set + let specialTokens: [String: Int] + let addedTokensRegex: NSRegularExpression? - private let preTokenizer: PreTokenizer? - private let normalizer: Normalizer? - private let postProcessor: PostProcessor? - private let decoder: Decoder? - private let tokenizerConfig: Config + let preTokenizer: PreTokenizer? + let normalizer: Normalizer? + let postProcessor: PostProcessor? + let decoder: Decoder? + public let tokenizerConfig: Config - private let cleanUpTokenizationSpaces: Bool + let cleanUpTokenizationSpaces: Bool required public init(tokenizerConfig: Config, tokenizerData: Config) throws { var addedTokens: [String : Int] = [:] @@ -359,19 +362,19 @@ public class PreTrainedTokenizer: Tokenizer { model.convertIdToToken(id) } - public func applyChatTemplate(messages: [[String: String]]) throws -> [Int] { + open func applyChatTemplate(messages: [[String: String]]) throws -> [Int] { try applyChatTemplate(messages: messages, addGenerationPrompt: true) } - public func applyChatTemplate(messages: [[String: String]], chatTemplate: ChatTemplateArgument) throws -> [Int] { + open func applyChatTemplate(messages: [[String: String]], chatTemplate: ChatTemplateArgument) throws -> [Int] { try applyChatTemplate(messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: true) } - public func applyChatTemplate(messages: [[String: String]], chatTemplate: String) throws -> [Int] { + open func applyChatTemplate(messages: [[String: String]], chatTemplate: String) throws -> [Int] { try applyChatTemplate(messages: messages, chatTemplate: .literal(chatTemplate), addGenerationPrompt: true) } - public func applyChatTemplate( + open func applyChatTemplate( messages: [[String: String]], chatTemplate: ChatTemplateArgument? = nil, addGenerationPrompt: Bool = false, @@ -385,130 +388,7 @@ public class PreTrainedTokenizer: Tokenizer { /// Note: tool calling is not supported yet, it will be available in a future update. tools: [[String: Any]]? = nil ) throws -> [Int] { - var selectedChatTemplate: String? - if let chatTemplate, case .literal(let template) = chatTemplate { - // Use chat template from argument - selectedChatTemplate = template - } else if let valueFromConfig = tokenizerConfig.chatTemplate { - if let arrayValue = valueFromConfig.arrayValue { - // If the config specifies a list of chat templates, convert them to a dictionary - let templateDict = Dictionary(uniqueKeysWithValues: arrayValue.compactMap { item in - guard let name = item.name?.stringValue, let template = item.template?.stringValue else { - return nil - } - return (name, template) - }) - if let chatTemplate, case .name(let name) = chatTemplate { - // Select chat template from config by name - if let matchingDictEntry = templateDict[name] { - selectedChatTemplate = matchingDictEntry - } else { - throw TokenizerError.chatTemplate("No chat template named \"\(name)\" was found in the tokenizer config") - } - } else if let tools, !tools.isEmpty, let toolUseTemplate = templateDict["tool_use"] { - // Use tool use chat template from config - selectedChatTemplate = toolUseTemplate - } else if let defaultChatTemplate = templateDict["default"] { - // Use default chat template from config - selectedChatTemplate = defaultChatTemplate - } - } else if let stringValue = valueFromConfig.stringValue { - // Use chat template from config - selectedChatTemplate = stringValue - } - } - - guard let selectedChatTemplate else { - throw TokenizerError.chatTemplate("No chat template was specified") - } - - let template = try Template(selectedChatTemplate) - var context: [String: Any] = [ - "messages": messages, - "add_generation_prompt": addGenerationPrompt - // TODO: Add `tools` entry when support is added in Jinja - // "tools": tools - ] - - // TODO: maybe keep NSString here - for (key, value) in tokenizerConfig.dictionary as [String : Any] { - if specialTokenAttributes.contains(key), !(value is NSNull) { - if let stringValue = value as? String { - context[key] = stringValue - } else if let dictionary = value as? [NSString:Any] { - context[key] = addedTokenAsString(Config(dictionary)) - } else { - context[key] = value - } - } - } - - let rendered = try template.render(context) - var encodedTokens = encode(text: rendered, addSpecialTokens: false) - var maxLength = maxLength ?? encodedTokens.count - maxLength = min(maxLength, tokenizerConfig.modelMaxLength?.intValue ?? maxLength) - if encodedTokens.count > maxLength { - if truncation { - encodedTokens = Array(encodedTokens.prefix(maxLength)) - } - } - - return encodedTokens - } -} - -// MARK: - Building - -public struct AutoTokenizer {} - -struct PreTrainedTokenizerClasses { - /// Class overrides for custom behaviour - /// Not to be confused with the TokenizerModel classes defined in TokenizerModel - static let tokenizerClasses: [String : PreTrainedTokenizer.Type] = [ - "LlamaTokenizer": LlamaPreTrainedTokenizer.self - ] -} - -extension AutoTokenizer { - static func tokenizerClass(for tokenizerConfig: Config) -> PreTrainedTokenizer.Type { - guard let tokenizerClassName = tokenizerConfig.tokenizerClass?.stringValue else { - return PreTrainedTokenizer.self - } - - // Some tokenizer_class entries use a Fast suffix - let tokenizerName = tokenizerClassName.replacingOccurrences(of: "Fast", with: "") - if let tokenizerClass = PreTrainedTokenizerClasses.tokenizerClasses[tokenizerName] { - return tokenizerClass - } - - return PreTrainedTokenizer.self - } - - public static func from(tokenizerConfig: Config, tokenizerData: Config) throws -> Tokenizer { - let tokenizerClass = tokenizerClass(for: tokenizerConfig) - return try tokenizerClass.init(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData) - } - - public static func from( - pretrained model: String, - hubApi: HubApi = .shared - ) async throws -> Tokenizer { - let config = LanguageModelConfigurationFromHub(modelName: model, hubApi: hubApi) - guard let tokenizerConfig = try await config.tokenizerConfig else { throw TokenizerError.missingConfig } - let tokenizerData = try await config.tokenizerData - - return try AutoTokenizer.from(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData) - } - - public static func from( - modelFolder: URL, - hubApi: HubApi = .shared - ) async throws -> Tokenizer { - let config = LanguageModelConfigurationFromHub(modelFolder: modelFolder, hubApi: hubApi) - guard let tokenizerConfig = try await config.tokenizerConfig else { throw TokenizerError.missingConfig } - let tokenizerData = try await config.tokenizerData - - return try PreTrainedTokenizer(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData) + throw TokenizerError.chatTemplate("Not implemented, you may want to use the `TokenizersWithTemplates` target.") } } @@ -529,12 +409,13 @@ class T5Tokenizer : UnigramTokenizer {} // MARK: - PreTrainedTokenizer classes -let sentencePieceUnderline = "▁" +// These need to be public to be visible from the wrapper factory + +public let sentencePieceUnderline = "▁" // Hack for Llama tokenizers, see https://github.com/huggingface/transformers/blob/bcb841f0073fcd7a4fb88ea8064313c17dcab04a/src/transformers/models/llama/tokenization_llama_fast.py#L181 // Return updated config, or nil -func maybeUpdatePostProcessor(tokenizerConfig: Config, processorConfig: Config?) throws -> Config? { - +public func maybeUpdatePostProcessor(tokenizerConfig: Config, processorConfig: Config?) throws -> Config? { // If it's already a Template processor (instead of a ByteLevel one), assume it's correct let postProcessor = PostProcessorFactory.fromConfig(config: processorConfig) guard !(postProcessor is TemplateProcessing) else { return nil } @@ -573,25 +454,3 @@ func maybeUpdatePostProcessor(tokenizerConfig: Config, processorConfig: Config?) let postProcessorConfig = Config(["type": PostProcessorType.TemplateProcessing.rawValue, "single": single, "pair": pair]) return postProcessorConfig } - -// See https://github.com/xenova/transformers.js/blob/1a9964fb09b8f54fcbeac46dc6aae8d76795809d/src/tokenizers.js#L3203 for these exceptions -class LlamaPreTrainedTokenizer: PreTrainedTokenizer { - let isLegacy: Bool - - required init(tokenizerConfig: Config, tokenizerData: Config) throws { - isLegacy = tokenizerConfig.legacy?.boolValue ?? true - var configDictionary = tokenizerData.dictionary - if !isLegacy { - configDictionary.removeValue(forKey: "normalizer") - configDictionary["pre_tokenizer"] = ["type": "Metaspace", "replacement": sentencePieceUnderline, "add_prefix_space": true, "prepend_scheme": "first"] - } - - if let postProcessorConfig = try maybeUpdatePostProcessor(tokenizerConfig: tokenizerConfig, processorConfig: tokenizerData.postProcessor) { - configDictionary["post_processor"] = postProcessorConfig.dictionary - } - - let updatedData = Config(configDictionary) - try super.init(tokenizerConfig: tokenizerConfig, tokenizerData: updatedData) - } -} - diff --git a/Sources/TokenizersTemplates/TokenizersTemplates.swift b/Sources/TokenizersTemplates/TokenizersTemplates.swift new file mode 100644 index 0000000..6801807 --- /dev/null +++ b/Sources/TokenizersTemplates/TokenizersTemplates.swift @@ -0,0 +1,115 @@ +import Hub +import TokenizersCore +import Jinja +import Foundation + +let specialTokenAttributes: [String] = [ + "bos_token", + "eos_token", + "unk_token", + "sep_token", + "pad_token", + "cls_token", + "mask_token", + "additional_special_tokens" +] + +open class PreTrainedTokenizerWithTemplates : PreTrainedTokenizer { + // I don't know why these need to be here. They are implemented in the protocol, **and** in the superclass. + public override func applyChatTemplate(messages: [[String: String]]) throws -> [Int] { + try applyChatTemplate(messages: messages, addGenerationPrompt: true) + } + + public override func applyChatTemplate(messages: [[String: String]], chatTemplate: ChatTemplateArgument) throws -> [Int] { + try applyChatTemplate(messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: true) + } + + public override func applyChatTemplate(messages: [[String: String]], chatTemplate: String) throws -> [Int] { + try applyChatTemplate(messages: messages, chatTemplate: .literal(chatTemplate), addGenerationPrompt: true) + } + + public override func applyChatTemplate( + messages: [[String: String]], + chatTemplate: ChatTemplateArgument? = nil, + addGenerationPrompt: Bool = false, + truncation: Bool = false, + maxLength: Int? = nil, + /// A list of tools (callable functions) that will be accessible to the model. If the template does not + /// support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema, + /// giving the name, description and argument types for the tool. See the + /// [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use) + /// for more information. + /// Note: tool calling is not supported yet, it will be available in a future update. + tools: [[String: Any]]? = nil + ) throws -> [Int] { + var selectedChatTemplate: String? + if let chatTemplate, case .literal(let template) = chatTemplate { + // Use chat template from argument + selectedChatTemplate = template + } else if let valueFromConfig = tokenizerConfig.chatTemplate { + if let arrayValue = valueFromConfig.arrayValue { + // If the config specifies a list of chat templates, convert them to a dictionary + let templateDict = Dictionary(uniqueKeysWithValues: arrayValue.compactMap { item in + guard let name = item.name?.stringValue, let template = item.template?.stringValue else { + return nil + } + return (name, template) + }) + if let chatTemplate, case .name(let name) = chatTemplate { + // Select chat template from config by name + if let matchingDictEntry = templateDict[name] { + selectedChatTemplate = matchingDictEntry + } else { + throw TokenizerError.chatTemplate("No chat template named \"\(name)\" was found in the tokenizer config") + } + } else if let tools, !tools.isEmpty, let toolUseTemplate = templateDict["tool_use"] { + // Use tool use chat template from config + selectedChatTemplate = toolUseTemplate + } else if let defaultChatTemplate = templateDict["default"] { + // Use default chat template from config + selectedChatTemplate = defaultChatTemplate + } + } else if let stringValue = valueFromConfig.stringValue { + // Use chat template from config + selectedChatTemplate = stringValue + } + } + + guard let selectedChatTemplate else { + throw TokenizerError.chatTemplate("No chat template was specified") + } + + let template = try Template(selectedChatTemplate) + var context: [String: Any] = [ + "messages": messages, + "add_generation_prompt": addGenerationPrompt + // TODO: Add `tools` entry when support is added in Jinja + // "tools": tools + ] + + // TODO: maybe keep NSString here + for (key, value) in tokenizerConfig.dictionary as [String : Any] { + if specialTokenAttributes.contains(key), !(value is NSNull) { + if let stringValue = value as? String { + context[key] = stringValue + } else if let dictionary = value as? [NSString:Any] { + context[key] = addedTokenAsString(Config(dictionary)) + } else { + context[key] = value + } + } + } + + let rendered = try template.render(context) + var encodedTokens = encode(text: rendered, addSpecialTokens: false) + var maxLength = maxLength ?? encodedTokens.count + maxLength = min(maxLength, tokenizerConfig.modelMaxLength?.intValue ?? maxLength) + if encodedTokens.count > maxLength { + if truncation { + encodedTokens = Array(encodedTokens.prefix(maxLength)) + } + } + + return encodedTokens + } +} diff --git a/Sources/TokenizersWrapper/TokenizersWrapper.swift b/Sources/TokenizersWrapper/TokenizersWrapper.swift new file mode 100644 index 0000000..3de0ce6 --- /dev/null +++ b/Sources/TokenizersWrapper/TokenizersWrapper.swift @@ -0,0 +1,83 @@ +import Foundation +import Hub + +@_exported import TokenizersCore + +#if canImport(TokenizersTemplates) +import TokenizersTemplates +public typealias PreTrainedTokenizer = PreTrainedTokenizerWithTemplates +#endif + +public struct AutoTokenizer {} + +struct PreTrainedTokenizerClasses { + /// Class overrides for custom behaviour + /// Not to be confused with the TokenizerModel classes defined in TokenizerModel + static let tokenizerClasses: [String : PreTrainedTokenizer.Type] = [ + "LlamaTokenizer": LlamaPreTrainedTokenizer.self + ] +} + +extension AutoTokenizer { + static func tokenizerClass(for tokenizerConfig: Config) -> PreTrainedTokenizer.Type { + guard let tokenizerClassName = tokenizerConfig.tokenizerClass?.stringValue else { + return PreTrainedTokenizer.self + } + + // Some tokenizer_class entries use a Fast suffix + let tokenizerName = tokenizerClassName.replacingOccurrences(of: "Fast", with: "") + if let tokenizerClass = PreTrainedTokenizerClasses.tokenizerClasses[tokenizerName] { + return tokenizerClass + } + + return PreTrainedTokenizer.self + } + + public static func from(tokenizerConfig: Config, tokenizerData: Config) throws -> Tokenizer { + let tokenizerClass = tokenizerClass(for: tokenizerConfig) + return try tokenizerClass.init(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData) + } + + public static func from( + pretrained model: String, + hubApi: HubApi = .shared + ) async throws -> Tokenizer { + let config = LanguageModelConfigurationFromHub(modelName: model, hubApi: hubApi) + guard let tokenizerConfig = try await config.tokenizerConfig else { throw TokenizerError.missingConfig } + let tokenizerData = try await config.tokenizerData + + return try AutoTokenizer.from(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData) + } + + public static func from( + modelFolder: URL, + hubApi: HubApi = .shared + ) async throws -> Tokenizer { + let config = LanguageModelConfigurationFromHub(modelFolder: modelFolder, hubApi: hubApi) + guard let tokenizerConfig = try await config.tokenizerConfig else { throw TokenizerError.missingConfig } + let tokenizerData = try await config.tokenizerData + + return try PreTrainedTokenizer(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData) + } +} + +// See https://github.com/xenova/transformers.js/blob/1a9964fb09b8f54fcbeac46dc6aae8d76795809d/src/tokenizers.js#L3203 for these exceptions +class LlamaPreTrainedTokenizer: PreTrainedTokenizer { + let isLegacy: Bool + + required init(tokenizerConfig: Config, tokenizerData: Config) throws { + isLegacy = tokenizerConfig.legacy?.boolValue ?? true + var configDictionary = tokenizerData.dictionary + if !isLegacy { + configDictionary.removeValue(forKey: "normalizer") + configDictionary["pre_tokenizer"] = ["type": "Metaspace", "replacement": sentencePieceUnderline, "add_prefix_space": true, "prepend_scheme": "first"] + } + + if let postProcessorConfig = try maybeUpdatePostProcessor(tokenizerConfig: tokenizerConfig, processorConfig: tokenizerData.postProcessor) { + configDictionary["post_processor"] = postProcessorConfig.dictionary + } + + let updatedData = Config(configDictionary) + try super.init(tokenizerConfig: tokenizerConfig, tokenizerData: updatedData) + } +} From 2682309e27490c9a566417f861ca3afe89d19a54 Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Thu, 30 Jan 2025 12:13:14 +0100 Subject: [PATCH 5/5] Simplify --- Package.swift | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Package.swift b/Package.swift index 175cc06..e517d77 100644 --- a/Package.swift +++ b/Package.swift @@ -25,9 +25,7 @@ let package = Package( targets: [ .executableTarget( name: "TransformersCLI", - dependencies: [ - "Models", "Generation", "TokenizersTemplates", - .product(name: "ArgumentParser", package: "swift-argument-parser")]), + dependencies: [ "Models", .product(name: "ArgumentParser", package: "swift-argument-parser")]), .executableTarget(name: "HubCLI", dependencies: ["Hub", .product(name: "ArgumentParser", package: "swift-argument-parser")]), .target(name: "Hub", resources: [.process("FallbackConfigs")]), .target(name: "TokenizersCore", dependencies: ["Hub"], path: "Sources/Tokenizers"),