Skip to content

Commit

Permalink
Merge pull request #651 from epfml/646-tokenizer-julien
Browse files Browse the repository at this point in the history
Add tokenization and prompting API to GPT models
  • Loading branch information
JulienVig authored Apr 3, 2024
2 parents 5d282bf + 1ba5f60 commit 7c282e7
Show file tree
Hide file tree
Showing 25 changed files with 653 additions and 234 deletions.
1 change: 1 addition & 0 deletions discojs/discojs-core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"dependencies": {
"@tensorflow/tfjs": "4",
"@types/msgpack-lite": "0.1",
"@xenova/transformers": "2",
"axios": "1",
"gpt3-tokenizer": "1",
"immutable": "4",
Expand Down
31 changes: 14 additions & 17 deletions discojs/discojs-core/src/dataset/data/data.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,40 +64,37 @@ export abstract class Data {
* functions in a series. The preprocessing functions are chained according to their defined
* priority.
*/
get preprocessing (): (entry: tf.TensorContainer) => tf.TensorContainer {
get preprocessing (): (entry: tf.TensorContainer) => Promise<tf.TensorContainer> {
const params = this.task.trainingInformation
const taskPreprocessing = params.preprocessingFunctions

if (
taskPreprocessing === undefined ||
taskPreprocessing.length === 0 ||
this.availablePreprocessing === undefined ||
this.availablePreprocessing.size === 0
) {
return (x) => x
}

) {
return x => Promise.resolve(x)
}
const applyPreprocessing = this.availablePreprocessing
.filter((e) => e.type in taskPreprocessing)
.map((e) => e.apply)

.filter((e) => e.type in taskPreprocessing)
.map((e) => e.apply)
if (applyPreprocessing.size === 0) {
return (x) => x
return x => Promise.resolve(x)
}

const preprocessingChain = applyPreprocessing.reduce((acc, fn) =>
(x: tf.TensorContainer) => fn(acc(x), this.task),
(x: tf.TensorContainer) => x,
)

return (x: tf.TensorContainer) => preprocessingChain(x)
x => fn(acc(x), this.task), (x: Promise<tf.TensorContainer>) => x)

return x => preprocessingChain(Promise.resolve(x))
}

/**
* The TF.js dataset preprocessing according to the set of preprocessing functions and the task's
* parameters.
*/
get preprocessedDataset (): Dataset {
return this.dataset.map(this.preprocessing)
return this.dataset.mapAsync(this.preprocessing)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ export type Preprocessing = ImagePreprocessing | TextPreprocessing | TabularPrep
*/
export interface PreprocessingFunction {
type: Preprocessing
apply: (x: tf.TensorContainer, task: Task) => tf.TensorContainer
apply: (x: Promise<tf.TensorContainer>, task: Task) => Promise<tf.TensorContainer>
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ interface ImageEntry extends tf.TensorContainerObject {

const resize: PreprocessingFunction = {
type: ImagePreprocessing.Resize,
apply: (entry: tf.TensorContainer, task: Task): tf.TensorContainer => {
const { xs, ys } = entry as ImageEntry
apply: async (entry: Promise<tf.TensorContainer>, task: Task): Promise<tf.TensorContainer> => {
const { xs, ys } = await entry as ImageEntry
const params = task.trainingInformation
return {
xs: params.IMAGE_W !== undefined && params.IMAGE_H !== undefined
Expand All @@ -33,8 +33,8 @@ const resize: PreprocessingFunction = {

const normalize: PreprocessingFunction = {
type: ImagePreprocessing.Normalize,
apply: (entry: tf.TensorContainer): tf.TensorContainer => {
const { xs, ys } = entry as ImageEntry
apply: async (entry: Promise<tf.TensorContainer>): Promise<tf.TensorContainer> => {
const { xs, ys } = await entry as ImageEntry
return {
xs: xs.div(tf.scalar(255)),
ys
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ interface TabularEntry extends tf.TensorContainerObject {

const sanitize: PreprocessingFunction = {
type: TabularPreprocessing.Sanitize,
apply: (entry: tf.TensorContainer): tf.TensorContainer => {
apply: async (entry: Promise<tf.TensorContainer>) => {
// if preprocessing a dataset without labels, then the entry is an array of numbers
if (Array.isArray(entry)) {
return entry.map(i => i ?? 0)
return entry.map((i: number) => i ?? 0)
// otherwise it is an object with feature and labels
} else {
const { xs, ys } = entry as TabularEntry
const { xs, ys } = await entry as TabularEntry
return {
xs: xs.map(i => i ?? 0),
ys
Expand Down
Original file line number Diff line number Diff line change
@@ -1,57 +1,80 @@
import { List } from 'immutable'
import * as tf from '@tensorflow/tfjs'

import type { Task } from '../../../index.js'
import type { PreprocessingFunction } from './base.js'
import { models } from '../../../index.js'

/**
* Available text preprocessing types.
*/
export enum TextPreprocessing {
Tokenize,
Padding
}

interface TextEntry extends tf.TensorContainerObject {
xs: string[]
ys: number[]
LeftPadding
}

interface TokenizedEntry extends tf.TensorContainerObject {
xs: tf.Tensor1D
ys: tf.Tensor1D
}

// TODO that'll fail everytime
const gpt3Tokenizer = null as unknown as { encode: (_: string) => { bpe: number[]; text: string[] } }
/**
* We are currently only implementing left padding for text generation
* https://huggingface.co/docs/transformers/en/llm_tutorial#wrong-padding-side
* The function can easily be extended to support right padding once the need arise
*/
const leftPadding: PreprocessingFunction = {
type: TextPreprocessing.LeftPadding,
apply: async (x: Promise<tf.TensorContainer>, task: Task): Promise<tf.TensorContainer> => {
let { xs } = await x as TokenizedEntry
if (xs === undefined || !(xs instanceof tf.tensor) ||xs.rankType !== tf.Rank.R1) {
new Error("The leftPadding preprocessing expects a 1D tensor named 'xs' as input")
}
const tokenizer = await models.getTaskTokenizer(task)

const padding: PreprocessingFunction = {
type: TextPreprocessing.Padding,
apply: (x: tf.TensorContainer) => {
const { xs, ys } = x as TokenizedEntry
// TODO: add to task definition
const maxLength = 64
if (maxLength === undefined) {
return { xs, ys }
const maxLength = task.trainingInformation.maxSequenceLength ?? tokenizer.model_max_length as number
// Should never happen because tokenization truncates inputs
if (xs.size > maxLength) {
xs = xs.slice([0], [maxLength])
} else if (xs.size < maxLength) {
const paddingToken = tokenizer.pad_token_id
xs = xs.pad([[Math.max(0, maxLength - xs.size), 0]], paddingToken)
}
// if xs.size == maxLength we can leave it as it is
return {
xs: xs
.pad([[0, Math.max(0, maxLength - xs.size)]])
.slice([0], [maxLength]),
ys
xs,
ys: tf.oneHot(xs, tokenizer.model.vocab.length + 1) // gpt-tfjs expects a one-hot encoded token label
}
}
}

interface TokenizerOutput {
input_ids: number[]
}
/**
* Tokenize and truncates input strings
*/
const tokenize: PreprocessingFunction = {
type: TextPreprocessing.Tokenize,
apply: (x: tf.TensorContainer) => {
const { xs, ys } = x as TextEntry

const tokenized = gpt3Tokenizer.encode(xs[0]).bpe
apply: async (x: Promise<tf.TensorContainer>, task: Task): Promise<tf.TensorContainer> => {
if (typeof x != 'string') {
new Error("The tokenize preprocessing expects a string as input")
}
const xs = await x as string // tf.TextLineDataset yields strings
const tokenizer = await models.getTaskTokenizer(task)
const maxLength = task.trainingInformation.maxSequenceLength ?? tokenizer.model_max_length as number

const {input_ids: tokens} = tokenizer(xs, {
// Transformers.js currently only supports right padding while we need left for text generation
// Right padding should be supported in the future, once it is, we can directly pad while tokenizing
// https://github.com/xenova/transformers.js/blob/8804c36591d11d8456788d1bb4b16489121b3be2/src/tokenizers.js#L2517
padding: false,
truncation: true,
return_tensor: false,
max_length: maxLength,
}) as TokenizerOutput
return {
xs: tf.tensor(tokenized),
ys: tf.tensor(ys)
xs: tf.tensor(tokens, undefined, 'int32') // cast tokens from float to int for gpt-tfjs}
}
}
}
Expand All @@ -61,5 +84,5 @@ const tokenize: PreprocessingFunction = {
*/
export const AVAILABLE_PREPROCESSING = List.of(
tokenize,
padding
leftPadding
).sortBy((e) => e.type)
12 changes: 7 additions & 5 deletions discojs/discojs-core/src/dataset/data_loader/text_loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import type { Task } from '../../index.js'
import type { DataSplit, Dataset } from '../index.js'
import { TextData } from '../index.js'

import { DataLoader } from './index.js'
import { DataLoader, DataConfig } from './index.js'

/**
* Text data loader whose instantiable implementation is delegated by the platform-dependent Disco subprojects, namely,
Expand All @@ -18,13 +18,15 @@ export abstract class TextLoader<S> extends DataLoader<S> {

abstract loadDatasetFrom (source: S): Promise<Dataset>

async load (source: S): Promise<Dataset> {
return await this.loadDatasetFrom(source)
async load (source: S, config?: DataConfig): Promise<Dataset> {
const dataset = await this.loadDatasetFrom(source)
// 1st arg: Stream shuffling buffer size
return (config?.shuffle === undefined || config?.shuffle) ? dataset.shuffle(1000, undefined, true) : dataset
}

async loadAll (sources: S[]): Promise<DataSplit> {
async loadAll (sources: S[], config?: DataConfig): Promise<DataSplit> {
const concatenated =
(await Promise.all(sources.map(async (src) => await this.load(src))))
(await Promise.all(sources.map(async (src) => await this.load(src, config))))
.reduce((acc, dataset) => acc.concatenate(dataset))

return {
Expand Down
13 changes: 7 additions & 6 deletions discojs/discojs-core/src/default_tasks/wikitext.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { Model, Task, TaskProvider } from '../index.js'
import { models } from '../index.js'
import { data, models } from '../index.js'

export const wikitext: TaskProvider = {
getTask (): Task {
Expand All @@ -19,17 +19,18 @@ export const wikitext: TaskProvider = {
trainingInformation: {
dataType: 'text',
modelID: 'wikitext-103-raw-model',
preprocessingFunctions: [data.TextPreprocessing.Tokenize, data.TextPreprocessing.LeftPadding],
validationSplit: 0.2, // TODO: is this used somewhere? because train, eval and test are already split in dataset
epochs: 10,
// constructing a batch is taken care automatically in the dataset to make things faster
// so we fake a batch size of 1
batchSize: 1,
epochs: 5,
scheme: 'federated',
noiseScale: undefined,
decentralizedSecure: true,
minimumReadyPeers: 3,
maxShareValue: 100,
roundDuration: 10
roundDuration: 10,
batchSize: 16,
tokenizer: 'Xenova/gpt2',
maxSequenceLength: 128
}
}
},
Expand Down
9 changes: 7 additions & 2 deletions discojs/discojs-core/src/logging/trainer_logger.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,20 @@ export class TrainerLogger extends ConsoleLogger {
}

// console output
const msg = `Epoch: ${epoch}\nTrain: ${logs?.acc ?? 'undefined'}\nValidation:${logs?.val_acc ?? 'undefined'}\nLoss:${logs?.loss ?? 'undefined'}`
let msg = `Epoch: ${epoch}\n`
if (logs !== undefined) {
for (const [key, value] of Object.entries(logs)) {
msg += `${key}: ${value}\n`
}
}
this.success(`On epoch end:\n${msg}\n`)
}

/**
* Display ram usage
*/
ramUsage (): void {
this.success(`Training RAM usage is = ${tf.memory().numBytes * 0.000001} MB`)
this.success(`Training RAM usage is = ${tf.memory().numBytes / 1024 / 1024} MB`)
this.success(`Number of allocated tensors = ${tf.memory().numTensors}`)
}
}
4 changes: 0 additions & 4 deletions discojs/discojs-core/src/models/gpt/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,11 @@ export interface ModelSize {

export interface GPTConfig {
lr: number
batchSize: number
blockSize: number
vocabSize: number
evaluate?: boolean
maxEvalBatches?: number
evaluateEvery?: number
epochs?: number
maxIter?: number
weightDecay?: number
verbose?: 0 | 1
Expand All @@ -38,8 +36,6 @@ export interface GPTConfig {
export const DEFAULT_CONFIG: Required<GPTConfig> = {
lr: 0.001,
weightDecay: 0,
batchSize: 2,
epochs: 9999,
maxIter: 10_000,
verbose: 0,
modelType: 'gpt-nano',
Expand Down
21 changes: 13 additions & 8 deletions discojs/discojs-core/src/models/gpt/evaluate.ts
Original file line number Diff line number Diff line change
@@ -1,26 +1,33 @@
import * as tf from '@tensorflow/tfjs'

interface DataPoint extends tf.TensorContainerObject {
xs: tf.Tensor2D,
ys: tf.Tensor3D,
}

export default async function evaluate (
model: tf.LayersModel,
dataset: tf.data.Dataset<{ xs: tf.Tensor, ys: tf.Tensor }>
dataset: tf.data.Dataset<DataPoint>,
maxEvalBatches: number
): Promise<Record<'acc' | 'val_acc' | 'val_loss' | 'val_perplexity', number>> {
let datasetSize = 0
let totalLoss = 0
const acc: [number, number] = [0, 0]

await dataset.map(({ xs, ys }) => {
await dataset.take(maxEvalBatches).map(({ xs, ys }) => {
const logits = model.apply(xs)
if (Array.isArray(logits)) {
throw new Error('model outputed many tensor')
throw new Error('model output too many tensor')
}
if (logits instanceof tf.SymbolicTensor) {
throw new Error('model outputed symbolic tensor')
throw new Error('model output symbolic tensor')
}
xs.dispose()

return { logits, ys }
}).mapAsync(async ({ logits, ys }) => {
const loss = (await tf.losses.softmaxCrossEntropy(ys, logits).array())
const lossTensor = tf.losses.softmaxCrossEntropy(ys, logits)
const loss = await lossTensor.array()
if (typeof loss !== 'number') {
throw new Error('got multiple loss')
}
Expand All @@ -33,8 +40,7 @@ export default async function evaluate (
throw new Error('got multiple accuracy sum')
}

tf.dispose([ys, logits, accTensor, accSum])

tf.dispose([ys, logits, accTensor, accSum, lossTensor])
return { loss, accSummed, accSize }
}).forEachAsync(({ loss, accSummed, accSize }) => {
datasetSize += 1
Expand All @@ -44,7 +50,6 @@ export default async function evaluate (
})

const loss = totalLoss / datasetSize

return {
val_loss: loss,
val_perplexity: Math.exp(loss),
Expand Down
Loading

0 comments on commit 7c282e7

Please sign in to comment.