Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add down sampling before luis publish #2629

Merged
merged 21 commits into from
Apr 23, 2020
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
0fae30c
add bootstrap sampling before publish
lei9444 Apr 13, 2020
cf4b621
Merge branch 'master' into downsampling
lei9444 Apr 13, 2020
bff8fe5
Merge branch 'master' of https://github.com/lei9444/BotFramework-Comp…
lei9444 Apr 13, 2020
d994eb0
add reservoir sample if the utterances' number > 15000
lei9444 Apr 13, 2020
0ac7000
Merge branch 'downsampling' of https://github.com/lei9444/BotFramewor…
lei9444 Apr 13, 2020
944aafa
Merge branch 'master' of https://github.com/microsoft/BotFramework-Co…
lei9444 Apr 15, 2020
c52c00b
update the sample logic
lei9444 Apr 15, 2020
ddea83e
add unit test for sampler
lei9444 Apr 15, 2020
1e8a4b2
Merge branch 'master' into downsampling
lei9444 Apr 16, 2020
0797ced
Merge branch 'master' into downsampling
cwhitten Apr 16, 2020
9ffae39
Merge branch 'master' into downsampling
boydc2014 Apr 17, 2020
2dfbad1
add downsampling config to bot
lei9444 Apr 17, 2020
c1f2d1b
update the type
lei9444 Apr 17, 2020
248cebe
fix unit test
lei9444 Apr 17, 2020
44d2e56
Merge branch 'master' into downsampling
lei9444 Apr 20, 2020
e134377
don't do sample for the ratio is ok
lei9444 Apr 21, 2020
8a7f1dc
Merge branch 'master' of https://github.com/lei9444/BotFramework-Comp…
lei9444 Apr 21, 2020
d927924
Merge branch 'master' into downsampling
lei9444 Apr 21, 2020
c24845c
Merge branch 'master' of https://github.com/lei9444/BotFramework-Comp…
lei9444 Apr 22, 2020
80e2359
Merge branch 'master' into downsampling
cwhitten Apr 23, 2020
fc5c1bd
Merge branch 'master' into downsampling
cwhitten Apr 23, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,10 @@ describe('setting operation', () => {
endpointkey: '',
hostname: '',
},
downsampling: {
maxImbalanceRatio: 10,
maxUtteranceAllowed: 15000,
},
};
let projectId = '';
beforeEach(async () => {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
import { ComposerBootstrapSampler } from './../../../../src/models/bot/sampler/BootstrapSampler';

describe('BootstrapSampler', () => {
it('balence the utterances ratio in intents after bootstrap sampling', async () => {
const utterances = [
{ intent: '0', text: '1', entities: [] },
{ intent: '0', text: '2', entities: [] },
{ intent: '1', text: '3', entities: [] },
{ intent: '1', text: '4', entities: [] },
{ intent: '1', text: '5', entities: [] },
{ intent: '1', text: '6', entities: [] },
{ intent: '1', text: '7', entities: [] },
];
const sampler = new ComposerBootstrapSampler(utterances, 2);
const result = sampler.getSampledUtterances();
const intent1 = result.filter(e => e.intent === '1').length;
expect(2 / intent1).toBeCloseTo(0.5, 2);
const sampler1 = new ComposerBootstrapSampler(utterances, 5);
const result1 = sampler1.getSampledUtterances();
const intent11 = result1.filter(e => e.intent === '1').length;
expect(intent11).toBeCloseTo(5);
});
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

import { ComposerReservoirSampler } from './../../../../src/models/bot/sampler/ReservoirSampler';

describe('BootstrapSampler', () => {
it('down size the number of utterances reservoir sampling', async () => {
const utterances = [
{ intent: '0', text: '1', entities: [] },
{ intent: '1', text: '2', entities: [] },
{ intent: '1', text: '3', entities: [] },
{ intent: '1', text: '4', entities: [] },
{ intent: '1', text: '5', entities: [] },
{ intent: '1', text: '6', entities: [] },
{ intent: '1', text: '7', entities: [] },
{ intent: '1', text: '8', entities: [] },
{ intent: '1', text: '9', entities: [] },
{ intent: '1', text: '10', entities: [] },
{ intent: '1', text: '11', entities: [] },
{ intent: '1', text: '12', entities: [] },
{ intent: '1', text: '13', entities: [] },
{ intent: '1', text: '14', entities: [] },
{ intent: '1', text: '15', entities: [] },
];
const sampler = new ComposerReservoirSampler(utterances, 10);
expect(sampler.getSampledUtterances().length).toBe(10);
const sampler1 = new ComposerReservoirSampler(utterances, 11);
expect(sampler1.getSampledUtterances().length).toBe(11);
const sampler2 = new ComposerReservoirSampler(utterances, 12);
expect(sampler2.getSampledUtterances().length).toBe(12);
const sampler3 = new ComposerReservoirSampler(utterances, 18);
expect(sampler3.getSampledUtterances().length).toBe(15);
});
});
2 changes: 2 additions & 0 deletions Composer/packages/server/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
"@bfc/lu-languageserver": "*",
"@bfc/plugin-loader": "*",
"@bfc/shared": "*",
"@microsoft/bf-dispatcher": "4.9.0-preview.121555",
"@microsoft/bf-lu": "4.9.0-preview.121555",
"archiver": "^3.0.0",
"axios": "^0.18.0",
Expand All @@ -88,6 +89,7 @@
"morgan": "^1.9.1",
"passport": "^0.4.1",
"path-to-regexp": "^6.1.0",
"ts-md5": "^1.2.7",
"vscode-languageserver": "^5.3.0-next",
"vscode-ws-jsonrpc": "^0.1.1",
"ws": "^5.0.0"
Expand Down
15 changes: 7 additions & 8 deletions Composer/packages/server/src/models/bot/botProject.ts
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,6 @@ export class BotProject {
this.settings = await this.getEnvSettings('', false);
this.skills = await extractSkillManifestUrl(this.settings?.skill || []);
this.files = await this._getFiles();
if (this.settings) {
this.luPublisher.setLuisConfig(this.settings.luis);
}
};

public getProject = () => {
Expand Down Expand Up @@ -132,7 +129,6 @@ export class BotProject {
// create or update dialog settings
public updateEnvSettings = async (slot: string, config: DialogSetting) => {
await this.settingManager.set(slot, config);
this.luPublisher.setLuisConfig(config.luis);
};

// update skill in settings
Expand Down Expand Up @@ -255,15 +251,18 @@ export class BotProject {
return await this._createFile(relativePath, content);
};

public publishLuis = async (authoringKey: string, fileIds: string[], crossTrainConfig: ICrossTrainConfig) => {
this.luPublisher.setAuthoringKey(authoringKey);
if (fileIds.length) {
public publishLuis = async (authoringKey: string, fileIds: string[] = [], crossTrainConfig: ICrossTrainConfig) => {
if (fileIds.length && this.settings) {
const map = fileIds.reduce((result, id) => {
result[id] = true;
return result;
}, {});
const files = this.files.filter(file => map[Path.basename(file.name, '.lu')]);
this.luPublisher.setCrossTrainConfig(crossTrainConfig);
this.luPublisher.setPublishConfig(
{ ...this.settings.luis, authoringKey },
crossTrainConfig,
this.settings.downsampling
);
await this.luPublisher.publish(files);
}
};
Expand Down
75 changes: 54 additions & 21 deletions Composer/packages/server/src/models/bot/luPublisher.ts
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

import isEqual from 'lodash/isEqual';
import { FileInfo } from '@bfc/shared';

import { Path } from './../../utility/path';
import { IFileStorage } from './../storage/interface';
import { Path } from '../../utility/path';
import { IFileStorage } from '../storage/interface';
import log from '../../logger';

import { ComposerReservoirSampler } from './sampler/ReservoirSampler';
import { ComposerBootstrapSampler } from './sampler/BootstrapSampler';
import { ILuisConfig } from './interface';
import log from './../../logger';

// eslint-disable-next-line @typescript-eslint/no-var-requires
const crossTrainer = require('@microsoft/bf-lu/lib/parser/cross-train/crossTrainer.js');
// eslint-disable-next-line @typescript-eslint/no-var-requires
const luBuild = require('@microsoft/bf-lu/lib/parser/lubuild/builder.js');
// eslint-disable-next-line @typescript-eslint/no-var-requires
const LuisBuilder = require('@microsoft/bf-lu/lib/parser/luis/luisBuilder');
// eslint-disable-next-line @typescript-eslint/no-var-requires
const luisToLuContent = require('@microsoft/bf-lu/lib/parser/luis/luConverter');

const GENERATEDFOLDER = 'generated';
const INTERUPTION = 'interuption';
Expand All @@ -24,13 +30,20 @@ export interface ICrossTrainConfig {
verbose: boolean;
}

export interface IDownSamplingConfig {
maxImbalanceRatio: number;
maxUtteranceAllowed: number;
}

export class LuPublisher {
public botDir: string;
public dialogsDir: string;
public generatedFolderPath: string;
public interuptionFolderPath: string;
public storage: IFileStorage;
public config: ILuisConfig | null = null;
public downSamplingConfig: IDownSamplingConfig = { maxImbalanceRatio: 0, maxUtteranceAllowed: 0 };

public crossTrainConfig: ICrossTrainConfig = {
rootIds: [],
triggerRules: {},
Expand Down Expand Up @@ -65,23 +78,15 @@ export class LuPublisher {
}
};

public getLuisConfig = () => this.config;

public setLuisConfig = (config: ILuisConfig) => {
if (!isEqual(config, this.config)) {
this.config = config;
}
};

public setAuthoringKey = (key: string) => {
if (this.config) {
this.config.authoringKey = key;
}
};

public setCrossTrainConfig = (crossTrainConfig: ICrossTrainConfig) => {
if (crossTrainConfig) this.crossTrainConfig = crossTrainConfig;
};
public setPublishConfig(
luisConfig: ILuisConfig,
crossTrainConfig: ICrossTrainConfig,
downSamplingConfig: IDownSamplingConfig
) {
this.config = luisConfig;
this.crossTrainConfig = crossTrainConfig;
this.downSamplingConfig = downSamplingConfig;
}

private async _createGeneratedDir() {
// clear previous folder
Expand All @@ -104,6 +109,33 @@ export class LuPublisher {
await this._writeFiles(result.luResult);
}

private _doDownSampling(luObject: any) {
//do bootstramp sampling to make the utterances' number ratio to 1:10
const bootstrapSampler = new ComposerBootstrapSampler(
luObject.utterances,
this.downSamplingConfig.maxImbalanceRatio
);
luObject.utterances = bootstrapSampler.getSampledUtterances();
//if detect the utterances>15000, use reservoir sampling to down size
const reservoirSampler = new ComposerReservoirSampler(
luObject.utterances,
this.downSamplingConfig.maxUtteranceAllowed
);
luObject.utterances = reservoirSampler.getSampledUtterances();
return luObject;
}

private async _downSizeUtterances(luContents: any) {
return await Promise.all(
luContents.map(async luContent => {
const result = await LuisBuilder.fromLUAsync(luContent.content);
const sampledResult = this._doDownSampling(result);
const content = luisToLuContent(sampledResult);
return { ...luContent, content };
})
);
}

private async _writeFiles(crossTrainResult) {
if (!(await this.storage.exists(this.interuptionFolderPath))) {
await this.storage.mkDir(this.interuptionFolderPath);
Expand All @@ -121,6 +153,7 @@ export class LuPublisher {
throw new Error('No luis file exist');
}
const loadResult = await this._loadLuConatents(config.models);
loadResult.luContents = await this._downSizeUtterances(loadResult.luContents);
const buildResult = await this.builder.build(
loadResult.luContents,
loadResult.recognizers,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

import { BootstrapSampler } from '@microsoft/bf-dispatcher/lib/mathematics/sampler/BootstrapSampler';
import { Utility } from '@microsoft/bf-dispatcher/lib/utility/Utility';

const SAMPLE_SIZE_CONFIGURATION = 2;
lei9444 marked this conversation as resolved.
Show resolved Hide resolved

export interface IUtterance {
text: string;
intent: string;
entities: any[];
}

export class ComposerBootstrapSampler extends BootstrapSampler<number> {
private _maxImbalanceRatio: number;
private _utterances: IUtterance[] = [];

public constructor(utterances: IUtterance[], maxImbalanceRatio: number) {
super({}, true, SAMPLE_SIZE_CONFIGURATION);
this._utterances = utterances;
this._maxImbalanceRatio = maxImbalanceRatio;
utterances.forEach((e, index) => {
const { intent } = e;
this.addInstance(intent, index);
});
}

public computeMaxBalanceNumber(): number {
const numberInstancesPerLabelReduce: number = this.labels.reduce(
(mini: number, key: string) => (this.instances[key].length < mini ? this.instances[key].length : mini),
Number.MAX_SAFE_INTEGER
);

return this._maxImbalanceRatio * numberInstancesPerLabelReduce;
}

public computeSamplingNumberInstancesPerLabel(label = ''): number {
return this.computeMaxBalanceNumber() * SAMPLE_SIZE_CONFIGURATION;
}

public getSampledUtterances() {
if (this._maxImbalanceRatio) {
this.resetLabelsAndMap();

const sampledIndexes = this.sampleInstances();

const set = new Set([...sampledIndexes]);

return Array.from(set).map(index => this._utterances[index]);
} else {
return this._utterances;
}
}

//do re-sample if the ratio is beigher than the maxImbalanceRatio
public *sampleInstances() {
for (const key in this.instances) {
const instanceArray: number[] = this.instances[key];
const numberInstancesPerLabel: number = instanceArray.length;
const maxBalanceNumber: number = this.computeMaxBalanceNumber();
if (numberInstancesPerLabel > maxBalanceNumber) {
const numberSamplingInstancesPerLabel: number = this.computeSamplingNumberInstancesPerLabel(key);
for (let i = 0; i < numberSamplingInstancesPerLabel; i++) {
const indexRandom = Utility.getRandomInt(numberInstancesPerLabel);
yield instanceArray[indexRandom];
}
} else {
for (let i = 0; i < numberInstancesPerLabel; i++) {
yield instanceArray[i];
}
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
import { ReservoirSampler } from '@microsoft/bf-dispatcher/lib/mathematics/sampler/ReservoirSampler';

import { IUtterance } from './BootstrapSampler';

export class ComposerReservoirSampler extends ReservoirSampler<number> {
private _utterances: IUtterance[] = [];
private _maxUtteranceAllowed: number;

public constructor(utterances: IUtterance[], maxUtteranceAllowed: number) {
super({});
this._utterances = utterances;
this._maxUtteranceAllowed = maxUtteranceAllowed;
utterances.forEach((e, index) => {
this.addInstance(e.intent, index);
});
}

public getSampledUtterances() {
if (this._maxUtteranceAllowed && this._utterances.length > this._maxUtteranceAllowed) {
this.resetLabelsAndMap();

const sampledIndexes = this.sampleInstances(this._maxUtteranceAllowed);

const set = new Set([...sampledIndexes]);

return Array.from(set).map(index => this._utterances[index]);
} else {
return this._utterances;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,22 @@ export class DefaultSettingManager extends FileSettingManager {
endpointkey: '',
hostname: '',
},
downsampling: {
maxImbalanceRatio: 10,
maxUtteranceAllowed: 15000,
},
};
};

public async get(slot = '', obfuscate = false): Promise<any> {
const result = await super.get(slot, obfuscate);
//add downsampling property for old bot
if (!result.downsampling) {
result.downsampling = this.createDefaultSettings().downsampling;
}
return result;
}

private filterOutSensitiveValue = (obj: any) => {
if (obj && typeof obj === 'object') {
return omit(obj, SensitiveProperties);
Expand Down
Loading