Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add down sampling before luis publish #2629

Merged
merged 21 commits into from
Apr 23, 2020
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
0fae30c
add bootstrap sampling before publish
lei9444 Apr 13, 2020
cf4b621
Merge branch 'master' into downsampling
lei9444 Apr 13, 2020
bff8fe5
Merge branch 'master' of https://github.com/lei9444/BotFramework-Comp…
lei9444 Apr 13, 2020
d994eb0
add reservoir sample if the utterances' number > 15000
lei9444 Apr 13, 2020
0ac7000
Merge branch 'downsampling' of https://github.com/lei9444/BotFramewor…
lei9444 Apr 13, 2020
944aafa
Merge branch 'master' of https://github.com/microsoft/BotFramework-Co…
lei9444 Apr 15, 2020
c52c00b
update the sample logic
lei9444 Apr 15, 2020
ddea83e
add unit test for sampler
lei9444 Apr 15, 2020
1e8a4b2
Merge branch 'master' into downsampling
lei9444 Apr 16, 2020
0797ced
Merge branch 'master' into downsampling
cwhitten Apr 16, 2020
9ffae39
Merge branch 'master' into downsampling
boydc2014 Apr 17, 2020
2dfbad1
add downsampling config to bot
lei9444 Apr 17, 2020
c1f2d1b
update the type
lei9444 Apr 17, 2020
248cebe
fix unit test
lei9444 Apr 17, 2020
44d2e56
Merge branch 'master' into downsampling
lei9444 Apr 20, 2020
e134377
don't do sample for the ratio is ok
lei9444 Apr 21, 2020
8a7f1dc
Merge branch 'master' of https://github.com/lei9444/BotFramework-Comp…
lei9444 Apr 21, 2020
d927924
Merge branch 'master' into downsampling
lei9444 Apr 21, 2020
c24845c
Merge branch 'master' of https://github.com/lei9444/BotFramework-Comp…
lei9444 Apr 22, 2020
80e2359
Merge branch 'master' into downsampling
cwhitten Apr 23, 2020
fc5c1bd
Merge branch 'master' into downsampling
cwhitten Apr 23, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Composer/packages/client/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
"@bfc/ui-plugin-select-skill-dialog": "*",
"@bfc/visual-designer": "*",
"@emotion/core": "^10.0.7",
"@microsoft/bf-lu": "^4.9.0-preview.115707",
"@microsoft/bf-lu": "4.9.0-preview.115707",
"@reach/router": "^1.2.1",
"@uifabric/fluent-theme": "^7.1.13",
"@uifabric/icons": "^7.3.4",
Expand Down
2 changes: 1 addition & 1 deletion Composer/packages/lib/indexers/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
},
"dependencies": {
"@bfc/shared": "*",
"@microsoft/bf-lu": "^4.9.0-preview.115707",
"@microsoft/bf-lu": "4.9.0-preview.115707",
"adaptive-expressions": "^4.8.0-preview-110700",
"botbuilder-lg": "^4.9.0-preview-117748",
"lodash": "^4.17.15"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
import { ComposerBootstrapSampler } from './../../../../src/models/bot/sampler/BootstrapSampler';

describe('BootstrapSampler', () => {
it('balence the utterances ratio in intents after bootstrap sampling', async () => {
const utterances = [
{ intent: '0', text: '1', entities: [] },
{ intent: '1', text: '3', entities: [] },
{ intent: '1', text: '4', entities: [] },
{ intent: '1', text: '5', entities: [] },
{ intent: '1', text: '6', entities: [] },
{ intent: '1', text: '7', entities: [] },
{ intent: '1', text: '8', entities: [] },
{ intent: '1', text: '9', entities: [] },
{ intent: '1', text: '10', entities: [] },
{ intent: '1', text: '11', entities: [] },
{ intent: '1', text: '12', entities: [] },
{ intent: '1', text: '13', entities: [] },
{ intent: '1', text: '14', entities: [] },
{ intent: '1', text: '15', entities: [] },
];
const sampler = new ComposerBootstrapSampler(utterances);
const result = sampler.getSampledUtterances();
const intent1 = result.filter(e => e.intent === '1').length;
expect((result.length - intent1) / intent1).toBeCloseTo(0.1, 2);
});
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

import { ComposerReservoirSampler } from './../../../../src/models/bot/sampler/ReservoirSampler';

describe('BootstrapSampler', () => {
it('down size the number of utterances reservoir sampling', async () => {
const utterances = [
{ intent: '0', text: '1', entities: [] },
{ intent: '1', text: '2', entities: [] },
{ intent: '1', text: '3', entities: [] },
{ intent: '1', text: '4', entities: [] },
{ intent: '1', text: '5', entities: [] },
{ intent: '1', text: '6', entities: [] },
{ intent: '1', text: '7', entities: [] },
{ intent: '1', text: '8', entities: [] },
{ intent: '1', text: '9', entities: [] },
{ intent: '1', text: '10', entities: [] },
{ intent: '1', text: '11', entities: [] },
{ intent: '1', text: '12', entities: [] },
{ intent: '1', text: '13', entities: [] },
{ intent: '1', text: '14', entities: [] },
{ intent: '1', text: '15', entities: [] },
];
const sampler = new ComposerReservoirSampler(utterances);
sampler.sampleSize = 10;
expect(sampler.getSampledUtterances().length).toBe(10);
sampler.sampleSize = 11;
expect(sampler.getSampledUtterances().length).toBe(11);
sampler.sampleSize = 12;
expect(sampler.getSampledUtterances().length).toBe(12);
sampler.sampleSize = 16;
expect(sampler.getSampledUtterances().length).toBe(15);
});
});
6 changes: 4 additions & 2 deletions Composer/packages/server/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,13 @@
},
"dependencies": {
"@azure/ms-rest-js": "^1.8.7",
"@bfc/indexers": "*",
"@bfc/client": "*",
"@bfc/indexers": "*",
"@bfc/lg-languageserver": "*",
"@bfc/lu-languageserver": "*",
"@bfc/shared": "*",
"@microsoft/bf-lu": "^4.9.0-preview.115707",
"@microsoft/bf-dispatcher": "https://botbuilder.myget.org/F/botframework-cli/npm/@microsoft/bf-dispatcher/-/@microsoft/bf-dispatcher-4.9.0-preview.115707.tgz",
"@microsoft/bf-lu": "4.9.0-preview.115707",
"archiver": "^3.0.0",
"axios": "^0.18.0",
"azure-storage": "^2.10.3",
Expand All @@ -86,6 +87,7 @@
"morgan": "^1.9.1",
"passport": "^0.4.1",
"path-to-regexp": "^6.1.0",
"ts-md5": "^1.2.7",
"vscode-languageserver": "^5.3.0-next",
"vscode-ws-jsonrpc": "^0.1.1",
"ws": "^5.0.0"
Expand Down
37 changes: 33 additions & 4 deletions Composer/packages/server/src/models/bot/luPublisher.ts
Original file line number Diff line number Diff line change
@@ -1,18 +1,25 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

import isEqual from 'lodash/isEqual';
import { FileInfo } from '@bfc/shared';
import isEqual from 'lodash/isEqual';

import { Path } from '../../utility/path';
import { IFileStorage } from '../storage/interface';
import log from '../../logger';

import { Path } from './../../utility/path';
import { IFileStorage } from './../storage/interface';
import { ComposerReservoirSampler } from './sampler/ReservoirSampler';
import { ComposerBootstrapSampler } from './sampler/BootstrapSampler';
import { ILuisConfig } from './interface';
import log from './../../logger';

// eslint-disable-next-line @typescript-eslint/no-var-requires
const crossTrainer = require('@microsoft/bf-lu/lib/parser/cross-train/crossTrainer.js');
// eslint-disable-next-line @typescript-eslint/no-var-requires
const luBuild = require('@microsoft/bf-lu/lib/parser/lubuild/builder.js');
// eslint-disable-next-line @typescript-eslint/no-var-requires
const LuisBuilder = require('@microsoft/bf-lu/lib/parser/luis/luisBuilder');
// eslint-disable-next-line @typescript-eslint/no-var-requires
const luisToLuContent = require('@microsoft/bf-lu/lib/parser/luis/luConverter');

const GENERATEDFOLDER = 'generated';
const INTERUPTION = 'interuption';
Expand Down Expand Up @@ -104,6 +111,27 @@ export class LuPublisher {
await this._writeFiles(result.luResult);
}

private _doDownSampling(luObject: any) {
//do bootstramp sampling to make the utterances' number ratio to 1:10
const bootstrapSampler = new ComposerBootstrapSampler(luObject.utterances);
luObject.utterances = bootstrapSampler.getSampledUtterances();
//if detect the utterances>15000, use reservoir sampling to down size
const reservoirSampler = new ComposerReservoirSampler(luObject.utterances);
luObject.utterances = reservoirSampler.getSampledUtterances();
return luObject;
}

private async _downSizeUtterances(luContents: any) {
return await Promise.all(
luContents.map(async luContent => {
const result = await LuisBuilder.fromLUAsync(luContent.content);
const sampledResult = this._doDownSampling(result);
const content = luisToLuContent(sampledResult);
return { ...luContent, content };
})
);
}

private async _writeFiles(crossTrainResult) {
if (!(await this.storage.exists(this.interuptionFolderPath))) {
await this.storage.mkDir(this.interuptionFolderPath);
Expand All @@ -121,6 +149,7 @@ export class LuPublisher {
throw new Error('No luis file exist');
}
const loadResult = await this._loadLuConatents(config.models);
loadResult.luContents = await this._downSizeUtterances(loadResult.luContents);
const buildResult = await this.builder.build(
loadResult.luContents,
loadResult.recognizers,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

import { BootstrapSampler } from '@microsoft/bf-dispatcher/lib/mathematics/sampler/BootstrapSampler';

const SAMPLE_SIZE_CONFIGURATION = 2;
lei9444 marked this conversation as resolved.
Show resolved Hide resolved

export interface IUtterance {
text: string;
intent: string;
entities: any[];
}

export class ComposerBootstrapSampler extends BootstrapSampler<number> {
private _times = 10;
lei9444 marked this conversation as resolved.
Show resolved Hide resolved
private _utterances: IUtterance[] = [];

public constructor(utterances: IUtterance[]) {
super({}, true, SAMPLE_SIZE_CONFIGURATION);
this._utterances = utterances;
utterances.forEach((e, index) => {
const { intent } = e;
this.addInstance(intent, index);
});
}

public set times(v: number) {
this._times = v;
}

public computeSamplingNumberInstancesPerLabel(label = ''): number {
const numberInstancesPerLabelReduce: number = this.labels.reduce(
(mini: number, key: string) => (this.instances[key].length < mini ? this.instances[key].length : mini),
Number.MAX_SAFE_INTEGER
);

return this._times * numberInstancesPerLabelReduce * SAMPLE_SIZE_CONFIGURATION;
}

public getSampledUtterances() {
this.resetLabelsAndMap();

const sampledIndexes = this.sampleInstances();

const set = new Set([...sampledIndexes]);

return Array.from(set).map(index => this._utterances[index]);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
import { ReservoirSampler } from '@microsoft/bf-dispatcher/lib/mathematics/sampler/ReservoirSampler';

import { IUtterance } from './BootstrapSampler';

const MIN_SAMPLE_SIZE = 15000;
lei9444 marked this conversation as resolved.
Show resolved Hide resolved

export class ComposerReservoirSampler extends ReservoirSampler<number> {
private _utterances: IUtterance[] = [];
private _sampleSize = MIN_SAMPLE_SIZE;

public constructor(utterances: IUtterance[]) {
super({});
this._utterances = utterances;
utterances.forEach((e, index) => {
this.addInstance(e.intent, index);
});
}

public set sampleSize(v: number) {
this._sampleSize = v;
}

public getSampledUtterances() {
this.resetLabelsAndMap();
if (this._utterances.length > this._sampleSize) {
const sampledIndexes = this.sampleInstances(this._sampleSize);

const set = new Set([...sampledIndexes]);

return Array.from(set).map(index => this._utterances[index]);
} else {
return this._utterances;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
},
"dependencies": {
"@microsoft/bf-cli-command": "https://botbuilder.myget.org/F/botbuilder-declarative/npm/@microsoft/bf-cli-command/-/@microsoft/bf-cli-command-1.0.1.tgz",
"@microsoft/bf-lu": "^4.9.0-preview.115707",
"@microsoft/bf-lu": "4.9.0-preview.115707",
"@types/node": "^12.0.4",
"express": "^4.15.2",
"monaco-languageclient": "^0.10.0",
Expand Down
19 changes: 17 additions & 2 deletions Composer/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2944,6 +2944,16 @@
fs-extra "^7.0.1"
tslib "~1.10.0"

"@microsoft/bf-dispatcher@https://botbuilder.myget.org/F/botframework-cli/npm/@microsoft/bf-dispatcher/-/@microsoft/bf-dispatcher-4.9.0-preview.115707.tgz":
version "4.9.0-preview.115707"
resolved "https://botbuilder.myget.org/F/botframework-cli/npm/@microsoft/bf-dispatcher/-/@microsoft/bf-dispatcher-4.9.0-preview.115707.tgz#bd6563aeb10e2197a61f2725471c348ef4e7899e"
dependencies:
"@microsoft/bf-lu" "4.9.0-preview.115707"
"@oclif/command" "~1.5.19"
"@oclif/config" "~1.13.3"
argparse "~1.0.10"
tslib "^1.10.0"

"@microsoft/bf-lu@4.8.0", "@microsoft/bf-lu@^4.8.0":
version "4.8.0"
resolved "https://registry.yarnpkg.com/@microsoft/bf-lu/-/bf-lu-4.8.0.tgz#83a260f286836a4b06671a2572009c72bf1283ef"
Expand All @@ -2966,7 +2976,7 @@
semver "^5.5.1"
tslib "^1.10.0"

"@microsoft/bf-lu@^4.9.0-preview.115707":
"@microsoft/bf-lu@4.9.0-preview.115707":
version "4.9.0-preview.115707"
resolved "https://botbuilder.myget.org/F/botframework-cli/npm/@microsoft/bf-lu/-/@microsoft/bf-lu-4.9.0-preview.115707.tgz#6c5546217b8884c10f85c3e91248e0466ea57f2d"
integrity sha1-bFVGIXuIhMEPhcPpEkjgRm6lfy0=
Expand Down Expand Up @@ -4654,7 +4664,7 @@ arg@^4.1.0:
resolved "https://registry.yarnpkg.com/arg/-/arg-4.1.1.tgz#485f8e7c390ce4c5f78257dbea80d4be11feda4c"
integrity sha512-SlmP3fEA88MBv0PypnXZ8ZfJhwmDeIE3SP71j37AiXQBXYosPV0x6uISAaHYSlSVhmHOVkomen0tbGk6Anlebw==

argparse@^1.0.7:
argparse@^1.0.7, argparse@~1.0.10:
version "1.0.10"
resolved "https://registry.yarnpkg.com/argparse/-/argparse-1.0.10.tgz#bcd6791ea5ae09725e17e5ad988134cd40b3d911"
integrity sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==
Expand Down Expand Up @@ -18114,6 +18124,11 @@ ts-loader@^6.1.0, ts-loader@^6.2.1:
micromatch "^4.0.0"
semver "^6.0.0"

ts-md5@^1.2.7:
version "1.2.7"
resolved "https://registry.yarnpkg.com/ts-md5/-/ts-md5-1.2.7.tgz#b76471fc2fd38f0502441f6c3b9494ed04537401"
integrity sha512-emODogvKGWi1KO1l9c6YxLMBn6CEH3VrH5mVPIyOtxBG52BvV4jP3GWz6bOZCz61nLgBc3ffQYE4+EHfCD+V7w==

ts-node@^8.3.0:
version "8.5.4"
resolved "https://registry.yarnpkg.com/ts-node/-/ts-node-8.5.4.tgz#a152add11fa19c221d0b48962c210cf467262ab2"
Expand Down