From eff4d719c7408c23250f6667fb07e02c42fcfe2a Mon Sep 17 00:00:00 2001 From: ObservedObserver <270001151@qq.com> Date: Wed, 4 Mar 2020 19:10:35 +0800 Subject: [PATCH 01/10] feat: init a pipeline structure --- .../visual-insights/src/insights/impurity.ts | 47 ++++++++++ .../visual-insights/src/insights/subspaces.ts | 25 +++++- .../visual-insights/src/pipeline/index.ts | 42 +++++++++ packages/visual-insights/src/pipeline/node.ts | 89 +++++++++++++++++++ packages/visual-insights/src/pipeline/test.ts | 37 ++++++++ 5 files changed, 239 insertions(+), 1 deletion(-) create mode 100644 packages/visual-insights/src/pipeline/index.ts create mode 100644 packages/visual-insights/src/pipeline/node.ts create mode 100644 packages/visual-insights/src/pipeline/test.ts diff --git a/packages/visual-insights/src/insights/impurity.ts b/packages/visual-insights/src/insights/impurity.ts index 8d657dbb..9c3f04e8 100644 --- a/packages/visual-insights/src/insights/impurity.ts +++ b/packages/visual-insights/src/insights/impurity.ts @@ -79,3 +79,50 @@ export function insightExtraction(dataSource: DataSource, dimensions: string[], } return impurityList } +interface InsightSpace { + dimensions: string[]; + type: 'entropy' | 'trend' | 'outlier'; + order: 'desc' | 'asc'; + score: { + [meaName: string]: number; + }; + correlationMatrix: number[][]; +} +export function multiInsightExtraction(dataSource: DataSource, dimensions: string[], measures: string[]): InsightSpace[] { + let impurityList: FieldsFeature[] = []; + let dimSet = subspaceSearching(dataSource, dimensions, true); + let correlationMatrix = measures.map(i => measures.map(j => 0)); + for (let i = 0; i < measures.length; i++) { + correlationMatrix[i][i] = 1; + for (let j = i + 1; j < measures.length; j++) { + let r = pearsonCC(dataSource, measures[i], measures[j]); + correlationMatrix[j][i] = correlationMatrix[i][j] = r; + } + } + + for (let dset of dimSet) { + let impurity = {}; + let trend = {}; + let outlier = {}; + let aggData = aggregate({ + dataSource, + dimensions: dset, + measures, + asFields: measures, + operator: operator || 'sum'//: operator as + }); + // let fList = aggData.map(r => r) + for (let mea of measures) { + // fl = frequency list, pL = probability list + let fL = aggData.map(r => r[mea]); + let pL = normalize(linearMapPositive(fL)); + let value = entropy(pL); + impurity[mea] = value; + } + for (let mea of measures) { + + } + impurityList.push([dset, impurity, correlationMatrix]); + } + return impurityList +} \ No newline at end of file diff --git a/packages/visual-insights/src/insights/subspaces.ts b/packages/visual-insights/src/insights/subspaces.ts index f9152b4c..171cec49 100644 --- a/packages/visual-insights/src/insights/subspaces.ts +++ b/packages/visual-insights/src/insights/subspaces.ts @@ -3,7 +3,7 @@ import aggregate from 'cube-core'; import { entropy, normalize } from '../statistics/index'; import { DataSource, OperatorType } from '../commonTypes'; import { crammersV, getCombination, pearsonCC, linearMapPositive } from '../statistics/index'; -import { CrammersVThreshold } from './config'; +import { CrammersVThreshold, PearsonCorrelation } from './config'; import { Cluster } from '../ml/index'; import { CHANNEL } from '../constant'; // insights like outlier and trend both request high impurity of dimension. @@ -19,6 +19,18 @@ function getDimCorrelationMatrix(dataSource: DataSource, dimensions: string[]): return matrix; } +function getMeaCorrelationMatrix(dataSource: DataSource, measures: string[]): number[][] { + let matrix = measures.map(i => measures.map(j => 0)); + for (let i = 0; i < measures.length; i++) { + matrix[i][i] = 1; + for (let j = i + 1; j < measures.length; j++) { + let r = pearsonCC(dataSource, measures[i], measures[j]); + matrix[j][i] = matrix[i][j] = r; + } + } + return matrix; +} + export function getDimSetsBasedOnClusterGroups(dataSource: DataSource, dimensions: string[]): string[][] { const maxDimNumberInView = 4; let dimSets: string[][] = []; @@ -38,6 +50,17 @@ export function getDimSetsBasedOnClusterGroups(dataSource: DataSource, dimension return dimSets; } +export function getMeaSetsBasedOnClusterGroups(dataSource: DataSource, measures: string[], maxFieldNumberInView: number = 3): string[][] { + let correlationMatrix: number[][] = getMeaCorrelationMatrix(dataSource, measures); + let groups: string[][] = Cluster.kruskal({ + matrix: correlationMatrix, + measures: measures, + groupMaxSize: Math.round(measures.length / maxFieldNumberInView), + threshold: PearsonCorrelation.strong + }); + return groups; +} + export function subspaceSearching(dataSource: DataSource, dimensions: string[], shouldDimensionsCorrelated: boolean | undefined = true): string[][] { if (shouldDimensionsCorrelated) { return getDimSetsBasedOnClusterGroups(dataSource, dimensions); diff --git a/packages/visual-insights/src/pipeline/index.ts b/packages/visual-insights/src/pipeline/index.ts new file mode 100644 index 00000000..8ba74d74 --- /dev/null +++ b/packages/visual-insights/src/pipeline/index.ts @@ -0,0 +1,42 @@ +import { PipeLineNode, PipeLineNodeInterface } from './node'; +interface PipeLineInterface { + nodes: PipeLineNodeInterface[] +} +export class PipeLine { + nodes: PipeLineNode[]; + runningIndex: number; + constructor (props: PipeLineInterface) { + this.nodes = []; + for (let i = 0; i < props.nodes.length; i++) { + let node = props.nodes[i]; + let pipeNode = new PipeLineNode(node); + pipeNode.openChannel({ + type: 'consumer', + operator: (state) => { + console.log('consumer', state.returns) + return state.returns; + } + }) + if (i > 0) { + pipeNode.openChannel({ + type: 'producer', + operator: (state, injection, params) => { + let lastNode = this.nodes[i - 1]; + // let target = lastNode.channels.find(c => c.type === 'consumer'); + injection(draft => { + console.log('last', lastNode.state.returns) + draft.source = lastNode.state.returns; + }) + } + }) + } + + this.nodes.push(pipeNode); + } + } + public run (startIndex: number = 0): void { + for (let i = startIndex; i < this.nodes.length; i++) { + this.nodes[i].run(); + } + } +} \ No newline at end of file diff --git a/packages/visual-insights/src/pipeline/node.ts b/packages/visual-insights/src/pipeline/node.ts new file mode 100644 index 00000000..15fd510a --- /dev/null +++ b/packages/visual-insights/src/pipeline/node.ts @@ -0,0 +1,89 @@ +import produce from 'immer'; + +type NucleiFunction = (params: P) => R; + +interface StateBase

{ + startTime: number; + endTime: number; + params: P; + returns: R | null +} + +type Injection

= (updater: (cytoplasmParams: P) => void) => void; + +type ChannelOperator, Signal> = (cytoplasmState: S, inject?: Injection, params?: any) => Signal; + +interface Channel, Signal = any> { + type: 'consumer' | 'producer'; + operator: ChannelOperator +} +export interface PipeLineNodeInterface

{ + nuclei: NucleiFunction; + initParams: P +} +export class PipeLineNode

{ + /** + * kernal function responsable for specific task of current node in pipeline. + */ + public nuclei: NucleiFunction; + /** + * which is the cytoplasm, contains node status, outside control params and returns of nuclei function + */ + public state: StateBase; + /** + * service plugins / cytoplasm consumers + */ + public channels: Channel>[]; + constructor (props: PipeLineNodeInterface) { + this.nuclei = props.nuclei; + this.state = { + startTime: 0, + endTime: 0, + params: props.initParams, + returns: null + } + this.channels = []; + } + /** + * injection is used by channel to update the state.params in "cytoplasm". + */ + public injection: Injection

= (updater) => { + let nextParams = produce(this.state.params, updater); + this.state.params = nextParams; + } + /** + * register a channel (or service plugin, or cytoplasm consumer) + * @param channel + */ + public openChannel (channel: Channel>) { + this.channels.push(channel); + } + // public absorbProps(params: P) { + // this.state.params = params; + // } + + private beforeRun() { + this.state.startTime = new Date().getTime(); + for (let channel of this.channels) { + if (channel.type === 'producer') { + channel.operator(this.state, this.injection); + } + } + } + private afterRun(returns: R) { + this.state.endTime = new Date().getTime(); + this.state.returns = returns; + for (let channel of this.channels) { + // todo: consumer type should not get injection ? + if (channel.type === 'consumer') { + channel.operator(this.state); + } + } + } + public run (): R { + this.beforeRun(); + let returns: R = this.nuclei(this.state.params); + this.afterRun(returns); + return returns; + } +} \ No newline at end of file diff --git a/packages/visual-insights/src/pipeline/test.ts b/packages/visual-insights/src/pipeline/test.ts new file mode 100644 index 00000000..986b6de3 --- /dev/null +++ b/packages/visual-insights/src/pipeline/test.ts @@ -0,0 +1,37 @@ +import { PipeLine } from './index'; + +// array: x => x + 1 +// array: x => x / index +// array => sum(array) + +let pipe = new PipeLine({ + nodes: [ + { + initParams: { + source: [10, 20, 30] + }, + nuclei: (params) => params.source.map(n => n + 1) + }, + { + initParams: { + source: [] + }, + nuclei: (params) => params.source.map((n, i) => n / (i + 1)) + }, + { + initParams: { + source: [] + }, + nuclei: (params) => { + let sum = 0; + params.source.forEach(n => { + sum += n; + }) + console.log(sum) + return sum; + } + } + ] +}); + +pipe.run(); From a94b4489c05d46aacfc0606857d3fdcf0e301fb6 Mon Sep 17 00:00:00 2001 From: ObservedObserver <270001151@qq.com> Date: Wed, 4 Mar 2020 23:02:25 +0800 Subject: [PATCH 02/10] feat: a work demo --- .../visual-insights/src/pipeline/index.ts | 74 +++++++++++++------ packages/visual-insights/src/pipeline/node.ts | 72 +++++++++--------- packages/visual-insights/src/pipeline/test.ts | 59 +++++++++++++-- 3 files changed, 142 insertions(+), 63 deletions(-) diff --git a/packages/visual-insights/src/pipeline/index.ts b/packages/visual-insights/src/pipeline/index.ts index 8ba74d74..75114897 100644 --- a/packages/visual-insights/src/pipeline/index.ts +++ b/packages/visual-insights/src/pipeline/index.ts @@ -1,38 +1,66 @@ +import produce from 'immer'; import { PipeLineNode, PipeLineNodeInterface } from './node'; + interface PipeLineInterface { nodes: PipeLineNodeInterface[] } -export class PipeLine { - nodes: PipeLineNode[]; + +export class PipeLine { + nodes: PipeLineNode[]; runningIndex: number; - constructor (props: PipeLineInterface) { + constructor (props: { nodes: NODES }) { + // this.nodes = [] as NODES; this.nodes = []; - for (let i = 0; i < props.nodes.length; i++) { - let node = props.nodes[i]; - let pipeNode = new PipeLineNode(node); - pipeNode.openChannel({ - type: 'consumer', - operator: (state) => { - console.log('consumer', state.returns) - return state.returns; - } - }) - if (i > 0) { - pipeNode.openChannel({ - type: 'producer', - operator: (state, injection, params) => { - let lastNode = this.nodes[i - 1]; - // let target = lastNode.channels.find(c => c.type === 'consumer'); + props.nodes.forEach((node, index) => { + if (index > 0) { + let newNode = produce(node, draft => { + if (typeof draft.channels === 'undefined') { + draft.channels = { + injection: {}, + release: {} + } + } + draft.channels.injection['source'] = (injection) => { injection(draft => { - console.log('last', lastNode.state.returns) - draft.source = lastNode.state.returns; + draft.source = this.nodes[index - 1].state.returns; }) } + // this.nodes[index - 1].state.returns; }) + let pipeNode = new PipeLineNode(newNode); + this.nodes.push(pipeNode); + } else { + let pipeNode = new PipeLineNode(node); + this.nodes.push(pipeNode) } + + }) + // for (let i = 0; i < props.nodes.length; i++) { + // let node = props.nodes[i]; + // let pipeNode = new PipeLineNode(node); + // pipeNode.openChannel({ + // type: 'consumer', + // operator: (state) => { + // console.log('consumer', state.returns) + // return state.returns; + // } + // }) + // if (i > 0) { + // pipeNode.openChannel({ + // type: 'producer', + // operator: (state, injection, params) => { + // let lastNode = this.nodes[i - 1]; + // // let target = lastNode.channels.find(c => c.type === 'consumer'); + // injection(draft => { + // console.log('last', lastNode.state.returns) + // draft.source = lastNode.state.returns; + // }) + // } + // }) + // } - this.nodes.push(pipeNode); - } + // this.nodes.push(pipeNode); + // } } public run (startIndex: number = 0): void { for (let i = startIndex; i < this.nodes.length; i++) { diff --git a/packages/visual-insights/src/pipeline/node.ts b/packages/visual-insights/src/pipeline/node.ts index 15fd510a..13a58319 100644 --- a/packages/visual-insights/src/pipeline/node.ts +++ b/packages/visual-insights/src/pipeline/node.ts @@ -2,7 +2,7 @@ import produce from 'immer'; type NucleiFunction = (params: P) => R; -interface StateBase

{ +export interface StateBase

{ startTime: number; endTime: number; params: P; @@ -11,17 +11,26 @@ interface StateBase

{ type Injection

= (updater: (cytoplasmParams: P) => void) => void; -type ChannelOperator, Signal> = (cytoplasmState: S, inject?: Injection, params?: any) => Signal; +export type ReleaseChannel, Signal = any> = (cytoplasmState: S, inject?: Injection) => Signal; +export type InjectChannel, T> = (inject: Injection, params?: T) => any; -interface Channel, Signal = any> { - type: 'consumer' | 'producer'; - operator: ChannelOperator -} -export interface PipeLineNodeInterface

{ +interface BaseChannel

{ + injection: { + source?: InjectChannel, any>, + [key: string]: InjectChannel, any> + }; + release: { + [key: string]: ReleaseChannel, any> + } +}; + +export interface PipeLineNodeInterface

= BaseChannel> { nuclei: NucleiFunction; - initParams: P -} -export class PipeLineNode

{ + initParams: P; + channels?: C +}; + +export class PipeLineNode

= BaseChannel> { /** * kernal function responsable for specific task of current node in pipeline. */ @@ -33,16 +42,25 @@ export class PipeLineNode

{ /** * service plugins / cytoplasm consumers */ - public channels: Channel>[]; - constructor (props: PipeLineNodeInterface) { - this.nuclei = props.nuclei; + public channels: C; + + constructor (props: PipeLineNodeInterface) { + const { nuclei, initParams, channels } = props; + this.nuclei = nuclei; this.state = { startTime: 0, endTime: 0, - params: props.initParams, + params: initParams, returns: null } - this.channels = []; + if (channels) { + this.channels = channels + } else { + this.channels = { + injection: {}, + release: {} + } as C; + } } /** * injection is used by channel to update the state.params in "cytoplasm". @@ -51,33 +69,19 @@ export class PipeLineNode

{ let nextParams = produce(this.state.params, updater); this.state.params = nextParams; } - /** - * register a channel (or service plugin, or cytoplasm consumer) - * @param channel - */ - public openChannel (channel: Channel>) { - this.channels.push(channel); - } - // public absorbProps(params: P) { - // this.state.params = params; - // } private beforeRun() { - this.state.startTime = new Date().getTime(); - for (let channel of this.channels) { - if (channel.type === 'producer') { - channel.operator(this.state, this.injection); - } + if (this.channels.injection.source) { + this.channels.injection.source(this.injection) } + this.state.startTime = new Date().getTime(); } private afterRun(returns: R) { this.state.endTime = new Date().getTime(); this.state.returns = returns; - for (let channel of this.channels) { + for (let channelName in this.channels.release) { // todo: consumer type should not get injection ? - if (channel.type === 'consumer') { - channel.operator(this.state); - } + this.channels.release[channelName](this.state, this.injection) } } public run (): R { diff --git a/packages/visual-insights/src/pipeline/test.ts b/packages/visual-insights/src/pipeline/test.ts index 986b6de3..fd21d01c 100644 --- a/packages/visual-insights/src/pipeline/test.ts +++ b/packages/visual-insights/src/pipeline/test.ts @@ -1,22 +1,58 @@ import { PipeLine } from './index'; +import { PipeLineNode, PipeLineNodeInterface, InjectChannel, ReleaseChannel, StateBase } from './node'; // array: x => x + 1 // array: x => x / index // array => sum(array) - -let pipe = new PipeLine({ +type Pip = [ + PipeLineNodeInterface<{ source: any[] }, number[]>, + PipeLineNodeInterface<{ source: any[], threshold: number }, number[], { + injection: { + hack: InjectChannel, number> + }; + release: { + console: ReleaseChannel, any> + } + }>, + PipeLineNodeInterface<{ source: any[] }, number> +]; +let pipe = new PipeLine({ nodes: [ { initParams: { - source: [10, 20, 30] + source: [-10, -20, -30, 10, 20, 30] }, - nuclei: (params) => params.source.map(n => n + 1) + nuclei: (params) => params.source.map(n => n + 1), + channels: { + injection: {}, + release: { + console (state) { + console.log(state) + } + } + } }, { initParams: { - source: [] + source: [], + threshold: -Infinity }, - nuclei: (params) => params.source.map((n, i) => n / (i + 1)) + nuclei: (params) => params.source.filter(n => n > params.threshold), + channels: { + injection: { + hack (injection, num) { + // console.log('hack is running', nums) + injection(draft => { + draft.threshold = num; + }) + } + }, + release: { + console (state) { + console.log(state) + } + } + } }, { initParams: { @@ -29,9 +65,20 @@ let pipe = new PipeLine({ }) console.log(sum) return sum; + }, + channels: { + injection: {}, + release: { + console (state) { + console.log(state) + } + } } } ] }); pipe.run(); +console.log('================') +pipe.nodes[1].channels.injection.hack(pipe.nodes[1].injection, 2) +pipe.run(1) From f0dd5cbf20b165139cf67497c9cccef592594112 Mon Sep 17 00:00:00 2001 From: ObservedObserver <270001151@qq.com> Date: Thu, 5 Mar 2020 12:29:07 +0800 Subject: [PATCH 03/10] feat: wrap injection params --- packages/visual-insights/src/pipeline/node.ts | 12 ++++++++++++ packages/visual-insights/src/pipeline/test.ts | 3 ++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/packages/visual-insights/src/pipeline/node.ts b/packages/visual-insights/src/pipeline/node.ts index 13a58319..cc4974d0 100644 --- a/packages/visual-insights/src/pipeline/node.ts +++ b/packages/visual-insights/src/pipeline/node.ts @@ -90,4 +90,16 @@ export class PipeLineNode

({ pipe.run(); console.log('================') -pipe.nodes[1].channels.injection.hack(pipe.nodes[1].injection, 2) +// pipe.nodes[1].channels.injection.hack(pipe.nodes[1].injection, 2) +pipe.nodes[1].openChannel('injection.hack', 2) pipe.run(1) From c2eab9c49688f0bb430a88f6b789afe85601c415 Mon Sep 17 00:00:00 2001 From: ObservedObserver <270001151@qq.com> Date: Thu, 5 Mar 2020 19:06:57 +0800 Subject: [PATCH 04/10] feat: new intention insight demo --- packages/visual-insights/src/insights/dev.ts | 171 ++++++++++++++++++ .../visual-insights/src/insights/impurity.ts | 90 ++++----- .../visual-insights/src/insights/subspaces.ts | 10 +- .../src/ml/outlier/isolationForest.ts | 4 +- .../visual-insights/src/pipeline/visTest.ts | 85 +++++++++ packages/visual-insights/test/insights.js | 11 +- .../visual-insights/test/linearRegression.js | 2 +- 7 files changed, 319 insertions(+), 54 deletions(-) create mode 100644 packages/visual-insights/src/insights/dev.ts create mode 100644 packages/visual-insights/src/pipeline/visTest.ts diff --git a/packages/visual-insights/src/insights/dev.ts b/packages/visual-insights/src/insights/dev.ts new file mode 100644 index 00000000..507a67fc --- /dev/null +++ b/packages/visual-insights/src/insights/dev.ts @@ -0,0 +1,171 @@ +import { DataSource, View } from "../commonTypes"; +import { getDimSetsBasedOnClusterGroups, getMeaSetsBasedOnClusterGroups, getDimClusterGroups } from './subspaces'; +import { CrammersVThreshold } from './config'; +import { Cluster, Outier } from '../ml/index'; +import { crammersV, getCombination, pearsonCC, linearMapPositive } from '../statistics/index'; +import { CHANNEL } from '../constant'; +import { entropy, normalize } from '../statistics/index'; +import aggregate, { createCube } from 'cube-core'; +import { momentCube } from "cube-core/built/core"; +import { isFieldContinous, isFieldTime } from '../utils/common'; +import { oneDLinearRegression } from '../statistics/index' +const SPLITER = '=;='; +interface ViewSpace { + dimensions: string[]; + measures: string[]; +} +interface InsightSpace { + dimensions: string[]; + measures: string[]; + type: 'general' | 'trend' | 'outlier'; + order: 'desc' | 'asc'; + score: number; + significance: number; +} +function crossGroups(dimensionGroups: string[][], measureGroups: string[][]): ViewSpace[] { + let viewSpaces: ViewSpace[] = []; + for (let dimensions of dimensionGroups) { + for (let measures of measureGroups) { + viewSpaces.push({ + dimensions, + measures + }); + } + } + return viewSpaces; +} + +function getDimSetsFromClusterGroups(groups: string[][]): string[][] { + let dimSets: string[][] = []; + for (let group of groups) { + let combineDimSet: string[][] = getCombination(group, 1, CHANNEL.maxDimensionNumber); + dimSets.push(...combineDimSet); + } + return dimSets; +} + +export function getGeneralIntentionSpaces (cubePool: Map, viewSpaces: ViewSpace[]): InsightSpace[] { + let ansSpace: InsightSpace[] = [] + for (let space of viewSpaces) { + const { dimensions, measures } = space; + let key = dimensions.join(SPLITER); + if (cubePool.has(key)) { + let aggData = cubePool.get(key); + let score = 0; + let significance = 0; + for (let mea of measures) { + let fL = aggData.map(r => r[mea]); + let pL = normalize(linearMapPositive(fL)); + let value = entropy(pL); + score += value; + significance += value / Math.log2(fL.length) + } + score /= measures.length; + significance /= measures.length; + significance = 1 - significance; + let insightSpace: InsightSpace = { + dimensions, + measures, + type: 'general', + score, + significance, + order: 'asc' + } + ansSpace.push(insightSpace); + } + } + return ansSpace; +} + +export function getOutlierIntentionSpaces (cubePool: Map, viewSpaces: ViewSpace[]): InsightSpace[] { + let ansSpace: InsightSpace[] = []; + for (let space of viewSpaces) { + const { dimensions, measures } = space; + let key = dimensions.join(SPLITER); + if (cubePool.has(key)) { + let aggData = cubePool.get(key); + let iForest = new Outier.IsolationForest(dimensions, measures, aggData); + iForest.buildIsolationForest(); + let scoreList = iForest.estimateOutierScore(); + let score = Math.max(...scoreList); + let insightSpace: InsightSpace = { + dimensions, + measures, + type: 'outlier', + score, + significance: score, + order: 'desc' + } + ansSpace.push(insightSpace); + } + } + return ansSpace; +} + +export function getTrendIntentionSpaces (cubePool: Map, viewSpaces: ViewSpace[]): InsightSpace[] { + let ansSpace: InsightSpace[] = []; + for (let space of viewSpaces) { + const { dimensions, measures } = space; + let key = dimensions.join(SPLITER); + if (cubePool.has(key)) { + let aggData = cubePool.get(key); + let orderedData = [...aggData]; + orderedData.sort((a, b) => { + if (a[dimensions[0]] > b[dimensions[0]]) return 1; + if (a[dimensions[0]] === b[dimensions[0]]) return 0; + if (a[dimensions[0]] < b[dimensions[0]]) return -1; + }); + let score = 0; + for (let mea of measures) { + let linearModel = new oneDLinearRegression(orderedData, dimensions[0], mea); + linearModel.normalizeDimensions(dimensions); + score += linearModel.significance(); + } + score /= measures.length; + let insightSpace: InsightSpace = { + dimensions, + measures, + type: 'trend', + score, + significance: score, + order: 'desc' + } + ansSpace.push(insightSpace); + } + } + return ansSpace; +} + +export function getVisSpaces (dataSource: DataSource, dimensions: string[], measures: string[]): InsightSpace[] { + // 1. get dimension cluster groups. + // 2. get measure cluster groups. + // 3. get dimension groups * measure groups = subspaces + aggregate + // 4. calculate each subspace intention score (entropy, outlier, trend for temporal & oridinal field) + // 5. filter each intend subspaces with threadshold + // 6.manage those spaces / order them. + let ansSpace: InsightSpace[] = []; + let dimensionGroups = getDimClusterGroups(dataSource, dimensions); + let dimensionSets = getDimSetsFromClusterGroups(dimensionGroups); + let measureGroups = getMeaSetsBasedOnClusterGroups(dataSource, measures); + let viewSpaces = crossGroups(dimensionSets, measureGroups); + let cubePool: Map = new Map(); + for (let group of dimensionGroups) { + let key = group.join(SPLITER); + let aggData = aggregate({ + dataSource, + dimensions: group, + measures, + asFields: measures, + operator: 'sum' + }); + cubePool.set(key, aggData); + } + ansSpace.push(...getGeneralIntentionSpaces(cubePool, viewSpaces)); + ansSpace.push(...getOutlierIntentionSpaces(cubePool, viewSpaces)); + let trendSpaces = viewSpaces.filter(space => space.dimensions.length === 1) + // .filter(space => { + // return isFieldContinous(dataSource, space.dimensions[0]) || isFieldTime(dataSource, space.dimensions[0]) + // }) + ansSpace.push(...getTrendIntentionSpaces(cubePool, trendSpaces)); + return ansSpace; +} \ No newline at end of file diff --git a/packages/visual-insights/src/insights/impurity.ts b/packages/visual-insights/src/insights/impurity.ts index 9c3f04e8..2d64f760 100644 --- a/packages/visual-insights/src/insights/impurity.ts +++ b/packages/visual-insights/src/insights/impurity.ts @@ -79,50 +79,50 @@ export function insightExtraction(dataSource: DataSource, dimensions: string[], } return impurityList } -interface InsightSpace { - dimensions: string[]; - type: 'entropy' | 'trend' | 'outlier'; - order: 'desc' | 'asc'; - score: { - [meaName: string]: number; - }; - correlationMatrix: number[][]; -} -export function multiInsightExtraction(dataSource: DataSource, dimensions: string[], measures: string[]): InsightSpace[] { - let impurityList: FieldsFeature[] = []; - let dimSet = subspaceSearching(dataSource, dimensions, true); - let correlationMatrix = measures.map(i => measures.map(j => 0)); - for (let i = 0; i < measures.length; i++) { - correlationMatrix[i][i] = 1; - for (let j = i + 1; j < measures.length; j++) { - let r = pearsonCC(dataSource, measures[i], measures[j]); - correlationMatrix[j][i] = correlationMatrix[i][j] = r; - } - } +// interface InsightSpace { +// dimensions: string[]; +// type: 'entropy' | 'trend' | 'outlier'; +// order: 'desc' | 'asc'; +// score: { +// [meaName: string]: number; +// }; +// correlationMatrix: number[][]; +// } +// export function multiInsightExtraction(dataSource: DataSource, dimensions: string[], measures: string[]): InsightSpace[] { +// let impurityList: FieldsFeature[] = []; +// let dimSet = subspaceSearching(dataSource, dimensions, true); +// let correlationMatrix = measures.map(i => measures.map(j => 0)); +// for (let i = 0; i < measures.length; i++) { +// correlationMatrix[i][i] = 1; +// for (let j = i + 1; j < measures.length; j++) { +// let r = pearsonCC(dataSource, measures[i], measures[j]); +// correlationMatrix[j][i] = correlationMatrix[i][j] = r; +// } +// } - for (let dset of dimSet) { - let impurity = {}; - let trend = {}; - let outlier = {}; - let aggData = aggregate({ - dataSource, - dimensions: dset, - measures, - asFields: measures, - operator: operator || 'sum'//: operator as - }); - // let fList = aggData.map(r => r) - for (let mea of measures) { - // fl = frequency list, pL = probability list - let fL = aggData.map(r => r[mea]); - let pL = normalize(linearMapPositive(fL)); - let value = entropy(pL); - impurity[mea] = value; - } - for (let mea of measures) { +// for (let dset of dimSet) { +// let impurity = {}; +// let trend = {}; +// let outlier = {}; +// let aggData = aggregate({ +// dataSource, +// dimensions: dset, +// measures, +// asFields: measures, +// operator: operator || 'sum'//: operator as +// }); +// // let fList = aggData.map(r => r) +// for (let mea of measures) { +// // fl = frequency list, pL = probability list +// let fL = aggData.map(r => r[mea]); +// let pL = normalize(linearMapPositive(fL)); +// let value = entropy(pL); +// impurity[mea] = value; +// } +// for (let mea of measures) { - } - impurityList.push([dset, impurity, correlationMatrix]); - } - return impurityList -} \ No newline at end of file +// } +// impurityList.push([dset, impurity, correlationMatrix]); +// } +// return impurityList +// } \ No newline at end of file diff --git a/packages/visual-insights/src/insights/subspaces.ts b/packages/visual-insights/src/insights/subspaces.ts index 171cec49..f09a8a73 100644 --- a/packages/visual-insights/src/insights/subspaces.ts +++ b/packages/visual-insights/src/insights/subspaces.ts @@ -31,9 +31,8 @@ function getMeaCorrelationMatrix(dataSource: DataSource, measures: string[]): nu return matrix; } -export function getDimSetsBasedOnClusterGroups(dataSource: DataSource, dimensions: string[]): string[][] { +export function getDimClusterGroups(dataSource: DataSource, dimensions: string[]): string[][] { const maxDimNumberInView = 4; - let dimSets: string[][] = []; let dimCorrelationMatrix = getDimCorrelationMatrix(dataSource, dimensions); // groupMaxSize here means group number. let groups: string[][] = Cluster.kruskal({ @@ -42,7 +41,12 @@ export function getDimSetsBasedOnClusterGroups(dataSource: DataSource, dimension groupMaxSize: Math.round(dimensions.length / maxDimNumberInView), threshold: CrammersVThreshold }); - // todo: maybe a threhold would be better ? + return groups; +} + +export function getDimSetsBasedOnClusterGroups(dataSource: DataSource, dimensions: string[]): string[][] { + let dimSets: string[][] = []; + let groups = getDimClusterGroups(dataSource, dimensions); for (let group of groups) { let combineDimSet: string[][] = getCombination(group, 1, CHANNEL.maxDimensionNumber); dimSets.push(...combineDimSet); diff --git a/packages/visual-insights/src/ml/outlier/isolationForest.ts b/packages/visual-insights/src/ml/outlier/isolationForest.ts index 1071c3b3..c45bd0b0 100644 --- a/packages/visual-insights/src/ml/outlier/isolationForest.ts +++ b/packages/visual-insights/src/ml/outlier/isolationForest.ts @@ -24,8 +24,8 @@ export class IsolationForest { this.measures = measures; this.dataSource = dataSource; if (dataSource.length < Psi) { - this.treeNumber = 20; - this.sampleSize = dataSource.length / 5; + this.treeNumber = Math.max(1, Math.ceil(Psi / 50)); + this.sampleSize = Math.max(2, Math.floor(dataSource.length / 2)); } else { this.treeNumber = treeNumber; this.sampleSize = Psi; diff --git a/packages/visual-insights/src/pipeline/visTest.ts b/packages/visual-insights/src/pipeline/visTest.ts new file mode 100644 index 00000000..e2712b64 --- /dev/null +++ b/packages/visual-insights/src/pipeline/visTest.ts @@ -0,0 +1,85 @@ +import { PipeLine } from './index'; +import { PipeLineNode, PipeLineNodeInterface, InjectChannel, ReleaseChannel, StateBase } from './node'; + +// array: x => x + 1 +// array: x => x / index +// array => sum(array) +type Pip = [ + PipeLineNodeInterface<{ source: any[] }, number[]>, + PipeLineNodeInterface<{ source: any[], threshold: number }, number[], { + injection: { + hack: InjectChannel, number> + }; + release: { + console: ReleaseChannel, any> + } + }>, + PipeLineNodeInterface<{ source: any[] }, number> +]; +let pipe = new PipeLine({ + nodes: [ + { + initParams: { + source: [-10, -20, -30, 10, 20, 30] + }, + nuclei: (params) => params.source.map(n => n + 1), + channels: { + injection: {}, + release: { + console (state) { + console.log(state) + } + } + } + }, + { + initParams: { + source: [], + threshold: -Infinity + }, + nuclei: (params) => params.source.filter(n => n > params.threshold), + channels: { + injection: { + hack (injection, num) { + // console.log('hack is running', nums) + injection(draft => { + draft.threshold = num; + }) + } + }, + release: { + console (state) { + console.log(state) + } + } + } + }, + { + initParams: { + source: [] + }, + nuclei: (params) => { + let sum = 0; + params.source.forEach(n => { + sum += n; + }) + console.log(sum) + return sum; + }, + channels: { + injection: {}, + release: { + console (state) { + console.log(state) + } + } + } + } + ] +}); + +pipe.run(); +console.log('================') +// pipe.nodes[1].channels.injection.hack(pipe.nodes[1].injection, 2) +pipe.nodes[1].openChannel('injection.hack', 2) +pipe.run(1) diff --git a/packages/visual-insights/test/insights.js b/packages/visual-insights/test/insights.js index c327a36e..9461c38a 100644 --- a/packages/visual-insights/test/insights.js +++ b/packages/visual-insights/test/insights.js @@ -3,7 +3,7 @@ const assert = require('assert'); const path = require('path'); const { Insight, Cleaner, Statistics, Sampling } = require('../build/cjs/index'); - +const { getVisSpaces } = require('../build/cjs/insights/dev'); const datasetPath = path.resolve(__dirname, './dataset/airbnb.json'); const dataset = JSON.parse(fs.readFileSync(datasetPath).toString()); const { @@ -20,17 +20,22 @@ describe('insights test', function () { const result = Insight.insightExtraction(cleanData, dimensions, measures); assert.equal(result.length > 0, true); }) + + it('print(dev pipeline)', function () { + const result = getVisSpaces(cleanData, dimensions, measures); + console.log('new pipeline result', result) + assert.equal(result.length > 0, true); + }) it('print(getCombination)', function () { let result = Statistics.getCombination([1, 2, 3, 4, 5, 6]); - console.log(result) assert.equal(result.length, Math.pow(2, 6) - 1) }) it('print(clusterCombination vs. combination)', function () { let result = Insight.subspaceSearching(cleanData, dimensions, true); let unClusterResult = Statistics.getCombination(dimensions); - console.log(result.length, unClusterResult.length, result) + console.log(result.length, unClusterResult.length) assert.equal(result.length <= unClusterResult.length, true); }) }) diff --git a/packages/visual-insights/test/linearRegression.js b/packages/visual-insights/test/linearRegression.js index 0ce7907c..810c1e52 100644 --- a/packages/visual-insights/test/linearRegression.js +++ b/packages/visual-insights/test/linearRegression.js @@ -29,7 +29,7 @@ describe('linear regression', function () { }) it('p-value', function () { let p = linearModel.pValue(); - assert.equal(p < 1, 0) + assert.equal(p < 1, true) }) }) }) \ No newline at end of file From ac3b827e5621feae310a6fcbdb9c6ccc5721d960 Mon Sep 17 00:00:00 2001 From: ObservedObserver <270001151@qq.com> Date: Thu, 5 Mar 2020 23:40:58 +0800 Subject: [PATCH 05/10] feat: outier good demo --- packages/frontend/src/App.tsx | 7 +- packages/frontend/src/pages/dev/index.tsx | 201 ++++++++++++++++++ packages/frontend/src/service.ts | 28 ++- packages/frontend/src/workers/dev.worker.js | 22 ++ packages/visual-insights/src/insights/dev.ts | 27 ++- .../src/ml/outlier/isolationForest.ts | 39 +++- 6 files changed, 310 insertions(+), 14 deletions(-) create mode 100644 packages/frontend/src/pages/dev/index.tsx create mode 100644 packages/frontend/src/workers/dev.worker.js diff --git a/packages/frontend/src/App.tsx b/packages/frontend/src/App.tsx index b0191baf..67f952dd 100644 --- a/packages/frontend/src/App.tsx +++ b/packages/frontend/src/App.tsx @@ -10,13 +10,15 @@ import Gallery from "./pages/gallery/index"; import NoteBook from "./pages/notebook/index"; import DataSourceBoard from "./pages/dataSource/index"; import DashBoardPage from './pages/dashBoard/index'; +import DevPage from './pages/dev'; import UserSettings from './components/userSettings'; const pivotList = [ 'DataSource', 'NoteBook', 'Explore', - 'DashBoard' + 'DashBoard', + 'Dev' ].map((page, index) => { return { title: page, itemKey: 'pivot-' + (index + 1)} }); @@ -121,6 +123,9 @@ function App() { { pageStatus.current.pivotKey === 'pivot-4' && } + { + pageStatus.current.pivotKey === 'pivot-5' && + } ); } diff --git a/packages/frontend/src/pages/dev/index.tsx b/packages/frontend/src/pages/dev/index.tsx new file mode 100644 index 00000000..3fdc484a --- /dev/null +++ b/packages/frontend/src/pages/dev/index.tsx @@ -0,0 +1,201 @@ +import React, { useState, useEffect, useMemo } from 'react'; +import styled from 'styled-components'; +import { InsightSpace } from 'visual-insights/build/esm/insights/dev'; +import { specification } from "visual-insights"; +import { getInsightViewSpace } from '../../service'; +import { PrimaryButton, SpinButton, Slider, ProgressIndicator } from "office-ui-fabric-react"; +import PreferencePanel, { + PreferencePanelConfig +} from "../../components/preference"; +import BaseChart from "../../visBuilder/vegaBase"; +import { Position } from "office-ui-fabric-react/lib/utilities/positioning"; + +import { useGlobalState } from "../../state"; +import { useComposeState } from '../../utils'; + +const Tag = styled.div` + display: inline-block; + padding: 0.2em 0.5em; + margin: 0.2em; + border-radius: 3px; + color: #fff; + font-size: 12px; + background-color: ${props => props.color}; +`; + +const ColorMap: { + [key in InsightSpace['type']]: string +} = { + 'outlier': '#cf1322', + 'trend': '#7cb305', + 'general': '#08979c' +} + +const DevPage: React.FC = props => { + const [insightViewSpace, setInsightViewSpace] = useState([]); + const [sigThreshold, setSigThreshold] = useState(0.6); + const [loading, setLoading] = useState(false); + const [visualConfig, setVisualConfig] = useState({ + aggregator: "sum", + defaultAggregated: true, + defaultStack: true + }); + const [pageStatus, setPageStatus] = useComposeState<{ show: { configPanel: boolean }}>({ + show: { + configPanel: false + } + }); + const [state, , dispatch, getters] = useGlobalState(); + const [chartIndex, setChartIndex] = useState(0); + const { + cookedDataSource, + cookedDimensions, + cookedMeasures + } = state; + const { dimScores } = getters; + + const viewSpaceList = useMemo(() => { + return insightViewSpace.filter(s => s.significance >= sigThreshold); + }, [insightViewSpace, sigThreshold]) + + const dataView = useMemo(() => { + if (viewSpaceList.length === 0) return null; + const { dimensions, measures } = viewSpaceList[chartIndex]; + const fieldScores = dimScores.filter(field => { + return dimensions.includes(field[0]) || measures.includes(field[0]); + }); + let { schema } = specification( + fieldScores, + cookedDataSource, + dimensions, + measures + ); + console.log(viewSpaceList[chartIndex].description) + return { + schema, + fieldFeatures: fieldScores.map(f => f[3]), + dimensions, + measures + } + }, [viewSpaceList, chartIndex, cookedDataSource]) + useEffect(() => { + if (dataView === null) return; + const { schema } = dataView; + if ( + schema.geomType && + (schema.geomType.includes("point") || + schema.geomType.includes("density")) + ) { + setVisualConfig(config => { + return { + ...config, + defaultAggregated: false + }; + }); + } else { + setVisualConfig(config => { + return { + ...config, + defaultAggregated: true + }; + }); + } + }, [dataView]) + return

+ { + setVisualConfig(config); + setPageStatus(draft => { + draft.show.configPanel = false; + }); + }} + onClose={() => { + setPageStatus(draft => { + draft.show.configPanel = false; + }); + }} + /> +
+ { + setLoading(true); + getInsightViewSpace(cookedDataSource, cookedDimensions, cookedMeasures).then(res => { + setInsightViewSpace(res); + setLoading(false); + }) + }} /> + { loading && } +
+ `${value}%`} + showValue={true} + onChange={(value: number) => { + setSigThreshold(value / 100); + setChartIndex(0); + }} + /> +
+
+ { + setChartIndex((Number(value) - 1) % viewSpaceList.length); + }} + onIncrement={() => { + setChartIndex((chartIndex + 1) % viewSpaceList.length); + }} + onDecrement={() => { + setChartIndex( + (chartIndex - 1 + viewSpaceList.length) % + viewSpaceList.length + ); + }} + incrementButtonAriaLabel={"Increase value by 1"} + decrementButtonAriaLabel={"Decrease value by 1"} + /> +
+

+ There are {viewSpaceList.length} of views of which insight significance is no less than {(sigThreshold * 100).toFixed(2)} % +

+ { + viewSpaceList[chartIndex] &&

+ Dimensions are {viewSpaceList[chartIndex].dimensions}, and measures are {viewSpaceList[chartIndex].measures}.
+ There is a significance of {(viewSpaceList[chartIndex].significance * 100).toFixed(2)}% that there exits a {viewSpaceList[chartIndex].type} in the graph.
+ { JSON.stringify(viewSpaceList[chartIndex].description) } +

+ } +
+ { + viewSpaceList[chartIndex] && {viewSpaceList[chartIndex].type} + } +
+ { + viewSpaceList.length > 0 && dataView !== null &&
+ +
+ } +
+
+} + +export default DevPage; diff --git a/packages/frontend/src/service.ts b/packages/frontend/src/service.ts index d08f456e..ecc3c07a 100644 --- a/packages/frontend/src/service.ts +++ b/packages/frontend/src/service.ts @@ -19,6 +19,11 @@ import fieldsSummaryWorker from './workers/fieldsSummary.worker'; // @ts-ignore // eslint-disable-next-line import groupFieldsWorker from './workers/groupFields.worker'; +/* eslint import/no-webpack-loader-syntax:0 */ +// @ts-ignore +// eslint-disable-next-line +import InsightViewWorker from './workers/dev.worker'; +import { InsightSpace } from 'visual-insights/build/esm/insights/dev'; let server = '//lobay.moe:8443'; @@ -303,4 +308,25 @@ export async function generateDashBoard (dataSource: DataSource, dimensions: str } return dashBoardList; -} \ No newline at end of file +} + +export async function getInsightViewSpace (dataSource: DataSource, dimensions: string[], measures: string[]): Promise { + let ansSpace: InsightSpace[] = []; + try { + const worker = new InsightViewWorker(); + const result = await workerService(worker, { + dataSource, + dimensions, + measures + }); + if (result.success) { + ansSpace = result.data; + } else { + throw new Error('[getInsightViewSpace]' + result.message); + } + worker.terminate(); + } catch (error) { + console.error(error); + } + return ansSpace; +} \ No newline at end of file diff --git a/packages/frontend/src/workers/dev.worker.js b/packages/frontend/src/workers/dev.worker.js new file mode 100644 index 00000000..e9f516c4 --- /dev/null +++ b/packages/frontend/src/workers/dev.worker.js @@ -0,0 +1,22 @@ +/* eslint no-restricted-globals: 0 */ +import { getVisSpaces } from 'visual-insights/build/esm/insights/dev'; +import { timer } from './timer'; + +const generateDashBoard = (e) => { + const { dataSource, dimensions, measures } = e.data; + try { + let ansSpace = getVisSpaces(dataSource, dimensions, measures); + ansSpace.sort((a, b) => b.significance - a.significance); + self.postMessage({ + success: true, + data: ansSpace + }) + } catch (error) { + self.postMessage({ + success: false, + message: error + }) + } +} + +self.addEventListener('message', timer(generateDashBoard), false); \ No newline at end of file diff --git a/packages/visual-insights/src/insights/dev.ts b/packages/visual-insights/src/insights/dev.ts index 507a67fc..892c5212 100644 --- a/packages/visual-insights/src/insights/dev.ts +++ b/packages/visual-insights/src/insights/dev.ts @@ -14,13 +14,14 @@ interface ViewSpace { dimensions: string[]; measures: string[]; } -interface InsightSpace { +export interface InsightSpace { dimensions: string[]; measures: string[]; type: 'general' | 'trend' | 'outlier'; order: 'desc' | 'asc'; score: number; significance: number; + description?: any } function crossGroups(dimensionGroups: string[][], measureGroups: string[][]): ViewSpace[] { let viewSpaces: ViewSpace[] = []; @@ -81,20 +82,35 @@ export function getOutlierIntentionSpaces (cubePool: Map, vi let ansSpace: InsightSpace[] = []; for (let space of viewSpaces) { const { dimensions, measures } = space; - let key = dimensions.join(SPLITER); + let key = measures.length >= 2 ? '*' : dimensions.join(SPLITER); if (cubePool.has(key)) { let aggData = cubePool.get(key); - let iForest = new Outier.IsolationForest(dimensions, measures, aggData); + let iForest = new Outier.IsolationForest([], measures, aggData); iForest.buildIsolationForest(); let scoreList = iForest.estimateOutierScore(); - let score = Math.max(...scoreList); + // let rankScoreList = scoreList.map((s, i) => ({ + // score: s, + // index: i + // })); + // rankScoreList.sort((a, b) => b.score - a.score); + let maxIndex = 0; + let score = 0; + for (let i = 0; i < scoreList.length; i++) { + if (scoreList[i] > score) { + score = scoreList[i]; + maxIndex = i; + } + } + let des: {[key: string]: any} = {}; + dimensions.concat(measures).forEach(mea => { des[mea] = aggData[maxIndex][mea]; }) let insightSpace: InsightSpace = { dimensions, measures, type: 'outlier', score, significance: score, - order: 'desc' + order: 'desc', + description: des//rankScoreList.slice(0, 10).map(s => aggData[s.index]) } ansSpace.push(insightSpace); } @@ -160,6 +176,7 @@ export function getVisSpaces (dataSource: DataSource, dimensions: string[], meas }); cubePool.set(key, aggData); } + cubePool.set('*', dataSource); ansSpace.push(...getGeneralIntentionSpaces(cubePool, viewSpaces)); ansSpace.push(...getOutlierIntentionSpaces(cubePool, viewSpaces)); let trendSpaces = viewSpaces.filter(space => space.dimensions.length === 1) diff --git a/packages/visual-insights/src/ml/outlier/isolationForest.ts b/packages/visual-insights/src/ml/outlier/isolationForest.ts index c45bd0b0..212ce9c9 100644 --- a/packages/visual-insights/src/ml/outlier/isolationForest.ts +++ b/packages/visual-insights/src/ml/outlier/isolationForest.ts @@ -18,6 +18,7 @@ export class IsolationForest { public readonly limitHeight: number; public recordScoreList: number[]; private valueSets: Array >; + private ranges: Array<[number, number]>; private iForest: ITree[]; constructor (dimensions: string[], measures: string[], dataSource: DataSource, treeNumber: number = 100, Psi: number = 256) { this.dimensions = dimensions; @@ -37,9 +38,13 @@ export class IsolationForest { private normalizeDimensions(): NormalizedRecord[] { this.normalizedDataSource = []; this.valueSets = []; + this.ranges = []; this.dimensions.forEach(() => { this.valueSets.push(new Map()); }) + this.measures.forEach(() => { + this.ranges.push([Infinity, -Infinity]) + }) this.dataSource.forEach(record => { this.dimensions.forEach((dim, index) => { let value = (record[dim] || 'others').toString(); @@ -47,6 +52,13 @@ export class IsolationForest { this.valueSets[index].set(value, this.valueSets[index].size); } }) + this.measures.forEach((mea, index) => { + let value = record[mea]; + if (typeof value === 'number') { + this.ranges[index][0] = Math.min(this.ranges[index][0], value); + this.ranges[index][1] = Math.max(this.ranges[index][1], value); + } + }) }) this.dataSource.forEach(record => { let normalizedRecord = this.normalizeRecord(record); @@ -65,12 +77,26 @@ export class IsolationForest { }) return normalizedRecord; } - public buildIsolationTree (fields: string[], normalizedSampleData: NormalizedRecord[], depth: number): ITree { + public buildIsolationTree (normalizedSampleData: NormalizedRecord[], depth: number): ITree { if (depth >= this.limitHeight || normalizedSampleData.length <= 1) { return null; } else { - let randField = fields[Math.floor(Math.random() * fields.length) % fields.length]; - let randValue = normalizedSampleData[Math.floor(Math.random() * normalizedSampleData.length) % normalizedSampleData.length][randField]; + let rand: number = Math.random(); + let randField: string = this.measures[0] || this.dimensions[0]; + let dimLength = this.dimensions.length; + let meaLength = this.measures.length; + let randValue: number = 0; + if (rand >= dimLength / (dimLength + meaLength)) { + let index = Math.floor(Math.random() * meaLength) % meaLength + randField = this.measures[index]; + randValue = this.ranges[index][0] + (this.ranges[index][1] - this.ranges[index][0]) * Math.random(); + } else { + let index = Math.floor(Math.random() * dimLength) % dimLength; + randField = this.dimensions[index]; + randValue = Math.floor(this.valueSets[index].size * Math.random()) % this.valueSets[index].size; + } + // random in range not in distribution. + // let randValue = normalizedSampleData[Math.floor(Math.random() * normalizedSampleData.length) % normalizedSampleData.length][randField]; let leftSubData: DataSource = []; let rightSubData: DataSource = []; for (let record of normalizedSampleData) { @@ -85,8 +111,8 @@ export class IsolationForest { value: randValue, size: normalizedSampleData.length } - node.left = this.buildIsolationTree(fields, leftSubData, depth + 1); - node.right = this.buildIsolationTree(fields, rightSubData, depth + 1); + node.left = this.buildIsolationTree(leftSubData, depth + 1); + node.right = this.buildIsolationTree(rightSubData, depth + 1); return node; } } @@ -114,10 +140,9 @@ export class IsolationForest { public buildIsolationForest (): ITree[] { this.iForest = []; - let fields = this.dimensions.concat(this.measures); for (let i = 0; i < this.treeNumber; i++) { let samples = uniformSampling(this.normalizedDataSource, this.sampleSize); - let iTree = this.buildIsolationTree(fields, samples, 0); + let iTree = this.buildIsolationTree(samples, 0); this.iForest.push(iTree); } return this.iForest; From 0f8ba7b2b538b79bfcef13e5e789ef8400d2091a Mon Sep 17 00:00:00 2001 From: ObservedObserver <270001151@qq.com> Date: Fri, 6 Mar 2020 16:28:21 +0800 Subject: [PATCH 06/10] feat: adjust dev vis board --- .../frontend/src/components/radarChart.tsx | 234 ++++++++++++++++++ .../frontend/src/components/simpleTick.tsx | 63 +++++ packages/frontend/src/pages/dev/index.tsx | 148 +++++++---- 3 files changed, 399 insertions(+), 46 deletions(-) create mode 100644 packages/frontend/src/components/radarChart.tsx create mode 100644 packages/frontend/src/components/simpleTick.tsx diff --git a/packages/frontend/src/components/radarChart.tsx b/packages/frontend/src/components/radarChart.tsx new file mode 100644 index 00000000..f7b68d62 --- /dev/null +++ b/packages/frontend/src/components/radarChart.tsx @@ -0,0 +1,234 @@ +import React, { useRef, useEffect, useMemo } from 'react'; +import embed from 'vega-embed'; +import { scheme } from 'vega'; + +scheme('threshold', ['#1890ff', '#ffccc7']); + +interface RadarChartProps { + keyField: string; + valueField: string; + dataSource: any[]; + threshold: number; +} +interface DataRecord { + key: string; + value: number; + category: 0 | 1; +} +const RadarChart: React.FC = props => { + const { keyField, valueField, dataSource = [], threshold } = props; + const container = useRef(null); + const viewData = useMemo(() => { + let data: DataRecord[] = dataSource.map(record => { + return { + key: record[keyField], + value: Math.round(record[valueField] * 1000) / 1000, + category: 0 + } + }); + ['outlier', 'trend', 'general'].forEach(type => { + if (!data.find(d => d.key === type)) { + data.push({ + key: type, + value: 0, + category: 0 + }) + } + }) + let ruleData: DataRecord[] = data.map(record => { + return { + key: record.key, + value: threshold, + category: 1 + } + }) + return data.concat(ruleData); + }, [keyField, valueField, dataSource, threshold]) + useEffect(() => { + if (container.current) { + embed(container.current, { + width: 280, + height: 280, + padding: 50, + autosize: { type: "none", contains: "padding" }, + + signals: [{ name: "radius", update: "width / 2" }], + + data: [ + { + name: "table", + values: viewData + }, + { + name: "keys", + source: "table", + transform: [ + { + type: "aggregate", + groupby: ["key"] + } + ] + } + ], + + scales: [ + { + name: "angular", + type: "point", + range: { signal: "[-PI, PI]" }, + padding: 0.5, + domain: { data: "table", field: "key" } + }, + { + name: "radial", + type: "linear", + range: { signal: "[0, radius]" }, + zero: true, + nice: false, + domain: { data: "table", field: "value" }, + domainMin: 0 + }, + { + name: "color", + type: "ordinal", + domain: { data: "table", field: "category" }, + range: { scheme: "threshold" } + } + ], + + encode: { + enter: { + x: { signal: "radius" }, + y: { signal: "radius" } + } + }, + + marks: [ + { + type: "group", + name: "categories", + zindex: 1, + from: { + facet: { data: "table", name: "facet", groupby: ["category"] } + }, + marks: [ + { + type: "line", + name: "category-line", + from: { data: "facet" }, + encode: { + enter: { + interpolate: { value: "linear-closed" }, + x: { + signal: + "scale('radial', datum.value) * cos(scale('angular', datum.key))" + }, + y: { + signal: + "scale('radial', datum.value) * sin(scale('angular', datum.key))" + }, + stroke: { scale: "color", field: "category" }, + strokeWidth: { value: 1 }, + fill: { scale: "color", field: "category" }, + fillOpacity: { value: 0.1 } + } + } + }, + { + type: "text", + name: "value-text", + from: { data: "category-line" }, + encode: { + enter: { + x: { signal: "datum.x" }, + y: { signal: "datum.y" }, + text: { signal: "datum.datum.value" }, + align: { value: "center" }, + baseline: { value: "middle" }, + fill: { value: "black" } + } + } + } + ] + }, + { + type: "rule", + name: "radial-grid", + from: { data: "keys" }, + zindex: 0, + encode: { + enter: { + x: { value: 0 }, + y: { value: 0 }, + x2: { signal: "radius * cos(scale('angular', datum.key))" }, + y2: { signal: "radius * sin(scale('angular', datum.key))" }, + stroke: { value: "lightgray" }, + strokeWidth: { value: 1 } + } + } + }, + { + type: "text", + name: "key-label", + from: { data: "keys" }, + zindex: 1, + encode: { + enter: { + x: { + signal: "(radius + 5) * cos(scale('angular', datum.key))" + }, + y: { + signal: "(radius + 5) * sin(scale('angular', datum.key))" + }, + text: { field: "key" }, + align: [ + { + test: "abs(scale('angular', datum.key)) > PI / 2", + value: "right" + }, + { + value: "left" + } + ], + baseline: [ + { + test: "scale('angular', datum.key) > 0", + value: "top" + }, + { + test: "scale('angular', datum.key) == 0", + value: "middle" + }, + { + value: "bottom" + } + ], + fill: { value: "black" }, + fontWeight: { value: "bold" } + } + } + }, + { + type: "line", + name: "outer-line", + from: { data: "radial-grid" }, + encode: { + enter: { + interpolate: { value: "linear-closed" }, + x: { field: "x2" }, + y: { field: "y2" }, + stroke: { value: "lightgray" }, + strokeWidth: { value: 1 } + } + } + } + ] + } as any, { + actions: false + }); + } + }, [viewData]); + return
+} + +export default RadarChart; diff --git a/packages/frontend/src/components/simpleTick.tsx b/packages/frontend/src/components/simpleTick.tsx new file mode 100644 index 00000000..bbf0a238 --- /dev/null +++ b/packages/frontend/src/components/simpleTick.tsx @@ -0,0 +1,63 @@ +import React, { useMemo } from 'react'; +import { DataSource } from '../global'; +import ReactVega from './react-vega'; + +interface SimpleTickProps { + x: string; + y: string; + threshold: number; + dataSource: DataSource +} +const SimpleTick: React.FC = props => { + const { x, y, threshold, dataSource = [] } = props; + const spec = useMemo(() => { + return { + width: 180, + height: 200, + data: { + name: 'dataSource' + }, + transform: [ + { calculate: threshold.toString(), as: 'threshold' } + ], + layer: [ + { + mark: 'point', + encoding: { + x: { + field: x, + type: 'nominal', + scale: { + domain: ['outlier', 'trend', 'general'] + } + }, + y: { + field: y, + type: 'quantitative', + scale: { + domain: [0, 1] + } + } + } + }, + { + mark: 'rule', + encoding: { + y: { + field: 'threshold', + type: 'quantitative' + }, + color: { + value: 'red' + } + } + } + ] + } + }, [x, y, threshold]) + return
+ +
; +} + +export default SimpleTick; \ No newline at end of file diff --git a/packages/frontend/src/pages/dev/index.tsx b/packages/frontend/src/pages/dev/index.tsx index 3fdc484a..230dc13d 100644 --- a/packages/frontend/src/pages/dev/index.tsx +++ b/packages/frontend/src/pages/dev/index.tsx @@ -12,10 +12,12 @@ import { Position } from "office-ui-fabric-react/lib/utilities/positioning"; import { useGlobalState } from "../../state"; import { useComposeState } from '../../utils'; +import SimpleTick from '../../components/simpleTick'; +import RadarChart from '../../components/radarChart'; const Tag = styled.div` display: inline-block; - padding: 0.2em 0.5em; + padding: 0.1em 0.3em; margin: 0.2em; border-radius: 3px; color: #fff; @@ -23,6 +25,20 @@ const Tag = styled.div` background-color: ${props => props.color}; `; +const DashBoard = styled.div` + display: flex; + div.left{ + flex-basis: 300px; + flex-grow: 1; + border-right: 1px solid #f0f0f0; + } + div.right{ + margin-left: 1em; + flex-grow: 8; + } + padding: 1em 0em; +` + const ColorMap: { [key in InsightSpace['type']]: string } = { @@ -31,6 +47,18 @@ const ColorMap: { 'general': '#08979c' } +function arrEqual (arr1: any[], arr2: any[]): boolean { + if (arr1.length !== arr2.length) { + return false; + } + for (let i = 0; i < arr1.length; i++) { + if (arr1[i] !== arr2[i]) { + return false; + } + } + return true; +} + const DevPage: React.FC = props => { const [insightViewSpace, setInsightViewSpace] = useState([]); const [sigThreshold, setSigThreshold] = useState(0.6); @@ -53,7 +81,11 @@ const DevPage: React.FC = props => { cookedMeasures } = state; const { dimScores } = getters; - + console.log({ + cookedDataSource, + cookedDimensions, + cookedMeasures + }, getters) const viewSpaceList = useMemo(() => { return insightViewSpace.filter(s => s.significance >= sigThreshold); }, [insightViewSpace, sigThreshold]) @@ -70,7 +102,6 @@ const DevPage: React.FC = props => { dimensions, measures ); - console.log(viewSpaceList[chartIndex].description) return { schema, fieldFeatures: fieldScores.map(f => f[3]), @@ -78,6 +109,19 @@ const DevPage: React.FC = props => { measures } }, [viewSpaceList, chartIndex, cookedDataSource]) + const relatedViews = useMemo(() => { + if (dataView !== null) { + const { dimensions, measures } = dataView; + return insightViewSpace.filter(f => { + if (arrEqual(dimensions, f.dimensions) && arrEqual(measures, f.measures)) { + return true + } + return false + }) + } + return [] + }, [insightViewSpace, dataView]) + console.log(relatedViews) useEffect(() => { if (dataView === null) return; const { schema } = dataView; @@ -126,8 +170,11 @@ const DevPage: React.FC = props => { }) }} /> { loading && } -
- +
+ + { setChartIndex(0); }} /> -
-
- { - setChartIndex((Number(value) - 1) % viewSpaceList.length); - }} - onIncrement={() => { - setChartIndex((chartIndex + 1) % viewSpaceList.length); - }} - onDecrement={() => { - setChartIndex( - (chartIndex - 1 + viewSpaceList.length) % - viewSpaceList.length - ); - }} - incrementButtonAriaLabel={"Increase value by 1"} - decrementButtonAriaLabel={"Decrease value by 1"} - /> -
-

- There are {viewSpaceList.length} of views of which insight significance is no less than {(sigThreshold * 100).toFixed(2)} % -

- { - viewSpaceList[chartIndex] &&

- Dimensions are {viewSpaceList[chartIndex].dimensions}, and measures are {viewSpaceList[chartIndex].measures}.
- There is a significance of {(viewSpaceList[chartIndex].significance * 100).toFixed(2)}% that there exits a {viewSpaceList[chartIndex].type} in the graph.
- { JSON.stringify(viewSpaceList[chartIndex].description) } +

+ There are {viewSpaceList.length} of views of which insight significance is no less than {(sigThreshold * 100).toFixed(2)} %

- } -
- { - viewSpaceList[chartIndex] && {viewSpaceList[chartIndex].type} - } -
+
+
+
+ { + setChartIndex((Number(value) - 1) % viewSpaceList.length); + }} + onIncrement={() => { + setChartIndex((chartIndex + 1) % viewSpaceList.length); + }} + onDecrement={() => { + setChartIndex( + (chartIndex - 1 + viewSpaceList.length) % + viewSpaceList.length + ); + }} + incrementButtonAriaLabel={"Increase value by 1"} + decrementButtonAriaLabel={"Decrease value by 1"} + /> +
+
+ +
+ { + viewSpaceList[chartIndex] && {viewSpaceList[chartIndex].type} + } + { + viewSpaceList[chartIndex] &&

+ Dimensions are {viewSpaceList[chartIndex].dimensions}, and measures are {viewSpaceList[chartIndex].measures}.
+ There is a significance of {(viewSpaceList[chartIndex].significance * 100).toFixed(2)}% that there exits a {viewSpaceList[chartIndex].type} in the graph.
+ { JSON.stringify(viewSpaceList[chartIndex].description) } +

+ } +
+
+ +
+
+
+ { viewSpaceList.length > 0 && dataView !== null &&
Date: Fri, 6 Mar 2020 22:20:13 +0800 Subject: [PATCH 07/10] feat: group intend and dev panel ui adjust --- .../frontend/src/components/radarChart.tsx | 6 +- .../frontend/src/components/simpleTick.tsx | 2 +- packages/frontend/src/pages/dev/index.tsx | 200 ++++++++++-------- packages/visual-insights/src/index.ts | 5 +- packages/visual-insights/src/insights/dev.ts | 33 ++- .../src/insights/intention/groups.ts | 33 +++ .../src/ml/classification/knn.ts | 155 ++++++++++++++ packages/visual-insights/src/ml/index.ts | 8 +- packages/visual-insights/test/insights.js | 2 +- packages/visual-insights/test/ml.js | 19 +- 10 files changed, 367 insertions(+), 96 deletions(-) create mode 100644 packages/visual-insights/src/insights/intention/groups.ts create mode 100644 packages/visual-insights/src/ml/classification/knn.ts diff --git a/packages/frontend/src/components/radarChart.tsx b/packages/frontend/src/components/radarChart.tsx index f7b68d62..c4a0efb6 100644 --- a/packages/frontend/src/components/radarChart.tsx +++ b/packages/frontend/src/components/radarChart.tsx @@ -26,7 +26,7 @@ const RadarChart: React.FC = props => { category: 0 } }); - ['outlier', 'trend', 'general'].forEach(type => { + ['outlier', 'trend', 'general', 'group'].forEach(type => { if (!data.find(d => d.key === type)) { data.push({ key: type, @@ -85,7 +85,7 @@ const RadarChart: React.FC = props => { range: { signal: "[0, radius]" }, zero: true, nice: false, - domain: { data: "table", field: "value" }, + domain: [0, 1], domainMin: 0 }, { @@ -145,7 +145,7 @@ const RadarChart: React.FC = props => { text: { signal: "datum.datum.value" }, align: { value: "center" }, baseline: { value: "middle" }, - fill: { value: "black" } + fill: { value: "#262626" } } } } diff --git a/packages/frontend/src/components/simpleTick.tsx b/packages/frontend/src/components/simpleTick.tsx index bbf0a238..d5d031ba 100644 --- a/packages/frontend/src/components/simpleTick.tsx +++ b/packages/frontend/src/components/simpleTick.tsx @@ -28,7 +28,7 @@ const SimpleTick: React.FC = props => { field: x, type: 'nominal', scale: { - domain: ['outlier', 'trend', 'general'] + domain: ['outlier', 'trend', 'general', 'group'] } }, y: { diff --git a/packages/frontend/src/pages/dev/index.tsx b/packages/frontend/src/pages/dev/index.tsx index 230dc13d..33d869a5 100644 --- a/packages/frontend/src/pages/dev/index.tsx +++ b/packages/frontend/src/pages/dev/index.tsx @@ -44,7 +44,8 @@ const ColorMap: { } = { 'outlier': '#cf1322', 'trend': '#7cb305', - 'general': '#08979c' + 'general': '#08979c', + 'group': '#c41d7f' } function arrEqual (arr1: any[], arr2: any[]): boolean { @@ -145,8 +146,9 @@ const DevPage: React.FC = props => { }); } }, [dataView]) - return
- + { @@ -161,97 +163,123 @@ const DevPage: React.FC = props => { }); }} /> -
- { - setLoading(true); - getInsightViewSpace(cookedDataSource, cookedDimensions, cookedMeasures).then(res => { - setInsightViewSpace(res); - setLoading(false); - }) - }} /> - { loading && } - - -
- - `${value}%`} - showValue={true} - onChange={(value: number) => { - setSigThreshold(value / 100); - setChartIndex(0); +
+ { + setLoading(true); + getInsightViewSpace( + cookedDataSource, + cookedDimensions, + cookedMeasures + ).then(res => { + setInsightViewSpace(res); + setLoading(false); + }); }} /> -

- There are {viewSpaceList.length} of views of which insight significance is no less than {(sigThreshold * 100).toFixed(2)} % -

-
-
-
- { - setChartIndex((Number(value) - 1) % viewSpaceList.length); - }} - onIncrement={() => { - setChartIndex((chartIndex + 1) % viewSpaceList.length); - }} - onDecrement={() => { - setChartIndex( - (chartIndex - 1 + viewSpaceList.length) % - viewSpaceList.length - ); - }} - incrementButtonAriaLabel={"Increase value by 1"} - decrementButtonAriaLabel={"Decrease value by 1"} - /> + {loading && } + + +
+ + `${value}%`} + showValue={true} + onChange={(value: number) => { + setSigThreshold(value / 100); + setChartIndex(0); + }} + /> +

+ There are {viewSpaceList.length} of views of which insight + significance is no less than {(sigThreshold * 100).toFixed(2)} % +

-
- -
- { - viewSpaceList[chartIndex] && {viewSpaceList[chartIndex].type} - } - { - viewSpaceList[chartIndex] &&

- Dimensions are {viewSpaceList[chartIndex].dimensions}, and measures are {viewSpaceList[chartIndex].measures}.
- There is a significance of {(viewSpaceList[chartIndex].significance * 100).toFixed(2)}% that there exits a {viewSpaceList[chartIndex].type} in the graph.
- { JSON.stringify(viewSpaceList[chartIndex].description) } -

- } +
+
+ { + setChartIndex((Number(value) - 1) % viewSpaceList.length); + }} + onIncrement={() => { + setChartIndex((chartIndex + 1) % viewSpaceList.length); + }} + onDecrement={() => { + setChartIndex( + (chartIndex - 1 + viewSpaceList.length) % + viewSpaceList.length + ); + }} + incrementButtonAriaLabel={"Increase value by 1"} + decrementButtonAriaLabel={"Decrease value by 1"} + /> +
+
+ +
+ { + relatedViews.length > 0 && relatedViews.filter(view => view.significance >= sigThreshold).map(view => ( + + {view.type} + + )) + } + { + viewSpaceList[chartIndex] &&

+ Dimensions are {viewSpaceList[chartIndex].dimensions}, and + measures are {viewSpaceList[chartIndex].measures}.
+ There is a significance of + {(viewSpaceList[chartIndex].significance * 100).toFixed(2)}% + that there exits a {viewSpaceList[chartIndex].type} in the + graph.
+ {JSON.stringify(viewSpaceList[chartIndex].description)} +

+ } +
+ +
- + + {viewSpaceList.length > 0 && dataView !== null && (
+
-
- - { - viewSpaceList.length > 0 && dataView !== null &&
- -
- } + )} +
-
+ ); } export default DevPage; diff --git a/packages/visual-insights/src/index.ts b/packages/visual-insights/src/index.ts index 722b7124..2725aeef 100644 --- a/packages/visual-insights/src/index.ts +++ b/packages/visual-insights/src/index.ts @@ -12,7 +12,7 @@ import * as DashBoard from './dashboard/index'; import * as Sampling from './sampling/index'; import * as Statistics from './statistics/index'; -import { Cluster, Outier } from './ml/index'; +import { Cluster, Outier, Classification } from './ml/index'; export { DashBoard, @@ -25,5 +25,6 @@ export { Cleaner, Insight, Cluster, - Outier + Outier, + Classification } \ No newline at end of file diff --git a/packages/visual-insights/src/insights/dev.ts b/packages/visual-insights/src/insights/dev.ts index 892c5212..a08f1232 100644 --- a/packages/visual-insights/src/insights/dev.ts +++ b/packages/visual-insights/src/insights/dev.ts @@ -9,6 +9,7 @@ import aggregate, { createCube } from 'cube-core'; import { momentCube } from "cube-core/built/core"; import { isFieldContinous, isFieldTime } from '../utils/common'; import { oneDLinearRegression } from '../statistics/index' +import { GroupIntention } from "./intention/groups"; const SPLITER = '=;='; interface ViewSpace { dimensions: string[]; @@ -17,7 +18,7 @@ interface ViewSpace { export interface InsightSpace { dimensions: string[]; measures: string[]; - type: 'general' | 'trend' | 'outlier'; + type: 'general' | 'trend' | 'outlier' | 'group'; order: 'desc' | 'asc'; score: number; significance: number; @@ -152,6 +153,35 @@ export function getTrendIntentionSpaces (cubePool: Map, view return ansSpace; } +export function getGroupIntentionSpaces (cubePool: Map, viewSpaces: ViewSpace[]): InsightSpace[] { + let ansSpace: InsightSpace[] = []; + for (let space of viewSpaces) { + const { dimensions, measures } = space; + let key = dimensions.join(SPLITER); + if (cubePool.has(key)) { + let aggData = cubePool.get(key); + let score = 0; + let groupIntention = new GroupIntention({ + dataSource: aggData, + dimensions, + measures, + K: 8 + }); + score = groupIntention.getSignificance(measures.concat(dimensions.slice(0, -1)), dimensions.slice(-1)); + let insightSpace: InsightSpace = { + dimensions, + measures, + type: 'group', + score, + significance: score, + order: 'desc' + } + ansSpace.push(insightSpace); + } + } + return ansSpace; +} + export function getVisSpaces (dataSource: DataSource, dimensions: string[], measures: string[]): InsightSpace[] { // 1. get dimension cluster groups. // 2. get measure cluster groups. @@ -179,6 +209,7 @@ export function getVisSpaces (dataSource: DataSource, dimensions: string[], meas cubePool.set('*', dataSource); ansSpace.push(...getGeneralIntentionSpaces(cubePool, viewSpaces)); ansSpace.push(...getOutlierIntentionSpaces(cubePool, viewSpaces)); + ansSpace.push(...getGroupIntentionSpaces(cubePool, viewSpaces)); let trendSpaces = viewSpaces.filter(space => space.dimensions.length === 1) // .filter(space => { // return isFieldContinous(dataSource, space.dimensions[0]) || isFieldTime(dataSource, space.dimensions[0]) diff --git a/packages/visual-insights/src/insights/intention/groups.ts b/packages/visual-insights/src/insights/intention/groups.ts new file mode 100644 index 00000000..4e703566 --- /dev/null +++ b/packages/visual-insights/src/insights/intention/groups.ts @@ -0,0 +1,33 @@ +import { KNN } from "../../ml/classification/knn"; +import { NormalizedRecord } from "../../commonTypes"; + +export class GroupIntention extends KNN { + public getTargetValuePercent(targets: string[], targetRecord: NormalizedRecord, neighbors: NormalizedRecord[]): any[] { + let ans = []; + targets.forEach((target, index) => { + let sameCount = 0; + neighbors.forEach(nei => { + if (nei[target] === targetRecord[target]) { + sameCount++; + } + }) + ans.push(sameCount / neighbors.length) + }) + return ans; + } + public getSignificance (features: string[], targets: string[]): number { + let ans = 0 + this.normalizedDataSource.forEach(record => { + let neighbors = this.getNeighbors(record, features); + let percents = this.getTargetValuePercent(targets, record, neighbors); + let sig = 0; + percents.forEach(per => { + sig += per; + }) + sig /= percents.length; + ans += sig; + }) + ans /= this.normalizedDataSource.length; + return ans; + } +} \ No newline at end of file diff --git a/packages/visual-insights/src/ml/classification/knn.ts b/packages/visual-insights/src/ml/classification/knn.ts new file mode 100644 index 00000000..d9164694 --- /dev/null +++ b/packages/visual-insights/src/ml/classification/knn.ts @@ -0,0 +1,155 @@ +import { DataSource, Record, NormalizedRecord } from "../../commonTypes"; +interface BaseProps { + readonly dataSource: DataSource; + readonly dimensions: string[]; + readonly measures: string[]; +} +class Base { + public readonly dataSource: DataSource; + public readonly dimensions: string[]; + public readonly measures: string[]; + protected valueSets: Array >; + protected valueParser: string[][]; + protected ranges: Array<[number, number]>; + public normalizedDataSource: NormalizedRecord[]; + constructor (props: BaseProps) { + const { dataSource, dimensions, measures } = props; + this.dataSource = dataSource; + this.dimensions = dimensions; + this.measures = measures; + } + public normalize(): NormalizedRecord[] { + this.normalizedDataSource = []; + this.valueSets = []; + this.valueParser = []; + this.ranges = []; + this.dimensions.forEach(dim => { + this.valueSets.push(new Map()); + this.valueParser.push([]); + }) + this.measures.forEach(() => { + this.ranges.push([Infinity, -Infinity]); + }) + this.dataSource.forEach(record => { + this.dimensions.forEach((dim, index) => { + let value = (record[dim] || 'others').toString(); + if (!this.valueSets[index].has(value)) { + this.valueSets[index].set(value, this.valueSets[index].size); + this.valueParser[index].push(value) + } + }) + this.measures.forEach((mea, index) => { + let value = record[mea]; + if (typeof value === 'number') { + this.ranges[index][0] = Math.min(this.ranges[index][0], value); + this.ranges[index][1] = Math.max(this.ranges[index][1], value); + } + }) + }) + this.dataSource.forEach(record => { + let normalizedRecord = this.normalizeRecord(record); + this.normalizedDataSource.push(normalizedRecord); + }) + return this.normalizedDataSource; + } + public normalizeRecord (record: Record): NormalizedRecord { + let normalizedRecord: NormalizedRecord = {}; + this.measures.forEach((mea, index) => { + normalizedRecord[mea] = (record[mea] - this.ranges[index][0]) / (this.ranges[index][1] - this.ranges[index][0]); + }) + this.dimensions.forEach((dim, index) => { + let value = (record[dim] || 'others').toString(); + normalizedRecord[dim] = this.valueSets[index].get(value); + }) + return normalizedRecord; + } +} +export interface KNNProps extends BaseProps { + K: number; +} +export class KNN extends Base { + public K: number; + public features: string[]; + public targets: string[]; + constructor (props: KNNProps) { + super(props); + const { K } = props; + this.K = K; + this.normalize(); + } + public getNeighbors(targetRecord: NormalizedRecord, features: string[], weights: number[] | undefined = []): NormalizedRecord[] { + if (weights.length !== features.length) { + features.forEach(f => { + weights.push(1) + }) + } + // let normalizedRecord = this.normalizeRecord(targetRecord); + let dimFeatures: string[] = []; + let meaFeatures: string[] = []; + let dimWeights: number[] = []; + let meaWeights: number[] = []; + let dimSets: Set = new Set(this.dimensions); + for (let i = 0; i < features.length; i ++) { + if (dimSets.has(features[i])) { + dimFeatures.push(features[i]); + dimWeights.push(weights[i]) + } else { + meaFeatures.push(features[i]); + meaWeights.push(weights[i]); + } + } + // let legalFeatures = features.filter(f => this.measures.includes(f)); + let distances: Array<{dis: number, index: number}> = []; + this.normalizedDataSource.forEach((record, rIndex) => { + let dis = 0; + meaFeatures.forEach((feature, index) => { + dis += ((record[feature] - targetRecord[feature]) * meaWeights[index]) ** 2; + }) + dimFeatures.forEach((feature, index) => { + if (record[feature] !== targetRecord[feature]) { + dis += dimWeights[index] ** 2; + } + }) + distances.push({ + dis, + index: rIndex + }); + }) + distances.sort((a, b) => { + return a.dis - b.dis; + }); + let ans: NormalizedRecord[] = []; + let len = Math.min(this.K, distances.length); + for (let i = 0; i < len; i++) { + ans.push(this.normalizedDataSource[distances[i].index]) + } + return ans; + } + public getTargetValue(targets: string[], neighbors: NormalizedRecord[]): any[] { + let ans = []; + targets.forEach(tar => { + let votes: Map = new Map(); + neighbors.forEach(nei => { + if (!votes.has(nei[tar])) { + votes.set(nei[tar], 0) + } + votes.set(nei[tar], votes.get(nei[tar]) + 1) + }) + let mostCount = 0; + let mostFeature = 0; + for (let vote of votes) { + if (vote[1] > mostCount) { + mostCount = vote[1]; + mostFeature = vote[0]; + } + } + let dimIndex = this.dimensions.indexOf(tar) + if (dimIndex > -1) { + ans.push(this.valueParser[dimIndex][mostFeature]); + } else { + ans.push(mostFeature) + } + }) + return ans; + } +} \ No newline at end of file diff --git a/packages/visual-insights/src/ml/index.ts b/packages/visual-insights/src/ml/index.ts index bec5cadf..ecd8ea2b 100644 --- a/packages/visual-insights/src/ml/index.ts +++ b/packages/visual-insights/src/ml/index.ts @@ -1,11 +1,17 @@ import * as Cluster from './cluster/index'; +import { KNN } from './classification/knn'; import { IsolationForest } from './outlier/isolationForest'; const Outier = { IsolationForest } +const Classification = { + KNN +} + export { Cluster, - Outier + Outier, + Classification } \ No newline at end of file diff --git a/packages/visual-insights/test/insights.js b/packages/visual-insights/test/insights.js index 9461c38a..e698536b 100644 --- a/packages/visual-insights/test/insights.js +++ b/packages/visual-insights/test/insights.js @@ -23,7 +23,7 @@ describe('insights test', function () { it('print(dev pipeline)', function () { const result = getVisSpaces(cleanData, dimensions, measures); - console.log('new pipeline result', result) + // console.log('new pipeline result', result) assert.equal(result.length > 0, true); }) diff --git a/packages/visual-insights/test/ml.js b/packages/visual-insights/test/ml.js index 89780bff..26321289 100644 --- a/packages/visual-insights/test/ml.js +++ b/packages/visual-insights/test/ml.js @@ -2,7 +2,7 @@ const fs = require('fs'); const assert = require('assert'); const path = require('path'); -const { Outier, Cleaner } = require('../build/cjs/index'); +const { Outier, Cleaner, Classification } = require('../build/cjs/index'); const datasetPath = path.resolve(__dirname, './dataset/titanic.json'); const dataset = JSON.parse(fs.readFileSync(datasetPath).toString()); @@ -70,4 +70,21 @@ describe('machine learning algorithms', function () { }) }) }) + describe('classification', function () { + describe('KNN', function () { + it('titanic', function () { + let knn = new Classification.KNN({ + dataSource, + dimensions, + measures, + K: 5 + }) + let sample = dataSource[0]; + let neighbors = knn.getNeighbors(sample, ['Sex', 'Pclass', 'Age', 'Parch']); + // let predict = knn.getTargetValue(['Survived'], neighbors); + // console.log(sample, predict) + assert.equal(neighbors.length, 5); + }) + }) + }) }) \ No newline at end of file From 3a06921542d11af584e056ace0502123984374ec Mon Sep 17 00:00:00 2001 From: ObservedObserver <270001151@qq.com> Date: Mon, 9 Mar 2020 11:43:52 +0800 Subject: [PATCH 08/10] doc: update current pipeline --- README.zh-CN.md | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/README.zh-CN.md b/README.zh-CN.md index 91387e65..3c5939f0 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -90,31 +90,34 @@ Rath的主要分析流程在dashboard模块中有每个环节的可视化,可 ![](https://cdn.nlark.com/yuque/0/2019/png/171008/1570692438037-b2ce208d-bd1d-4b38-be27-9251bbb171d2.png) ### Univariate summary - -好累,翻译不动了,改日再来。欢迎pr... -For the first step, visual-insights analyze all the fields in the dataset independently. It gets the fields' distributions and calculate its entropy. Besides, it will define a semantic type (`quantitative`, `ordinal`, `temporal`, `nominal`) for each field. More details of the field will be displayed when hover your mouse on the fields. +For the first step, rath analyze all the fields in the dataset independently. It gets the fields' distributions and calculate its entropy. Besides, it will define a semantic type (`quantitative`, `ordinal`, `temporal`, `nominal`) for each field. More details of the field will be displayed when hover your mouse on the fields. ![](https://cdn.nlark.com/yuque/0/2019/jpeg/171008/1570614609678-33d5f2c1-e51e-4bcd-8343-271a041f7519.jpeg) Then, it will find the fields with high entropy and try to reduce it by grouping the field (for example). Only dimensions participates this process. ### Subspaces -In this step, visual insights search the combination of fields, and calculate the entropy of each measure with some aggregation operators. +In this step, visual insights search the combination of fields. Visual-Insights suppose that any two fields appears in a view should be correlated with each other otherwise they should be display in seperated view. Visual-Insight now use crammver'V and pearson' cc for different types of fields' correlation. ![](https://chspace.oss-cn-hongkong.aliyuncs.com/visual-insights/subspaces.svg) -### Correlation -After one subspace is specified (try to click one row of subsapce in notebook), it will analyze the correlation of measures in the space. +#### Correlation + +for example, the correlation of measures: ![](https://chspace.oss-cn-hongkong.aliyuncs.com/visual-insights/correlation.svg) -### Clustering +#### Clustering It helps you to cluster all the measures based on their correlation. It puts all the variables who are strongly related together to make a specific view (with specified dimenions). -Click one group of measures and visualization will be shown at the bottom of the page. ![](https://chspace.oss-cn-hongkong.aliyuncs.com/visual-insights/clustering.svg) +### Insight Extraction +After we get many subspaces, we can check the insight significance of each space. Currently, visual-insights support trend, outlier, group(whether different groups of data behave differently for spefic measures) + +![](https://chspace.oss-cn-hongkong.aliyuncs.com/visual-insights/rath-demo.jpg) + ### Specification & Visualization From 1f7627072e04cd377f3eebeef0c6da4c1d72c005 Mon Sep 17 00:00:00 2001 From: ObservedObserver <270001151@qq.com> Date: Mon, 9 Mar 2020 11:46:24 +0800 Subject: [PATCH 09/10] Update README.md --- README.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index e2b4bdb9..0ece218b 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,7 @@ npm i visual-insights --save` ## How does it work The working process are visualized in notebook board in the application. *** Main process of the algorithm is shown in the `notebook` board. *** Here shows how rath use visual-insights to make a analytic pipeline. -![](https://cdn.nlark.com/yuque/0/2019/png/171008/1570692438037-b2ce208d-bd1d-4b38-be27-9251bbb171d2.png) +![](https://chspace.oss-cn-hongkong.aliyuncs.com/visual-insights/rath-arc.png) ### Univariate summary For the first step, rath analyze all the fields in the dataset independently. It gets the fields' distributions and calculate its entropy. Besides, it will define a semantic type (`quantitative`, `ordinal`, `temporal`, `nominal`) for each field. More details of the field will be displayed when hover your mouse on the fields. @@ -96,22 +96,26 @@ For the first step, rath analyze all the fields in the dataset independently. It Then, it will find the fields with high entropy and try to reduce it by grouping the field (for example). Only dimensions participates this process. ### Subspaces -In this step, visual insights search the combination of fields, and calculate the entropy of each measure with some aggregation operators. +In this step, visual insights search the combination of fields. Visual-Insights suppose that any two fields appears in a view should be correlated with each other otherwise they should be display in seperated view. Visual-Insight now use crammver'V and pearson' cc for different types of fields' correlation. ![](https://chspace.oss-cn-hongkong.aliyuncs.com/visual-insights/subspaces.svg) -### Correlation -After one subspace is specified (try to click one row of subsapce in notebook), it will analyze the correlation of measures in the space. +#### Correlation + +for example, the correlation of measures: ![](https://chspace.oss-cn-hongkong.aliyuncs.com/visual-insights/correlation.svg) -### Clustering +#### Clustering It helps you to cluster all the measures based on their correlation. It puts all the variables who are strongly related together to make a specific view (with specified dimenions). -Click one group of measures and visualization will be shown at the bottom of the page. ![](https://chspace.oss-cn-hongkong.aliyuncs.com/visual-insights/clustering.svg) +### Insight Extraction +After we get many subspaces, we can check the insight significance of each space. Currently, visual-insights support trend, outlier, group(whether different groups of data behave differently for spefic measures) + +![](https://chspace.oss-cn-hongkong.aliyuncs.com/visual-insights/rath-demo.jpg) ### Specification & Visualization From 51809e9e45aaf72bdb991cb43fdc3a46f9670980 Mon Sep 17 00:00:00 2001 From: ObservedObserver <270001151@qq.com> Date: Mon, 9 Mar 2020 12:01:26 +0800 Subject: [PATCH 10/10] fix: add hint for no cooked data cases. --- packages/frontend/src/pages/dev/index.tsx | 237 +++++++++++----------- 1 file changed, 122 insertions(+), 115 deletions(-) diff --git a/packages/frontend/src/pages/dev/index.tsx b/packages/frontend/src/pages/dev/index.tsx index 33d869a5..94a52c60 100644 --- a/packages/frontend/src/pages/dev/index.tsx +++ b/packages/frontend/src/pages/dev/index.tsx @@ -82,11 +82,6 @@ const DevPage: React.FC = props => { cookedMeasures } = state; const { dimScores } = getters; - console.log({ - cookedDataSource, - cookedDimensions, - cookedMeasures - }, getters) const viewSpaceList = useMemo(() => { return insightViewSpace.filter(s => s.significance >= sigThreshold); }, [insightViewSpace, sigThreshold]) @@ -122,7 +117,7 @@ const DevPage: React.FC = props => { } return [] }, [insightViewSpace, dataView]) - console.log(relatedViews) + useEffect(() => { if (dataView === null) return; const { schema } = dataView; @@ -163,121 +158,133 @@ const DevPage: React.FC = props => { }); }} /> -
- { - setLoading(true); - getInsightViewSpace( - cookedDataSource, - cookedDimensions, - cookedMeasures - ).then(res => { - setInsightViewSpace(res); - setLoading(false); - }); - }} - /> - {loading && } + { + cookedDataSource.length > 0 &&
+ { + setLoading(true); + getInsightViewSpace( + cookedDataSource, + cookedDimensions, + cookedMeasures + ).then(res => { + setInsightViewSpace(res); + setLoading(false); + }); + }} + /> + {loading && } - -
- - `${value}%`} - showValue={true} - onChange={(value: number) => { - setSigThreshold(value / 100); - setChartIndex(0); - }} - /> -

- There are {viewSpaceList.length} of views of which insight - significance is no less than {(sigThreshold * 100).toFixed(2)} % -

-
-
-
- { - setChartIndex((Number(value) - 1) % viewSpaceList.length); - }} - onIncrement={() => { - setChartIndex((chartIndex + 1) % viewSpaceList.length); - }} - onDecrement={() => { - setChartIndex( - (chartIndex - 1 + viewSpaceList.length) % - viewSpaceList.length - ); + +
+ + `${value}%`} + showValue={true} + onChange={(value: number) => { + setSigThreshold(value / 100); + setChartIndex(0); }} - incrementButtonAriaLabel={"Increase value by 1"} - decrementButtonAriaLabel={"Decrease value by 1"} /> +

+ There are {viewSpaceList.length} of views of which insight + significance is no less than {(sigThreshold * 100).toFixed(2)} % +

-
- -
- { - relatedViews.length > 0 && relatedViews.filter(view => view.significance >= sigThreshold).map(view => ( - - {view.type} - - )) - } - { - viewSpaceList[chartIndex] &&

- Dimensions are {viewSpaceList[chartIndex].dimensions}, and - measures are {viewSpaceList[chartIndex].measures}.
- There is a significance of - {(viewSpaceList[chartIndex].significance * 100).toFixed(2)}% - that there exits a {viewSpaceList[chartIndex].type} in the - graph.
- {JSON.stringify(viewSpaceList[chartIndex].description)} -

- } +
+
+ { + setChartIndex((Number(value) - 1) % viewSpaceList.length); + }} + onIncrement={() => { + setChartIndex((chartIndex + 1) % viewSpaceList.length); + }} + onDecrement={() => { + setChartIndex( + (chartIndex - 1 + viewSpaceList.length) % + viewSpaceList.length + ); + }} + incrementButtonAriaLabel={"Increase value by 1"} + decrementButtonAriaLabel={"Decrease value by 1"} + /> +
+
+ +
+ { + relatedViews.length > 0 && relatedViews.filter(view => view.significance >= sigThreshold).map(view => ( + + {view.type} + + )) + } + { + viewSpaceList[chartIndex] &&

+ Dimensions are {viewSpaceList[chartIndex].dimensions}, and + measures are {viewSpaceList[chartIndex].measures}.
+ There is a significance of + {(viewSpaceList[chartIndex].significance * 100).toFixed(2)}% + that there exits a {viewSpaceList[chartIndex].type} in the + graph.
+ {JSON.stringify(viewSpaceList[chartIndex].description)} +

+ } +
-
-
-
- - {viewSpaceList.length > 0 && dataView !== null && ( -
- -
- )} -
+
+
+ + {viewSpaceList.length > 0 && dataView !== null && ( +
+ +
+ )} +
+ } + { + cookedDataSource.length === 0 &&
+

+ Dev Page now is testing for different types of insight worker. +
+ If you see this hint, it means you have not upload dataSource or not click the 'extract insights' button + in dataSource page which will produce a cooked dataSource for dev page. +

+
+ }
); }