Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding LDA Transform #377

Merged
merged 43 commits into from
Jun 26, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
ec890d6
GetSummaryDataView() implementation for Pca and Linear Predictors
ganik May 18, 2018
9394cdd
typo
ganik May 18, 2018
b508e68
Implement ICanGetSummaryAsIRow instead of ICanGetSummaryAsIDataView o…
ganik May 25, 2018
e072546
Merge branch 'master' of https://github.com/ganik/machinelearning
ganik May 25, 2018
ec8200a
Add vector column for top rank eigenvectors
ganik May 27, 2018
cf37f62
remove unnecessary code
ganik May 27, 2018
9943586
remove space
ganik May 27, 2018
45cd5f2
fix build
ganik May 28, 2018
083645f
no need for slot names
ganik May 29, 2018
58d0f31
Merge pull request #1 from dotnet/master
ganik May 30, 2018
c5c0173
Enable back PCA anomaly tests
ganik May 30, 2018
695abc5
fix PCA Anomaly tests baseline
ganik May 30, 2018
eea9c69
fix unit tests for Release
ganik May 30, 2018
637b325
Add PCA azure test data
ganik May 30, 2018
bdec903
Added entrypoint summpary test for LinearPredictor
ganik May 30, 2018
b942537
added PCA summary test
ganik May 30, 2018
678633c
use using() { } on IDisposable
ganik May 31, 2018
3031a4c
remove slot names
ganik Jun 4, 2018
b0c2e49
remove not needed datasets
ganik Jun 5, 2018
eb3df32
Merge pull request #2 from dotnet/master
ganik Jun 15, 2018
7cb7bc8
Added LDANative project
ganik Jun 19, 2018
0690012
Merge pull request #3 from dotnet/master
ganik Jun 19, 2018
9ee8703
remove malloc.h
ganik Jun 19, 2018
1c4310f
Merge branch 'master' of https://github.com/ganik/machinelearning
ganik Jun 19, 2018
1574123
no affinity for MAC for now
ganik Jun 20, 2018
341be9d
disable affinity for MAC for now
ganik Jun 20, 2018
179d6cf
adding LdaTransform
ganik Jun 20, 2018
8be2c79
Fix header
ganik Jun 21, 2018
c79589c
Fix name case
ganik Jun 21, 2018
b35d4cf
Update Entrypoint catalog test and CSharpApi
ganik Jun 21, 2018
e1e0421
reference lda native in tests
ganik Jun 21, 2018
1e9b0d3
add lda native reference to test project
ganik Jun 21, 2018
c41dc50
address comments
ganik Jun 25, 2018
7c2449f
remove aliases
ganik Jun 25, 2018
6e6cefb
rename LDANative to LdaNative folder
ganik Jun 25, 2018
545ae86
remove unicode char
ganik Jun 25, 2018
336ddb0
address comments
ganik Jun 25, 2018
98c15c6
remove aliases
ganik Jun 25, 2018
34eedf1
Add unit tests for "direct" API
ganik Jun 25, 2018
9b4dcab
add LdaNative reference to test proj
ganik Jun 25, 2018
6205510
thread affinity on OSX
ganik Jun 25, 2018
b72d194
include <mach/mach_types.h>
ganik Jun 25, 2018
1ce124b
include <mach/thread_act.h>
ganik Jun 25, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -92,5 +92,20 @@ public static CommonOutputs.TransformOutput CharTokenize(IHostEnvironment env, C
OutputData = view
};
}

[TlcModule.EntryPoint(Name = "Transforms.LightLda", Desc = LdaTransform.Summary, UserName = LdaTransform.UserName, ShortName = LdaTransform.ShortName)]
public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, LdaTransform.Arguments input)
{
Contracts.CheckValue(env, nameof(env));
env.CheckValue(input, nameof(input));

var h = EntryPointUtils.CheckArgsAndCreateHost(env, "LightLda", input);
var view = new LdaTransform(h, input, input.Data);
return new CommonOutputs.TransformOutput()
{
Model = new TransformModel(h, view, input.Data),
OutputData = view
};
}
}
}
357 changes: 357 additions & 0 deletions src/Microsoft.ML.Transforms/Text/LdaSingleBox.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,357 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.InteropServices;
using System.Security;

namespace Microsoft.ML.Runtime.TextAnalytics
{

internal static class LdaInterface
{
public struct LdaEngine
{
public IntPtr Ptr;
}

private const string NativeDll = "LdaNative";
[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern LdaEngine CreateEngine(int numTopic, int numVocab, float alphaSum, float beta, int numIter,
int likelihoodInterval, int numThread, int mhstep, int maxDocToken);

[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern void AllocateModelMemory(LdaEngine engine, int numTopic, int numVocab, long tableSize, long aliasTableSize);

[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern void AllocateDataMemory(LdaEngine engine, int docNum, long corpusSize);

[DllImport(NativeDll, CharSet = CharSet.Ansi), SuppressUnmanagedCodeSecurity]
internal static extern void Train(LdaEngine engine, string trainOutput);

[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern void GetModelStat(LdaEngine engine, out long memBlockSize, out long aliasMemBlockSize);

[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern void Test(LdaEngine engine, int numBurninIter, float[] pLogLikelihood);

[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern void CleanData(LdaEngine engine);

[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern void CleanModel(LdaEngine engine);

[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern void DestroyEngine(LdaEngine engine);

[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern void GetWordTopic(LdaEngine engine, int wordId, int[] pTopic, int[] pProb, ref int length);

[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern void SetWordTopic(LdaEngine engine, int wordId, int[] pTopic, int[] pProb, int length);

[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern void SetAlphaSum(LdaEngine engine, float avgDocLength);

[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern int FeedInData(LdaEngine engine, int[] termId, int[] termFreq, int termNum, int numVocab);

[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern int FeedInDataDense(LdaEngine engine, int[] termFreq, int termNum, int numVocab);

[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern void GetDocTopic(LdaEngine engine, int docId, int[] pTopic, int[] pProb, ref int numTopicReturn);

[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern void GetTopicSummary(LdaEngine engine, int topicId, int[] pWords, float[] pProb, ref int numTopicReturn);

[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern void TestOneDoc(LdaEngine engine, int[] termId, int[] termFreq, int termNum, int[] pTopics, int[] pProbs, ref int numTopicsMax, int numBurnIter, bool reset);

[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern void TestOneDocDense(LdaEngine engine, int[] termFreq, int termNum, int[] pTopics, int[] pProbs, ref int numTopicsMax, int numBurninIter, bool reset);

[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern void InitializeBeforeTrain(LdaEngine engine);

[DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
internal static extern void InitializeBeforeTest(LdaEngine engine);
}

internal sealed class LdaSingleBox : IDisposable
{
private LdaInterface.LdaEngine _engine;
private bool _isDisposed;
private int[] _topics;
private int[] _probabilities;
private int[] _summaryTerm;
private float[] _summaryTermProb;
private readonly int _likelihoodInterval;
private readonly float _alpha;
private readonly float _beta;
private readonly int _mhStep;
private readonly int _numThread;
private readonly int _numSummaryTerms;
private readonly bool _denseOutput;

public readonly int NumTopic;
public readonly int NumVocab;
public LdaSingleBox(int numTopic, int numVocab, float alpha,
float beta, int numIter, int likelihoodInterval, int numThread,
int mhstep, int numSummaryTerms, bool denseOutput, int maxDocToken)
{
NumTopic = numTopic;
NumVocab = numVocab;
_alpha = alpha;
_beta = beta;
_mhStep = mhstep;
_numSummaryTerms = numSummaryTerms;
_denseOutput = denseOutput;
_likelihoodInterval = likelihoodInterval;
_numThread = numThread;

_topics = new int[numTopic];
_probabilities = new int[numTopic];

_summaryTerm = new int[_numSummaryTerms];
_summaryTermProb = new float[_numSummaryTerms];

_engine = LdaInterface.CreateEngine(numTopic, numVocab, alpha, beta, numIter, likelihoodInterval, numThread, mhstep, maxDocToken);
}

public void AllocateModelMemory(int numTopic, int numVocab, long tableSize, long aliasTableSize)
{
Contracts.Check(numTopic >= 0);
Contracts.Check(numVocab >= 0);
Contracts.Check(tableSize >= 0);
Contracts.Check(aliasTableSize >= 0);
LdaInterface.AllocateModelMemory(_engine, numVocab, numTopic, tableSize, aliasTableSize);
}

public void AllocateDataMemory(int docNum, long corpusSize)
{
Contracts.Check(docNum >= 0);
Contracts.Check(corpusSize >= 0);
LdaInterface.AllocateDataMemory(_engine, docNum, corpusSize);
}

public void Train(string trainOutput)
{
if (string.IsNullOrWhiteSpace(trainOutput))
LdaInterface.Train(_engine, null);
else
LdaInterface.Train(_engine, trainOutput);
}

public void GetModelStat(out long memBlockSize, out long aliasMemBlockSize)
{
LdaInterface.GetModelStat(_engine, out memBlockSize, out aliasMemBlockSize);
}

public void Test(int numBurninIter, float[] logLikelihood)
{
Contracts.Check(numBurninIter >= 0);
var pLogLikelihood = new float[numBurninIter];
LdaInterface.Test(_engine, numBurninIter, pLogLikelihood);
logLikelihood = pLogLikelihood.Select(item => (float)item).ToArray();
}

public void CleanData()
{
LdaInterface.CleanData(_engine);
}

public void CleanModel()
{
LdaInterface.CleanModel(_engine);
}

public void CopyModel(LdaSingleBox trainer, int wordId)
{
int length = NumTopic;
LdaInterface.GetWordTopic(trainer._engine, wordId, _topics, _probabilities, ref length);
LdaInterface.SetWordTopic(_engine, wordId, _topics, _probabilities, length);
}

public void SetAlphaSum(float averageDocLength)
{
LdaInterface.SetAlphaSum(_engine, averageDocLength);
}

public int LoadDoc(int[] termID, double[] termVal, int termNum, int numVocab)
{
Contracts.Check(numVocab == NumVocab);
Contracts.Check(termNum > 0);
Contracts.Check(termID.Length >= termNum);
Contracts.Check(termVal.Length >= termNum);

int[] pID = new int[termNum];
int[] pVal = termVal.Select(item => (int)item).ToArray();
Array.Copy(termID, pID, termNum);
return LdaInterface.FeedInData(_engine, pID, pVal, termNum, NumVocab);
}

public int LoadDocDense(double[] termVal, int termNum, int numVocab)
{
Contracts.Check(numVocab == NumVocab);
Contracts.Check(termNum > 0);

Contracts.Check(termVal.Length >= termNum);

int[] pID = new int[termNum];
int[] pVal = termVal.Select(item => (int)item).ToArray();
return LdaInterface.FeedInDataDense(_engine, pVal, termNum, NumVocab);

}

public List<KeyValuePair<int, float>> GetDocTopicVector(int docID)
{
int numTopicReturn = NumTopic;
LdaInterface.GetDocTopic(_engine, docID, _topics, _probabilities, ref numTopicReturn);
var topicRet = new List<KeyValuePair<int, float>>();
int currentTopic = 0;
for (int i = 0; i < numTopicReturn; i++)
{
if (_denseOutput)
{
while (currentTopic < _topics[i])
{
//use a value to smooth the count so that we get dense output on each topic
//the smooth value is usually set to 0.1
topicRet.Add(new KeyValuePair<int, float>(currentTopic, (float)_alpha));
currentTopic++;
}
topicRet.Add(new KeyValuePair<int, float>(_topics[i], _probabilities[i] + (float)_alpha));
currentTopic++;
}
else
{
topicRet.Add(new KeyValuePair<int, float>(_topics[i], (float)_probabilities[i]));
}
}

if (_denseOutput)
{
while (currentTopic < NumTopic)
{
topicRet.Add(new KeyValuePair<int, float>(currentTopic, (float)_alpha));
currentTopic++;
}
}
return topicRet;
}

public List<KeyValuePair<int, float>> TestDoc(int[] termID, double[] termVal, int termNum, int numBurninIter, bool reset)
{
Contracts.Check(termNum > 0);
Contracts.Check(termVal.Length >= termNum);
Contracts.Check(termID.Length >= termNum);

int[] pID = new int[termNum];
int[] pVal = termVal.Select(item => (int)item).ToArray();
int[] pTopic = new int[NumTopic];
int[] pProb = new int[NumTopic];
Array.Copy(termID, pID, termNum);

int numTopicReturn = NumTopic;

LdaInterface.TestOneDoc(_engine, pID, pVal, termNum, pTopic, pProb, ref numTopicReturn, numBurninIter, reset);

// PREfast suspects that the value of numTopicReturn could be changed in _engine->TestOneDoc, which might result in read overrun in the following loop.
if (numTopicReturn > NumTopic)
{
Contracts.Check(false);
numTopicReturn = NumTopic;
}

var topicRet = new List<KeyValuePair<int, float>>();
for (int i = 0; i < numTopicReturn; i++)
topicRet.Add(new KeyValuePair<int, float>(pTopic[i], (float)pProb[i]));
return topicRet;
}

public List<KeyValuePair<int, float>> TestDocDense(double[] termVal, int termNum, int numBurninIter, bool reset)
{
Contracts.Check(termNum > 0);
Contracts.Check(numBurninIter > 0);
Contracts.Check(termVal.Length >= termNum);
int[] pVal = termVal.Select(item => (int)item).ToArray();
int[] pTopic = new int[NumTopic];
int[] pProb = new int[NumTopic];

int numTopicReturn = NumTopic;

// There are two versions of TestOneDoc interfaces
// (1) TestOneDoc
// (2) TestOneDocRestart
// The second one is the same as the first one except that it will reset
// the states of the internal random number generator, so that it yields reproducable results for the same input
LdaInterface.TestOneDocDense(_engine, pVal, termNum, pTopic, pProb, ref numTopicReturn, numBurninIter, reset);

// PREfast suspects that the value of numTopicReturn could be changed in _engine->TestOneDoc, which might result in read overrun in the following loop.
if (numTopicReturn > NumTopic)
{
Contracts.Check(false);
numTopicReturn = NumTopic;
}

var topicRet = new List<KeyValuePair<int, float>>();
for (int i = 0; i < numTopicReturn; i++)
topicRet.Add(new KeyValuePair<int, float>(pTopic[i], (float)pProb[i]));
return topicRet;
}

public void InitializeBeforeTrain()
{
LdaInterface.InitializeBeforeTrain(_engine);
}

public void InitializeBeforeTest()
{
LdaInterface.InitializeBeforeTest(_engine);
}

public KeyValuePair<int, int>[] GetModel(int wordId)
{
int length = NumTopic;
LdaInterface.GetWordTopic(_engine, wordId, _topics, _probabilities, ref length);
var wordTopicVector = new KeyValuePair<int, int>[length];

for (int i = 0; i < length; i++)
wordTopicVector[i] = new KeyValuePair<int, int>(_topics[i], _probabilities[i]);
return wordTopicVector;
}

public KeyValuePair<int, float>[] GetTopicSummary(int topicId)
{
int length = _numSummaryTerms;
LdaInterface.GetTopicSummary(_engine, topicId, _summaryTerm, _summaryTermProb, ref length);
var topicSummary = new KeyValuePair<int, float>[length];

for (int i = 0; i < length; i++)
topicSummary[i] = new KeyValuePair<int, float>(_summaryTerm[i], _summaryTermProb[i]);
return topicSummary;
}

public void SetModel(int termID, int[] topicID, int[] topicProb, int topicNum)
{
Contracts.Check(termID >= 0);
Contracts.Check(topicNum <= NumTopic);
Array.Copy(topicID, _topics, topicNum);
Array.Copy(topicProb, _probabilities, topicNum);
LdaInterface.SetWordTopic(_engine, termID, _topics, _probabilities, topicNum);
}

public void Dispose()
{
if (_isDisposed)
return;
_isDisposed = true;
LdaInterface.DestroyEngine(_engine);
_engine.Ptr = IntPtr.Zero;
}
}
}
Loading