-
Notifications
You must be signed in to change notification settings - Fork 1
/
model.h
executable file
·157 lines (133 loc) · 5.26 KB
/
model.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#ifndef _MODEL_H
#define _MODEL_H
#include "constants.h"
#include "dataset.h"
#include "utils.h"
#include "database.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <time.h>
#include <boost/math/special_functions/digamma.hpp>
using namespace std;
using namespace boost::math;
// LDA model
class model {
public:
vector<pair<string, int> > movie_classes;
string classes_trn_file;// file containing movie classes
string classes_pre_file;
string wordmapfile; // file that contains word map [string -> integer id]
string trainlogfile; // training log file
string tassign_suffix; // suffix for topic assignment file
string theta_suffix; // suffix for theta file
string phi_suffix; // suffix for phi file
string others_suffix; // suffix for file containing other parameters
string twords_suffix; // suffix for file containing words-per-topics
string dir; // model directory
string dfile; // data file
string model_name; // model name
string originDir; // directory needs be be preprocessed
int file_type;
int model_status; // model status:
// MODEL_STATUS_UNKNOWN: unknown status
// MODEL_STATUS_EST: estimating from scratch
// MODEL_STATUS_ESTC: continue to estimate the model from a previous one
// MODEL_STATUS_INF: do inference
// MODEL_STATUS_PREPROCESS: preprocess texts
// MODEL_STATUS_RANKING: ranking
// MODEL_STATUS_CLASSIFIER
// MODEL_STATUS_SERVER
dataset * ptrndata; // pointer to training dataset object
dataset * pnewdata; // pointer to new dataset object
mapid2word id2word; // word map [int => string]
// --- model parameters and variables ---
int M; // dataset size (i.e., number of docs)
int V; // vocabulary size
int K; // number of topics
double alpha, beta; // LDA hyperparameters
int niters; // number of Gibbs sampling iterations
int liter; // the iteration at which the model was saved
int savestep; // saving period
int twords; // print out top words per each topic
int withrawstrs;
int disp;
int rank_num;
double * p; // temp variable for sampling
int ** z; // topic assignments for words, size M x doc.size()
int ** nw; // cwt[i][j]: number of instances of word/term i assigned to topic j, size V x K
int ** nd; // na[i][j]: number of words in document i assigned to topic j, size M x K
int * nwsum; // nwsum[j]: total number of words assigned to topic j, size K
int * ndsum; // ndsum[i]: total number of words in document i, size M
double ** theta; // theta: document-topic distributions, size M x K
double ** phi; // phi: topic-word distributions, size K x V
double *beta0;
double sumbeta;
// for inference only
int inf_liter;
int newM;
int newV;
int ** newz;
int ** newnw;
int ** newnd;
int * newnwsum;
int * newndsum;
double ** newtheta;
double ** newphi;
double a,b,c,d;
// --------------------------------------
model() {
set_default_values();
}
~model();
// set default values for variables
void set_default_values();
// initialize the model
int init();
int textpreprocessor();
// load LDA model to continue estimating or to do inference
int load_model(string model_name);
// save LDA model to files
// model_name.tassign: topic assignments for words in docs
// model_name.theta: document-topic distributions
// model_name.phi: topic-word distributions
// model_name.others: containing other parameters of the model (alpha, beta, M, V, K)
int save_model(string model_name);
int save_model_tassign(string filename);
int save_model_theta(string filename);
int save_model_phi(string filename);
int save_model_others(string filename);
int save_model_twords(string filename);
// saving inference outputs
int save_inf_model(string model_name);
int save_inf_model_tassign(string filename);
int save_inf_model_newtheta(string filename);
int save_inf_model_newphi(string filename);
int save_inf_model_others(string filename);
int save_inf_model_twords(string filename);
// init for estimation
int init_est();
int init_estc();
// estimate LDA model using Gibbs sampling
void estimate();
int sampling(int m, int n);
void compute_theta();
void compute_phi();
void compute_alpha();
void compute_beta();
// init for inference
int init_inf();
// inference for new (unseen) data based on the estimated LDA model
void inference();
int inf_sampling(int m, int n);
void compute_newtheta();
void compute_newphi();
void preprocess();
int init_ranking();
void ranking();
vector<int> ranking(vector<int> candidate);
vector<int> ranking(vector<int> candidate, string type, string category);
void classification();
};
#endif