-
Notifications
You must be signed in to change notification settings - Fork 87
/
Copy pathISR_classification_SparkR.R
220 lines (199 loc) · 7.95 KB
/
ISR_classification_SparkR.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
library(SparkR)
library(rjson)
library(RTextTools)
sc <- sparkR.init("local")
includePackage(sc, rjson)
includePackage(sc, RTextTools)
input <- textFile(sc, "file:///home/zhicheng/ISR/articles_3364.json")
#count(input)
articles <- lapply(input, fromJSON)
#count(articles)
#first(articles)
gov_terms <- rbind(
c('Audit Committee Independence' , 'governance'),
c('Compensation Committee Independence' , 'governance'),
c('Nomination Committee Independence' , 'governance'),
c('Board Meeting Attendance Average' , 'governance'),
c('Board Gender Diversity' , 'governance'),
c('Compensation Policy' , 'governance'),
c('Sustainability Compensation Incentives' , 'governance'),
c('Senior Executive Total Compensation' , 'governance'),
c('Board Member Total Compensation' , 'governance'),
c('Experienced Board' , 'governance'),
c('Highest Remuneration Package' , 'governance'),
c('Anti Takeover Devices' , 'governance'),
c('Compensation Controversies' , 'governance'),
c('Non-Executive Board Member Total' , 'governance'),
c('Board Structure' , 'governance'),
c('Board Functions' , 'governance'),
c('Shareholders Rights' , 'governance'),
c('Vision and Strategy' , 'governance'),
c('Implementation' , 'governance'),
c('Quality and consistency' , 'governance'),
c('corporate disclosure obligations' , 'governance'),
c('litigation costs' , 'governance'),
c('independent directors' , 'governance'),
c('sustainability initiatives and networks' , 'governance'),
c('employee incentives' , 'governance'),
c('core business decisions' , 'governance'),
c('Executive compensation' , 'governance'),
c('blow-ups' , 'governance'),
c('financial surprises' , 'governance'),
c('Poison pills' , 'governance'),
c('Takeover defenses' , 'governance'),
c('Staggered Boards' , 'governance'),
c('Say on pay' , 'governance'),
c('Majority voting' , 'governance'),
c('Dual-class' , 'governance'),
c('Share Structure' , 'governance'),
c('Cumulative voting' , 'governance'),
c('pay equity' , 'governance'),
c('shareholder' , 'governance'),
c('Audit integrity' , 'governance'),
c('internal control risks' , 'governance'),
c('Executive performance' , 'governance'),
c('Board leadership' , 'governance'),
c('Business Ethics' , 'governance'),
c('Reputation' , 'governance'),
c('ethical risks' , 'governance'),
c('fundamental risk' , 'governance'),
c('Board accountability' , 'governance'),
c('Accountability' , 'governance'),
c('Stockholders' , 'governance')
)
soc_terms <- rbind(
c('% of salary paid during sick leave' , 'social'),
c('bottom of the pyramid' , 'social'),
c('Amount of social investment' , 'social'),
c('adherence to labor standards' , 'social'),
c('avoiding employee churn' , 'social'),
c('employee health' , 'social'),
c('ILO labor standards' , 'social'),
c('Poverty and community impact' , 'social'),
c('Supply Chain Management' , 'social'),
c('fundamental human rights' , 'social'),
c('Safe Labor practices' , 'social'),
c('Bio Capacity' , 'social'),
c('Corporate Social Responibility' , 'social'),
c('Business relationships' , 'social'),
c('Gender Equality' , 'social'),
c('Health Insurance Cards' , 'social'),
c('Customer satisfaction' , 'social'),
c('Customer Loyalty' , 'social'),
c('Business Units' , 'social'),
c('Human Rights' , 'social'),
c('Animal Welfare' , 'social'),
c('Supply chain' , 'social'),
c('Ethical Investments' , 'social'),
c('Social Responsibility' , 'social'),
c('Equal Pay' , 'social'),
c('Women Rights' , 'social'),
c('Human capital management' , 'social'),
c('Labour relations' , 'social'),
c('Hiring rate trend' , 'social'),
c('Career develoment training' , 'social'),
c('Working conditions' , 'social'),
c('Employee absenteeism' , 'social'),
c('Emerging technology' , 'social'),
c('Community relations' , 'social'),
c('Responsible lending' , 'social'),
c('Corporate philanthropy' , 'social'),
c('brand loyalty' , 'social'),
c('worker rights' , 'social'),
c('child labor' , 'social'),
c('community relations' , 'social'),
c('indigenous rights' , 'social'),
c('Animal welfare' , 'social'),
c('social risks' , 'social'),
c('Genetically modified organisms' , 'social'),
c('Living wage disputes' , 'social'),
c('Predatory lending' , 'social'),
c('Sexual harassment' , 'social'),
c('Slave labor' , 'social'),
c('Political risk' , 'social'),
c('Political contributions' , 'social')
)
eco_terms <- rbind(
c('Biodiversity & Ecosystem', 'ecological'),
c('Electricity Purchased', 'ecological'),
c('Water Withdrawal Total', 'ecological'),
c('Water Recycled', 'ecological'),
c('CO2 Equivalents Emissions', 'ecological'),
c('NOx Emissions ', 'ecological'),
c('SOx Emissions ', 'ecological'),
c('VOC Emissions', 'ecological'),
c('Waste Total', 'ecological'),
c('Waste Recycled Total', 'ecological'),
c('Hazardous Waste', 'ecological'),
c('Environmental Management System Certified System', 'ecological'),
c('Spills and Pollution Controversies', 'ecological'),
c('Resource Reduction', 'ecological'),
c('Emission Reduction', 'ecological'),
c('Product Innovation', 'ecological'),
c('Responsible Investment', 'ecological'),
c('Energy Efficiency', 'ecological'),
c('Breakdown of energy costs', 'ecological'),
c('Breakdown of carbon costs', 'ecological'),
c('off-grid electricity', 'ecological'),
c('industrial processes', 'ecological'),
c('fugitive emissions', 'ecological'),
c('energy-related R&D', 'ecological'),
c('% of renewable energy', 'ecological'),
c('Green House Gas emissions', 'ecological'),
c('Water Footprint', 'ecological'),
c('Water Footprint Network', 'ecological'),
c('Biodiversity Hotspots', 'ecological'),
c('ecosystem efficient product', 'ecological'),
c('resource-efficient', 'ecological'),
c('recyclable products', 'ecological'),
c('Innovation in environmentfriendly products and services', 'ecological'),
c('Biodiversity & Ecosystem', 'ecological'),
c('carbon regulation', 'ecological'),
c('climate change policy', 'ecological'),
c('water-stressed area', 'ecological'),
c('access to sanitation', 'ecological'),
c('long-term water resource', 'ecological'),
c('biodiversity losses', 'ecological'),
c('OECD-level regulation', 'ecological'),
c('environmental problems', 'ecological'),
c('Natural resource conservation', 'ecological'),
c('Animal treatment', 'ecological'),
c('Environmental risk', 'ecological'),
c('Biodiversity & Ecosystem', 'ecological'),
c('Hazardous waste', 'ecological'),
c('Toxic emissions', 'ecological'),
c('Biodiversity & Ecosystem', 'ecological'),
c('Waste Management', 'ecological')
)
tweets <- rbind(gov_terms, soc_terms, eco_terms)
train.size <- nrow(tweets)
features <- tweets[, 1]
labels <- as.factor(tweets[, 2])
matrix <- create_matrix(features, language="english",
removeStopwords=FALSE, removeNumbers=TRUE,
stemWords=FALSE)
terms <- matrix$dimnames$Terms # all terms in training set
train.data <- create_container(matrix, labels, trainSize=1:train.size,
testSize=NULL, virgin=TRUE)
model <- train_model(train.data, algorithm="SVM")
ClassifyISR <- function(article) {
feature <- article$title
test <- create_matrix(feature, language="english",
removeStopwords=FALSE, removeNumbers=TRUE,
stemWords=FALSE)
# Exclude terms not in training set
feature <- paste0(intersect(test$dimnames$Terms, terms), collapse=" ")
if (feature == "") {
return("non-ISR")
}
matrix <- create_matrix(c(features, feature), language="english",
removeStopwords=FALSE, removeNumbers=TRUE,
stemWords=FALSE)
container = create_container(matrix, labels, trainSize=1:train.size,
testSize=train.size+1, virgin=TRUE)
result <- classify_model(container, model)
result
}
results <- lapply(articles, ClassifyISR)
output <- collect(results)
head(output)