-
Notifications
You must be signed in to change notification settings - Fork 0
/
WordCountSameRanking.java
141 lines (119 loc) · 5.44 KB
/
WordCountSameRanking.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.*;
public class WordCountSameRanking {
private static final String OUTPUT_PATH = "/data/intermediate_output";
public static class TokenizerMapper extends Mapper<Object, Text, Text, Text> {
private Text word = new Text();
private Text filename = new Text();
private List<String> keyWords = new ArrayList<>();
{
keyWords.add("education");
keyWords.add("politics");
keyWords.add("sports");
keyWords.add("agriculture");
}
@Override
protected void setup(Context context) throws IOException, InterruptedException {
String filePath = ((FileSplit) context.getInputSplit()).getPath().toString();
filename.set(filePath.substring(filePath.lastIndexOf('/') + 1));
}
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
// String line = value.toString().toLowerCase().replaceAll("[_|$#<>\\^=\\[\\]\\*/\\\\,;,.\\-:()?!\"']", " ");
String line = value.toString();
StringTokenizer itr = new StringTokenizer(line);
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
if (keyWords.contains(word.toString())) {
context.write(filename, word);
}
}
}
}
public static class IntSumReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values,
Context context
) throws IOException, InterruptedException {
java.util.Map<String, Integer> countMap = new HashMap<>();
for (Text val : values) {
String currWord = val.toString();
if (!countMap.containsKey(currWord)) {
countMap.put(currWord, 1);
} else {
countMap.put(currWord, countMap.get(currWord) + 1);
}
}
Map<String, Integer> sortedMap = new LinkedHashMap<>();
countMap.entrySet().stream()
.sorted((Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2)
-> o2.getValue().compareTo(o1.getValue()))
.forEachOrdered(x -> sortedMap.put(x.getKey(), x.getValue()));
// Constructs the ranking list delimited by -
String wordRanking = "";
for (String keyWord : sortedMap.keySet()) {
wordRanking += keyWord + "-";
}
wordRanking = wordRanking.substring(0, wordRanking.length() - 1);
context.write(key, new Text(wordRanking));
}
}
public static class Mapper2 extends Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer itr = new StringTokenizer(line);
while (itr.hasMoreTokens()) {
Text state = new Text(itr.nextToken());
Text ranking = new Text(itr.nextToken());
context.write(ranking, state);
}
}
}
public static class Reducer2 extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context
) throws IOException, InterruptedException {
String listOfStates = "";
for (Text val : values) {
listOfStates += val + ", ";
}
listOfStates = listOfStates.substring(0, listOfStates.length() - 2);
context.write(key, new Text(listOfStates));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// Job 1
Job job = Job.getInstance(conf, "Job 1");
job.setJarByClass(WordCountSameRanking.class);
job.setJar("WordCountSameRanking.jar");
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));
job.waitForCompletion(true);
// Job 2
Job job2 = Job.getInstance(conf, "Job 2");
job2.setJarByClass(WordCountSameRanking.class);
job2.setJar("WordCountSameRanking.jar");
job2.setMapperClass(Mapper2.class);
job2.setCombinerClass(Reducer2.class);
job2.setReducerClass(Reducer2.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job2, new Path(OUTPUT_PATH));
FileOutputFormat.setOutputPath(job2, new Path(args[1]));
System.exit(job2.waitForCompletion(true) ? 0 : 1);
}
}