Skip to content

Commit

Permalink
NUTCH-3014 Standardize Job names
Browse files Browse the repository at this point in the history
  • Loading branch information
lewismc committed Oct 22, 2023
1 parent a5d0b06 commit d7a3132
Show file tree
Hide file tree
Showing 12 changed files with 21 additions and 36 deletions.
3 changes: 1 addition & 2 deletions src/java/org/apache/nutch/crawl/CrawlDbReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -995,8 +995,7 @@ public void processTopNJob(String crawlDb, long topN, float min,
}

LOG.info("CrawlDb topN: collecting topN scores.");
job = NutchJob.getInstance(config);
job.setJobName("topN collect " + crawlDb);
job = Job.getInstance(config, "Nutch CrawlDbReader: topN collect " + crawlDb);
job.getConfiguration().setLong("db.reader.topn", topN);

FileInputFormat.addInputPath(job, tempDir);
Expand Down
2 changes: 1 addition & 1 deletion src/java/org/apache/nutch/crawl/DeduplicationJob.java
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ public int run(String[] args) throws IOException {
Path tempDir = new Path(crawlDb, "dedup-temp-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

Job job = NutchJob.getInstance(getConf(), "Nutch DeduplicationJob: " + crawlDb);
Job job = Job.getInstance(getConf(), "Nutch DeduplicationJob: " + crawlDb);
Configuration conf = job.getConfiguration();
conf.set(DEDUPLICATION_GROUP_MODE, group);
conf.set(DEDUPLICATION_COMPARE_ORDER, compareOrder);
Expand Down
3 changes: 1 addition & 2 deletions src/java/org/apache/nutch/crawl/Generator.java
Original file line number Diff line number Diff line change
Expand Up @@ -941,8 +941,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
Path tempDir2 = new Path(dbDir,
"generate-temp-" + java.util.UUID.randomUUID().toString());

job = NutchJob.getInstance(getConf());
job.setJobName("generate: updatedb " + dbDir);
job = Job.getInstance(getConf(), "Nutch Generator: updatedb " + dbDir);
job.getConfiguration().setLong(Nutch.GENERATE_TIME_KEY, generateTime);
for (Path segmpaths : generatedSegments) {
Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
Expand Down
6 changes: 2 additions & 4 deletions src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
Original file line number Diff line number Diff line change
Expand Up @@ -341,8 +341,7 @@ public void dumpLinks(Path webGraphDb) throws IOException,
// run the inverter job
Path tempInverted = new Path(webGraphDb, "inverted-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
Job inverter = NutchJob.getInstance(conf);
inverter.setJobName("LinkDumper: inverter");
Job inverter = Job.getInstance(conf, "Nutch LinkDumper: invert " + webGraphDb);
FileInputFormat.addInputPath(inverter, nodeDb);
FileInputFormat.addInputPath(inverter, outlinkDb);
inverter.setInputFormatClass(SequenceFileInputFormat.class);
Expand Down Expand Up @@ -372,8 +371,7 @@ public void dumpLinks(Path webGraphDb) throws IOException,
}

// run the merger job
Job merger = NutchJob.getInstance(conf);
merger.setJobName("LinkDumper: merger");
Job merger = Job.getInstance(conf, "Nutch LinkDumper: merge " + tempInverted);
FileInputFormat.addInputPath(merger, tempInverted);
merger.setJarByClass(Merger.class);
merger.setInputFormatClass(SequenceFileInputFormat.class);
Expand Down
15 changes: 6 additions & 9 deletions src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,8 @@ private int runCounter(FileSystem fs, Path webGraphDb) throws IOException,
// configure the counter job
Path numLinksPath = new Path(webGraphDb, NUM_NODES);
Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
Job counter = NutchJob.getInstance(getConf());
Job counter = Job.getInstance(getConf(), "Nutch LinkRank: counter " + webGraphDb);
Configuration conf = counter.getConfiguration();
counter.setJobName("LinkRank Counter");
FileInputFormat.addInputPath(counter, nodeDb);
FileOutputFormat.setOutputPath(counter, numLinksPath);
counter.setInputFormatClass(SequenceFileInputFormat.class);
Expand Down Expand Up @@ -194,9 +193,8 @@ private void runInitializer(Path nodeDb, Path output) throws IOException,
InterruptedException, ClassNotFoundException {

// configure the initializer
Job initializer = NutchJob.getInstance(getConf());
Job initializer = Job.getInstance(getConf(), "Nutch LinkRank: initializer " + nodeDb);
Configuration conf = initializer.getConfiguration();
initializer.setJobName("LinkAnalysis Initializer");
FileInputFormat.addInputPath(initializer, nodeDb);
FileOutputFormat.setOutputPath(initializer, output);
initializer.setJarByClass(Initializer.class);
Expand Down Expand Up @@ -245,9 +243,9 @@ private void runInverter(Path nodeDb, Path outlinkDb, Path output)
throws IOException, InterruptedException, ClassNotFoundException {

// configure the inverter
Job inverter = NutchJob.getInstance(getConf());
Job inverter = Job.getInstance(getConf(),
"Nutch Linkrank: inverter nodedb: " + nodeDb + " outlinkdb: " + outlinkDb);
Configuration conf = inverter.getConfiguration();
inverter.setJobName("LinkAnalysis Inverter");
FileInputFormat.addInputPath(inverter, nodeDb);
FileInputFormat.addInputPath(inverter, outlinkDb);
FileOutputFormat.setOutputPath(inverter, output);
Expand Down Expand Up @@ -305,11 +303,10 @@ private void runAnalysis(Path nodeDb, Path inverted, Path output,
int iteration, int numIterations, float rankOne)
throws IOException, InterruptedException, ClassNotFoundException {

Job analyzer = NutchJob.getInstance(getConf());
Job analyzer = Job.getInstance(getConf(),
"Nutch LinkRank: analysis iteration" + (iteration + 1) + " of " + numIterations);
Configuration conf = analyzer.getConfiguration();
conf.set("link.analyze.iteration", String.valueOf(iteration + 1));
analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1)
+ " of " + numIterations);
FileInputFormat.addInputPath(analyzer, nodeDb);
FileInputFormat.addInputPath(analyzer, inverted);
FileOutputFormat.setOutputPath(analyzer, output);
Expand Down
3 changes: 1 addition & 2 deletions src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
Original file line number Diff line number Diff line change
Expand Up @@ -298,9 +298,8 @@ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output,
LOG.info("NodeDumper: starting");
Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);

Job dumper = NutchJob.getInstance(getConf());
Job dumper = Job.getInstance(getConf(), "Nutch NodeDumper: " + webGraphDb);
Configuration conf = dumper.getConfiguration();
dumper.setJobName("NodeDumper: " + webGraphDb);
FileInputFormat.addInputPath(dumper, nodeDb);
dumper.setInputFormatClass(SequenceFileInputFormat.class);

Expand Down
3 changes: 1 addition & 2 deletions src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,7 @@ public void update(Path crawlDb, Path webGraphDb) throws IOException,
.nextInt(Integer.MAX_VALUE)));

// run the updater job outputting to the temp crawl database
Job updater = NutchJob.getInstance(conf);
updater.setJobName("Update CrawlDb from WebGraph");
Job updater = Job.getInstance(conf, "Nutch ScoreUpdater: " + crawlDb);
FileInputFormat.addInputPath(updater, crawlDbCurrent);
FileInputFormat.addInputPath(updater, nodeDb);
FileOutputFormat.setOutputPath(updater, newCrawlDb);
Expand Down
9 changes: 3 additions & 6 deletions src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
Original file line number Diff line number Diff line change
Expand Up @@ -545,9 +545,8 @@ public void createWebGraph(Path webGraphDb, Path[] segments,

Path tempOutlinkDb = new Path(outlinkDb + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
Job outlinkJob = NutchJob.getInstance(getConf());
Job outlinkJob = Job.getInstance(getConf(), "Nutch WebGraph: outlinkdb " + outlinkDb);
Configuration outlinkJobConf = outlinkJob.getConfiguration();
outlinkJob.setJobName("Outlinkdb: " + outlinkDb);

boolean deleteGone = outlinkJobConf.getBoolean("link.delete.gone", false);
boolean preserveBackup = outlinkJobConf.getBoolean("db.preserve.backup", true);
Expand Down Expand Up @@ -625,9 +624,8 @@ public void createWebGraph(Path webGraphDb, Path[] segments,
Path tempInlinkDb = new Path(inlinkDb + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

Job inlinkJob = NutchJob.getInstance(getConf());
Job inlinkJob = Job.getInstance(getConf(), "Nutch WebGraph: inlinkdb " + inlinkDb);
Configuration inlinkJobConf = inlinkJob.getConfiguration();
inlinkJob.setJobName("Inlinkdb " + inlinkDb);
LOG.info("InlinkDb: adding input: " + outlinkDb);
FileInputFormat.addInputPath(inlinkJob, outlinkDb);
inlinkJob.setInputFormatClass(SequenceFileInputFormat.class);
Expand Down Expand Up @@ -669,9 +667,8 @@ public void createWebGraph(Path webGraphDb, Path[] segments,
Path tempNodeDb = new Path(nodeDb + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

Job nodeJob = NutchJob.getInstance(getConf());
Job nodeJob = Job.getInstance(getConf(), "Nutch WebGraph: nodedb " + nodeDb);
Configuration nodeJobConf = nodeJob.getConfiguration();
nodeJob.setJobName("NodeDb " + nodeDb);
LOG.info("NodeDb: adding input: " + outlinkDb);
LOG.info("NodeDb: adding input: " + inlinkDb);
FileInputFormat.addInputPath(nodeJob, outlinkDb);
Expand Down
2 changes: 1 addition & 1 deletion src/java/org/apache/nutch/segment/SegmentMerger.java
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,7 @@ public void merge(Path out, Path[] segs, boolean filter, boolean normalize,
long slice) throws IOException, ClassNotFoundException, InterruptedException {
String segmentName = Generator.generateSegmentName();
LOG.info("Merging {} segments to {}/{}", segs.length, out, segmentName);
Job job = NutchJob.getInstance(getConf(), "Nutch SegmentMerger: " + out + "/" + segmentName);
Job job = Job.getInstance(getConf(), "Nutch SegmentMerger: " + out + "/" + segmentName);
Configuration conf = job.getConfiguration();
conf.setBoolean("segment.merger.filter", filter);
conf.setBoolean("segment.merger.normalizer", normalize);
Expand Down
3 changes: 1 addition & 2 deletions src/java/org/apache/nutch/tools/warc/WARCExporter.java
Original file line number Diff line number Diff line change
Expand Up @@ -433,8 +433,7 @@ public int generateWARC(String output, List<Path> segments,
stopWatch.start();
LOG.info("WARCExporter: starting");

final Job job = NutchJob.getInstance(getConf());
job.setJobName("warc-exporter " + output);
final Job job = Job.getInstance(getConf(), "Nutch WARCExporter: " + output);

job.getConfiguration().setBoolean(ONLY_SUCCESSFUL_RESPONSES,
onlySuccessfulResponses);
Expand Down
3 changes: 1 addition & 2 deletions src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
import org.apache.nutch.util.NutchJob;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
Expand Down Expand Up @@ -94,7 +93,7 @@ public void testUrl404Purging() throws Exception {
conf.setBoolean(CrawlDbFilter.URL_NORMALIZING, true);
conf.setBoolean(CrawlDbFilter.URL_FILTERING, false);
conf.setInt("urlnormalizer.loop.count", 2);
Job job = NutchJob.getInstance(conf);
Job job = Job.getInstance(conf);
job.setJobName("Test CrawlDbFilter");
Path current = new Path(dbDir, "current");
if (FileSystem.get(conf).exists(current)) {
Expand Down
5 changes: 2 additions & 3 deletions src/test/org/apache/nutch/plugin/TestPluginSystem.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
Expand Down Expand Up @@ -102,7 +101,7 @@ public void testLoadPlugins() {
public void testRepositoryCache() throws IOException {
Configuration config = NutchConfiguration.create();
PluginRepository repo = PluginRepository.get(config);
Job job = NutchJob.getInstance(config);
Job job = Job.getInstance(config);
config = job.getConfiguration();
PluginRepository repo1 = PluginRepository.get(config);
Assert.assertTrue(repo == repo1);
Expand All @@ -111,7 +110,7 @@ public void testRepositoryCache() throws IOException {
config.addResource("nutch-default.xml");
config.addResource("nutch-site.xml");
repo = PluginRepository.get(config);
job = NutchJob.getInstance(config);
job = Job.getInstance(config);
config = job.getConfiguration();
repo1 = PluginRepository.get(config);
Assert.assertTrue(repo1 != repo);
Expand Down

0 comments on commit d7a3132

Please sign in to comment.