From d7a3132bbbb30c4ae46d63224dc2d3d43aed433c Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 22 Oct 2023 12:13:19 -0700 Subject: [PATCH] NUTCH-3014 Standardize Job names --- .../org/apache/nutch/crawl/CrawlDbReader.java | 3 +-- .../org/apache/nutch/crawl/DeduplicationJob.java | 2 +- src/java/org/apache/nutch/crawl/Generator.java | 3 +-- .../apache/nutch/scoring/webgraph/LinkDumper.java | 6 ++---- .../apache/nutch/scoring/webgraph/LinkRank.java | 15 ++++++--------- .../apache/nutch/scoring/webgraph/NodeDumper.java | 3 +-- .../nutch/scoring/webgraph/ScoreUpdater.java | 3 +-- .../apache/nutch/scoring/webgraph/WebGraph.java | 9 +++------ .../org/apache/nutch/segment/SegmentMerger.java | 2 +- .../org/apache/nutch/tools/warc/WARCExporter.java | 3 +-- .../org/apache/nutch/crawl/TestCrawlDbFilter.java | 3 +-- .../org/apache/nutch/plugin/TestPluginSystem.java | 5 ++--- 12 files changed, 21 insertions(+), 36 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java index 181244779d..6be1c37738 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java @@ -995,8 +995,7 @@ public void processTopNJob(String crawlDb, long topN, float min, } LOG.info("CrawlDb topN: collecting topN scores."); - job = NutchJob.getInstance(config); - job.setJobName("topN collect " + crawlDb); + job = Job.getInstance(config, "Nutch CrawlDbReader: topN collect " + crawlDb); job.getConfiguration().setLong("db.reader.topn", topN); FileInputFormat.addInputPath(job, tempDir); diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java index 1e0649ff8a..e370013546 100644 --- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java +++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java @@ -305,7 +305,7 @@ public int run(String[] args) throws IOException { Path tempDir = new Path(crawlDb, "dedup-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - Job job = NutchJob.getInstance(getConf(), "Nutch DeduplicationJob: " + crawlDb); + Job job = Job.getInstance(getConf(), "Nutch DeduplicationJob: " + crawlDb); Configuration conf = job.getConfiguration(); conf.set(DEDUPLICATION_GROUP_MODE, group); conf.set(DEDUPLICATION_COMPARE_ORDER, compareOrder); diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java index 1c6b48516b..33f743a37a 100644 --- a/src/java/org/apache/nutch/crawl/Generator.java +++ b/src/java/org/apache/nutch/crawl/Generator.java @@ -941,8 +941,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, Path tempDir2 = new Path(dbDir, "generate-temp-" + java.util.UUID.randomUUID().toString()); - job = NutchJob.getInstance(getConf()); - job.setJobName("generate: updatedb " + dbDir); + job = Job.getInstance(getConf(), "Nutch Generator: updatedb " + dbDir); job.getConfiguration().setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java index 4831d73f38..439d7438c4 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java +++ b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java @@ -341,8 +341,7 @@ public void dumpLinks(Path webGraphDb) throws IOException, // run the inverter job Path tempInverted = new Path(webGraphDb, "inverted-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - Job inverter = NutchJob.getInstance(conf); - inverter.setJobName("LinkDumper: inverter"); + Job inverter = Job.getInstance(conf, "Nutch LinkDumper: invert " + webGraphDb); FileInputFormat.addInputPath(inverter, nodeDb); FileInputFormat.addInputPath(inverter, outlinkDb); inverter.setInputFormatClass(SequenceFileInputFormat.class); @@ -372,8 +371,7 @@ public void dumpLinks(Path webGraphDb) throws IOException, } // run the merger job - Job merger = NutchJob.getInstance(conf); - merger.setJobName("LinkDumper: merger"); + Job merger = Job.getInstance(conf, "Nutch LinkDumper: merge " + tempInverted); FileInputFormat.addInputPath(merger, tempInverted); merger.setJarByClass(Merger.class); merger.setInputFormatClass(SequenceFileInputFormat.class); diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java index c226ad130b..e48f04acdf 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java +++ b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java @@ -93,9 +93,8 @@ private int runCounter(FileSystem fs, Path webGraphDb) throws IOException, // configure the counter job Path numLinksPath = new Path(webGraphDb, NUM_NODES); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); - Job counter = NutchJob.getInstance(getConf()); + Job counter = Job.getInstance(getConf(), "Nutch LinkRank: counter " + webGraphDb); Configuration conf = counter.getConfiguration(); - counter.setJobName("LinkRank Counter"); FileInputFormat.addInputPath(counter, nodeDb); FileOutputFormat.setOutputPath(counter, numLinksPath); counter.setInputFormatClass(SequenceFileInputFormat.class); @@ -194,9 +193,8 @@ private void runInitializer(Path nodeDb, Path output) throws IOException, InterruptedException, ClassNotFoundException { // configure the initializer - Job initializer = NutchJob.getInstance(getConf()); + Job initializer = Job.getInstance(getConf(), "Nutch LinkRank: initializer " + nodeDb); Configuration conf = initializer.getConfiguration(); - initializer.setJobName("LinkAnalysis Initializer"); FileInputFormat.addInputPath(initializer, nodeDb); FileOutputFormat.setOutputPath(initializer, output); initializer.setJarByClass(Initializer.class); @@ -245,9 +243,9 @@ private void runInverter(Path nodeDb, Path outlinkDb, Path output) throws IOException, InterruptedException, ClassNotFoundException { // configure the inverter - Job inverter = NutchJob.getInstance(getConf()); + Job inverter = Job.getInstance(getConf(), + "Nutch Linkrank: inverter nodedb: " + nodeDb + " outlinkdb: " + outlinkDb); Configuration conf = inverter.getConfiguration(); - inverter.setJobName("LinkAnalysis Inverter"); FileInputFormat.addInputPath(inverter, nodeDb); FileInputFormat.addInputPath(inverter, outlinkDb); FileOutputFormat.setOutputPath(inverter, output); @@ -305,11 +303,10 @@ private void runAnalysis(Path nodeDb, Path inverted, Path output, int iteration, int numIterations, float rankOne) throws IOException, InterruptedException, ClassNotFoundException { - Job analyzer = NutchJob.getInstance(getConf()); + Job analyzer = Job.getInstance(getConf(), + "Nutch LinkRank: analysis iteration" + (iteration + 1) + " of " + numIterations); Configuration conf = analyzer.getConfiguration(); conf.set("link.analyze.iteration", String.valueOf(iteration + 1)); - analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1) - + " of " + numIterations); FileInputFormat.addInputPath(analyzer, nodeDb); FileInputFormat.addInputPath(analyzer, inverted); FileOutputFormat.setOutputPath(analyzer, output); diff --git a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java index dfccccc19e..9277df8f66 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java +++ b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java @@ -298,9 +298,8 @@ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, LOG.info("NodeDumper: starting"); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); - Job dumper = NutchJob.getInstance(getConf()); + Job dumper = Job.getInstance(getConf(), "Nutch NodeDumper: " + webGraphDb); Configuration conf = dumper.getConfiguration(); - dumper.setJobName("NodeDumper: " + webGraphDb); FileInputFormat.addInputPath(dumper, nodeDb); dumper.setInputFormatClass(SequenceFileInputFormat.class); diff --git a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java index c10a6e37b0..bcd5342743 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java +++ b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java @@ -170,8 +170,7 @@ public void update(Path crawlDb, Path webGraphDb) throws IOException, .nextInt(Integer.MAX_VALUE))); // run the updater job outputting to the temp crawl database - Job updater = NutchJob.getInstance(conf); - updater.setJobName("Update CrawlDb from WebGraph"); + Job updater = Job.getInstance(conf, "Nutch ScoreUpdater: " + crawlDb); FileInputFormat.addInputPath(updater, crawlDbCurrent); FileInputFormat.addInputPath(updater, nodeDb); FileOutputFormat.setOutputPath(updater, newCrawlDb); diff --git a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java index b98329d1e0..25e3cf2304 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java +++ b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java @@ -545,9 +545,8 @@ public void createWebGraph(Path webGraphDb, Path[] segments, Path tempOutlinkDb = new Path(outlinkDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - Job outlinkJob = NutchJob.getInstance(getConf()); + Job outlinkJob = Job.getInstance(getConf(), "Nutch WebGraph: outlinkdb " + outlinkDb); Configuration outlinkJobConf = outlinkJob.getConfiguration(); - outlinkJob.setJobName("Outlinkdb: " + outlinkDb); boolean deleteGone = outlinkJobConf.getBoolean("link.delete.gone", false); boolean preserveBackup = outlinkJobConf.getBoolean("db.preserve.backup", true); @@ -625,9 +624,8 @@ public void createWebGraph(Path webGraphDb, Path[] segments, Path tempInlinkDb = new Path(inlinkDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - Job inlinkJob = NutchJob.getInstance(getConf()); + Job inlinkJob = Job.getInstance(getConf(), "Nutch WebGraph: inlinkdb " + inlinkDb); Configuration inlinkJobConf = inlinkJob.getConfiguration(); - inlinkJob.setJobName("Inlinkdb " + inlinkDb); LOG.info("InlinkDb: adding input: " + outlinkDb); FileInputFormat.addInputPath(inlinkJob, outlinkDb); inlinkJob.setInputFormatClass(SequenceFileInputFormat.class); @@ -669,9 +667,8 @@ public void createWebGraph(Path webGraphDb, Path[] segments, Path tempNodeDb = new Path(nodeDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - Job nodeJob = NutchJob.getInstance(getConf()); + Job nodeJob = Job.getInstance(getConf(), "Nutch WebGraph: nodedb " + nodeDb); Configuration nodeJobConf = nodeJob.getConfiguration(); - nodeJob.setJobName("NodeDb " + nodeDb); LOG.info("NodeDb: adding input: " + outlinkDb); LOG.info("NodeDb: adding input: " + inlinkDb); FileInputFormat.addInputPath(nodeJob, outlinkDb); diff --git a/src/java/org/apache/nutch/segment/SegmentMerger.java b/src/java/org/apache/nutch/segment/SegmentMerger.java index 9443468634..53bdee22eb 100644 --- a/src/java/org/apache/nutch/segment/SegmentMerger.java +++ b/src/java/org/apache/nutch/segment/SegmentMerger.java @@ -625,7 +625,7 @@ public void merge(Path out, Path[] segs, boolean filter, boolean normalize, long slice) throws IOException, ClassNotFoundException, InterruptedException { String segmentName = Generator.generateSegmentName(); LOG.info("Merging {} segments to {}/{}", segs.length, out, segmentName); - Job job = NutchJob.getInstance(getConf(), "Nutch SegmentMerger: " + out + "/" + segmentName); + Job job = Job.getInstance(getConf(), "Nutch SegmentMerger: " + out + "/" + segmentName); Configuration conf = job.getConfiguration(); conf.setBoolean("segment.merger.filter", filter); conf.setBoolean("segment.merger.normalizer", normalize); diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java b/src/java/org/apache/nutch/tools/warc/WARCExporter.java index 6d8a385572..4e80aac5f6 100644 --- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java +++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java @@ -433,8 +433,7 @@ public int generateWARC(String output, List segments, stopWatch.start(); LOG.info("WARCExporter: starting"); - final Job job = NutchJob.getInstance(getConf()); - job.setJobName("warc-exporter " + output); + final Job job = Job.getInstance(getConf(), "Nutch WARCExporter: " + output); job.getConfiguration().setBoolean(ONLY_SUCCESSFUL_RESPONSES, onlySuccessfulResponses); diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java b/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java index 82fefaf164..812d4a6a8f 100644 --- a/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java +++ b/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java @@ -31,7 +31,6 @@ import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum; -import org.apache.nutch.util.NutchJob; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -94,7 +93,7 @@ public void testUrl404Purging() throws Exception { conf.setBoolean(CrawlDbFilter.URL_NORMALIZING, true); conf.setBoolean(CrawlDbFilter.URL_FILTERING, false); conf.setInt("urlnormalizer.loop.count", 2); - Job job = NutchJob.getInstance(conf); + Job job = Job.getInstance(conf); job.setJobName("Test CrawlDbFilter"); Path current = new Path(dbDir, "current"); if (FileSystem.get(conf).exists(current)) { diff --git a/src/test/org/apache/nutch/plugin/TestPluginSystem.java b/src/test/org/apache/nutch/plugin/TestPluginSystem.java index dba7c66066..7c1362aa56 100644 --- a/src/test/org/apache/nutch/plugin/TestPluginSystem.java +++ b/src/test/org/apache/nutch/plugin/TestPluginSystem.java @@ -28,7 +28,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.Job; import org.apache.nutch.util.NutchConfiguration; -import org.apache.nutch.util.NutchJob; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -102,7 +101,7 @@ public void testLoadPlugins() { public void testRepositoryCache() throws IOException { Configuration config = NutchConfiguration.create(); PluginRepository repo = PluginRepository.get(config); - Job job = NutchJob.getInstance(config); + Job job = Job.getInstance(config); config = job.getConfiguration(); PluginRepository repo1 = PluginRepository.get(config); Assert.assertTrue(repo == repo1); @@ -111,7 +110,7 @@ public void testRepositoryCache() throws IOException { config.addResource("nutch-default.xml"); config.addResource("nutch-site.xml"); repo = PluginRepository.get(config); - job = NutchJob.getInstance(config); + job = Job.getInstance(config); config = job.getConfiguration(); repo1 = PluginRepository.get(config); Assert.assertTrue(repo1 != repo);