From 6bff1237a28f64972700d4d2432e7bf030c5d240 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 27 Nov 2024 13:54:43 +0100 Subject: [PATCH 1/6] NUTCH-3095 Update .gitignore to ignore Hadoop native libraries --- .gitignore | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitignore b/.gitignore index 9cac3379c..f977de333 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,10 @@ ivy/dependency-check-ant/* .gradle* ivy/apache-rat-* .vscode +# native Hadoop libraries, see lib/native/README.txt +lib/native/libhadoop.* +lib/native/libhadooppipes.* +lib/native/libhadooputils.* +lib/native/libhdfs.* +lib/native/libhdfspp.* +lib/native/libnativetask.* From 68c1a7dd4b1cdf79398a546187bd51d6f2957a40 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 27 Nov 2024 12:09:52 +0100 Subject: [PATCH 2/6] NUTCH-3094 Github tests to run if build configuration changes --- .github/workflows/master-build.yml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index 02176a51d..f975db44f 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -82,11 +82,18 @@ jobs: - 'src/testresources/**' plugins: - 'src/plugin/**' + buildconf: + - 'build.xml' + - 'ivy/ivy.xml' + # run if the build configuration or both 'core' and 'plugins' files were changed + - name: test all + if: ${{ steps.filter.outputs.buildconf == 'true' || ( steps.filter.outputs.core == 'true' && steps.filter.outputs.plugin == 'true' ) }} + run: ant clean test -buildfile build.xml # run only if 'core' files were changed - name: test core - if: steps.filter.outputs.core == 'true' + if: ${{ steps.filter.outputs.core == 'true' && steps.filter.outputs.plugins == 'false' && steps.filter.outputs.buildconf == 'false' }} run: ant clean test-core -buildfile build.xml # run only if 'plugins' files were changed - name: test plugins - if: steps.filter.outputs.plugins == 'true' + if: ${{ steps.filter.outputs.plugins == 'true' && steps.filter.outputs.core == 'false' && steps.filter.outputs.buildconf == 'false' }} run: ant clean test-plugins -buildfile build.xml From 5a01834030ef32ae8672687b8856b6be3c944dfd Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 3 Dec 2024 13:45:13 +0100 Subject: [PATCH 3/6] NUTCH-3094 Github tests to run if build configuration changes - fix typo in Github workflow definition --- .github/workflows/master-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index f975db44f..f0802a607 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -87,7 +87,7 @@ jobs: - 'ivy/ivy.xml' # run if the build configuration or both 'core' and 'plugins' files were changed - name: test all - if: ${{ steps.filter.outputs.buildconf == 'true' || ( steps.filter.outputs.core == 'true' && steps.filter.outputs.plugin == 'true' ) }} + if: ${{ steps.filter.outputs.buildconf == 'true' || ( steps.filter.outputs.core == 'true' && steps.filter.outputs.plugins == 'true' ) }} run: ant clean test -buildfile build.xml # run only if 'core' files were changed - name: test core From e2a29d022ff7d7de17315e02a944b877a8a574c7 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 26 Nov 2024 13:57:14 +0100 Subject: [PATCH 4/6] NUTCH-3092 Replace all imports of commons-lang by commons-lang3 --- src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java | 2 +- .../org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java | 2 +- src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java | 2 +- src/java/org/apache/nutch/protocol/ProtocolFactory.java | 2 +- src/java/org/apache/nutch/service/impl/ConfManagerImpl.java | 2 +- src/java/org/apache/nutch/service/impl/JobManagerImpl.java | 2 +- .../apache/nutch/service/impl/NutchServerPoolExecutor.java | 2 +- .../org/apache/nutch/tools/AbstractCommonCrawlFormat.java | 2 +- src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java | 4 ++-- src/java/org/apache/nutch/tools/warc/WARCExporter.java | 2 +- src/java/org/apache/nutch/util/DumpFileUtil.java | 2 +- src/java/org/apache/nutch/util/JexlUtil.java | 2 +- src/java/org/apache/nutch/util/TableUtil.java | 2 +- .../org/apache/nutch/indexer/more/MoreIndexingFilter.java | 4 ++-- .../nutch/indexwriter/cloudsearch/CloudSearchIndexWriter.java | 2 +- .../apache/nutch/indexwriter/elastic/ElasticIndexWriter.java | 2 +- .../org/apache/nutch/indexwriter/kafka/KafkaIndexWriter.java | 2 +- .../apache/nutch/protocol/http/api/HttpRobotRulesParser.java | 2 +- .../src/java/org/apache/nutch/parse/tika/TikaParser.java | 2 +- .../org/apache/nutch/parsefilter/regex/RegexParseFilter.java | 2 +- .../src/java/org/apache/nutch/protocol/httpclient/Http.java | 2 +- .../org/apache/nutch/urlfilter/domain/DomainURLFilter.java | 2 +- .../urlfilter/domaindenylist/DomainDenylistURLFilter.java | 2 +- .../java/org/apache/nutch/urlfilter/fast/FastURLFilter.java | 2 +- .../nutch/net/urlnormalizer/host/HostURLNormalizer.java | 2 +- .../net/urlnormalizer/protocol/ProtocolURLNormalizer.java | 2 +- .../urlnormalizer/querystring/QuerystringURLNormalizer.java | 2 +- .../nutch/net/urlnormalizer/slash/SlashURLNormalizer.java | 2 +- 28 files changed, 30 insertions(+), 30 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java index d6272c598..8b61f6696 100644 --- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java @@ -21,7 +21,7 @@ import org.apache.hadoop.io.FloatWritable; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.util.NutchConfiguration; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java index 21022d46e..6a7f13e85 100644 --- a/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java @@ -22,7 +22,7 @@ import java.lang.invoke.MethodHandles; import java.util.HashMap; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.nutch.metadata.HttpHeaders; diff --git a/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java b/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java index be161440e..b56f2cc1b 100644 --- a/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java +++ b/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java @@ -21,7 +21,7 @@ import java.util.HashMap; import java.util.Map; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; /** * A decorator to Metadata that adds spellchecking capabilities to property diff --git a/src/java/org/apache/nutch/protocol/ProtocolFactory.java b/src/java/org/apache/nutch/protocol/ProtocolFactory.java index dc274b7e1..d5f8eff54 100644 --- a/src/java/org/apache/nutch/protocol/ProtocolFactory.java +++ b/src/java/org/apache/nutch/protocol/ProtocolFactory.java @@ -31,7 +31,7 @@ import org.apache.nutch.util.ObjectCache; import org.apache.nutch.util.URLUtil; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; diff --git a/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java b/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java index 7afe030f8..ebcc575ac 100644 --- a/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java +++ b/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java @@ -24,7 +24,7 @@ import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.collections.MapUtils; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.service.ConfManager; import org.apache.nutch.service.model.request.NutchConfig; diff --git a/src/java/org/apache/nutch/service/impl/JobManagerImpl.java b/src/java/org/apache/nutch/service/impl/JobManagerImpl.java index aae40b460..3bcb7dde9 100644 --- a/src/java/org/apache/nutch/service/impl/JobManagerImpl.java +++ b/src/java/org/apache/nutch/service/impl/JobManagerImpl.java @@ -18,7 +18,7 @@ import java.util.Collection; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.service.ConfManager; import org.apache.nutch.service.JobManager; diff --git a/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java b/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java index 529e1907b..473062217 100644 --- a/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java +++ b/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java @@ -24,7 +24,7 @@ import java.util.concurrent.TimeUnit; import org.apache.commons.collections.CollectionUtils; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.nutch.service.model.response.JobInfo; import com.google.common.collect.Lists; diff --git a/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java b/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java index 7ba945863..db0383611 100644 --- a/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java +++ b/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java @@ -25,7 +25,7 @@ import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.util.URIUtil; -import org.apache.commons.lang.NotImplementedException; +import org.apache.commons.lang3.NotImplementedException; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.ParseData; diff --git a/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java b/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java index 9dcb72976..899c714e9 100644 --- a/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java +++ b/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java @@ -27,8 +27,8 @@ import java.util.List; import java.util.concurrent.atomic.AtomicInteger; -import org.apache.commons.lang.NotImplementedException; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.NotImplementedException; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.ParseData; diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java b/src/java/org/apache/nutch/tools/warc/WARCExporter.java index 4e80aac5f..bf824f9b3 100644 --- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java +++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java @@ -31,7 +31,7 @@ import java.util.UUID; import java.util.concurrent.TimeUnit; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; diff --git a/src/java/org/apache/nutch/util/DumpFileUtil.java b/src/java/org/apache/nutch/util/DumpFileUtil.java index a9ad19545..c7aacfe8f 100644 --- a/src/java/org/apache/nutch/util/DumpFileUtil.java +++ b/src/java/org/apache/nutch/util/DumpFileUtil.java @@ -18,7 +18,7 @@ import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.io.FileUtils; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.MD5Hash; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/src/java/org/apache/nutch/util/JexlUtil.java b/src/java/org/apache/nutch/util/JexlUtil.java index 370ba7aa1..549aebc41 100644 --- a/src/java/org/apache/nutch/util/JexlUtil.java +++ b/src/java/org/apache/nutch/util/JexlUtil.java @@ -24,7 +24,7 @@ import org.apache.commons.jexl3.JexlBuilder; import org.apache.commons.jexl3.JexlEngine; import org.apache.commons.jexl3.JexlScript; -import org.apache.commons.lang.time.DateUtils; +import org.apache.commons.lang3.time.DateUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/src/java/org/apache/nutch/util/TableUtil.java b/src/java/org/apache/nutch/util/TableUtil.java index 7b0b5ce7d..aab01543c 100644 --- a/src/java/org/apache/nutch/util/TableUtil.java +++ b/src/java/org/apache/nutch/util/TableUtil.java @@ -16,7 +16,7 @@ */ package org.apache.nutch.util; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import java.net.MalformedURLException; import java.net.URL; diff --git a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java index 6f403594e..5cc9fcfb8 100644 --- a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java +++ b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java @@ -51,8 +51,8 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; -import org.apache.commons.lang.StringUtils; -import org.apache.commons.lang.time.DateUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.time.DateUtils; import java.io.File; import java.net.URL; diff --git a/src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchIndexWriter.java b/src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchIndexWriter.java index 389157265..ecff85743 100644 --- a/src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchIndexWriter.java +++ b/src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchIndexWriter.java @@ -33,7 +33,7 @@ import java.util.Map.Entry; import org.apache.commons.io.FileUtils; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.indexer.IndexWriter; import org.apache.nutch.indexer.IndexWriterParams; diff --git a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java index 0cb267463..dc024d50b 100644 --- a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java +++ b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java @@ -27,7 +27,7 @@ import javax.net.ssl.SSLContext; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.http.HttpHost; import org.apache.http.auth.AuthScope; diff --git a/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaIndexWriter.java b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaIndexWriter.java index 1702004fd..2fcf6de87 100644 --- a/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaIndexWriter.java +++ b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaIndexWriter.java @@ -16,7 +16,7 @@ */ package org.apache.nutch.indexwriter.kafka; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.exception.ExceptionUtils; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.indexer.IndexWriter; diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java index ec5e77e43..9818c9a4a 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java @@ -25,7 +25,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.net.protocols.Response; diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java index 26472d141..85cd1fae0 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java @@ -27,7 +27,7 @@ import java.util.Map; import java.util.Set; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.html.dom.HTMLDocumentImpl; import org.apache.nutch.metadata.Nutch; diff --git a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java index bc17eb079..cc3d12561 100644 --- a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java +++ b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java @@ -36,7 +36,7 @@ import org.apache.nutch.plugin.PluginRepository; import org.apache.nutch.protocol.Content; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java index 5942486f2..894ce6ef8 100644 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java +++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java @@ -48,7 +48,7 @@ import org.apache.commons.httpclient.protocol.Protocol; import org.apache.commons.httpclient.protocol.ProtocolSocketFactory; import org.apache.commons.httpclient.protocol.SSLProtocolSocketFactory; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.net.protocols.Response; import org.apache.nutch.protocol.ProtocolException; diff --git a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java index 9b0e9776d..e9ba6a8c6 100644 --- a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java +++ b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java @@ -25,7 +25,7 @@ import java.util.LinkedHashSet; import java.util.Set; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; diff --git a/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java b/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java index 1e86426c7..d61d59722 100644 --- a/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java +++ b/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java @@ -25,7 +25,7 @@ import java.util.LinkedHashSet; import java.util.Set; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; diff --git a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java index cbc08f4c3..246dc8110 100644 --- a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java +++ b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java @@ -18,7 +18,7 @@ import com.google.common.collect.LinkedHashMultimap; import com.google.common.collect.Multimap; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.CompressionCodec; diff --git a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java index 537868b5e..e0394befe 100644 --- a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java +++ b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java @@ -26,7 +26,7 @@ import java.net.URL; import java.util.HashMap; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; diff --git a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java index cec27760e..abe531291 100644 --- a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java +++ b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java @@ -29,7 +29,7 @@ import java.util.TreeMap; import java.util.regex.Pattern; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java b/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java index f8a547bc0..115376bfc 100644 --- a/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java +++ b/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java @@ -22,7 +22,7 @@ import java.util.Collections; import java.util.List; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.net.URLNormalizer; diff --git a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java index 5e5884ea2..2da831041 100644 --- a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java +++ b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java @@ -27,7 +27,7 @@ import java.util.HashMap; import java.util.Map; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; From 5263b7cbea0a50bf0bb3324f139f2ad3030f6875 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 4 Dec 2024 16:11:11 +0100 Subject: [PATCH 5/6] NUTCH-3096 HostDB ResolverThread can create too many job counters (patch contributed by Markus Jelsma) --- .../apache/nutch/hostdb/ResolverThread.java | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/java/org/apache/nutch/hostdb/ResolverThread.java b/src/java/org/apache/nutch/hostdb/ResolverThread.java index 434e7bb31..c0a4f124b 100644 --- a/src/java/org/apache/nutch/hostdb/ResolverThread.java +++ b/src/java/org/apache/nutch/hostdb/ResolverThread.java @@ -114,15 +114,32 @@ public void run() { } } - context.getCounter("UpdateHostDb", - Long.toString(datum.numFailures()) + "_times_failed").increment(1); + context.getCounter("UpdateHostDb", createFailureCounterLabel(datum)).increment(1); } catch (Exception ioe) { LOG.warn(StringUtils.stringifyException(ioe)); } } catch (Exception e) { LOG.warn(StringUtils.stringifyException(e)); } - + context.getCounter("UpdateHostDb", "checked_hosts").increment(1); } + + private String createFailureCounterLabel(HostDatum datum) { + // Hadoop will allow no more than 120 distinct counters. If we have a large + // number of distinct failures, we'll exceed the limit, Hadoop will complain, + // the job will fail. Let's limit the amount of possibilities by grouping + // the numFailures in buckets. NUTCH-3096 + String label = null; + long n = datum.numFailures(); + if (n < 4) { + label = Long.toString(n); + } else if (n > 3 && n < 11) { + label = "4-10"; + } else { + label = ">10"; + } + + return label + "_times_failed"; + } } From b481f912cee9ddf985886491b1e1ce695af4d23d Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sun, 27 Oct 2024 12:42:14 +0100 Subject: [PATCH 6/6] NUTCH-3083 Add RobotRulesParser to bin/nutch Add command *robotsparser* to bin/nutch, invoking the main method of org.apache.nutch.protocol.RobotRulesParser --- src/bin/nutch | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/bin/nutch b/src/bin/nutch index 0b55388c6..257059deb 100755 --- a/src/bin/nutch +++ b/src/bin/nutch @@ -86,6 +86,7 @@ if [ $# = 0 ]; then echo " indexchecker check the indexing filters for a given url" echo " filterchecker check url filters for a given url" echo " normalizerchecker check url normalizers for a given url" + echo " robotsparser parse a robots.txt file and check whether urls are allowed or not" echo " domainstats calculate domain statistics from crawldb" echo " protocolstats calculate protocol status code stats from crawldb" echo " crawlcomplete calculate crawl completion stats from crawldb" @@ -268,6 +269,8 @@ elif [ "$COMMAND" = "filterchecker" ] ; then CLASS=org.apache.nutch.net.URLFilterChecker elif [ "$COMMAND" = "normalizerchecker" ] ; then CLASS=org.apache.nutch.net.URLNormalizerChecker +elif [ "$COMMAND" = "robotsparser" ] ; then + CLASS=org.apache.nutch.protocol.RobotRulesParser elif [ "$COMMAND" = "domainstats" ] ; then CLASS=org.apache.nutch.util.DomainStatistics elif [ "$COMMAND" = "protocolstats" ] ; then