Skip to content

Commit

Permalink
Merge pull request #776 from tballison/NUTCH-2959
Browse files Browse the repository at this point in the history
NUTCH-2959 -- upgrade Tika to 2.9.0
  • Loading branch information
tballison authored Oct 20, 2023
2 parents a74b57b + 9aabc45 commit 97eb0b5
Show file tree
Hide file tree
Showing 11 changed files with 54 additions and 132 deletions.
19 changes: 11 additions & 8 deletions ivy/ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,14 @@
<dependency org="org.apache.logging.log4j" name="log4j-slf4j2-impl" rev="2.20.0" conf="*->master" />
<dependency org="org.slf4j" name="slf4j-api" rev="2.0.7" conf="*->master" />

<dependency org="org.apache.commons" name="commons-lang3" rev="3.12.0" conf="*->default" />
<dependency org="org.apache.commons" name="commons-lang3" rev="3.13.0" conf="*->default" />
<dependency org="org.apache.commons" name="commons-collections4" rev="4.4" conf="*->master" />
<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.13" conf="*->master" />
<dependency org="commons-codec" name="commons-codec" rev="1.15" conf="*->default" />
<dependency org="org.apache.commons" name="commons-compress" rev="1.21" conf="*->default" />
<dependency org="commons-codec" name="commons-codec" rev="1.16.0" conf="*->default" />
<!-- hadoop 3.4.0 should have 2.11.0; Tika is broken in distributed mode until then;
see https://github.com/apache/nutch/pull/776 -->
<dependency org="commons-io" name="commons-io" rev="2.11.0" conf="*->default" />
<dependency org="org.apache.commons" name="commons-compress" rev="1.23.0" conf="*->default" />
<dependency org="org.apache.commons" name="commons-jexl3" rev="3.2.1" conf="*->default" />
<dependency org="com.tdunning" name="t-digest" rev="3.3" />

Expand All @@ -67,7 +70,7 @@
<exclude org="org.slf4j" name="*" />
</dependency><!-- End of Hadoop Dependencies -->

<dependency org="org.apache.tika" name="tika-core" rev="2.3.0" />
<dependency org="org.tallison.tika" name="tika-core-shaded" rev="2.9.0.0" conf="*->default" transitive="false"/>

<dependency org="xml-apis" name="xml-apis" rev="1.4.01" /><!-- force this version as it is required by Tika -->
<dependency org="xerces" name="xercesImpl" rev="2.12.2" />
Expand All @@ -88,10 +91,10 @@
<dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.5.3" conf="*->default" />
<dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.5.3" conf="*->default" />
<dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.5.3" conf="test->default" />
<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.13.3" conf="*->default" />
<dependency org="com.fasterxml.jackson.core" name="jackson-annotations" rev="2.13.3" conf="*->default" />
<dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.13.3" conf="*->default" />
<dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.13.3" conf="*->default" />
<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.15.2" conf="*->default" />
<dependency org="com.fasterxml.jackson.core" name="jackson-annotations" rev="2.15.2" conf="*->default" />
<dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.15.2" conf="*->default" />
<dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.15.2" conf="*->default" />

<!-- WARC artifacts needed -->
<dependency org="org.netpreserve.commons" name="webarchive-commons" rev="1.1.9" conf="*->default">
Expand Down
7 changes: 7 additions & 0 deletions ivy/ivysettings.xml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,14 @@
<!-- pull in the local repository -->
<include url="${ivy.default.conf.dir}/ivyconf-local.xml"/>
<settings defaultResolver="default"/>
<property name="local-maven2-dir" value="${user.home}/.m2/repository/" />
<resolvers>
<filesystem name="local-maven-2" m2compatible="true">
<artifact
pattern="${local-maven2-dir}/[organisation]/[module]/[revision]/[module]-[revision].[ext]" />
<ivy
pattern="${local-maven2-dir}/[organisation]/[module]/[revision]/[module]-[revision].pom" />
</filesystem>
<ibiblio name="maven2"
root="${repo.maven.org}"
pattern="${maven2.pattern.ext}"
Expand Down
6 changes: 3 additions & 3 deletions src/plugin/indexer-cloudsearch/plugin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
<library name="commons-logging-1.1.3.jar"/>
<library name="httpclient-4.3.6.jar"/>
<library name="httpcore-4.3.3.jar"/>
<library name="jackson-annotations-2.5.0.jar"/>
<library name="jackson-core-2.5.3.jar"/>
<library name="jackson-databind-2.5.3.jar"/>
<library name="jackson-annotations-2.15.2.jar"/>
<library name="jackson-core-2.15.2.jar"/>
<library name="jackson-databind-2.15.2.jar"/>
<library name="joda-time-2.8.jar"/>

</runtime>
Expand Down
6 changes: 3 additions & 3 deletions src/plugin/indexer-kafka/plugin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@
<library name="kafka_2.12-1.1.0.jar"/>
<library name="connect-json-1.1.0.jar"/>
<library name="connect-api-1.1.0.jar"/>
<library name="jackson-annotations-2.9.0.jar"/>
<library name="jackson-core-2.9.4.jar"/>
<library name="jackson-databind-2.9.4.jar"/>
<library name="jackson-annotations-2.15.2.jar"/>
<library name="jackson-core-2.15.2.jar"/>
<library name="jackson-databind-2.15.2.jar"/>
<library name="jopt-simple-5.0.4.jar"/>
<library name="kafka-clients-1.1.0.jar"/>
<library name="lz4-java-1.4.jar"/>
Expand Down
2 changes: 1 addition & 1 deletion src/plugin/indexer-solr/plugin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
<export name="*" />
</library>
<!-- Solr dependencies -->
<library name="commons-io-2.8.0.jar"/>
<library name="commons-io-2.13.0.jar"/>
<library name="commons-lang-2.6.jar"/>
<library name="commons-math3-3.6.1.jar"/>
<library name="http2-client-9.4.44.v20210927.jar"/>
Expand Down
8 changes: 1 addition & 7 deletions src/plugin/language-identifier/ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,7 @@
</publications>

<dependencies>
<dependency org="org.apache.tika" name="tika-langdetect-optimaize" rev="2.3.0" conf="*->default">
<!-- exclusions of dependencies provided in Nutch core (ivy/ivy.xml) -->
<exclude org="org.apache.tika" name="tika-core" />
<exclude org="com.google.guava" name="guava" />
<exclude org="org.slf4j" name="slf4j-api" />
<!-- exclusions of dependencies provided in Nutch core (ivy/ivy.xml) -->
</dependency>
<dependency org="org.tallison.tika" name="tika-langdetect-optimaize-shaded" rev="2.9.0.0" conf="*->default" transitive="false"/>
</dependencies>

</ivy-module>
11 changes: 1 addition & 10 deletions src/plugin/language-identifier/plugin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,7 @@
<export name="*"/>
</library>
<!-- dependencies of Tika's Optimaize language detector (tika-langdetect-optimaize) -->
<library name="annotations-12.0.jar"/>
<library name="checker-qual-3.12.0.jar"/>
<library name="error_prone_annotations-2.7.1.jar"/>
<library name="failureaccess-1.0.1.jar"/>
<library name="j2objc-annotations-1.3.jar"/>
<library name="jsonic-1.2.11.jar"/>
<library name="jsr305-3.0.2.jar"/>
<library name="language-detector-0.6.jar"/>
<library name="listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar"/>
<library name="tika-langdetect-optimaize-2.3.0.jar"/>
<library name="tika-langdetect-optimaize-shaded-2.9.0.0.jar"/>
</runtime>

<requires>
Expand Down
6 changes: 3 additions & 3 deletions src/plugin/lib-htmlunit/plugin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,16 @@
<library name="checker-qual-3.12.0.jar">
<export name="*"/>
</library>
<library name="commons-codec-1.15.jar">
<library name="commons-codec-1.16.0.jar">
<export name="*"/>
</library>
<library name="commons-exec-1.3.jar">
<export name="*"/>
</library>
<library name="commons-io-2.10.0.jar">
<library name="commons-io-2.13.0.jar">
<export name="*"/>
</library>
<library name="commons-lang3-3.12.0.jar">
<library name="commons-lang3-3.13.0.jar">
<export name="*"/>
</library>
<library name="commons-logging-1.2.jar">
Expand Down
22 changes: 22 additions & 0 deletions src/plugin/parse-tika/howto_upgrade_tika.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
We are currently using a shim (https://github.com/tballison/hadoop-safe-tika
because of binary conflicts in commons-io versions between what Hadoop supports and the more
modern features that Apache Tika and Apache POI were using in commons-io.

For now, all you have to do is update the fat jar dependencies:

1. tika-core-shaded in ivy/ivy.xml

2. tika-parsers-standard-package-shaded in src/plugin/parse-tika/ivy.xml

3. The library name version for tika-parsers-standard-package-shaded in src/plugin/parse-tika/plugin.xml

4. Repeat steps 2 and 3 for the language-identifier

5. Build Nutch and run all unit tests:

$ cd ../../../
$ ant clean runtime test

The following directions are what we used to do with thin jars. Hopefully, we'll
be able to get back to these directions once we have version harmony with Hadoop and Tika/POI.

1. Upgrade Tika dependency (tika-core) in ivy/ivy.xml

2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml
Expand Down
18 changes: 1 addition & 17 deletions src/plugin/parse-tika/ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -36,23 +36,7 @@
</publications>

<dependencies>
<dependency org="org.apache.tika" name="tika-parsers-standard-package" rev="2.3.0" conf="*->default">
<!-- exclusions of dependencies provided in Nutch core (ivy/ivy.xml) -->
<exclude org="org.apache.tika" name="tika-core" />
<exclude org="org.apache.commons" name="commons-lang3" />
<exclude org="org.apache.commons" name="commons-collections4" />
<exclude org="commons-io" name="commons-io" />
<exclude org="commons-logging" name="commons-logging" />
<exclude org="org.slf4j" name="slf4j-api" />
<!-- exclusion of Xerces and xml-apis is mandatory so that there
are no instances in the child/plugin class loader -->
<exclude org="xerces" name="xercesImpl" />
<exclude org="xml-apis" name="xml-apis" />
<!-- common-codec and commons-compress must be included -->
<!--exclude org="org.apache.commons" name="commons-codec" /-->
<!--exclude org="commons-codec" name="commons-codec" /-->
<!--exclude org="org.apache.commons" name="commons-compress" /-->
</dependency>
<dependency org="org.tallison.tika" name="tika-parsers-standard-package-shaded" rev="2.9.0.0" conf="*->default" transitive="false"/>
</dependencies>

</ivy-module>
81 changes: 1 addition & 80 deletions src/plugin/parse-tika/plugin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,86 +25,7 @@
<library name="parse-tika.jar">
<export name="*"/>
</library>
<!-- dependencies of Tika (tika-parsers) -->
<library name="apache-mime4j-core-0.8.4.jar"/>
<library name="apache-mime4j-dom-0.8.4.jar"/>
<library name="asm-9.2.jar"/>
<library name="bcmail-jdk15on-1.70.jar"/>
<library name="bcpkix-jdk15on-1.70.jar"/>
<library name="bcprov-jdk15on-1.70.jar"/>
<library name="bcutil-jdk15on-1.70.jar"/>
<library name="boilerpipe-1.1.0.jar"/>
<library name="commons-codec-1.15.jar"/>
<library name="commons-compress-1.21.jar"/>
<library name="commons-csv-1.9.0.jar"/>
<library name="commons-exec-1.3.jar"/>
<library name="commons-math3-3.6.1.jar"/>
<library name="curvesapi-1.06.jar"/>
<library name="dd-plist-1.23.jar"/>
<library name="dec-0.1.2.jar"/>
<library name="fontbox-2.0.25.jar"/>
<library name="istack-commons-runtime-3.0.12.jar"/>
<library name="jackcess-4.0.1.jar"/>
<library name="jackcess-encrypt-4.0.1.jar"/>
<library name="jai-imageio-core-1.4.0.jar"/>
<library name="jakarta.activation-1.2.2.jar"/>
<library name="jakarta.xml.bind-api-2.3.3.jar"/>
<library name="java-libpst-0.9.3.jar"/>
<library name="jaxb-runtime-2.3.5.jar"/>
<library name="jbig2-imageio-3.0.3.jar"/>
<library name="jcl-over-slf4j-1.7.35.jar"/>
<library name="jdom2-2.0.6.1.jar"/>
<library name="jempbox-1.8.16.jar"/>
<library name="jhighlight-1.0.3.jar"/>
<library name="jmatio-1.5.jar"/>
<library name="juniversalchardet-1.0.3.jar"/>
<library name="junrar-7.4.1.jar"/>
<library name="log4j-api-2.17.1.jar"/>
<library name="metadata-extractor-2.16.0.jar"/>
<library name="parso-2.0.14.jar"/>
<library name="pdfbox-2.0.25.jar"/>
<library name="pdfbox-debugger-2.0.25.jar"/>
<library name="pdfbox-tools-2.0.25.jar"/>
<library name="poi-5.2.0.jar"/>
<library name="poi-ooxml-5.2.0.jar"/>
<library name="poi-ooxml-lite-5.2.0.jar"/>
<library name="poi-scratchpad-5.2.0.jar"/>
<library name="rome-1.18.0.jar"/>
<library name="rome-utils-1.18.0.jar"/>
<library name="SparseBitSet-1.2.jar"/>
<library name="tagsoup-1.2.1.jar"/>
<library name="tika-parser-apple-module-2.3.0.jar"/>
<library name="tika-parser-audiovideo-module-2.3.0.jar"/>
<library name="tika-parser-cad-module-2.3.0.jar"/>
<library name="tika-parser-code-module-2.3.0.jar"/>
<library name="tika-parser-crypto-module-2.3.0.jar"/>
<library name="tika-parser-digest-commons-2.3.0.jar"/>
<library name="tika-parser-font-module-2.3.0.jar"/>
<library name="tika-parser-html-commons-2.3.0.jar"/>
<library name="tika-parser-html-module-2.3.0.jar"/>
<library name="tika-parser-image-module-2.3.0.jar"/>
<library name="tika-parser-mail-commons-2.3.0.jar"/>
<library name="tika-parser-mail-module-2.3.0.jar"/>
<library name="tika-parser-microsoft-module-2.3.0.jar"/>
<library name="tika-parser-miscoffice-module-2.3.0.jar"/>
<library name="tika-parser-news-module-2.3.0.jar"/>
<library name="tika-parser-ocr-module-2.3.0.jar"/>
<library name="tika-parser-pdf-module-2.3.0.jar"/>
<library name="tika-parser-pkg-module-2.3.0.jar"/>
<library name="tika-parsers-standard-package-2.3.0.jar"/>
<library name="tika-parser-text-module-2.3.0.jar"/>
<library name="tika-parser-xml-module-2.3.0.jar"/>
<library name="tika-parser-xmp-commons-2.3.0.jar"/>
<library name="tika-parser-zip-commons-2.3.0.jar"/>
<library name="txw2-2.3.5.jar"/>
<library name="vorbis-java-core-0.8.jar"/>
<library name="vorbis-java-tika-0.8.jar"/>
<library name="xmlbeans-5.0.3.jar"/>
<library name="xmpbox-2.0.25.jar"/>
<library name="xmpcore-6.1.11.jar"/>
<library name="xz-1.9.jar"/>

<!-- end of dependencies of Tika (tika-parsers) -->
<library name="tika-parsers-standard-package-shaded-2.9.0.0.jar"/>
</runtime>

<requires>
Expand Down

0 comments on commit 97eb0b5

Please sign in to comment.