Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ingest Attachment: Upgrade Tika to 1.18 #31252

Merged
merged 10 commits into from
Jun 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions plugins/ingest-attachment/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ esplugin {
}

versions << [
'tika': '1.17',
'pdfbox': '2.0.8',
'tika': '1.18',
'pdfbox': '2.0.9',
'bouncycastle': '1.55',
'poi': '3.17',
'mime4j': '0.8.1'
Expand All @@ -33,9 +33,10 @@ versions << [
dependencies {
// mandatory for tika
compile "org.apache.tika:tika-core:${versions.tika}"
// build against Jackson 2.9.5, but still works on our current version
compile "org.apache.tika:tika-parsers:${versions.tika}"
compile 'org.tukaani:xz:1.6'
compile 'commons-io:commons-io:2.5'
compile 'org.tukaani:xz:1.8'
compile 'commons-io:commons-io:2.6'
compile "org.slf4j:slf4j-api:${versions.slf4j}"

// character set detection
Expand All @@ -62,7 +63,7 @@ dependencies {
// MS Office
compile "org.apache.poi:poi-scratchpad:${versions.poi}"
// Apple iWork
compile 'org.apache.commons:commons-compress:1.14'
compile 'org.apache.commons:commons-compress:1.16.1'
// Outlook documents
compile "org.apache.james:apache-mime4j-core:${versions.mime4j}"
compile "org.apache.james:apache-mime4j-dom:${versions.mime4j}"
Expand Down Expand Up @@ -118,6 +119,10 @@ thirdPartyAudit.excludes = [
'com.drew.metadata.jpeg.JpegDirectory',
'com.github.junrar.Archive',
'com.github.junrar.rarfile.FileHeader',
'com.github.luben.zstd.ZstdInputStream',
'com.github.luben.zstd.ZstdOutputStream',
'com.github.openjson.JSONArray',
'com.github.openjson.JSONObject',
'com.google.common.reflect.TypeToken',
'com.google.gson.Gson',
'com.googlecode.mp4parser.DataSource',
Expand Down Expand Up @@ -531,6 +536,7 @@ thirdPartyAudit.excludes = [
'org.apache.commons.exec.PumpStreamHandler',
'org.apache.commons.exec.environment.EnvironmentUtils',
'org.apache.commons.lang.StringUtils',
'org.apache.commons.lang.SystemUtils',
'org.apache.ctakes.typesystem.type.refsem.UmlsConcept',
'org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation',
'org.apache.cxf.jaxrs.client.WebClient',
Expand Down Expand Up @@ -635,8 +641,6 @@ thirdPartyAudit.excludes = [
'org.etsi.uri.x01903.v13.impl.UnsignedSignaturePropertiesTypeImpl$1SignatureTimeStampList',
'org.etsi.uri.x01903.v14.ValidationDataType$Factory',
'org.etsi.uri.x01903.v14.ValidationDataType',
'org.json.JSONArray',
'org.json.JSONObject',
'org.json.simple.JSONArray',
'org.json.simple.JSONObject',
'org.json.simple.parser.JSONParser',
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
7b5cdabadb4cf12f5ee0f801399e70635583193f
1 change: 0 additions & 1 deletion plugins/ingest-attachment/licenses/commons-io-2.5.jar.sha1

This file was deleted.

1 change: 1 addition & 0 deletions plugins/ingest-attachment/licenses/commons-io-2.6.jar.sha1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
815893df5f31da2ece4040fe0a12fd44b577afaf
1 change: 0 additions & 1 deletion plugins/ingest-attachment/licenses/fontbox-2.0.8.jar.sha1

This file was deleted.

1 change: 1 addition & 0 deletions plugins/ingest-attachment/licenses/fontbox-2.0.9.jar.sha1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
f961f17ebdbc307e9055e3cf7c0e207f0895ae55
1 change: 0 additions & 1 deletion plugins/ingest-attachment/licenses/pdfbox-2.0.8.jar.sha1

This file was deleted.

1 change: 1 addition & 0 deletions plugins/ingest-attachment/licenses/pdfbox-2.0.9.jar.sha1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
d0425578218624388f2ec84a0b3a11efd55df0f5
1 change: 0 additions & 1 deletion plugins/ingest-attachment/licenses/tika-core-1.17.jar.sha1

This file was deleted.

1 change: 1 addition & 0 deletions plugins/ingest-attachment/licenses/tika-core-1.18.jar.sha1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
69556697de96cf0b22df846e970dafd29866eee0

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
7d9b6dea91d783165f3313d320d3aaaa9a4dfc13
1 change: 0 additions & 1 deletion plugins/ingest-attachment/licenses/xz-1.6.jar.sha1

This file was deleted.

1 change: 1 addition & 0 deletions plugins/ingest-attachment/licenses/xz-1.8.jar.sha1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
c4f7d054303948eb6a4066194253886c8af07128
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ static PermissionCollection getRestrictedPermissions() {
perms.add(new SecurityPermission("putProviderProperty.BC"));
perms.add(new SecurityPermission("insertProvider"));
perms.add(new ReflectPermission("suppressAccessChecks"));
perms.add(new RuntimePermission("accessClassInPackage.sun.java2d.cmm.kcms"));
// xmlbeans, use by POI, needs to get the context classloader
perms.add(new RuntimePermission("getClassLoader"));
// ZipFile needs accessDeclaredMembers on JDK 10; cf. https://bugs.openjdk.java.net/browse/JDK-8187485
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,6 @@ grant {
permission java.lang.RuntimePermission "getClassLoader";
// ZipFile needs accessDeclaredMembers on Java 10
permission java.lang.RuntimePermission "accessDeclaredMembers";
// PDFBox checks for the existence of this class
permission java.lang.RuntimePermission "accessClassInPackage.sun.java2d.cmm.kcms";
};
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,12 @@ public void testAsciidocDocument() throws Exception {
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
}

// See (https://issues.apache.org/jira/browse/COMPRESS-432) for information
// about the issue that causes a zip file to hang in Tika versions prior to 1.18.
public void testZipFileDoesNotHang() {
expectThrows(Exception.class, () -> parseDocument("bad_tika.zip", processor));
}

public void testParseAsBytesArray() throws Exception {
String path = "/org/elasticsearch/ingest/attachment/test/sample-files/text-in-english.txt";
byte[] bytes;
Expand Down
Binary file not shown.