Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into shardLock
Browse files Browse the repository at this point in the history
  • Loading branch information
Hailong-am committed May 21, 2024
2 parents 0aed43a + 3fe6674 commit 35df6aa
Show file tree
Hide file tree
Showing 70 changed files with 2,877 additions and 582 deletions.
7 changes: 7 additions & 0 deletions .github/workflows/gradle-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,15 @@ jobs:
- name: Setup environment variables (PR)
if: github.event_name == 'pull_request_target'
run: |
echo "event_name=pull_request_target" >> $GITHUB_ENV
echo "branch_name=$(jq --raw-output .pull_request.base.ref $GITHUB_EVENT_PATH)" >> $GITHUB_ENV
echo "pr_from_sha=$(jq --raw-output .pull_request.head.sha $GITHUB_EVENT_PATH)" >> $GITHUB_ENV
echo "pr_from_clone_url=$(jq --raw-output .pull_request.head.repo.clone_url $GITHUB_EVENT_PATH)" >> $GITHUB_ENV
echo "pr_to_clone_url=$(jq --raw-output .pull_request.base.repo.clone_url $GITHUB_EVENT_PATH)" >> $GITHUB_ENV
echo "pr_title=$(jq --raw-output .pull_request.title $GITHUB_EVENT_PATH)" >> $GITHUB_ENV
echo "pr_number=$(jq --raw-output .pull_request.number $GITHUB_EVENT_PATH)" >> $GITHUB_ENV
echo "pr_owner=$(jq --raw-output .pull_request.user.login $GITHUB_EVENT_PATH)" >> $GITHUB_ENV
echo "pr_or_commit_description=$(jq --ascii-output .pull_request.body $GITHUB_EVENT_PATH)" >> $GITHUB_ENV
- name: Setup environment variables (Push)
if: github.event_name == 'push'
Expand All @@ -43,11 +47,14 @@ jobs:
ref_id=$(git rev-parse HEAD)
branch_name=$(git rev-parse --abbrev-ref HEAD)
echo "branch_name=$branch_name" >> $GITHUB_ENV
echo "event_name=push" >> $GITHUB_ENV
echo "pr_from_sha=$ref_id" >> $GITHUB_ENV
echo "pr_from_clone_url=$repo_url" >> $GITHUB_ENV
echo "pr_to_clone_url=$repo_url" >> $GITHUB_ENV
echo "pr_title=Push trigger $branch_name $ref_id $repo_url" >> $GITHUB_ENV
echo "pr_owner=$(jq --raw-output '.commits[0].author.username' $GITHUB_EVENT_PATH)" >> $GITHUB_ENV
echo "pr_number=Null" >> $GITHUB_ENV
echo "pr_or_commit_description=$(jq --ascii-output .head_commit.message $GITHUB_EVENT_PATH)" >> $GITHUB_ENV
- name: Checkout opensearch-build repo
uses: actions/checkout@v4
Expand Down
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

## [Unreleased 2.x]
### Added
- Add latency metrics for instrumenting critical clusterManager code paths ([#12333](https://github.com/opensearch-project/OpenSearch/pull/12333))
- Add support for Azure Managed Identity in repository-azure ([#12423](https://github.com/opensearch-project/OpenSearch/issues/12423))
- Add useCompoundFile index setting ([#13478](https://github.com/opensearch-project/OpenSearch/pull/13478))
- Make outbound side of transport protocol dependent ([#13293](https://github.com/opensearch-project/OpenSearch/pull/13293))
Expand All @@ -17,10 +18,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Bump `com.gradle.enterprise` from 3.17.2 to 3.17.3 ([#13641](https://github.com/opensearch-project/OpenSearch/pull/13641))
- Bump `org.apache.hadoop:hadoop-minicluster` from 3.3.6 to 3.4.0 ([#13642](https://github.com/opensearch-project/OpenSearch/pull/13642))
- Bump `mockito` from 5.11.0 to 5.12.0 ([#13665](https://github.com/opensearch-project/OpenSearch/pull/13665))
- Bump `com.google.code.gson:gson` from 2.10.1 to 2.11.0 ([#13752](https://github.com/opensearch-project/OpenSearch/pull/13752))

### Changed
- Add ability for Boolean and date field queries to run when only doc_values are enabled ([#11650](https://github.com/opensearch-project/OpenSearch/pull/11650))
- Refactor implementations of query phase searcher, allow QueryCollectorContext to have zero collectors ([#13481](https://github.com/opensearch-project/OpenSearch/pull/13481))
- Adds support to inject telemetry instances to plugins ([#13636](https://github.com/opensearch-project/OpenSearch/pull/13636))

### Deprecated

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,54 +32,67 @@

package org.opensearch.ingest.attachment;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.lucene.tests.util.LuceneTestCase.SuppressFileSystems;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.tika.metadata.Metadata;
import org.opensearch.common.io.PathUtils;
import org.opensearch.common.xcontent.XContentHelper;
import org.opensearch.common.xcontent.json.JsonXContent;
import org.opensearch.test.OpenSearchTestCase;

import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Map;

/**
* Evil test-coverage cheat, we parse a bunch of docs from tika
* so that we have a nice grab-bag variety, and assert some content
* comes back and no exception.
* Parse sample tika documents and assert the contents has not changed according to previously recorded checksums.
* Uncaught changes to tika parsing could potentially pose bwc issues.
* Note: In some cases tika will access a user's locale to inform the parsing of a file.
* The checksums of these files are left empty, and we only validate that parsed content is not null.
*/
@SuppressFileSystems("ExtrasFS") // don't try to parse extraN
public class TikaDocTests extends OpenSearchTestCase {

/** some test files from tika test suite, zipped up */
/** some test files from the apache tika unit test suite with accompanying sha1 checksums */
static final String TIKA_FILES = "/org/opensearch/ingest/attachment/test/tika-files/";
static final String TIKA_CHECKSUMS = "/org/opensearch/ingest/attachment/test/.checksums";

public void testFiles() throws Exception {
Path tmp = createTempDir();
logger.debug("unzipping all tika sample files");
try (DirectoryStream<Path> stream = Files.newDirectoryStream(PathUtils.get(getClass().getResource(TIKA_FILES).toURI()))) {
for (Path doc : stream) {
String filename = doc.getFileName().toString();
TestUtil.unzip(getClass().getResourceAsStream(TIKA_FILES + filename), tmp);
}
}
public void testParseSamples() throws Exception {
String checksumJson = Files.readString(PathUtils.get(getClass().getResource(TIKA_CHECKSUMS).toURI()));
Map<String, Object> checksums = XContentHelper.convertToMap(JsonXContent.jsonXContent, checksumJson, false);
DirectoryStream<Path> stream = Files.newDirectoryStream(unzipToTemp(TIKA_FILES));

try (DirectoryStream<Path> stream = Files.newDirectoryStream(tmp)) {
for (Path doc : stream) {
logger.debug("parsing: {}", doc);
assertParseable(doc);
for (Path doc : stream) {
String parsedContent = tryParse(doc);
assertNotNull(parsedContent);
assertFalse(parsedContent.isEmpty());

String check = checksums.get(doc.getFileName().toString()).toString();
if (!check.isEmpty()) {
assertEquals(check, DigestUtils.sha1Hex(parsedContent));
}
}

stream.close();
}

void assertParseable(Path fileName) throws Exception {
try {
byte bytes[] = Files.readAllBytes(fileName);
String parsedContent = TikaImpl.parse(bytes, new Metadata(), -1);
assertNotNull(parsedContent);
assertFalse(parsedContent.isEmpty());
logger.debug("extracted content: {}", parsedContent);
} catch (Exception e) {
throw new RuntimeException("parsing of filename: " + fileName.getFileName() + " failed", e);
private Path unzipToTemp(String zipDir) throws Exception {
Path tmp = createTempDir();
DirectoryStream<Path> stream = Files.newDirectoryStream(PathUtils.get(getClass().getResource(zipDir).toURI()));

for (Path doc : stream) {
String filename = doc.getFileName().toString();
TestUtil.unzip(getClass().getResourceAsStream(zipDir + filename), tmp);
}

stream.close();
return tmp;
}

private String tryParse(Path doc) throws Exception {
byte bytes[] = Files.readAllBytes(doc);
return TikaImpl.parse(bytes, new Metadata(), -1);
}
}
Loading

0 comments on commit 35df6aa

Please sign in to comment.