Merge remote-tracking branch 'upstream/main' into 13302

Signed-off-by: Ashish Singh <ssashish@amazon.com>
opensearch-project · Apr 29, 2024 · de1f70f · de1f70f
2 parents c0bcbb7 + fc81a90
commit de1f70f
Show file tree

Hide file tree

Showing 23 changed files with 1,610 additions and 488 deletions.
diff --git a/.github/workflows/assemble.yml b/.github/workflows/assemble.yml
@@ -8,7 +8,7 @@ jobs:
     strategy:
       matrix:
         java: [ 11, 17, 21 ]
-        os: [ubuntu-latest, windows-latest, macos-latest]
+        os: [ubuntu-latest, windows-latest, macos-13]
     steps:
       - uses: actions/checkout@v4
       - name: Set up JDK ${{ matrix.java }}
@@ -18,10 +18,7 @@ jobs:
           distribution: temurin
       - name: Setup docker (missing on MacOS)
         if: runner.os == 'macos'
-        run: |
-          brew install docker
-          colima start
-          sudo ln -sf $HOME/.colima/default/docker.sock /var/run/docker.sock
+        uses: douglascamata/setup-docker-macos-action@main
       - name: Run Gradle (assemble)
         run: |
           ./gradlew assemble --parallel --no-build-cache -PDISABLE_BUILD_CACHE
diff --git a/.github/workflows/precommit.yml b/.github/workflows/precommit.yml
@@ -8,7 +8,7 @@ jobs:
     strategy:
       matrix:
         java: [ 11, 17, 21 ]
-        os: [ubuntu-latest, windows-latest, macos-latest]
+        os: [ubuntu-latest, windows-latest, macos-13]
     steps:
       - uses: actions/checkout@v4
       - name: Set up JDK ${{ matrix.java }}

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,7 +24,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Add an individual setting of rate limiter for segment replication ([#12959](https://github.com/opensearch-project/OpenSearch/pull/12959))
 - [Streaming Indexing] Ensure support of the new transport by security plugin ([#13174](https://github.com/opensearch-project/OpenSearch/pull/13174))
 - Add cluster setting to dynamically configure the buckets for filter rewrite optimization. ([#13179](https://github.com/opensearch-project/OpenSearch/pull/13179))
+- [Tiered Caching] Gate new stats logic behind FeatureFlags.PLUGGABLE_CACHE ([#13238](https://github.com/opensearch-project/OpenSearch/pull/13238))
+- [Tiered Caching] Add a dynamic setting to disable/enable disk cache. ([#13373](https://github.com/opensearch-project/OpenSearch/pull/13373))
 - [Remote Store] Add capability of doing refresh as determined by the translog ([#12992](https://github.com/opensearch-project/OpenSearch/pull/12992))
+- [Tiered caching] Make Indices Request Cache Stale Key Mgmt Threshold setting dynamic ([#12941](https://github.com/opensearch-project/OpenSearch/pull/12941))
 - Batch mode for async fetching shard information in GatewayAllocator for unassigned shards ([#8746](https://github.com/opensearch-project/OpenSearch/pull/8746))
 - [Remote Store] Add settings for remote path type and hash algorithm ([#13225](https://github.com/opensearch-project/OpenSearch/pull/13225))
 - [Remote Store] Upload remote paths during remote enabled index creation ([#13386](https://github.com/opensearch-project/OpenSearch/pull/13386))
@@ -59,6 +62,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Ignoring unavailable shards during search request execution with ignore_available parameter ([#13298](https://github.com/opensearch-project/OpenSearch/pull/13298))
 - Refactoring globMatch using simpleMatchWithNormalizedStrings from Regex ([#13104](https://github.com/opensearch-project/OpenSearch/pull/13104))
 - [BWC and API enforcement] Reconsider the breaking changes check policy to detect breaking changes against released versions ([#13292](https://github.com/opensearch-project/OpenSearch/pull/13292))
+- Switch to macos-13 runner for precommit and assemble github actions due to macos-latest is now arm64 ([#13412](https://github.com/opensearch-project/OpenSearch/pull/13412))
 
 ### Deprecated
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -9,6 +9,7 @@
 - [Changelog](#changelog)
 - [Review Process](#review-process)
     - [Tips for Success](#tips)
+- [Troubleshooting Failing Builds](#troubleshooting-failing-builds)
 
 # Contributing to OpenSearch
 
@@ -180,3 +181,13 @@ We have a lot of mechanisms to help expedite towards an accepted PR. Here are so
 
 In general, adding more guardrails to your changes increases the likelihood of swift PR acceptance. We can always relax these guard rails in smaller followup PRs. Reverting a GA feature is much more difficult. Check out the [DEVELOPER_GUIDE](./DEVELOPER_GUIDE.md#submitting-changes) for more useful tips.
 
+## Troubleshooting Failing Builds
+
+The OpenSearch testing framework offers many capabilities but exhibits significant complexity (it does lot of randomization internally to cover as many edge cases and variations as possible). Unfortunately, this posses a challenge by making it harder to discover important issues/bugs in straightforward way and may lead to so called flaky tests - the tests which flip randomly from success to failure without any code changes.
+
+If your pull request reports a failing test(s) on one of the checks, please:
+- look if there is an existing [issue](https://github.com/opensearch-project/OpenSearch/issues) reported for the test in question
+- if not, please make sure this is not caused by your changes, run the failing test(s) locally for some time
+- if you are sure the failure is not related, please open a new [bug](https://github.com/opensearch-project/OpenSearch/issues/new?assignees=&labels=bug%2C+untriaged&projects=&template=bug_template.md&title=%5BBUG%5D) with `flaky-test` label
+- add a comment referencing the issue(s) or bug report(s) to your pull request explaining the failing build(s)
+- as a bonus point, try to contribute by fixing the flaky test(s)
diff --git a/...src/internalClusterTest/java/org.opensearch.cache.common.tier/TieredSpilloverCacheIT.java b/...src/internalClusterTest/java/org.opensearch.cache.common.tier/TieredSpilloverCacheIT.java
@@ -425,6 +425,121 @@ public void testWithExplicitCacheClear() throws Exception {
         }, 1, TimeUnit.SECONDS);
     }
 
+    public void testWithDynamicDiskCacheSetting() throws Exception {
+        int onHeapCacheSizeInBytes = 10; // Keep it low so that all items are cached onto disk.
+        internalCluster().startNode(
+            Settings.builder()
+                .put(defaultSettings(onHeapCacheSizeInBytes + "b"))
+                .put(INDICES_CACHE_CLEAN_INTERVAL_SETTING.getKey(), new TimeValue(1))
+                .build()
+        );
+        Client client = client();
+        assertAcked(
+            client.admin()
+                .indices()
+                .prepareCreate("index")
+                .setMapping("k", "type=keyword")
+                .setSettings(
+                    Settings.builder()
+                        .put(IndicesRequestCache.INDEX_CACHE_REQUEST_ENABLED_SETTING.getKey(), true)
+                        .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1)
+                        .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)
+                        .put("index.refresh_interval", -1)
+                )
+                .get()
+        );
+        // Update took time policy to zero so that all entries are eligible to be cached on disk.
+        ClusterUpdateSettingsRequest updateSettingsRequest = new ClusterUpdateSettingsRequest().transientSettings(
+            Settings.builder()
+                .put(
+                    TieredSpilloverCacheSettings.TOOK_TIME_POLICY_CONCRETE_SETTINGS_MAP.get(CacheType.INDICES_REQUEST_CACHE).getKey(),
+                    new TimeValue(0, TimeUnit.MILLISECONDS)
+                )
+                .build()
+        );
+        assertAcked(internalCluster().client().admin().cluster().updateSettings(updateSettingsRequest).get());
+        int numberOfIndexedItems = randomIntBetween(5, 10);
+        for (int iterator = 0; iterator < numberOfIndexedItems; iterator++) {
+            indexRandom(true, client.prepareIndex("index").setSource("k" + iterator, "hello" + iterator));
+        }
+        ensureSearchable("index");
+        refreshAndWaitForReplication();
+        // Force merge the index to ensure there can be no background merges during the subsequent searches that would invalidate the cache
+        ForceMergeResponse forceMergeResponse = client.admin().indices().prepareForceMerge("index").setFlush(true).get();
+        OpenSearchAssertions.assertAllSuccessful(forceMergeResponse);
+        long perQuerySizeInCacheInBytes = -1;
+        // Step 1: Hit some queries. We will see misses and queries will be cached(onto disk cache) for subsequent
+        // requests.
+        for (int iterator = 0; iterator < numberOfIndexedItems; iterator++) {
+            SearchResponse resp = client.prepareSearch("index")
+                .setRequestCache(true)
+                .setQuery(QueryBuilders.termQuery("k" + iterator, "hello" + iterator))
+                .get();
+            if (perQuerySizeInCacheInBytes == -1) {
+                RequestCacheStats requestCacheStats = getRequestCacheStats(client, "index");
+                perQuerySizeInCacheInBytes = requestCacheStats.getMemorySizeInBytes();
+            }
+            assertSearchResponse(resp);
+        }
+
+        RequestCacheStats requestCacheStats = getRequestCacheStats(client, "index");
+        assertEquals(numberOfIndexedItems * perQuerySizeInCacheInBytes, requestCacheStats.getMemorySizeInBytes());
+        assertEquals(numberOfIndexedItems, requestCacheStats.getMissCount());
+        assertEquals(0, requestCacheStats.getHitCount());
+        assertEquals(0, requestCacheStats.getEvictions());
+
+        // Step 2: Hit same queries again. We will see hits now.
+        for (int iterator = 0; iterator < numberOfIndexedItems; iterator++) {
+            SearchResponse resp = client.prepareSearch("index")
+                .setRequestCache(true)
+                .setQuery(QueryBuilders.termQuery("k" + iterator, "hello" + iterator))
+                .get();
+            assertSearchResponse(resp);
+        }
+        requestCacheStats = getRequestCacheStats(client, "index");
+        assertEquals(numberOfIndexedItems * perQuerySizeInCacheInBytes, requestCacheStats.getMemorySizeInBytes());
+        assertEquals(numberOfIndexedItems, requestCacheStats.getMissCount());
+        assertEquals(numberOfIndexedItems, requestCacheStats.getHitCount());
+        assertEquals(0, requestCacheStats.getEvictions());
+        long lastKnownHitCount = requestCacheStats.getHitCount();
+        long lastKnownMissCount = requestCacheStats.getMissCount();
+
+        // Step 3: Turn off disk cache now. And hit same queries again. We should not see hits now as all queries
+        // were cached onto disk cache.
+        updateSettingsRequest = new ClusterUpdateSettingsRequest().transientSettings(
+            Settings.builder()
+                .put(TieredSpilloverCacheSettings.DISK_CACHE_ENABLED_SETTING_MAP.get(CacheType.INDICES_REQUEST_CACHE).getKey(), false)
+                .build()
+        );
+        assertAcked(internalCluster().client().admin().cluster().updateSettings(updateSettingsRequest).get());
+
+        for (int iterator = 0; iterator < numberOfIndexedItems; iterator++) {
+            SearchResponse resp = client.prepareSearch("index")
+                .setRequestCache(true)
+                .setQuery(QueryBuilders.termQuery("k" + iterator, "hello" + iterator))
+                .get();
+            assertSearchResponse(resp);
+        }
+        requestCacheStats = getRequestCacheStats(client, "index");
+        assertEquals(numberOfIndexedItems * perQuerySizeInCacheInBytes, requestCacheStats.getMemorySizeInBytes()); //
+        // Still shows disk cache entries as explicit clear or invalidation is required to clean up disk cache.
+        assertEquals(lastKnownMissCount + numberOfIndexedItems, requestCacheStats.getMissCount());
+        assertEquals(0, lastKnownHitCount - requestCacheStats.getHitCount()); // No new hits being seen.
+        lastKnownMissCount = requestCacheStats.getMissCount();
+        lastKnownHitCount = requestCacheStats.getHitCount();
+
+        // Step 4: Invalidate entries via refresh.
+        // Explicit refresh would invalidate cache entries.
+        refreshAndWaitForReplication();
+        assertBusy(() -> {
+            // Explicit refresh should clear up cache entries
+            assertTrue(getRequestCacheStats(client, "index").getMemorySizeInBytes() == 0);
+        }, 1, TimeUnit.SECONDS);
+        requestCacheStats = getRequestCacheStats(client, "index");
+        assertEquals(0, lastKnownMissCount - requestCacheStats.getMissCount());
+        assertEquals(0, lastKnownHitCount - requestCacheStats.getHitCount());
+    }
+
     private RequestCacheStats getRequestCacheStats(Client client, String indexName) {
         return client.admin().indices().prepareStats(indexName).setRequestCache(true).get().getTotal().getRequestCache();
     }