parsa-epfl · BugraEryilmaz · May 20, 2024 · May 20, 2024 · May 20, 2024 · May 20, 2024
diff --git a/.wordlist.txt b/.wordlist.txt
@@ -148,3 +148,5 @@ pre
 latencies
 TLS
 Elgg's
+GiB
+MiB
diff --git a/benchmarks/data-serving/client/setup_tables.txt b/benchmarks/data-serving/client/setup_tables.txt
@@ -13,4 +13,6 @@ create table if not exists usertable (
     field7 varchar,
     field8 varchar,
     field9 varchar);
+ALTER TABLE ycsb.usertable WITH caching = {'keys' : 'ALL', 'rows_per_partition' : 'ALL'};
+ALTER TABLE ycsb.usertable WITH default_time_to_live = 1024000;
 exit;
diff --git a/benchmarks/data-serving/server/Dockerfile b/benchmarks/data-serving/server/Dockerfile
@@ -3,6 +3,7 @@ FROM cloudsuite/cassandra:4.1.0
 RUN apt update && apt install -y --no-install-recommends python3-yaml && rm -rf /var/lib/apt/lists/*
 
 COPY docker-entrypoint.py /
+COPY jvm11-server.options /etc/cassandra/jvm11-server.options
 
 ENTRYPOINT ["/docker-entrypoint.py"]
 

diff --git a/benchmarks/data-serving/server/docker-entrypoint.py b/benchmarks/data-serving/server/docker-entrypoint.py
@@ -24,6 +24,7 @@ def get_ip():
 parser.add_argument("--heap-size", type=int, help="The size of JVM heap in GB. Default is max(min(1/2 ram, 1GB), min(1/4 ram, 8GB)).")
 parser.add_argument("--seed-server-ip", help="The IP address of the seed server. This option is only for multiple-node deployment.")
 parser.add_argument("--affinity", help="The CPU ids (separated by comma) given to Cassandra to set JVM affinity. By default, Cassandra would use all CPU cores.")
+parser.add_argument("--row-cache", help="The size of the row cache. Also specify the unit, example 16GiB or 256MiB. By default, the row cache is disabled.", default="0MiB")
 
 
 args = parser.parse_args()
@@ -42,7 +43,6 @@ def get_ip():
 if not path.exists(f"{CASSANDRA_CONFIG}/jvm-server.options.bak"):
     shutil.copy(f"{CASSANDRA_CONFIG}/jvm-server.options", f"{CASSANDRA_CONFIG}/jvm-server.options.bak")
 
-
 # Now, modify the cassandra.yaml
 with open(f"{CASSANDRA_CONFIG}/cassandra.yaml") as f:
     config = yaml.safe_load(f)
@@ -56,6 +56,8 @@ def get_ip():
 config["concurrent_reads"] = args.reader_count
 config["concurrent_counter_writes"] = args.reader_count
 config["concurrent_writes"] = args.writer_count
+config["row_cache_size"] = args.row_cache
+config["row_cache_save_period"] = "1h"
 
 if args.seed_server_ip:
     config["seed_provider"][0]["parameters"][0]["seeds"] = f"{args.seed_server_ip}:7000"

diff --git a/benchmarks/data-serving/server/jvm11-server.options b/benchmarks/data-serving/server/jvm11-server.options
@@ -0,0 +1,108 @@
+###########################################################################
+#                         jvm11-server.options                            #
+#                                                                         #
+# See jvm-server.options. This file is specific for Java 11 and newer.    #
+###########################################################################
+
+#################
+#  GC SETTINGS  #
+#################
+
+
+
+### CMS Settings
+#-XX:+UseConcMarkSweepGC
+#-XX:+CMSParallelRemarkEnabled
+#-XX:SurvivorRatio=8
+#-XX:MaxTenuringThreshold=1
+#-XX:CMSInitiatingOccupancyFraction=75
+#-XX:+UseCMSInitiatingOccupancyOnly
+#-XX:CMSWaitDuration=10000
+#-XX:+CMSParallelInitialMarkEnabled
+#-XX:+CMSEdenChunksRecordAlways
+## some JVMs will fill up their heap when accessed via JMX, see CASSANDRA-6541
+#-XX:+CMSClassUnloadingEnabled
+
+
+
+### G1 Settings
+## Use the Hotspot garbage-first collector.
+-XX:+UseG1GC
+-XX:+ParallelRefProcEnabled
+
+#
+## Have the JVM do less remembered set work during STW, instead
+## preferring concurrent GC. Reduces p99.9 latency.
+#-XX:G1RSetUpdatingPauseTimePercent=5
+#
+## Main G1GC tunable: lowering the pause target will lower throughput and vise versa.
+## 200ms is the JVM default and lowest viable setting
+## 1000ms increases throughput. Keep it smaller than the timeouts in cassandra.yaml.
+#-XX:MaxGCPauseMillis=500
+
+## Optional G1 Settings
+# Save CPU time on large (>= 16GB) heaps by delaying region scanning
+# until the heap is 70% full. The default in Hotspot 8u40 is 40%.
+#-XX:InitiatingHeapOccupancyPercent=70
+
+# For systems with > 8 cores, the default ParallelGCThreads is 5/8 the number of logical cores.
+# Otherwise equal to the number of cores when 8 or less.
+# Machines with > 10 cores should try setting these to <= full cores.
+#-XX:ParallelGCThreads=16
+# By default, ConcGCThreads is 1/4 of ParallelGCThreads.
+# Setting both to the same value can reduce STW durations.
+#-XX:ConcGCThreads=16
+
+
+### JPMS
+
+-Djdk.attach.allowAttachSelf=true
+--add-exports java.base/jdk.internal.misc=ALL-UNNAMED
+--add-exports java.base/jdk.internal.ref=ALL-UNNAMED
+--add-exports java.base/sun.nio.ch=ALL-UNNAMED
+--add-exports java.management.rmi/com.sun.jmx.remote.internal.rmi=ALL-UNNAMED
+--add-exports java.rmi/sun.rmi.registry=ALL-UNNAMED
+--add-exports java.rmi/sun.rmi.server=ALL-UNNAMED
+--add-exports java.sql/java.sql=ALL-UNNAMED
+
+--add-opens java.base/java.lang.module=ALL-UNNAMED
+--add-opens java.base/jdk.internal.loader=ALL-UNNAMED
+--add-opens java.base/jdk.internal.ref=ALL-UNNAMED
+--add-opens java.base/jdk.internal.reflect=ALL-UNNAMED
+--add-opens java.base/jdk.internal.math=ALL-UNNAMED
+--add-opens java.base/jdk.internal.module=ALL-UNNAMED
+--add-opens java.base/jdk.internal.util.jar=ALL-UNNAMED
+--add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED
+
+
+### GC logging options -- uncomment to enable
+
+# Java 11 (and newer) GC logging options:
+# See description of https://bugs.openjdk.java.net/browse/JDK-8046148 for details about the syntax
+# The following is the equivalent to -XX:+PrintGCDetails -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=10 -XX:GCLogFileSize=10M
+#-Xlog:gc=info,heap*=trace,age*=debug,safepoint=info,promotion*=trace:file=/var/log/cassandra/gc.log:time,uptime,pid,tid,level:filecount=10,filesize=10485760
+
+# Notes for Java 8 migration:
+#
+# -XX:+PrintGCDetails                   maps to -Xlog:gc*:... - i.e. add a '*' after "gc"
+# -XX:+PrintGCDateStamps                maps to decorator 'time'
+#
+# -XX:+PrintHeapAtGC                    maps to 'heap' with level 'trace'
+# -XX:+PrintTenuringDistribution        maps to 'age' with level 'debug'
+# -XX:+PrintGCApplicationStoppedTime    maps to 'safepoint' with level 'info'
+# -XX:+PrintPromotionFailure            maps to 'promotion' with level 'trace'
+# -XX:PrintFLSStatistics=1              maps to 'freelist' with level 'trace'
+
+### Netty Options
+
+# On Java >= 9 Netty requires the io.netty.tryReflectionSetAccessible system property to be set to true to enable
+# creation of direct buffers using Unsafe. Without it, this falls back to ByteBuffer.allocateDirect which has
+# inferior performance and risks exceeding MaxDirectMemory
+-Dio.netty.tryReflectionSetAccessible=true
+
+### Preserve Frame pointer for flamegraph
+-XX:+PreserveFramePointer
+
+# The newline in the end of file is intentional
+
+
diff --git a/docs/benchmarks/data-serving.md b/docs/benchmarks/data-serving.md
@@ -23,6 +23,8 @@ The following options can modify the settings of the server:
 - `--writer-count=<int>`: The number of writer threads Cassandra uses. Cassandra recommends 8 threads per CPU core. The default value is 32.
 - `--heap-size=<int>`: JVM heap size. Its unit is GB, and by default, JVM uses `max(min(1/2 ram, 1GB), min(1/4 ram, 8GB))`. It is good to increase the value when the server has enough DRAM for better performance or lower the value for explicit resource restriction.
 - `--affinity=<cpu_id, ...>`: The CPUs Cassandra works on. This setting let Cassandra be aware of its CPU affinity explicitly. It should be used together with the container's resource management option (e.g., `--cpuset-cpus`). 
+- `--row-cache=<size>`: The size of the row cache. Also specify the unit, for example `16GiB` or `256MiB`. By default, the row cache is disabled.
+
 
 ### Multiple Server Containers
 
@@ -68,6 +70,8 @@ You can give your expected load, and YCSB will try to meet the requirement. The
 
 More detailed instructions on generating the dataset and load can be found in Step 5 at [this](http://github.com/brianfrankcooper/YCSB/wiki/Running-a-Workload) link. Although Step 5 in the link describes the data loading procedure, other steps (e.g., 1, 2, 3, 4) are useful for understanding the YCSB settings. In this case, our scripts (`warmup.sh` and `load.sh`) are good templates for further customization.
 
+There are a couple of pre-defined workloads from YCSB. For example, Workload C has 100% read operations with no write operations. The default workload is Workload A (50% read + 50% write). You can change the workload by modifying the `load.sh` script. The other workloads can be found at [this](https://github.com/brianfrankcooper/YCSB/wiki/Core-Workloads) link.
+
 A rule of thumb on the dataset size
 -----------------------------------
 If you are only profiling CPU microarchitectures, you should ensure that the hot data part (3% ~ 5% of the dataset) cannot be buffered on-chip to mimic a realistic situation. Usually, a 10GB dataset is enough for a typical CPU with less than 50MB LLC.
@@ -78,7 +82,8 @@ Tuning the server performance
 2. The server settings are under the $CASSANDRA_PATH/conf folder. The main file is cassandra.yaml. The file has comments about all parameters. These parameters can also be found here: http://wiki.apache.org/cassandra/StorageConfiguration
 3. Make sure that half of the main memory is free for the operating system file buffers and caching. 
 4. As a workload based on JVM, you need to load the server to warm up the JIT cache. You can keep monitoring the throughput and tail latency and take measurement when it becomes relatively stable. As a reference, it takes around 2 minutes for a modern x86 machine (Skylake) to attain stable throughput (5000 RPS, 50% read and 50% update).
-5. The following links are useful pointers for performance tuning:
+5. The server has row cache disabled by default. It is used to cache the data rows in memory. It is useful for read-intensive workloads (e.g., Workload B and Workload C) and can improve throughput. However, it is not recommended for write-intensive workloads as it trashes the cache frequently. Tune it according to your workload.
+6. The following links are useful pointers for performance tuning:
 
     a. http://spyced.blogspot.com/2010/01/linux-performance-basics.html