Skip to content

Commit

Permalink
Include ClickHouse beta JSON type results in super command doc (#5513)
Browse files Browse the repository at this point in the history
Run super cmd perf queries with ClickHouse JSON type
  • Loading branch information
philrz authored Dec 3, 2024
1 parent 26269cc commit 5ff9972
Show file tree
Hide file tree
Showing 12 changed files with 888 additions and 162 deletions.
461 changes: 311 additions & 150 deletions docs/commands/super.md

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions scripts/super-cmd-perf/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ The run proceeds in three phases:
2. Test data is downloaded and loaded into needed storage formats
3. Queries are executed on all data platforms

The scripts only run with ClickHouse's [beta JSON type](https://clickhouse.com/blog/a-new-powerful-json-data-type-for-clickhouse)
on AWS because when we attempted to load data to this type on our Macbooks
that have 16 GB of RAM it consistently failed with a "too many open files"
error.

As the benchmarks may take a long time to run, the use of [`screen`](https://www.gnu.org/software/screen/)
or a similar "detachable" terminal tool is recommended in case your remote
network connection drops during a run.
Expand Down
23 changes: 13 additions & 10 deletions scripts/super-cmd-perf/benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,6 @@ if command -v dmidecode && [ "$(sudo dmidecode --string system-uuid | cut -c1-3)
echo 'export TMPDIR="/mnt/tmpdir"' >> "$HOME"/.profile
mkdir /mnt/tmpdir

# Install ClickHouse
if ! command -v clickhouse-client > /dev/null 2>&1; then
sudo apt-get install -y apt-transport-https ca-certificates curl gnupg
curl -fsSL 'https://packages.clickhouse.com/rpm/lts/repodata/repomd.xml.key' | sudo gpg --dearmor -o /usr/share/keyrings/clickhouse-keyring.gpg
echo "deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb stable main" | sudo tee \
/etc/apt/sources.list.d/clickhouse.list
sudo apt-get update
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y clickhouse-client
fi

# Install DuckDB
if ! command -v duckdb > /dev/null 2>&1; then
curl -L -O https://github.com/duckdb/duckdb/releases/download/v1.1.3/duckdb_cli-linux-amd64.zip
Expand Down Expand Up @@ -69,6 +59,19 @@ if command -v dmidecode && [ "$(sudo dmidecode --string system-uuid | cut -c1-3)

cd scripts/super-cmd-perf

# Install ClickHouse
if ! command -v clickhouse-client > /dev/null 2>&1; then
sudo apt-get install -y apt-transport-https ca-certificates curl gnupg
curl -fsSL 'https://packages.clickhouse.com/rpm/lts/repodata/repomd.xml.key' | sudo gpg --dearmor -o /usr/share/keyrings/clickhouse-keyring.gpg
echo "deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb stable main" | sudo tee \
/etc/apt/sources.list.d/clickhouse.list
sudo apt-get update
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y clickhouse-server clickhouse-client
sudo cp clickhouse-storage.xml /etc/clickhouse-server/config.d
sudo systemctl stop clickhouse-server
sudo systemctl disable clickhouse-server.service
fi

fi

rundir="$(date +%F_%T)"
Expand Down
5 changes: 5 additions & 0 deletions scripts/super-cmd-perf/clickhouse-storage.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<clickhouse>
<custom_cached_disks_base_directory>/mnt/clickhouse/caches/</custom_cached_disks_base_directory>
<path>/mnt/clickhouse/</path>
<tmp_path>/mnt/clickhouse/tmp/</tmp_path>
</clickhouse>
3 changes: 3 additions & 0 deletions scripts/super-cmd-perf/clickhouse-table-create.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SET enable_json_type = 1;
CREATE TABLE gha (v JSON) ENGINE MergeTree() ORDER BY tuple();
INSERT INTO gha SELECT * FROM file('gharchive_gz/*.json.gz', JSONAsObject);
14 changes: 14 additions & 0 deletions scripts/super-cmd-perf/prep-data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ mkdir -p "$rundir"

RUNNING_ON_AWS_EC2="${RUNNING_ON_AWS_EC2:-}"
if [ -n "$RUNNING_ON_AWS_EC2" ]; then
cp clickhouse-table-create.sql /mnt
cd /mnt
fi

Expand Down Expand Up @@ -55,4 +56,17 @@ run_cmd \
"$rundir/super-bsup-create.out" \
"super -o gha.bsup gharchive_gz/*.json.gz"

if [ -n "$RUNNING_ON_AWS_EC2" ]; then
sudo mkdir -p /var/lib/clickhouse/user_files
sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files
sudo ln -s /mnt/gharchive_gz /var/lib/clickhouse/user_files/gharchive_gz
sudo systemctl start clickhouse-server
sleep 5
run_cmd \
"$rundir/clickhouse-table-create.out" \
"clickhouse-client < clickhouse-table-create.sql"
sudo systemctl stop clickhouse-server
du -h clickhouse/store
fi

du -h gha.db gha.parquet gha.bsup gharchive_gz
5 changes: 5 additions & 0 deletions scripts/super-cmd-perf/queries/agg-clickhouse-db.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SET allow_suspicious_types_in_group_by = 1;
SELECT count(),v.type
FROM '__SOURCE__'
WHERE v.repo.name='duckdb/duckdb'
GROUP BY v.type
3 changes: 3 additions & 0 deletions scripts/super-cmd-perf/queries/count-clickhouse-db.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SELECT count()
FROM '__SOURCE__'
WHERE v.actor.login='johnbieren'
489 changes: 489 additions & 0 deletions scripts/super-cmd-perf/queries/search+-clickhouse-db.sql

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions scripts/super-cmd-perf/queries/search-clickhouse-db.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SELECT count()
FROM '__SOURCE__'
WHERE v.payload.pull_request.body LIKE '%in case you have any feedback 😊%'
13 changes: 13 additions & 0 deletions scripts/super-cmd-perf/queries/union-clickhouse-db.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
WITH assignees AS (
SELECT v.payload.pull_request.assignee.login assignee
FROM '__SOURCE__'
UNION ALL
SELECT arrayJoin(v.payload.pull_request.assignees).login assignee
FROM '__SOURCE__'
)
SELECT assignee, count(*) count
FROM assignees
WHERE assignee IS NOT NULL
GROUP BY assignee
ORDER BY count DESC
LIMIT 5
26 changes: 24 additions & 2 deletions scripts/super-cmd-perf/run-queries.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ function run_query {
cmd="$cmd < $final_query"
elif [ "$cmd" == "datafusion" ]; then
cmd="datafusion-cli --file $final_query"
elif [ "$cmd" == "clickhouse" ]; then
cmd="clickhouse --queries-file $final_query"
elif [[ "$cmd" == "clickhouse"* ]]; then
cmd="$cmd --queries-file $final_query"
fi

echo -e "About to execute\n================\n$cmd\n\nWith query\n==========" > "$outputfile"
Expand Down Expand Up @@ -144,3 +144,25 @@ do
done
echo >> "$report"
echo >> "$csv_report"

if [ -n "$RUNNING_ON_AWS_EC2" ]; then
sudo systemctl start clickhouse-server
echo -n "|\`clickhouse\`|\`db\`|" >> "$report"
echo -n "clickhouse,db" >> "$csv_report"
for queryfile in search-clickhouse-db.sql search+-clickhouse-db.sql count-clickhouse-db.sql agg-clickhouse-db.sql union-clickhouse-db.sql
do
if [ "$queryfile" == "union-clickhouse-db.sql" ]; then
echo -n "N/A|" >> "$report"
echo -n ",N/A" >> "$csv_report"
continue
fi
run_query clickhouse-client $queryfile gha
result=$(grep Time < "$rundir/clickhouse-client-$queryfile-$source.out" | awk '{ print $4 }')
echo -n "$result" >> "$report"
echo -n "|" >> "$report"
echo -n ",$result" >> "$csv_report"
done
sudo systemctl stop clickhouse-server
echo >> "$report"
echo >> "$csv_report"
fi

0 comments on commit 5ff9972

Please sign in to comment.