Skip to content

Commit

Permalink
refactor(impala): modernize the impala backend
Browse files Browse the repository at this point in the history
Remove HDFS integration from the impala for dramatic simplification of the backend.

Also move away from an ancient version of impala that is no longer maintained.

The primary motivation here is to reduce the scope of the Impala backend, and to lower its maintenance by supporting fewer features.

BREAKING CHANGE: Direct HDFS integration is removed and support for ingesting pandas DataFrames directly is as well. The Impala backend still works with HDFS, but data in HDFS must be managed outside of ibis.
  • Loading branch information
cpcloud authored and gforsyth committed Dec 19, 2023
1 parent 64cbd5e commit 252833d
Show file tree
Hide file tree
Showing 37 changed files with 778 additions and 1,783 deletions.
2 changes: 1 addition & 1 deletion .github/renovate.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"addLabels": ["duckdb"]
},
{
"matchPackagePatterns": ["fsspec", "impyla", "impala", "kudu"],
"matchPackagePatterns": ["impyla", "impala", "kudu"],
"addLabels": ["impala"]
},
{
Expand Down
11 changes: 1 addition & 10 deletions .github/workflows/ibis-backends.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ jobs:
- libgeos-dev
- name: impala
title: Impala
serial: true
extras:
- impala
services:
Expand Down Expand Up @@ -224,7 +223,6 @@ jobs:
backend:
name: impala
title: Impala
serial: true
extras:
- impala
services:
Expand Down Expand Up @@ -371,16 +369,9 @@ jobs:
- name: "run parallel tests: ${{ matrix.backend.name }}"
if: ${{ !matrix.backend.serial }}
run: just ci-check -m ${{ matrix.backend.name }} --numprocesses auto --dist=loadgroup

- name: "run serial tests: ${{ matrix.backend.name }}"
if: matrix.backend.serial && matrix.backend.name == 'impala'
run: just ci-check -m ${{ matrix.backend.name }} --randomly-dont-reorganize
env:
IBIS_TEST_NN_HOST: localhost
IBIS_TEST_IMPALA_HOST: localhost
IBIS_TEST_IMPALA_PORT: 21050
IBIS_TEST_WEBHDFS_PORT: 50070
IBIS_TEST_WEBHDFS_USER: hdfs
IBIS_EXAMPLES_DATA: ${{ runner.temp }}/examples-${{ matrix.backend.name }}-${{ matrix.os }}-${{ steps.install_python.outputs.python-version }}

# FIXME(deepyaman): If some backend-specific test, in test_ddl.py,
Expand All @@ -398,7 +389,7 @@ jobs:
FLINK_REMOTE_CLUSTER_PORT: "8081"

- name: "run serial tests: ${{ matrix.backend.name }}"
if: matrix.backend.serial && matrix.backend.name != 'impala' && matrix.backend.name != 'flink'
if: matrix.backend.serial && matrix.backend.name != 'flink'
run: just ci-check -m ${{ matrix.backend.name }}
env:
IBIS_EXAMPLES_DATA: ${{ runner.temp }}/examples-${{ matrix.backend.name }}-${{ matrix.os }}-${{ steps.install_python.outputs.python-version }}
Expand Down
199 changes: 131 additions & 68 deletions compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,74 +15,6 @@ services:
networks:
- clickhouse

impala:
depends_on:
- impala-postgres
- kudu
- kudu-tserver
environment:
PGPASSWORD: postgres
healthcheck:
interval: 1s
retries: 60
test:
- CMD-SHELL
- nc -z 127.0.0.1 21050 && nc -z 127.0.0.1 50070
hostname: localhost
image: ibisproject/impala:latest
ports:
- 50070:50070 # namenode http (hdfs)
- 50075:50075 # datanode http (hdfs)
- 8020:8020 # namenode metadata (hdfs)
- 21050:21050 # hiveserver2 (impala)
networks:
- impala

impala-postgres:
user: postgres
hostname: postgres
environment:
POSTGRES_PASSWORD: postgres
healthcheck:
interval: 1s
retries: 10
test:
- CMD
- pg_isready
image: postgres:13.13-alpine
networks:
- impala

kudu:
cap_add:
- SYS_TIME
image: apache/kudu:1.17.0
networks:
- impala
command: kudu master run --fs_wal_dir=/var/lib/kudu/master --fs_data_dirs=/var/lib/kudu/master
healthcheck:
interval: 1s
retries: 60
test:
- CMD-SHELL
- kudu cluster ksck kudu:7051

kudu-tserver:
cap_add:
- SYS_TIME
image: apache/kudu:1.17.0
depends_on:
- kudu # tablet server won't start if it can't find the master kudu node
networks:
- impala
command: kudu tserver run --fs_wal_dir=/var/lib/kudu/master --fs_data_dirs=/var/lib/kudu/master --tserver_master_addrs=kudu
healthcheck:
interval: 1s
retries: 60
test:
- CMD-SHELL
- kudu cluster ksck kudu:7051

mysql:
environment:
MYSQL_ALLOW_EMPTY_PASSWORD: "true"
Expand Down Expand Up @@ -455,8 +387,138 @@ services:
networks:
- flink

kudu:
cap_add:
- SYS_TIME
image: apache/kudu:1.17.0
networks:
- impala
command: kudu master run --fs_wal_dir=/var/lib/kudu/master --fs_data_dirs=/var/lib/kudu/master
healthcheck:
interval: 1s
retries: 60
test:
- CMD-SHELL
- kudu cluster ksck kudu:7051

kudu-tserver:
cap_add:
- SYS_TIME
image: apache/kudu:1.17.0
depends_on:
- kudu
networks:
- impala
command: kudu tserver run --fs_wal_dir=/var/lib/kudu/master --fs_data_dirs=/var/lib/kudu/master --tserver_master_addrs=kudu
healthcheck:
interval: 1s
retries: 60
test:
- CMD-SHELL
- kudu cluster ksck kudu:7051

impala-hive-metastore:
image: apache/impala:4.0.0-impala_quickstart_hms
container_name: impala-hive-metastore
command: hms
volumes:
# Volume used to store Apache Derby database.
- impala:/var/lib/hive
# Warehouse directory. HMS does file operations so needs access to the
# shared volume.
- impala:/user/hive/warehouse
- ./docker/impala/conf:/opt/hive/conf:ro
networks:
impala:
aliases:
- impala-hive-metastore
- impala-hive-metastore.impala

statestored:
image: apache/impala:4.0.0-statestored
ports:
- 25010:25010 # Web debug UI
command:
- -redirect_stdout_stderr=false
- -logtostderr
- -v=1
volumes:
- ./docker/impala/conf:/opt/impala/conf:ro
healthcheck:
interval: 30s
retries: 20
test:
- CMD-SHELL
- nc -z 127.0.0.1 25010
networks:
- impala

catalogd:
depends_on:
- statestored
- impala-hive-metastore
image: apache/impala:4.0.0-catalogd
ports:
- 25020:25020 # Web debug UI
command:
- -redirect_stdout_stderr=false
- -logtostderr
- -v=1
- -hms_event_polling_interval_s=1
- -invalidate_tables_timeout_s=999999
volumes:
# Warehouse directory. Catalog does file operations so needs access to the
# shared volume.
- impala:/user/hive/warehouse
- ./docker/impala/conf:/opt/impala/conf:ro
healthcheck:
interval: 30s
retries: 20
test:
- CMD-SHELL
- nc -z 127.0.0.1 25020
networks:
- impala

impala:
image: apache/impala:4.0.0-impalad_coord_exec
depends_on:
- statestored
- catalogd
- kudu
- kudu-tserver
ports:
- 21050:21050 # HS2 endpoint
healthcheck:
interval: 30s
retries: 20
test:
- CMD-SHELL
- nc -z 127.0.0.1 21050
command:
- -v=1
- -redirect_stdout_stderr=false
- -logtostderr
- -kudu_master_hosts=kudu:7051
- -mt_dop_auto_fallback=true
- -default_query_options=mt_dop=4,default_file_format=parquet,default_transactional_type=insert_only
- -mem_limit=4gb
environment:
# Keep the Java heap small to preserve memory for query execution.
JAVA_TOOL_OPTIONS: -Xmx1g
volumes:
- impala:/user/hive/warehouse
- ./docker/impala/conf:/opt/impala/conf:ro
networks:
- impala

networks:
impala:
# docker defaults to naming networks "$PROJECT_$NETWORK" but the Java Hive
# Metastore clients don't accept underscores in the thrift URIs and
# something is too-aggressively supplanting the specified thrift metastore
# URI with $SPECIFIED_URI.$NETWORK so rename it to something acceptable
name: "impala"
mysql:
mssql:
clickhouse:
Expand All @@ -483,3 +545,4 @@ volumes:
postgres:
minio:
exasol:
impala:
81 changes: 81 additions & 0 deletions docker/impala/conf/hive-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!--
Hive configuration for Impala quickstart docker cluster.
-->
<configuration>
<property>
<!-- Required for automatic metadata sync. -->
<name>hive.metastore.dml.events</name>
<value>true</value>
</property>

<property>
<!-- User impala is not authorized to consume notifications by default, disable
authentication to work around this. -->
<name>hive.metastore.event.db.notification.api.auth</name>
<value>false</value>
</property>

<property>
<name>hive.metastore.uris</name>
<value>thrift://impala-hive-metastore:9083</value>
</property>

<!-- Managed and external tablespaces must live on the Docker volumes that we
configure for the cluster. -->
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/user/hive/warehouse/managed</value>
</property>

<property>
<name>hive.metastore.warehouse.external.dir</name>
<value>/user/hive/warehouse/external</value>
</property>

<property>
<!-- Required to enable Hive transactions -->
<name>hive.support.concurrency</name>
<value>true</value>
</property>

<property>
<!-- Required to enable Hive transactions -->
<name>hive.txn.manager</name>
<value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value>
</property>

<!-- Hive stats autogathering negatively affects latency of DDL operations, etc and
is not particularly useful for Impala -->
<property>
<name>hive.stats.autogather</name>
<value>false</value>
</property>

<property>
<name>hive.compactor.initiator.on</name>
<value>true</value>
</property>

<property>
<name>hive.compactor.worker.threads</name>
<value>1</value>
</property>
</configuration>
Loading

0 comments on commit 252833d

Please sign in to comment.