Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hive metastore integration with installation files and a sample application #5

Merged
merged 1 commit into from
Apr 9, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
.DS_Store
_tmp
.idea
.idea
*.sw[po]
*Dockerfile.csi-driver-nfs
*Dockerfile.csi-s3
*s3-secret.yaml
21 changes: 21 additions & 0 deletions examples/hive/k8s/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
FROM registry.access.redhat.com/ubi7/ubi

WORKDIR /opt

ENV JAVA_HOME=/usr/lib/jvm/jre/
ENV HADOOP_HOME=/opt/hadoop-3.1.2
ENV HIVE_HOME=/opt/apache-hive-3.1.2-bin

RUN yum update --disableplugin=subscription-manager -y && rm -rf /var/cache/yum && \
yum install --disableplugin=subscription-manager java-1.8.0-openjdk-headless -y && \
yum install --disableplugin=subscription-manager postgresql-devel -y

RUN curl -L https://archive.apache.org/dist/hadoop/core/hadoop-3.1.2/hadoop-3.1.2.tar.gz | tar zxf - && \
curl -L https://www-us.apache.org/dist/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz | tar zxf -

RUN curl -L https://jdbc.postgresql.org/download/postgresql-42.2.8.jar > ${HIVE_HOME}/lib/postgresql-42.2.8.jar && \
curl -L https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.1.2/hadoop-aws-3.1.2.jar > ${HADOOP_HOME}/lib/hadoop-aws-3.1.2.jar && \
curl -L https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-core/1.11.671/aws-java-sdk-core-1.11.671.jar > ${HADOOP_HOME}/lib/aws-java-sdk-core-1.11.671.jar && \
curl -L https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-s3/1.11.671/aws-java-sdk-s3-1.11.671.jar > ${HADOOP_HOME}/lib/aws-java-sdk-s3-1.11.671.jar && \
curl -L https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-dynamodb/1.11.671/aws-java-sdk-dynamodb-1.11.671.jar > ${HADOOP_HOME}/lib/aws-java-sdk-dynamodb-1.11.671.jar && \
cp -v ${HADOOP_HOME}/lib/*aws*.jar ${HIVE_HOME}/lib/
19 changes: 19 additions & 0 deletions examples/hive/k8s/Dockerfile.hiveserver
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
FROM dlf-hive-base:latest

ENV HIVE_HOME=/opt/apache-hive-3.1.2-bin

COPY conf/hive-site.xml ${HIVE_HOME}/conf/
COPY hiveserver-entrypoint.sh ${HIVE_HOME}/entrypoint.sh

RUN groupadd -r hive --gid=1000 && \
useradd -r -g hive --uid=1000 -d ${HIVE_HOME} hive && \
chown hive:hive -R ${HIVE_HOME} && \
mkdir /tmp/hive && \
chmod -R a+w /tmp/hive/

USER hive
WORKDIR $HIVE_HOME
EXPOSE 10001 10002

ENTRYPOINT ["./entrypoint.sh"]
CMD ["bin/hive","--service","hiveserver2"]
20 changes: 20 additions & 0 deletions examples/hive/k8s/Dockerfile.metastore
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
FROM dlf-hive-base:latest

ENV METASTORE_HOME=/opt/apache-hive-3.1.2-bin

COPY conf/metastore-site.xml ${METASTORE_HOME}/conf/hive-site.xml
COPY conf/metastore-log4j2.properties ${METASTORE_HOME}/conf/metastore-log4j2.properties
COPY metastore-entrypoint.sh ${METASTORE_HOME}/entrypoint.sh

RUN groupadd -r hive --gid=1000 && \
useradd -r -g hive --uid=1000 -d ${METASTORE_HOME} hive && \
chown hive:hive -R ${METASTORE_HOME} && \
mkdir /tmp/hive && \
chmod -R a+w /tmp/hive/

USER hive
WORKDIR $METASTORE_HOME
EXPOSE 9083

ENTRYPOINT ["./entrypoint.sh"]
CMD ["bin/hive","--service","metastore"]
123 changes: 123 additions & 0 deletions examples/hive/k8s/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
SHELL=/bin/bash

DATASET_OPERATOR_NAMESPACE := default

DOCKER_REGISTRY_COMPONENTS := the_registry_to_use_for_components
DOCKER_REGISTRY_SECRET := your_already_installed_secrets

HIVE_FILE_PATH := $(shell pwd)

HIVE_BASE_DOCKERFILE := $(HIVE_FILE_PATH)/Dockerfile
HIVE_BASE_IMAGE := dlf-hive-base
HIVE_BASE_TAG := latest
HIVE_BASE_IMAGE := $(HIVE_BASE_IMAGE):$(HIVE_BASE_TAG)

HIVESERVER_IMAGE := hive-server
HIVESERVER_TAG := latest
HIVESERVER_IMAGE := $(DOCKER_REGISTRY_COMPONENTS)/$(HIVESERVER_IMAGE)
HIVESERVER_IMAGE := $(HIVESERVER_IMAGE):$(HIVESERVER_TAG)
HIVESERVER_DOCKERFILE := $(HIVE_FILE_PATH)/Dockerfile.hiveserver

HIVEMETASTORE_IMAGE := hive-metastore
HIVEMETASTORE_TAG := latest
HIVEMETASTORE_IMAGE := $(DOCKER_REGISTRY_COMPONENTS)/$(HIVEMETASTORE_IMAGE)
HIVEMETASTORE_IMAGE := $(HIVEMETASTORE_IMAGE):$(HIVEMETASTORE_TAG)
HIVEMETASTORE_DOCKERFILE := $(HIVE_FILE_PATH)/Dockerfile.metastore

MAKE_ENV += DATASET_OPERATOR_NAMESPACE
MAKE_ENV += DOCKER_REGISTRY_SECRET
MAKE_ENV += HIVESERVER_IMAGE
MAKE_ENV += HIVEMETASTORE_IMAGE

SHELL_EXPORT := $(foreach v,$(MAKE_ENV),$(v)='$($(v))' )

#K8S_FILES += $(shell find $(HIVE_FILE_PATH)/deploy -maxdepth 1 -name '*.yaml')
define load_containers_minikube
@mkdir -p _tmp ;\
docker save $(1) | gzip > _tmp/$(2).tar.gz ;\
eval $$(minikube docker-env) ;\
docker load < _tmp/$(2).tar.gz ;\
rm -rf _tmp/$(2).tar.gz
endef

define build-images
$(info Building $(1))
docker build -t $(1) -f $(2) $(3)
endef

define deploy-k8s
$(info Deploying $(1))
@$(SHELL_EXPORT) envsubst < $(HIVE_FILE_PATH)/deploy/$(1).yaml | kubectl apply -n $(DATASET_OPERATOR_NAMESPACE) -f -
@sleep 30s; kubectl wait --namespace $(DATASET_OPERATOR_NAMESPACE) --for condition=ready pods -l app=$(1) --timeout=90s > /dev/null 2>&1
endef

define undeploy-k8s
$(info Undeploying $(1))
@$(SHELL_EXPORT) envsubst < $(HIVE_FILE_PATH)/deploy/$(1).yaml | kubectl delete -n $(DATASET_OPERATOR_NAMESPACE) --ignore-not-found --wait -f -
endef

noobaa-env:
ifeq ($(origin S3_ENDPOINT),undefined)
$(info Getting connection parameters from Noobaa)
ifeq ($(origin NOOBAA_HOME),environment)
NOOBAA_HOME := ${NOOBAA_HOME}
else
$(error NOOBAA_HOME not found or unset)
endif
S3_ENDPOINT := $(shell minikube service s3 --url | head -n1)
AWS_ACCESS_KEY_ID := $(shell $(NOOBAA_HOME)/noobaa status 2>/dev/null | grep AWS_ACCESS_KEY_ID | awk -F ": " '{print $$2}')
AWS_SECRET_ACCESS_KEY := $(shell $(NOOBAA_HOME)/noobaa status 2>/dev/null | grep AWS_SECRET_ACCESS_KEY | awk -F ": " '{print $$2}')
else ifeq ($(origin S3_ENDPOINT),environment)
$(info Getting connection parameters from env)
S3_ENDPOINT := ${S3_ENDPOINT}
AWS_ACCESS_KEY_ID := ${AWS_ACCESS_KEY_ID}
AWS_SECRET_ACCESS_KEY := ${AWS_SECRET_ACCESS_KEY}
endif

conf/hive-site.xml: noobaa-env
@sed -e "s|\$${S3_ENDPOINT}|$(S3_ENDPOINT)|g" conf/hive-site.tmpl > conf/hive-site.xml

conf/metastore-site.xml: noobaa-env
@sed -e "s|\$${S3_ENDPOINT}|$(S3_ENDPOINT)|g" conf/metastore-site.tmpl > conf/metastore-site.xml

deploy/s3-secret.yaml: noobaa-env
@sed -e "s|\$${AWS_ACCESS_KEY_ID}|$(AWS_ACCESS_KEY_ID)|g" \
-e "s|\$${AWS_SECRET_ACCESS_KEY}|$(AWS_SECRET_ACCESS_KEY)|g" \
deploy/s3-secret.tmpl > deploy/s3-secret.yaml

build-images: conf/hive-site.xml conf/metastore-site.xml
$(call build-images,$(HIVE_BASE_IMAGE),$(HIVE_BASE_DOCKERFILE),$(HIVE_FILE_PATH))
$(call build-images,$(HIVEMETASTORE_IMAGE),$(HIVEMETASTORE_DOCKERFILE),$(HIVE_FILE_PATH))
$(call build-images,$(HIVESERVER_IMAGE),$(HIVESERVER_DOCKERFILE),$(HIVE_FILE_PATH))

push-images: build-images
@docker push $(HIVESERVER_IMAGE) ;\
docker push $(HIVEMETASTORE_IMAGE)

minikube-load-containers: build-images
$(call load_containers_minikube,$(HIVEMETASTORE_IMAGE),hivemetastore)
$(call load_containers_minikube,$(HIVESERVER_IMAGE),hiveserver)

deploy-secret: deploy/s3-secret.yaml
@kubectl apply -n $(DATASET_OPERATOR_NAMESPACE) -f deploy/s3-secret.yaml; \
kubectl apply -n $(DATASET_OPERATOR_NAMESPACE) -f deploy/database-secret.yaml

deploy-database: deploy-secret
$(call deploy-k8s,database)

deploy-hivemetastore: minikube-load-containers deploy-database
$(call deploy-k8s,hivemetastore)

deploy-hive: deploy-hivemetastore
$(call deploy-k8s,hiveserver)

undeploy-hive:
$(call undeploy-k8s,hiveserver)
$(call undeploy-k8s,hivemetastore)
$(call undeploy-k8s,database)
$(call undeploy-k8s,database-secret)
$(call undeploy-k8s,s3-secret)

minikube-install: deploy-hive

minikube-uninstall: undeploy-hive
39 changes: 39 additions & 0 deletions examples/hive/k8s/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Installing Hive in Kubernetes

In this series of steps, we will be installing Hive in Kubernetes to have a
metadata catalog that can be queried by the framework to create datasets. Hive
should be installed in the same namespace as the rest of DLF. This
guide assumes Minikube as the target Kubernetes cluster, but it is applicable to any Kubernetes/Openshift infrastructure.

## Initial steps

First, some configuration. The ObjectStorage integration of Hive requires that the endpoint be provided at the point of initial configuration.
if you are using the Nooba install as described in the main installation guide, then all you have to do is to export the directory where Nooba is placed.

```
$ unset S3_ENDPOINT
$ export NOOBAA_HOME=path/to/Noobaa/directory
```
If you are using a different Object Storage service, then you need to set these environment variables

```
$ export S3_ENDPOINT=http://<Object_Storage_Service_Provider_URL>
$ export AWS_ACCESS_KEY_ID = "Access key for Object Storage"
$ export AWS_SECRET_ACCESS_KEY = "Secret access key for Object Storage"
```
Then, examine `Makefile` in `examples/hive/k8s` and add values for `DATASET_NAMESPACE_OPERATOR`, `DOCKER_REGISTRY_COMPONENTS`, and `DOCKER_REGISTRY_SECRET`. Please ensure that these variable values are the same as that used for installing DLF.

Now go ahead and complete the install
```
$ make minikube-install
```

Test your installation with `test-hive.sh`. Examine the script in a editor and change the values of namespace and repository variables
```
$ ./test-hive.sh
```
If the output is
```
HTTP/1.1 200 OK
```
then you can try the URL provided in a browser and verify that the Hive landing page is displayed correctly
98 changes: 98 additions & 0 deletions examples/hive/k8s/conf/hive-site.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
<configuration>

<property>
<name>metastore.thrift.uris</name>
<value>thrift://hivemetastore:9083</value>
</property>

<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>

<property>
<name>hive.metastore.warehouse.dir</name>
<value>file:///tmp</value>
</property>

<property>
<name>fs.default.name</name>
<value>file:///tmp</value>
</property>

<property>
<name>fs.s3a.impl</name>
<value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
</property>

<property>
<name>fs.s3a.endpoint</name>
<value>${S3_ENDPOINT}</value>
</property>

<property>
<name>hive.exec.scratchdir</name>
<value>/tmp/hive</value>
</property>

<property>
<name>hive.server2.transport.mode</name>
<value>http</value>
</property>

<property>
<name>hive.server2.thrift.http.port</name>
<value>10001</value>
</property>

<property>
<name>hive.server2.thrift.http.path</name>
<value>cliservice</value>
</property>

<property>
<name>hive.server2.thrift.http.min.worker.threads</name>
<value>5</value>
</property>

<property>
<name>hive.server2.thrift.http.max.worker.threads</name>
<value>500</value>
</property>

<property>
<name>hive.server2.logging.operation.enabled</name>
<value>true</value>
</property>

<property>
<name>hive.server2.logging.operation.level</name>
<value>PERFORMANCE</value>
</property>

<property>
<name>mapred.input.dir.recursive</name>
<value>true</value>
</property>

<property>
<name>hive.mapred.supports.subdirectories</name>
<value>true</value>
</property>

<property>
<name>hive.server2.active.passive.ha.enable</name>
<value>true</value>
</property>

<property>
<name>hive.execution.engine</name>
<value>tez</value>
</property>

<property>
<name>hive.metastore.event.db.notification.api.auth</name>
<value>false</value>
</property>

</configuration>
Loading