Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT-#2447: add docker file for census on omnisci #2448

Merged
merged 8 commits into from
Nov 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions examples/docker/census-on-omnisci/build-docker-image.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash -e

# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

echo "Note: a user is responsible for preparing the dataset.
The dataset must be named as 'ipums_education2income_1970-2010.csv' and
be in the folder with 'census-omnisci.dockerfile'. It can be downloaded by link:
'https://rapidsai-data.s3.us-east-2.amazonaws.com/datasets/ipums_education2income_1970-2010.csv.gz'"

cd "`dirname \"$0\"`"

docker build -f census-omnisci.dockerfile -t census-omnisci --build-arg no_proxy \
--build-arg https_proxy --build-arg http_proxy --build-arg conda_extra_channel .
printf "\n\nTo run the benchmark execute:\n\tdocker run --rm census-omnisci\n"
63 changes: 63 additions & 0 deletions examples/docker/census-on-omnisci/census-omnisci.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

FROM ubuntu:18.04
ENV http_proxy ${http_proxy}
ENV https_proxy ${https_proxy}
ENV no_proxy ${no_proxy}
ENV MODIN_BACKEND "omnisci"
ENV MODIN_EXPERIMENTAL "true"

ARG conda_extra_channel
ENV add_extra_channel=${conda_extra_channel:+"-c ${conda_extra_channel}"}

RUN apt-get update --yes \
&& apt-get install wget --yes && \
rm -rf /var/lib/apt/lists/*

ENV USER modin
ENV UID 1000
ENV HOME /home/$USER

RUN adduser --disabled-password \
--gecos "Non-root user" \
--uid $UID \
--home $HOME \
$USER

ENV CONDA_DIR ${HOME}/miniconda

SHELL ["/bin/bash", "--login", "-c"]

RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh && \
bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u && \
"${CONDA_DIR}/bin/conda" init bash && \
rm -f /tmp/miniconda3.sh && \
echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile"

RUN conda update -n base -c defaults conda -y && \
conda create -n modin --yes --no-default-packages && \
conda activate modin && \
conda install -c intel/label/modin -c conda-forge modin "ray>=1.0.0"

RUN conda activate modin && \
conda install -c intel/label/modin -c conda-forge -c intel ${add_extra_channel} \
"daal4py>=2021.1" dpcpp_cpp_rt && \
conda install -c conda-forge scikit-learn && \
conda clean --all --yes

COPY ipums_education2income_1970-2010.csv "${HOME}/ipums_education2income_1970-2010.csv"
amyskov marked this conversation as resolved.
Show resolved Hide resolved

COPY census-omnisci.py "${HOME}/census-omnisci.py"

CMD ["/bin/bash", "--login", "-c", "conda activate modin && python ${HOME}/census-omnisci.py"]
162 changes: 162 additions & 0 deletions examples/docker/census-on-omnisci/census-omnisci.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import os
import time
import modin.pandas as pd
from modin.experimental.engines.omnisci_on_ray.frame.omnisci_worker import OmnisciServer

from sklearn import config_context
import daal4py.sklearn as sklearn

sklearn.patch_sklearn()
from sklearn.model_selection import train_test_split
import sklearn.linear_model as lm
import numpy as np


def read():
columns_names = [
"YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "QGQ", "PERNUM", "PERWT", "SEX",
"AGE", "EDUC", "EDUCD", "INCTOT", "SEX_HEAD", "SEX_MOM", "SEX_POP", "SEX_SP", "SEX_MOM2", "SEX_POP2",
"AGE_HEAD", "AGE_MOM", "AGE_POP", "AGE_SP", "AGE_MOM2", "AGE_POP2", "EDUC_HEAD", "EDUC_MOM", "EDUC_POP",
"EDUC_SP", "EDUC_MOM2", "EDUC_POP2", "EDUCD_HEAD", "EDUCD_MOM", "EDUCD_POP", "EDUCD_SP", "EDUCD_MOM2",
"EDUCD_POP2", "INCTOT_HEAD", "INCTOT_MOM", "INCTOT_POP", "INCTOT_SP", "INCTOT_MOM2", "INCTOT_POP2",
]
columns_types = [
"int64", "int64", "int64", "float64", "int64", "float64", "int64", "float64", "int64", "int64",
"int64", "int64", "int64", "int64", "int64", "float64", "float64", "float64", "float64", "float64",
"float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64",
"float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64",
"float64", "float64", "float64", "float64", "float64", "float64", "float64",
]
dtypes = {columns_names[i]: columns_types[i] for i in range(len(columns_names))}

df = pd.read_csv(
os.path.expanduser('~/ipums_education2income_1970-2010.csv'),
names=columns_names,
dtype=dtypes,
skiprows=1,
)

df.shape # to trigger real execution
df._query_compiler._modin_frame._partitions[0][
0
].frame_id = OmnisciServer().put_arrow_to_omnisci(
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
df._query_compiler._modin_frame._partitions[0][0].get()
) # to trigger real execution
return df


def etl(df):
keep_cols = [
"YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "PERNUM", "SEX", "AGE",
"INCTOT", "EDUC", "EDUCD", "EDUC_HEAD", "EDUC_POP", "EDUC_MOM", "EDUCD_MOM2", "EDUCD_POP2",
"INCTOT_MOM", "INCTOT_POP", "INCTOT_MOM2", "INCTOT_POP2", "INCTOT_HEAD", "SEX_HEAD",
]
df = df[keep_cols]

df = df[df["INCTOT"] != 9999999]
df = df[df["EDUC"] != -1]
df = df[df["EDUCD"] != -1]

df["INCTOT"] = df["INCTOT"] * df["CPI99"]

for column in keep_cols:
df[column] = df[column].fillna(-1)

df[column] = df[column].astype("float64")

y = df["EDUC"]
X = df.drop(columns=["EDUC", "CPI99"])

# to trigger real execution
df.shape
y.shape
X.shape

return (df, X, y)


def mse(y_test, y_pred):
return ((y_test - y_pred) ** 2).mean()


def cod(y_test, y_pred):
y_bar = y_test.mean()
total = ((y_test - y_bar) ** 2).sum()
residuals = ((y_test - y_pred) ** 2).sum()
return 1 - (residuals / total)


def ml(X, y, random_state, n_runs, test_size):
clf = lm.Ridge()

X = np.ascontiguousarray(X, dtype=np.float64)
y = np.ascontiguousarray(y, dtype=np.float64)

mse_values, cod_values = [], []
ml_scores = {}

print("ML runs: ", n_runs)
for i in range(n_runs):
(X_train, X_test, y_train, y_test) = train_test_split(
X, y, test_size=test_size, random_state=random_state
)
random_state += 777

with config_context(assume_finite=True):
model = clf.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse_values.append(mse(y_test, y_pred))
cod_values.append(cod(y_test, y_pred))

ml_scores["mse_mean"] = sum(mse_values) / len(mse_values)
ml_scores["cod_mean"] = sum(cod_values) / len(cod_values)
ml_scores["mse_dev"] = pow(
sum([(mse_value - ml_scores["mse_mean"]) ** 2 for mse_value in mse_values])
/ (len(mse_values) - 1),
0.5,
)
ml_scores["cod_dev"] = pow(
sum([(cod_value - ml_scores["cod_mean"]) ** 2 for cod_value in cod_values])
/ (len(cod_values) - 1),
0.5,
)

return ml_scores


def measure(name, func, *args, **kw):
t0 = time.time()
res = func(*args, **kw)
t1 = time.time()
print(f'{name}: {t1 - t0} sec')
return res


def main():
# ML specific
N_RUNS = 50
TEST_SIZE = 0.1
RANDOM_STATE = 777

df = measure('Reading', read)
_, X, y = measure('ETL', etl, df)
measure('ML', ml, X, y, random_state=RANDOM_STATE, n_runs=N_RUNS, test_size=TEST_SIZE)


if __name__ == '__main__':
main()
5 changes: 5 additions & 0 deletions examples/docker/taxi-on-omnisci/build-docker-image.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

echo "Note: a user is responsible for preparing the dataset.
The dataset must be named as 'trips_xaa.csv' and be in the folder with 'nyc-taxi-omnisci.dockerfile'.
It Can be generated by following the instructions on the link:
'https://github.com/toddwschneider/nyc-taxi-data#instructions'"

cd "`dirname \"$0\"`"

docker build -f nyc-taxi-omnisci.dockerfile -t nyc-taxi-omnisci --build-arg https_proxy --build-arg http_proxy .
Expand Down