Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT-#2444: add docker file for nyc on omnisci #2445

Merged
merged 1 commit into from
Nov 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions examples/docker/taxi-on-omnisci/build-docker-image.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash -e

# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

cd "`dirname \"$0\"`"

docker build -f nyc-taxi-omnisci.dockerfile -t nyc-taxi-omnisci --build-arg https_proxy --build-arg http_proxy .
printf "\n\nTo run the benchmark execute:\n\tdocker run --rm nyc-taxi-omnisci\n"
53 changes: 53 additions & 0 deletions examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

FROM ubuntu:18.04
ENV http_proxy ${http_proxy}
ENV https_proxy ${https_proxy}
ENV MODIN_BACKEND "omnisci"
ENV MODIN_EXPERIMENTAL "true"

RUN apt-get update --yes \
&& apt-get install wget --yes && \
rm -rf /var/lib/apt/lists/*

ENV USER modin
ENV UID 1000
ENV HOME /home/$USER

RUN adduser --disabled-password \
--gecos "Non-root user" \
--uid $UID \
--home $HOME \
$USER

ENV CONDA_DIR ${HOME}/miniconda

SHELL ["/bin/bash", "--login", "-c"]

RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh && \
bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u && \
"${CONDA_DIR}/bin/conda" init bash && \
rm -f /tmp/miniconda3.sh && \
echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile"

RUN conda update -n base -c defaults conda -y && \
conda create -n modin --yes --no-default-packages && \
conda activate modin && \
conda install -c intel/label/modin -c conda-forge modin "ray>=1.0.0" && \
conda clean --all --yes

COPY trips_xaa.csv "${HOME}/trips_xaa.csv"
COPY nyc-taxi-omnisci.py "${HOME}/nyc-taxi-omnisci.py"

CMD ["/bin/bash", "--login", "-c", "conda activate modin && python ${HOME}/nyc-taxi-omnisci.py"]
108 changes: 108 additions & 0 deletions examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import os
import time
import modin.pandas as pd
from modin.experimental.engines.omnisci_on_ray.frame.omnisci_worker import OmnisciServer

def read():
columns_names = [
"trip_id", "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag",
"rate_code_id", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude",
"passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax", "tip_amount",
"tolls_amount", "ehail_fee", "improvement_surcharge", "total_amount", "payment_type",
"trip_type", "pickup", "dropoff", "cab_type", "precipitation", "snow_depth", "snowfall",
"max_temperature", "min_temperature", "average_wind_speed", "pickup_nyct2010_gid",
"pickup_ctlabel", "pickup_borocode", "pickup_boroname", "pickup_ct2010",
"pickup_boroct2010", "pickup_cdeligibil", "pickup_ntacode", "pickup_ntaname", "pickup_puma",
"dropoff_nyct2010_gid", "dropoff_ctlabel", "dropoff_borocode", "dropoff_boroname",
"dropoff_ct2010", "dropoff_boroct2010", "dropoff_cdeligibil", "dropoff_ntacode",
"dropoff_ntaname", "dropoff_puma",
]
# use string instead of category
columns_types = [
"int64", "string", "timestamp", "timestamp", "string", "int64", "float64", "float64",
"float64", "float64", "int64", "float64", "float64", "float64", "float64", "float64", "float64",
"float64", "float64", "float64", "string", "float64", "string", "string", "string", "float64",
"int64", "float64", "int64", "int64", "float64", "float64", "float64", "float64", "string", "float64",
"float64", "string", "string", "string", "float64", "float64", "float64", "float64", "string",
"float64", "float64", "string", "string", "string", "float64",
]

dtypes = {columns_names[i]: columns_types[i] for i in range(len(columns_names))}
all_but_dates = {
col: valtype for (col, valtype) in dtypes.items() if valtype not in ["timestamp"]
}
dates_only = [col for (col, valtype) in dtypes.items() if valtype in ["timestamp"]]

df = pd.read_csv(
os.path.expanduser('~/trips_xaa.csv'),
names=columns_names,
dtype=all_but_dates,
parse_dates=dates_only,
)

df.shape # to trigger real execution
df._query_compiler._modin_frame._partitions[0][
0
].frame_id = OmnisciServer().put_arrow_to_omnisci(
df._query_compiler._modin_frame._partitions[0][0].get()
) # to trigger real execution
Garra1980 marked this conversation as resolved.
Show resolved Hide resolved
return df


def q1_omnisci(df):
q1_pandas_output = df.groupby("cab_type").size()
q1_pandas_output.shape # to trigger real execution
return q1_pandas_output

def q2_omnisci(df):
q2_pandas_output = df.groupby("passenger_count").agg({"total_amount": "mean"})
q2_pandas_output.shape # to trigger real execution
return q2_pandas_output

def q3_omnisci(df):
df["pickup_datetime"] = df["pickup_datetime"].dt.year
q3_pandas_output = df.groupby(["passenger_count", "pickup_datetime"]).size()
q3_pandas_output.shape # to trigger real execution
return q3_pandas_output

def q4_omnisci(df):
df["pickup_datetime"] = df["pickup_datetime"].dt.year
df["trip_distance"] = df["trip_distance"].astype("int64")
q4_pandas_output = (
df.groupby(["passenger_count", "pickup_datetime", "trip_distance"], sort=False)
.size()
.reset_index()
.sort_values(by=["pickup_datetime", 0], ignore_index=True, ascending=[True, False])
)
q4_pandas_output.shape # to trigger real execution
return q4_pandas_output

def measure(name, func, *args, **kw):
t0 = time.time()
res = func(*args, **kw)
t1 = time.time()
print(f'{name}: {t1 - t0} sec')
return res

def main():
df = measure('Reading', read)
measure('Q1', q1_omnisci, df)
measure('Q2', q2_omnisci, df)
measure('Q3', q3_omnisci, df.copy())
measure('Q4', q4_omnisci, df.copy())

if __name__ == '__main__':
main()