Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Workaround for pydantic 2.0 incompatibility, new AWS glue job runner, and new makefile targets. #202

Merged
merged 34 commits into from
Sep 5, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
cf41061
Merge pull request #132 from ray-project/repartition
valiantljk Jun 13, 2023
8c3b97d
Logging memory consumed to validate worker estimation correctness (#142)
raghumdani Jun 26, 2023
2ef0c40
Merge pull request #137 from Zyiqin-Miranda/efficiency-improvement-setup
Zyiqin-Miranda Jun 28, 2023
d96585e
Capturing all the performance metrics in an audit (#146)
raghumdani Jun 30, 2023
3218331
[skip download cold manifest] Add support for skipping download cold …
Zyiqin-Miranda Jul 5, 2023
765592a
Address review comments
Zyiqin-Miranda Jul 6, 2023
fe23a17
Merge pull request #147 from Zyiqin-Miranda/efficiency-skip-download-…
Zyiqin-Miranda Jul 6, 2023
d6c13a1
Adding a data model to represent compact_partition function parameter…
pfaraone Jul 6, 2023
9a1ba21
Object store implementation to allow elastically increasing the objec…
raghumdani Jul 14, 2023
e013bbf
[skip untouched files]Disable copy by reference during backfill and r…
Zyiqin-Miranda Jul 18, 2023
24240ba
add dd parallelism params (#154)
valiantljk Jul 21, 2023
fabc387
Allow s3 client kwargs as argument of compact_partition (#155)
raghumdani Jul 25, 2023
2f75297
Honor profile name in s3 client kwargs (#157)
raghumdani Jul 25, 2023
6471727
Allow s3_client_kwargs to be passed into repartition (#158)
rkenmi Jul 26, 2023
037b584
Move s3_client_kwargs default setter to parent scope (#159)
rkenmi Jul 26, 2023
ae3ba8e
keep null row and remove dw_int64 column (#161)
valiantljk Jul 31, 2023
3886384
version bump to 0.1.18b12 (#164)
pfaraone Jul 31, 2023
8eaa360
Cleaning up rehashing logic as it is a dead code as of now. (#166)
raghumdani Aug 1, 2023
154c2e9
Fix stream position and support latest pyarrow (#168)
raghumdani Aug 1, 2023
f192b0a
Bumped version from 0.1.18b12 to 0.1.18b13 (#169)
pfaraone Aug 1, 2023
2997dc6
Add pytest benchmarking for Parquet reads (#160)
jaychia Aug 4, 2023
f38ab82
Polling EC2 Instance Metadata endpoint until HTTP 200 OK (#172)
pfaraone Aug 7, 2023
c7073a1
Adding local deltacat storage module (#175)
raghumdani Aug 7, 2023
69b4d88
version bump from 0.1.18b13 to 0.1.18b14 (#179)
pfaraone Aug 7, 2023
0b01ff6
Now triggering publish-to-pypi on editing and creating a release (#180)
pfaraone Aug 8, 2023
d4e48ee
`compact_partition` incremental unit test (#188)
pfaraone Aug 16, 2023
7d630d4
Switch botocore retry mode to adaptive from standard (#191)
yankevn Aug 17, 2023
df6552b
Merge phash_main into main branch (#195)
raghumdani Aug 24, 2023
1d10d84
Daft Native Reader for Parquet Content Types (#183)
samster25 Aug 29, 2023
95ab6f3
[WIP] Read Iceberg to DeltaCAT Dataset (#131)
JonasJ-ap Jun 28, 2023
936273b
Add workaround for pydantic 2.0 incompatibility, add build & deploy t…
pdames Aug 29, 2023
4219a4e
Fix worker logging on AWS Glue, stop duplicate pip installs of DeltaC…
pdames Aug 31, 2023
a15112f
Add regionalization, remove assumptions about high-level errors/respo…
pdames Sep 1, 2023
8a44ceb
Merge branch 'iceberg' into iceberg
pdames Sep 5, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,21 +1,37 @@
clean:
rm -rf venv

install:
venv:
if [ ! -d "venv" ]; then \
if [[ '$(shell uname -m)' == 'arm64' ]]; then \
/usr/bin/python3 -m venv venv; \
else \
python -m venv venv; \
fi \
fi

clean-build:
rm -rf dist
rm -rf build

clean-venv:
rm -rf venv

clean: clean-build clean-venv

build: venv
venv/bin/python setup.py sdist bdist_wheel

rebuild: clean-build build

deploy-s3:
./s3-build-and-deploy.sh

install: venv
venv/bin/pip install --upgrade pip
venv/bin/pip install -r dev-requirements.txt

lint:
lint: venv
venv/bin/pre-commit run --all-files

test-integration:
test-integration: install
docker-compose -f dev/iceberg-integration/docker-compose-integration.yml kill
docker-compose -f dev/iceberg-integration/docker-compose-integration.yml rm -f
docker-compose -f dev/iceberg-integration/docker-compose-integration.yml up -d
Expand Down
Empty file added deltacat/examples/__init__.py
Empty file.
Empty file.
54 changes: 54 additions & 0 deletions deltacat/examples/common/fixtures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
import logging
from deltacat import logs

from deltacat.constants import (
DELTACAT_APP_LOG_LEVEL,
DELTACAT_SYS_LOG_LEVEL,
DELTACAT_APP_LOG_DIR,
DELTACAT_SYS_LOG_DIR,
DELTACAT_APP_INFO_LOG_BASE_FILE_NAME,
DELTACAT_SYS_INFO_LOG_BASE_FILE_NAME,
DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME,
DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME,
)

logger = logs.configure_deltacat_logger(logging.getLogger(__name__))


def create_runtime_environment():
# read packages to install in the runtime environment from system environment vars
package = os.environ.get("PACKAGE")
logger.debug(f"Pip Install Packages: {package}")

runtime_environment = None
if package:
# read the stage (e.g. alpha, beta, dev, etc.) from the system environment vars
stage = os.environ["STAGE"]
logger.debug(f"Job Run Stage: {stage}")

# log the job run system environment for debugging
logger.debug(f"Job Run System Environment: {os.environ}")

worker_env_vars = {
# forward the STAGE environment variable to workers
"STAGE": stage,
# forward deltacat logging environment variables to workers
"DELTACAT_APP_LOG_LEVEL": DELTACAT_APP_LOG_LEVEL,
"DELTACAT_SYS_LOG_LEVEL": DELTACAT_SYS_LOG_LEVEL,
"DELTACAT_APP_LOG_DIR": DELTACAT_APP_LOG_DIR,
"DELTACAT_SYS_LOG_DIR": DELTACAT_SYS_LOG_DIR,
"DELTACAT_APP_INFO_LOG_BASE_FILE_NAME": DELTACAT_APP_INFO_LOG_BASE_FILE_NAME,
"DELTACAT_SYS_INFO_LOG_BASE_FILE_NAME": DELTACAT_SYS_INFO_LOG_BASE_FILE_NAME,
"DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME": DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME,
"DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME": DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME,
}

# setup runtime environment from system environment variables:
# 1. STAGE: The execution environment stage (e.g. alpha, beta, dev, etc.)
# 2. PACKAGE: Package to Pip Install. (e.g. signed S3 URL for DeltaCAT)
runtime_environment = {
"env_vars": worker_env_vars,
"pip": package,
}
pdames marked this conversation as resolved.
Show resolved Hide resolved
return runtime_environment
18 changes: 18 additions & 0 deletions deltacat/examples/hello_world.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import ray
import deltacat


@ray.remote
def hello_worker():
print("Hello, Worker!")
print(f"Worker DeltaCAT Version: {deltacat.__version__}")


def run():
print("Hello, Driver!")
print(f"Driver DeltaCAT Version: {deltacat.__version__}")
hello_worker.remote()


if __name__ == "__main__":
run()
33 changes: 33 additions & 0 deletions deltacat/examples/logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
import sys
import ray
import logging

from deltacat import logs
from deltacat.examples.common.fixtures import create_runtime_environment

logger = logs.configure_deltacat_logger(logging.getLogger(__name__))


@ray.remote
def logging_worker(var1, var2):
logger.debug(f"Worker System Environment: {os.environ}")
logger.debug(f"Worker Variable 1: {var1}")
logger.debug(f"Worker Variable 2: {var2}")


def run(var1="default1", var2="default2", **kwargs):
logger.debug(f"Driver Variable 1: {var1}")
logger.debug(f"Driver Variable 2: {var2}")
logging_worker.remote(var1, var2)


if __name__ == "__main__":
# create any runtime environment required to run the example
runtime_env = create_runtime_environment()

# initialize ray
ray.init(address="auto", runtime_env=runtime_env)

# run the example using os.environ as kwargs
run(**os.environ)
Empty file.
File renamed without changes.
File renamed without changes.
3 changes: 3 additions & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,6 @@ pre-commit == 2.20.0
pytest == 7.2.0
pytest-cov == 4.0.0
requests-mock == 1.11.0
pyiceberg @ git+https://github.com/apache/iceberg#subdirectory=python
pydantic == 2.0
ray[data] ~= 2.6
pdames marked this conversation as resolved.
Show resolved Hide resolved
Loading