Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Python] Fix python host build #434

Merged
merged 1 commit into from
Jan 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/maven-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ jobs:
- name: Init PG
run: |
./script/meta_init_for_local_test.sh -j 2
PGPASSWORD=lakesoul_test psql -h localhost -p 5432 -U lakesoul_test -c "ALTER DATABASE lakesoul_test SET DEFAULT_TRANSACTION_ISOLATION TO 'serializable';"
- name: Install Protoc
uses: arduino/setup-protoc@v2
with:
Expand Down Expand Up @@ -164,6 +165,7 @@ jobs:
- name: Init PG
run: |
./script/meta_init_for_local_test.sh -j 2
PGPASSWORD=lakesoul_test psql -h localhost -p 5432 -U lakesoul_test -c "ALTER DATABASE lakesoul_test SET DEFAULT_TRANSACTION_ISOLATION TO 'serializable';"
- name: Install Protoc
uses: arduino/setup-protoc@v2
with:
Expand Down Expand Up @@ -232,6 +234,7 @@ jobs:
- name: Init PG
run: |
./script/meta_init_for_local_test.sh -j 1
PGPASSWORD=lakesoul_test psql -h localhost -p 5432 -U lakesoul_test -c "ALTER DATABASE lakesoul_test SET DEFAULT_TRANSACTION_ISOLATION TO 'serializable';"
- name: Init PG RBAC
run: |
./script/meta_rbac_init_for_local_test.sh -j 1
Expand Down Expand Up @@ -312,6 +315,7 @@ jobs:
- name: Init PG
run: |
PGPASSWORD=lakesoul_test psql -h localhost -p 5432 -U lakesoul_test -f script/meta_init.sql lakesoul_test
PGPASSWORD=lakesoul_test psql -h localhost -p 5432 -U lakesoul_test -c "ALTER DATABASE lakesoul_test SET DEFAULT_TRANSACTION_ISOLATION TO 'serializable';"
- name: Install Protoc
uses: arduino/setup-protoc@v2
with:
Expand Down Expand Up @@ -380,7 +384,7 @@ jobs:
- name: Init PG
run: |
PGPASSWORD=lakesoul_test psql -h localhost -p 5432 -U lakesoul_test -f script/meta_init.sql lakesoul_test

PGPASSWORD=lakesoul_test psql -h localhost -p 5432 -U lakesoul_test -c "ALTER DATABASE lakesoul_test SET DEFAULT_TRANSACTION_ISOLATION TO 'serializable';"
- name: Init PG RBAC ROW POLICY
run: |
PGPASSWORD=lakesoul_test psql -h localhost -p 5432 -U lakesoul_test -f script/meta_rbac_init.sql lakesoul_test
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/rust-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ jobs:
- name: Init PG
run: |
./script/meta_init_for_local_test.sh -j 2
PGPASSWORD=lakesoul_test psql -h localhost -p 5432 -U lakesoul_test -c "ALTER DATABASE lakesoul_test SET DEFAULT_TRANSACTION_ISOLATION TO 'serializable';"
- name: Install Protoc
uses: arduino/setup-protoc@v2
with:
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,6 @@ __pycache__/
/python/lakesoul/metadata/generated/entity_pb2_grpc.py
/python/build/
/python/lakesoul.egg-info/
/python/*.whl
*.whl
/wheelhouse/
/rust/.idea
5 changes: 3 additions & 2 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,8 @@ endif()
add_library(liblakesoul_io_c SHARED IMPORTED)
set_target_properties(liblakesoul_io_c PROPERTIES
IMPORTED_LOCATION "${PROJECT_SOURCE_DIR}/../rust/target/release/liblakesoul_io_c.so"
INTERFACE_INCLUDE_DIRECTORIES "${PROJECT_SOURCE_DIR}/../rust/lakesoul-io-c")
INTERFACE_INCLUDE_DIRECTORIES "${PROJECT_SOURCE_DIR}/../rust/lakesoul-io-c"
IMPORTED_NO_SONAME TRUE)

add_library(libarrow SHARED IMPORTED)
set_target_properties(libarrow PROPERTIES
Expand Down Expand Up @@ -175,4 +176,4 @@ add_library(lakesoul_dataset SHARED
${PROJECT_BINARY_DIR}/python/lakesoul/arrow/_lakesoul_dataset.cpp)
target_include_directories(lakesoul_dataset PRIVATE include)
target_link_libraries(lakesoul_dataset PRIVATE liblakesoul_io_c libarrow_python)
set_target_properties(lakesoul_dataset PROPERTIES PREFIX "_")
set_target_properties(lakesoul_dataset PROPERTIES PREFIX "_")
11 changes: 6 additions & 5 deletions docker/lakesoul-docker-compose-env/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@ services:
POSTGRES_USER: lakesoul_test
POSTGRES_DB: lakesoul_test
command:
--max_connections=4096
- "postgres"
- "-c"
- "max_connections=4096"
- "-c"
- "default_transaction_isolation=serializable"
volumes:
- ./meta_init.sql:/docker-entrypoint-initdb.d/meta_init.sql
- ./meta_cleanup.sql:/meta_cleanup.sql
Expand Down Expand Up @@ -120,7 +124,4 @@ networks:
default:
driver: bridge
ipam:
driver: default
config:
- subnet: 10.16.1.0/16
gateway: 10.16.1.1
driver: default
4 changes: 2 additions & 2 deletions lakesoul-flink/src/test/resources/log4j2-test.properties
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,6 @@ logger.hadooputil.level = ERROR
logger.webmonitor.name = org.apache.flink.runtime.webmonitor.WebMonitorUtils
logger.webmonitor.level = ERROR
logger.dispatchrest.name = org.apache.flink.runtime.dispatcher.DispatcherRestEndpoint
logger.dispatchrest.level = INFO
logger.dispatchrest.level = WARN
logger.lakesoul.name = org.apache.flink.lakesoul
logger.lakesoul.level = INFO
logger.lakesoul.level = WARN
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package org.apache.spark.sql.lakesoul.benchmark

import org.apache.spark.sql.SparkSession

object LargeColumnsBench {
def main(args: Array[String]): Unit = {
val builder = SparkSession.builder()
.appName("ParquetScanBenchmark")
.master("local[1]")
.config("spark.sql.shuffle.partitions", 1)
.config("spark.sql.files.maxPartitionBytes", "2g")
.config("spark.default.parallelism", 1)
.config("spark.sql.extensions", "com.dmetasoul.lakesoul.sql.LakeSoulSparkSessionExtension")
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.lakesoul.catalog.LakeSoulCatalog")
val spark = builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
val tablePath = "/tmp/lakesoul/spark/large_columns"
val tableName = "test_lakesoul_table"
val rows = 1000
val cols = 10000;
{
var df = spark.range(0, rows).toDF("row")
df.createOrReplaceTempView("temp")
var sql = "select row "
for (i <- 0 until cols) {
sql += s", rand($i) as col$i"
}
sql += " from temp"
df = spark.sql(sql)
println(s"Write $rows rows and $cols cols:")
spark.time {
df.write
.format("lakesoul")
.mode("overwrite")
// .save(tablePath)
.saveAsTable(tableName)
}
};
{
val df = spark.read.format("lakesoul").table(tableName)
println(s"Read $rows rows and $cols cols:")
spark.time {
df.write.format("noop").mode("overwrite").save()
}
};
/*
{
val df = spark.read.format("lakesoul").load(tablePath)
df.createOrReplaceTempView("temp")
var sql = "select row "
for (i <- 0 until cols / 3) {
sql += s", col$i"
}
sql += " from temp"
println(s"Read $rows rows and ${cols/3} cols:")
spark.time {
spark.sql(sql).write.format("noop").mode("overwrite").save()
}
};

*/
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -229,23 +229,20 @@ class CompactionSuite extends QueryTest

assert(!rangeInfo.groupBy(_.file_bucket_id).forall(_._2.length == 1))


LakeSoulTable.forPath(tableName).compaction("range=1")

println(SparkUtil.allDataInfo(sm.updateSnapshot())
.filter(_.range_partitions.equals("range=1"))
.groupBy(_.file_bucket_id))
val allDataInfo = SparkUtil.allDataInfo(sm.updateSnapshot())
println(allDataInfo.mkString("Array(", ", ", ")"))

assert(SparkUtil.allDataInfo(sm.updateSnapshot())
assert(allDataInfo
.filter(_.range_partitions.equals("range=1"))
.groupBy(_.file_bucket_id).forall(_._2.length == 1)
)

assert(SparkUtil.allDataInfo(sm.updateSnapshot())
assert(allDataInfo
.filter(!_.range_partitions.equals("range=1"))
.groupBy(_.file_bucket_id).forall(_._2.length != 1)
)

})
}
test("compaction with call - simple condition") {
Expand Down Expand Up @@ -282,20 +279,18 @@ class CompactionSuite extends QueryTest
*/
sql("call LakeSoulTable.compaction(condition=>map('range',1),tablePath=>'" + tableName + "')")

println(SparkUtil.allDataInfo(sm.updateSnapshot())
.filter(_.range_partitions.equals("range=1"))
.groupBy(_.file_bucket_id))
val allDataInfo = SparkUtil.allDataInfo(sm.updateSnapshot())
println(allDataInfo.mkString("Array(", ", ", ")"))

assert(SparkUtil.allDataInfo(sm.updateSnapshot())
assert(allDataInfo
.filter(_.range_partitions.equals("range=1"))
.groupBy(_.file_bucket_id).forall(_._2.length == 1)
)

assert(SparkUtil.allDataInfo(sm.updateSnapshot())
assert(allDataInfo
.filter(!_.range_partitions.equals("range=1"))
.groupBy(_.file_bucket_id).forall(_._2.length != 1)
)

})
}

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

[build-system]
requires = ["setuptools >= 40.1.0", "wheel", "cython >= 0.29.31,<3"]
requires = ["setuptools >= 62.1.0", "wheel", "cython >= 0.29.31,<3"]
build-backend = "setuptools.build_meta"

[project]
Expand Down
2 changes: 1 addition & 1 deletion python/host_build_wheel.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def _build_wheel(self):

def _repair_wheel(self):
pa_abi_tag = self._get_pyarrow_abi_tag()
args = [self._get_python_path(), '-m', 'auditwheel', 'repair', '--plat', 'manylinux2014_x86_64']
args = [self._get_python_path(), '-m', 'auditwheel', 'repair', '--plat', 'linux_x86_64']
args += ['--exclude', 'libarrow_python.so']
args += ['--exclude', 'libarrow_dataset.so.%s' % pa_abi_tag]
args += ['--exclude', 'libarrow_acero.so.%s' % pa_abi_tag]
Expand Down
10 changes: 10 additions & 0 deletions python/tests/arrow/test_lakesoul_read.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from lakesoul.arrow import lakesoul_dataset

ds = lakesoul_dataset("test_lakesoul_table", batch_size=10240)

total_rows = 0
total_cols = 0
for batch in ds.to_batches():
total_rows += batch.num_rows
total_cols += batch.num_columns
print(total_rows, total_cols)
File renamed without changes.
Loading