Skip to content

Commit

Permalink
merge commit
Browse files Browse the repository at this point in the history
  • Loading branch information
shrshi committed Jan 28, 2025
2 parents 072b4fa + 9f4afb4 commit cc0b2e7
Show file tree
Hide file tree
Showing 126 changed files with 1,484 additions and 680 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ jobs:
package-name: libcudf
package-type: cpp
wheel-build-pylibcudf:
needs: [wheel-publish-libcudf]
needs: [wheel-build-libcudf]
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
with:
Expand All @@ -111,7 +111,7 @@ jobs:
package-name: pylibcudf
package-type: python
wheel-build-cudf:
needs: wheel-publish-pylibcudf
needs: wheel-build-pylibcudf
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
with:
Expand All @@ -132,7 +132,7 @@ jobs:
package-name: cudf
package-type: python
wheel-build-dask-cudf:
needs: wheel-publish-cudf
needs: wheel-build-cudf
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
with:
Expand All @@ -155,7 +155,7 @@ jobs:
package-name: dask_cudf
package-type: python
wheel-build-cudf-polars:
needs: wheel-publish-pylibcudf
needs: wheel-build-pylibcudf
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
with:
Expand Down
13 changes: 2 additions & 11 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,6 @@ repos:
"python/cudf_polars/cudf_polars",
"python/dask_cudf/dask_cudf"]
pass_filenames: false
- repo: https://github.com/nbQA-dev/nbQA
rev: 1.9.1
hooks:
- id: nbqa-isort
# Use the cudf_kafka isort orderings in notebooks so that dask
# and RAPIDS packages have their own sections.
args: ["--settings-file=python/cudf_kafka/pyproject.toml"]
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v16.0.6
hooks:
Expand Down Expand Up @@ -153,13 +146,11 @@ repos:
^CHANGELOG.md$
)
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.8.0
rev: v0.9.3
hooks:
- id: ruff
args: ["--fix"]
files: python/.*$
- id: ruff-format
files: python/.*$
- repo: https://github.com/rapidsai/pre-commit-hooks
rev: v0.4.0
hooks:
Expand All @@ -173,7 +164,7 @@ repos:
)
- id: verify-alpha-spec
- repo: https://github.com/rapidsai/dependency-file-generator
rev: v1.16.0
rev: v1.17.0
hooks:
- id: rapids-dependency-file-generator
args: ["--clean"]
Expand Down
26 changes: 18 additions & 8 deletions ci/cudf_pandas_scripts/fetch_pandas_versions.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,34 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
# Copyright (c) 2024-2025, NVIDIA CORPORATION.

import argparse

import requests
from packaging.version import Version
from packaging.specifiers import SpecifierSet
import argparse
from packaging.version import Version


def get_pandas_versions(pandas_range):
url = "https://pypi.org/pypi/pandas/json"
response = requests.get(url)
data = response.json()
versions = [Version(v) for v in data['releases']]
versions = [Version(v) for v in data["releases"]]
specifier = SpecifierSet(pandas_range.lstrip("pandas"))
matching_versions = [v for v in versions if v in specifier]
matching_minors = sorted(set(".".join((str(v.major), str(v.minor))) for v in matching_versions), key=Version)
matching_minors = sorted(
set(".".join((str(v.major), str(v.minor))) for v in matching_versions),
key=Version,
)
return matching_minors


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Filter pandas versions by prefix.")
parser.add_argument("pandas_range", type=str, help="The version prefix to filter by.")
parser = argparse.ArgumentParser(
description="Filter pandas versions by prefix."
)
parser.add_argument(
"pandas_range", type=str, help="The version prefix to filter by."
)
args = parser.parse_args()

versions = get_pandas_versions(args.pandas_range)
print(','.join(versions))
print(",".join(versions))
30 changes: 22 additions & 8 deletions ci/cudf_pandas_scripts/pandas-tests/job-summary.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0

Expand Down Expand Up @@ -68,17 +68,27 @@ def emoji_failed(x):
pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index()
main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index()
total_usage = main_df["_slow_function_call"] + main_df["_fast_function_call"]
main_df["CPU Usage"] = ((main_df["_slow_function_call"] / total_usage) * 100.0).round(1)
main_df["GPU Usage"] = ((main_df["_fast_function_call"] / total_usage) * 100.0).round(1)
main_df["CPU Usage"] = (
(main_df["_slow_function_call"] / total_usage) * 100.0
).round(1)
main_df["GPU Usage"] = (
(main_df["_fast_function_call"] / total_usage) * 100.0
).round(1)

total_usage = pr_df["_slow_function_call"] + pr_df["_fast_function_call"]
pr_df["CPU Usage"] = ((pr_df["_slow_function_call"] / total_usage) * 100.0).round(1)
pr_df["GPU Usage"] = ((pr_df["_fast_function_call"] / total_usage) * 100.0).round(1)
pr_df["CPU Usage"] = (
(pr_df["_slow_function_call"] / total_usage) * 100.0
).round(1)
pr_df["GPU Usage"] = (
(pr_df["_fast_function_call"] / total_usage) * 100.0
).round(1)

cpu_usage_mean = pr_df["CPU Usage"].mean().round(2)
gpu_usage_mean = pr_df["GPU Usage"].mean().round(2)

gpu_usage_rate_change = abs(pr_df["GPU Usage"].mean() - main_df["GPU Usage"].mean())
gpu_usage_rate_change = abs(
pr_df["GPU Usage"].mean() - main_df["GPU Usage"].mean()
)
pr_df["CPU Usage"] = pr_df["CPU Usage"].fillna(0)
pr_df["GPU Usage"] = pr_df["GPU Usage"].fillna(0)
main_df["CPU Usage"] = main_df["CPU Usage"].fillna(0)
Expand All @@ -92,8 +102,12 @@ def emoji_failed(x):
pr_df["CPU Usage"] = pr_df["CPU Usage"].astype(str) + "%"
pr_df["GPU Usage"] = pr_df["GPU Usage"].astype(str) + "%"

pr_df = pr_df[["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]]
diff_df = diff_df[["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]]
pr_df = pr_df[
["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]
]
diff_df = diff_df[
["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]
]
diff_df.columns = diff_df.columns + "_diff"
diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed)
diff_df["failed_diff"] = diff_df["failed_diff"].map(emoji_failed)
Expand Down
34 changes: 18 additions & 16 deletions ci/utils/nbtestlog2junitxml.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.
# Generate a junit-xml file from parsing a nbtest log

import re
from xml.etree.ElementTree import Element, ElementTree
from os import path
import string
from enum import Enum

from os import path
from xml.etree.ElementTree import Element, ElementTree

startingPatt = re.compile(r"^STARTING: ([\w\.\-]+)$")
skippingPatt = re.compile(r"^SKIPPING: ([\w\.\-]+)\s*(\(([\w\.\-\ \,]+)\))?\s*$")
skippingPatt = re.compile(
r"^SKIPPING: ([\w\.\-]+)\s*(\(([\w\.\-\ \,]+)\))?\s*$"
)
exitCodePatt = re.compile(r"^EXIT CODE: (\d+)$")
folderPatt = re.compile(r"^FOLDER: ([\w\.\-]+)$")
timePatt = re.compile(r"^real\s+([\d\.ms]+)$")
Expand Down Expand Up @@ -37,12 +38,8 @@ def makeFailureElement(outputLines):


def setFileNameAttr(attrDict, fileName):
attrDict.update(file=fileName,
classname="",
line="",
name="",
time=""
)
attrDict.update(file=fileName, classname="", line="", name="", time="")


def setClassNameAttr(attrDict, className):
attrDict["classname"] = className
Expand Down Expand Up @@ -76,11 +73,12 @@ def parseLog(logFile, testSuiteElement):
testSuiteElement.attrib["timestamp"] = ""

attrDict = {}
#setFileNameAttr(attrDict, logFile)
# setFileNameAttr(attrDict, logFile)
setFileNameAttr(attrDict, "nbtest")

parserStateEnum = Enum("parserStateEnum",
"newTest startingLine finishLine exitCode")
parserStateEnum = Enum(
"parserStateEnum", "newTest startingLine finishLine exitCode"
)
parserState = parserStateEnum.newTest

testOutput = ""
Expand All @@ -98,7 +96,9 @@ def parseLog(logFile, testSuiteElement):
setTimeAttr(attrDict, "0m0s")
skippedElement = makeTestCaseElement(attrDict)
message = m.group(3) or ""
skippedElement.append(Element("skipped", message=message, type=""))
skippedElement.append(
Element("skipped", message=message, type="")
)
testSuiteElement.append(skippedElement)
incrNumAttr(testSuiteElement, "skipped")
incrNumAttr(testSuiteElement, "tests")
Expand Down Expand Up @@ -160,4 +160,6 @@ def parseLog(logFile, testSuiteElement):
testSuiteElement = Element("testsuite", name="nbtest", hostname="")
parseLog(sys.argv[1], testSuiteElement)
testSuitesElement.append(testSuiteElement)
ElementTree(testSuitesElement).write(sys.argv[1]+".xml", xml_declaration=True)
ElementTree(testSuitesElement).write(
sys.argv[1] + ".xml", xml_declaration=True
)
4 changes: 2 additions & 2 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ dependencies:
- pandas
- pandas>=2.0,<2.2.4dev0
- pandoc
- polars>=1.11,<1.18
- polars>=1.20,<1.22
- pre-commit
- ptxcompiler
- pyarrow>=14.0.0,<19.0.0a0
- pyarrow>=14.0.0,<20.0.0a0
- pydata-sphinx-theme>=0.15.4
- pynvml>=12.0.0,<13.0.0a0
- pytest-benchmark
Expand Down
4 changes: 2 additions & 2 deletions conda/environments/all_cuda-125_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ dependencies:
- pandas
- pandas>=2.0,<2.2.4dev0
- pandoc
- polars>=1.11,<1.18
- polars>=1.20,<1.22
- pre-commit
- pyarrow>=14.0.0,<19.0.0a0
- pyarrow>=14.0.0,<20.0.0a0
- pydata-sphinx-theme>=0.15.4
- pynvjitlink>=0.0.0a0
- pynvml>=12.0.0,<13.0.0a0
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf-polars/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ requirements:
run:
- python
- pylibcudf ={{ version }}
- polars >=1.11,<1.18
- polars >=1.20,<1.22
- {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}

test:
Expand Down
5 changes: 4 additions & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,10 @@ ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)
# ##################################################################################################
# * rolling benchmark
# ---------------------------------------------------------------------------------
ConfigureNVBench(ROLLING_NVBENCH rolling/grouped_rolling_sum.cpp rolling/rolling_sum.cpp)
ConfigureNVBench(
ROLLING_NVBENCH rolling/grouped_range_rolling_sum.cu rolling/grouped_rolling_sum.cpp
rolling/range_rolling_sum.cu rolling/rolling_sum.cpp
)

add_custom_target(
run_benchmarks
Expand Down
16 changes: 10 additions & 6 deletions cpp/benchmarks/io/parquet/parquet_reader_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ void BM_parquet_read_data(nvbench::state& state,
void BM_parquet_read_long_strings(nvbench::state& state)
{
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));

auto const d_type = get_type_or_group(static_cast<int32_t>(data_type::STRING));
auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
Expand All @@ -106,21 +105,26 @@ void BM_parquet_read_long_strings(nvbench::state& state)

auto const avg_string_length = static_cast<cudf::size_type>(state.get_int64("avg_string_length"));
// corresponds to 3 sigma (full width 6 sigma: 99.7% of range)
auto const half_width = static_cast<cudf::size_type>(state.get_int64("half_width_string_length"));
auto const half_width =
avg_string_length >> 3; // 32 +/- 4, 128 +/- 16, 1024 +/- 128, 8k +/- 1k, etc.
auto const length_min = avg_string_length - half_width;
auto const length_max = avg_string_length + half_width;

data_profile profile =
data_profile_builder()
.cardinality(cardinality)
.avg_run_length(run_length)
.avg_run_length(1)
.distribution(data_type::STRING, distribution_id::NORMAL, length_min, length_max);

auto const num_rows_written = [&]() {
auto const tbl = create_random_table(
cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, profile); // THIS
auto const view = tbl->view();

// set smaller threshold to reduce file size and execution time
auto const threshold = 1;
setenv("LIBCUDF_LARGE_STRINGS_THRESHOLD", std::to_string(threshold).c_str(), 1);

cudf::io::parquet_writer_options write_opts =
cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
.compression(compression);
Expand All @@ -129,6 +133,7 @@ void BM_parquet_read_long_strings(nvbench::state& state)
}();

parquet_read_common(num_rows_written, num_cols, source_sink, state);
unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
}

template <data_type DataType>
Expand Down Expand Up @@ -409,6 +414,5 @@ NVBENCH_BENCH(BM_parquet_read_long_strings)
.add_string_axis("io_type", {"DEVICE_BUFFER"})
.set_min_samples(4)
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32})
.add_int64_axis("avg_string_length", {16, 48, 96})
.add_int64_axis("half_width_string_length", {16, 32, 64}); // length = avg +/- half_width
.add_int64_power_of_two_axis("avg_string_length",
nvbench::range(4, 16, 2)); // 16, 64, ... -> 64k
Loading

0 comments on commit cc0b2e7

Please sign in to comment.