Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use dataset encoder rather than implicit value encoder #183

Merged
merged 21 commits into from
Aug 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/actions/build-whl/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ runs:
- name: Fetch Binaries Artifact
uses: actions/download-artifact@v3
with:
name: Binaries-${{ inputs.spark-compat-version }}-${{ inputs.scala-version }}
name: Binaries-${{ inputs.spark-compat-version }}-${{ inputs.scala-compat-version }}
path: .

- name: Cache Maven packages
Expand Down Expand Up @@ -69,7 +69,7 @@ runs:
- name: Upload whl
uses: actions/upload-artifact@v3
with:
name: Whl (Spark ${{ inputs.spark-version }} Scala ${{ inputs.scala-version }})
name: Whl (Spark ${{ inputs.spark-compat-version }} Scala ${{ inputs.scala-compat-version }})
path: |
python/dist/*.whl

Expand Down
5 changes: 4 additions & 1 deletion .github/actions/build/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ inputs:
spark-compat-version:
description: Spark compatibility version, e.g. 3.4
required: true
scala-compat-version:
description: Scala compatibility version, e.g. 2.12
required: true

runs:
using: 'composite'
Expand Down Expand Up @@ -45,7 +48,7 @@ runs:
- name: Upload Binaries
uses: actions/upload-artifact@v3
with:
name: Binaries-${{ inputs.spark-compat-version }}-${{ inputs.scala-version }}
name: Binaries-${{ inputs.spark-compat-version }}-${{ inputs.scala-compat-version }}
path: |
*
!.*
Expand Down
78 changes: 78 additions & 0 deletions .github/actions/check-compat/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
name: 'Check'
author: 'EnricoMi'
description: 'A GitHub Action that checks compatibility of spark-extension'

inputs:
spark-version:
description: Spark version, e.g. 3.4.0 or 3.4.0-SNAPSHOT
required: true
scala-version:
description: Scala version, e.g. 2.12.15
required: true
spark-compat-version:
description: Spark compatibility version, e.g. 3.4
required: true
scala-compat-version:
description: Scala compatibility version, e.g. 2.12
required: true
package-version:
description: Spark-Extension version to check against
required: true

runs:
using: 'composite'
steps:
- name: Set versions in pom.xml
run: |
./set-version.sh ${{ inputs.spark-version }} ${{ inputs.scala-version }}
git diff
shell: bash

- name: Fetch Binaries Artifact
uses: actions/download-artifact@v3
with:
name: Binaries-${{ inputs.spark-compat-version }}-${{ inputs.scala-compat-version }}
path: .

- name: Cache Maven packages
uses: actions/cache@v3
with:
path: ~/.m2/repository
key: ${{ runner.os }}-mvn-check-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }}
restore-keys: |
${{ runner.os }}-mvn-check-${{ inputs.spark-version }}-${{ inputs.scala-version }}
${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-

- name: Setup JDK 1.8
uses: actions/setup-java@v3
with:
java-version: '8'
distribution: 'zulu'

- name: Install Checker
run: |
sudo apt update
sudo apt install japi-compliance-checker
shell: bash

- name: Fetch package
run: |
mvn dependency:get -Dtransitive=false -DremoteRepositories -Dartifact=uk.co.gresearch.spark:spark-extension_${{ inputs.scala-compat-version }}:${{ inputs.package-version }}-${{ inputs.spark-compat-version }}
shell: bash

- name: Check
run: |
ls -lah ~/.m2/repository/uk/co/gresearch/spark/spark-extension_${{ inputs.scala-compat-version }}/${{ inputs.package-version }}-${{ inputs.spark-compat-version }}/spark-extension_${{ inputs.scala-compat-version }}-${{ inputs.package-version }}-${{ inputs.spark-compat-version }}.jar target/spark-extension*.jar
japi-compliance-checker ~/.m2/repository/uk/co/gresearch/spark/spark-extension_${{ inputs.scala-compat-version }}/${{ inputs.package-version }}-${{ inputs.spark-compat-version }}/spark-extension_${{ inputs.scala-compat-version }}-${{ inputs.package-version }}-${{ inputs.spark-compat-version }}.jar target/spark-extension*.jar
shell: bash

- name: Upload Report
uses: actions/upload-artifact@v3
if: always()
with:
name: Compat-Report-${{ inputs.spark-compat-version }}-${{ inputs.scala-compat-version }}
path: compat_reports/spark-extension/*

branding:
icon: 'check-circle'
color: 'green'
2 changes: 1 addition & 1 deletion .github/actions/test-jvm/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ runs:
- name: Fetch Binaries Artifact
uses: actions/download-artifact@v3
with:
name: Binaries-${{ inputs.spark-compat-version }}-${{ inputs.scala-version }}
name: Binaries-${{ inputs.spark-compat-version }}-${{ inputs.scala-compat-version }}
path: .

- name: Cache Spark Binaries
Expand Down
2 changes: 1 addition & 1 deletion .github/actions/test-python/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ runs:
- name: Fetch Binaries Artifact
uses: actions/download-artifact@v3
with:
name: Binaries-${{ inputs.spark-compat-version }}-${{ inputs.scala-version }}
name: Binaries-${{ inputs.spark-compat-version }}-${{ inputs.scala-compat-version }}
path: .

- name: Cache Maven packages
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/build-jvm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,4 @@ jobs:
spark-version: ${{ matrix.spark-version }}
scala-version: ${{ matrix.scala-version }}
spark-compat-version: ${{ matrix.spark-compat-version }}
scala-compat-version: ${{ matrix.scala-compat-version }}
1 change: 1 addition & 0 deletions .github/workflows/build-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,5 @@ jobs:
spark-version: ${{ matrix.spark-version }}
scala-version: ${{ matrix.scala-version }}
spark-compat-version: ${{ matrix.spark-compat-version }}
scala-compat-version: ${{ matrix.scala-compat-version }}
python-version: "3.8"
1 change: 1 addition & 0 deletions .github/workflows/build-snapshots.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,4 @@ jobs:
spark-version: ${{ matrix.spark-version }}
scala-version: ${{ matrix.scala-version }}
spark-compat-version: ${{ matrix.spark-compat-version }}-SNAPSHOT
scala-compat-version: ${{ matrix.scala-compat-version }}
77 changes: 77 additions & 0 deletions .github/workflows/check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
name: Check

on:
workflow_call:
jobs:
config:
name: Configure check
runs-on: ubuntu-latest
outputs:
major-version: ${{ steps.versions.outputs.major-version }}
release-version: ${{ steps.versions.outputs.release-version }}
release-major-version: ${{ steps.versions.outputs.release-major-version }}

steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0

- name: Get versions
id: versions
run: |
version=$(grep -m1 version pom.xml | sed -e "s/<[^>]*>//g" -e "s/ //g")
echo "version: $version"
echo "major-version: ${version/.*/}"
echo "version=$version" >> "$GITHUB_OUTPUT"
echo "major-version=${version/.*/}" >> "$GITHUB_OUTPUT"
release_version=$(git tag | tail -n1 | sed "s/^v//")
echo "release-version: $release_version"
echo "release-major-version: ${release_version/.*/}"
echo "release-version=$release_version" >> "$GITHUB_OUTPUT"
echo "release-major-version=${release_version/.*/}" >> "$GITHUB_OUTPUT"
shell: bash

check:
name: Check (Spark ${{ matrix.spark-compat-version }} Scala ${{ matrix.scala-compat-version }})
needs: config
runs-on: ubuntu-latest
if: needs.config.outputs.major-version == needs.config.outputs.release-major-version

strategy:
fail-fast: false
matrix:
include:
- spark-compat-version: '3.0'
spark-version: '3.0.3'
scala-compat-version: '2.12'
scala-version: '2.12.10'
- spark-compat-version: '3.1'
spark-version: '3.1.3'
scala-compat-version: '2.12'
scala-version: '2.12.10'
- spark-compat-version: '3.2'
spark-version: '3.2.4'
scala-compat-version: '2.12'
scala-version: '2.12.15'
- spark-compat-version: '3.3'
spark-version: '3.3.2'
scala-compat-version: '2.12'
scala-version: '2.12.15'
- spark-compat-version: '3.4'
scala-compat-version: '2.12'
scala-version: '2.12.17'
spark-version: '3.4.1'

steps:
- name: Checkout
uses: actions/checkout@v3

- name: Check
uses: ./.github/actions/check-compat
with:
spark-version: ${{ matrix.spark-version }}
scala-version: ${{ matrix.scala-version }}
spark-compat-version: ${{ matrix.spark-compat-version }}
scala-compat-version: ${{ matrix.scala-compat-version }}
package-version: ${{ needs.config.outputs.release-version }}
5 changes: 5 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ jobs:
needs: build-snapshots
uses: "./.github/workflows/test-snapshots.yml"

check:
name: "Check"
needs: build-jvm
uses: "./.github/workflows/check.yml"

test_success:
name: "Test success"
runs-on: ubuntu-latest
Expand Down
39 changes: 23 additions & 16 deletions src/main/scala/uk/co/gresearch/spark/group/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -81,19 +81,22 @@ package object group {
}

object SortedGroupByDataset {
def apply[K: Ordering : Encoder, V: Encoder](ds: Dataset[V],
groupColumns: Seq[Column],
orderColumns: Seq[Column],
partitions: Option[Int]): SortedGroupByDataset[K, V] = {
def apply[K: Ordering : Encoder, V](ds: Dataset[V],
groupColumns: Seq[Column],
orderColumns: Seq[Column],
partitions: Option[Int]): SortedGroupByDataset[K, V] = {
// make ds encoder implicitly available
implicit val valueEncoder: Encoder[V] = ds.encoder

// multiple group columns are turned into a tuple,
// while a single group column is taken as is
val keyColumn =
if (groupColumns.length == 1)
groupColumns.head
else
struct(groupColumns: _*)
if (groupColumns.length == 1)
groupColumns.head
else
struct(groupColumns: _*)

// all columns are turned into a single colum as a struct
// all columns are turned into a single column as a struct
val valColumn = struct(col("*"))

// repartition by group columns with given number of partitions (if given)
Expand All @@ -112,13 +115,17 @@ package object group {
SortedGroupByDataset(grouped)
}

def apply[K: Ordering : Encoder, V: Encoder, O: Encoder](ds: Dataset[V],
key: V => K,
order: V => O,
partitions: Option[Int],
reverse: Boolean): SortedGroupByDataset[K, V] = {
implicit val kvEncoder: Encoder[(K, V)] = Encoders.tuple(implicitly[Encoder[K]], implicitly[Encoder[V]])
implicit val kvoEncoder: Encoder[(K, V, O)] = Encoders.tuple(implicitly[Encoder[K]], implicitly[Encoder[V]], implicitly[Encoder[O]])
def apply[K: Ordering : Encoder, V, O: Encoder](ds: Dataset[V],
key: V => K,
order: V => O,
partitions: Option[Int],
reverse: Boolean): SortedGroupByDataset[K, V] = {
// prepare encoder needed for this exercise
val keyEncoder: Encoder[K] = implicitly[Encoder[K]]
implicit val valueEncoder: Encoder[V] = ds.encoder
val orderEncoder: Encoder[O] = implicitly[Encoder[O]]
implicit val kvEncoder: Encoder[(K, V)] = Encoders.tuple(keyEncoder, valueEncoder)
implicit val kvoEncoder: Encoder[(K, V, O)] = Encoders.tuple(keyEncoder, valueEncoder, orderEncoder)

// materialise the key and order class for each value
val kvo = ds.map(v => (key(v), v, order(v)))
Expand Down
Loading
Loading