Skip to content

Commit

Permalink
Merge pull request #1679 from apache/master
Browse files Browse the repository at this point in the history
Create a new pull request by comparing changes across two branches
  • Loading branch information
GulajavaMinistudio authored Sep 18, 2024
2 parents 808a61d + 669e63a commit d48c531
Show file tree
Hide file tree
Showing 857 changed files with 33,171 additions and 15,277 deletions.
2 changes: 2 additions & 0 deletions .asf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ github:
merge: false
squash: true
rebase: true
ghp_branch: master
ghp_path: /docs/_site

notifications:
pullrequests: reviews@spark.apache.org
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ jobs:
uses: actions/upload-artifact@v4
with:
name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
path: "**/target/unit-tests.log"
path: "**/target/*.log"

infra-image:
name: "Base image build"
Expand Down
90 changes: 90 additions & 0 deletions .github/workflows/pages.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#

name: GitHub Pages deployment

on:
push:
branches:
- master

concurrency:
group: 'docs preview'
cancel-in-progress: true

jobs:
docs:
name: Build and deploy documentation
runs-on: ubuntu-latest
permissions:
id-token: write
pages: write
env:
SPARK_TESTING: 1 # Reduce some noise in the logs
RELEASE_VERSION: 'In-Progress'
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
with:
repository: apache/spark
ref: 'master'
- name: Install Java 17
uses: actions/setup-java@v4
with:
distribution: zulu
java-version: 17
- name: Install Python 3.9
uses: actions/setup-python@v5
with:
python-version: '3.9'
architecture: x64
cache: 'pip'
- name: Install Python dependencies
run: pip install --upgrade -r dev/requirements.txt
- name: Install Ruby for documentation generation
uses: ruby/setup-ruby@v1
with:
ruby-version: '3.3'
bundler-cache: true
- name: Install Pandoc
run: |
sudo apt-get update -y
sudo apt-get install pandoc
- name: Install dependencies for documentation generation
run: |
cd docs
gem install bundler -v 2.4.22 -n /usr/local/bin
bundle install --retry=100
- name: Run documentation build
run: |
sed -i".tmp1" 's/SPARK_VERSION:.*$/SPARK_VERSION: '"$RELEASE_VERSION"'/g' docs/_config.yml
sed -i".tmp2" 's/SPARK_VERSION_SHORT:.*$/SPARK_VERSION_SHORT: '"$RELEASE_VERSION"'/g' docs/_config.yml
sed -i".tmp3" "s/'facetFilters':.*$/'facetFilters': [\"version:$RELEASE_VERSION\"]/g" docs/_config.yml
sed -i".tmp4" 's/__version__: str = .*$/__version__: str = "'"$RELEASE_VERSION"'"/' python/pyspark/version.py
cd docs
SKIP_RDOC=1 bundle exec jekyll build
- name: Setup Pages
uses: actions/configure-pages@v5
- name: Upload artifact
uses: actions/upload-pages-artifact@v3
with:
path: 'docs/_site'
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4
4 changes: 2 additions & 2 deletions .github/workflows/test_report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,14 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Download test results to report
uses: dawidd6/action-download-artifact@09385b76de790122f4da9c82b17bccf858b9557c # pin@v2
uses: dawidd6/action-download-artifact@bf251b5aa9c2f7eeb574a96ee720e24f801b7c11 # pin @v6
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
workflow: ${{ github.event.workflow_run.workflow_id }}
commit: ${{ github.event.workflow_run.head_commit.id }}
workflow_conclusion: completed
- name: Publish test report
uses: scacap/action-surefire-report@482f012643ed0560e23ef605a79e8e87ca081648 # pin@v1
uses: scacap/action-surefire-report@a2911bd1a4412ec18dde2d93b1758b3e56d2a880 # pin @v1.8.0
with:
check_name: Report test results
github_token: ${{ secrets.GITHUB_TOKEN }}
Expand Down
4 changes: 2 additions & 2 deletions assembly/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@
<!--
Because we don't shade dependencies anymore, we need to restore Guava to compile scope so
that the libraries Spark depend on have it available. We'll package the version that Spark
uses (14.0.1) which is not the same as Hadoop dependencies, but works.
uses which is not the same as Hadoop dependencies, but works.
-->
<dependency>
<groupId>com.google.guava</groupId>
Expand Down Expand Up @@ -200,7 +200,7 @@
<configuration>
<executable>cp</executable>
<arguments>
<argument>${basedir}/../connector/connect/client/jvm/target/spark-connect-client-jvm_${scala.binary.version}-${version}.jar</argument>
<argument>${basedir}/../connector/connect/client/jvm/target/spark-connect-client-jvm_${scala.binary.version}-${project.version}.jar</argument>
<argument>${basedir}/target/scala-${scala.binary.version}/jars/connect-repl</argument>
</arguments>
</configuration>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@
import java.util.function.Function;
import java.util.function.BiFunction;
import java.util.function.ToLongFunction;
import java.util.stream.Stream;

import com.ibm.icu.text.CollationKey;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.text.StringSearch;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.text.CollationKey;
import com.ibm.icu.text.Collator;
import com.ibm.icu.util.VersionInfo;

import org.apache.spark.SparkException;
import org.apache.spark.unsafe.types.UTF8String;
Expand Down Expand Up @@ -88,6 +90,17 @@ public Optional<String> getVersion() {
}
}

public record CollationMeta(
String catalog,
String schema,
String collationName,
String language,
String country,
String icuVersion,
String padAttribute,
boolean accentSensitivity,
boolean caseSensitivity) { }

/**
* Entry encapsulating all information about a collation.
*/
Expand Down Expand Up @@ -342,6 +355,23 @@ private static int collationNameToId(String collationName) throws SparkException
}

protected abstract Collation buildCollation();

protected abstract CollationMeta buildCollationMeta();

static List<CollationIdentifier> listCollations() {
return Stream.concat(
CollationSpecUTF8.listCollations().stream(),
CollationSpecICU.listCollations().stream()).toList();
}

static CollationMeta loadCollationMeta(CollationIdentifier collationIdentifier) {
CollationMeta collationSpecUTF8 =
CollationSpecUTF8.loadCollationMeta(collationIdentifier);
if (collationSpecUTF8 == null) {
return CollationSpecICU.loadCollationMeta(collationIdentifier);
}
return collationSpecUTF8;
}
}

private static class CollationSpecUTF8 extends CollationSpec {
Expand All @@ -364,6 +394,9 @@ private enum CaseSensitivity {
*/
private static final int CASE_SENSITIVITY_MASK = 0b1;

private static final String UTF8_BINARY_COLLATION_NAME = "UTF8_BINARY";
private static final String UTF8_LCASE_COLLATION_NAME = "UTF8_LCASE";

private static final int UTF8_BINARY_COLLATION_ID =
new CollationSpecUTF8(CaseSensitivity.UNSPECIFIED).collationId;
private static final int UTF8_LCASE_COLLATION_ID =
Expand Down Expand Up @@ -406,7 +439,7 @@ private static CollationSpecUTF8 fromCollationId(int collationId) {
protected Collation buildCollation() {
if (collationId == UTF8_BINARY_COLLATION_ID) {
return new Collation(
"UTF8_BINARY",
UTF8_BINARY_COLLATION_NAME,
PROVIDER_SPARK,
null,
UTF8String::binaryCompare,
Expand All @@ -417,7 +450,7 @@ protected Collation buildCollation() {
/* supportsLowercaseEquality = */ false);
} else {
return new Collation(
"UTF8_LCASE",
UTF8_LCASE_COLLATION_NAME,
PROVIDER_SPARK,
null,
CollationAwareUTF8String::compareLowerCase,
Expand All @@ -428,6 +461,52 @@ protected Collation buildCollation() {
/* supportsLowercaseEquality = */ true);
}
}

@Override
protected CollationMeta buildCollationMeta() {
if (collationId == UTF8_BINARY_COLLATION_ID) {
return new CollationMeta(
CATALOG,
SCHEMA,
UTF8_BINARY_COLLATION_NAME,
/* language = */ null,
/* country = */ null,
/* icuVersion = */ null,
COLLATION_PAD_ATTRIBUTE,
/* accentSensitivity = */ true,
/* caseSensitivity = */ true);
} else {
return new CollationMeta(
CATALOG,
SCHEMA,
UTF8_LCASE_COLLATION_NAME,
/* language = */ null,
/* country = */ null,
/* icuVersion = */ null,
COLLATION_PAD_ATTRIBUTE,
/* accentSensitivity = */ true,
/* caseSensitivity = */ false);
}
}

static List<CollationIdentifier> listCollations() {
CollationIdentifier UTF8_BINARY_COLLATION_IDENT =
new CollationIdentifier(PROVIDER_SPARK, UTF8_BINARY_COLLATION_NAME, "1.0");
CollationIdentifier UTF8_LCASE_COLLATION_IDENT =
new CollationIdentifier(PROVIDER_SPARK, UTF8_LCASE_COLLATION_NAME, "1.0");
return Arrays.asList(UTF8_BINARY_COLLATION_IDENT, UTF8_LCASE_COLLATION_IDENT);
}

static CollationMeta loadCollationMeta(CollationIdentifier collationIdentifier) {
try {
int collationId = CollationSpecUTF8.collationNameToId(
collationIdentifier.name, collationIdentifier.name.toUpperCase());
return CollationSpecUTF8.fromCollationId(collationId).buildCollationMeta();
} catch (SparkException ignored) {
// ignore
return null;
}
}
}

private static class CollationSpecICU extends CollationSpec {
Expand Down Expand Up @@ -684,6 +763,20 @@ protected Collation buildCollation() {
/* supportsLowercaseEquality = */ false);
}

@Override
protected CollationMeta buildCollationMeta() {
return new CollationMeta(
CATALOG,
SCHEMA,
collationName(),
ICULocaleMap.get(locale).getDisplayLanguage(),
ICULocaleMap.get(locale).getDisplayCountry(),
VersionInfo.ICU_VERSION.toString(),
COLLATION_PAD_ATTRIBUTE,
caseSensitivity == CaseSensitivity.CS,
accentSensitivity == AccentSensitivity.AS);
}

/**
* Compute normalized collation name. Components of collation name are given in order:
* - Locale name
Expand All @@ -704,6 +797,37 @@ private String collationName() {
}
return builder.toString();
}

private static List<String> allCollationNames() {
List<String> collationNames = new ArrayList<>();
for (String locale: ICULocaleToId.keySet()) {
// CaseSensitivity.CS + AccentSensitivity.AS
collationNames.add(locale);
// CaseSensitivity.CS + AccentSensitivity.AI
collationNames.add(locale + "_AI");
// CaseSensitivity.CI + AccentSensitivity.AS
collationNames.add(locale + "_CI");
// CaseSensitivity.CI + AccentSensitivity.AI
collationNames.add(locale + "_CI_AI");
}
return collationNames.stream().sorted().toList();
}

static List<CollationIdentifier> listCollations() {
return allCollationNames().stream().map(name ->
new CollationIdentifier(PROVIDER_ICU, name, VersionInfo.ICU_VERSION.toString())).toList();
}

static CollationMeta loadCollationMeta(CollationIdentifier collationIdentifier) {
try {
int collationId = CollationSpecICU.collationNameToId(
collationIdentifier.name, collationIdentifier.name.toUpperCase());
return CollationSpecICU.fromCollationId(collationId).buildCollationMeta();
} catch (SparkException ignored) {
// ignore
return null;
}
}
}

/**
Expand All @@ -730,9 +854,12 @@ public CollationIdentifier identifier() {
}
}

public static final String CATALOG = "SYSTEM";
public static final String SCHEMA = "BUILTIN";
public static final String PROVIDER_SPARK = "spark";
public static final String PROVIDER_ICU = "icu";
public static final List<String> SUPPORTED_PROVIDERS = List.of(PROVIDER_SPARK, PROVIDER_ICU);
public static final String COLLATION_PAD_ATTRIBUTE = "NO_PAD";

public static final int UTF8_BINARY_COLLATION_ID =
Collation.CollationSpecUTF8.UTF8_BINARY_COLLATION_ID;
Expand Down Expand Up @@ -923,4 +1050,12 @@ public static String getClosestSuggestionsOnInvalidName(

return String.join(", ", suggestions);
}

public static List<CollationIdentifier> listCollations() {
return Collation.CollationSpec.listCollations();
}

public static CollationMeta loadCollationMeta(CollationIdentifier collationIdentifier) {
return Collation.CollationSpec.loadCollationMeta(collationIdentifier);
}
}
Loading

0 comments on commit d48c531

Please sign in to comment.