Skip to content

Commit

Permalink
[CsvIO] Implement CsvIOParseHelpers::validate(CSVFormat) (apache#31853)
Browse files Browse the repository at this point in the history
* [CsvIO] document valid CSVFormat configuration for reading CSV.

* [CsvIO] implement

* [CsvIO] add tests for

* update CsvIOParseHelpers and CsvIOParseHelpersTest.

* [CsvIO] update CsvIOParseHelpers::validate() to disallow duplicate header names.
  • Loading branch information
francisohara24 authored and acrites committed Jul 17, 2024
1 parent 29856be commit 220bf64
Show file tree
Hide file tree
Showing 3 changed files with 194 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,69 @@
* <p>Reading from CSV files is not yet implemented. Please see <a
* href="https://github.com/apache/beam/issues/24552">https://github.com/apache/beam/issues/24552</a>.
*
* <h3>Valid CSVFormat Configuration</h3>
*
* <p>A <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html">{@code
* CSVFormat}</a> must meet the following conditions to be considered valid when reading CSV:
*
* <ul>
* <li>{@code String[]} <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withHeader-java.lang.Class-">header</a>
* - must contain at least one column name, and all column names must be non-empty.
* <li>{@code boolean} <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withAllowDuplicateHeaderNames--">allowDuplicateHeaderNames</a>
* - must be false.
* <li>{@code boolean} <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withAllowMissingColumnNames--">allowMissingColumnNames
* </a> - must be false.
* <li>{@code boolean} <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withIgnoreHeaderCase--">ignoreHeaderCase</a>
* - must be false.
* </ul>
*
* <h4>Ignored CSVFormat parameters</h4>
*
* <p>The following {@code CSVFormat} parameters are either not relevant for parsing CSV or are
* validated satisfactorily by the <a
* href="https://javadoc.io/doc/org.apache.commons/commons-csv/1.8/index.html">Apache Commons CSV
* library</a>.
*
* <ul>
* <li>{@code boolean} <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withAutoFlush-boolean-">autoFlush</a>
* <li>{@code char} <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withCommentMarker-char-">commentMarker</a>
* <li>{@code char} <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withDelimiter-char-">delimiter</a>
* <li>{@code char} <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withEscape-char-">escape</a>
* <li>{@code char} <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withQuote-char-">quote</a>
* <li>{@code org.apache.commons.csv.QuoteMode} <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withQuoteMode-org.apache.commons.csv.QuoteMode-">quoteMode</a>
* <li>{@code String} <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withNullString-java.lang.String-">nullString</a>
* <li>{@code char} <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withRecordSeparator-char-">recordSeparator</a>
* <li><a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withSystemRecordSeparator--">systemRecordSeparator</a>
* <li><a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withFirstRecordAsHeader--">firstRecordAsHeader</a>
* <li>{@code java.lang.Object...} <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withHeaderComments-java.lang.Object...-">headerComments</a>
* <li>{@code boolean} <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withIgnoreEmptyLines--">ignoreEmptyLines</a>
* <li>{@code boolean} <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withIgnoreSurroundingSpaces--">ignoreSurroundingSpaces</a>
* <li>{@code boolean} <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withTrim--">trim</a>
* <li>{@code boolean} <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withSkipHeaderRecord--">skipHeaderRecord</a>
* <li>{@code boolean} <a
* href="https://javadoc.io/static/org.apache.commons/commons-csv/1.8/org/apache/commons/csv/CSVFormat.html#withTrailingDelimiter--">trailingDelimiter</a>
* </ul>
*
* <h2>Writing CSV files</h2>
*
* <p>To write a {@link PCollection} to one or more CSV files, use {@link CsvIO.Write}, using {@link
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,50 @@
*/
package org.apache.beam.sdk.io.csv;

import static org.apache.beam.sdk.util.Preconditions.checkArgumentNotNull;
import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument;

import java.math.BigDecimal;
import java.time.Instant;
import java.util.ArrayList;
import java.util.List;
import org.apache.beam.sdk.schemas.Schema;
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings;
import org.apache.commons.csv.CSVFormat;

/** A utility class containing shared methods for parsing CSV records. */
final class CsvIOParseHelpers {
/** Validate the {@link CSVFormat} for CSV record parsing requirements. */
// TODO(https://github.com/apache/beam/issues/31712): implement method.
static void validate(CSVFormat format) {}
/**
* Validate the {@link CSVFormat} for CSV record parsing requirements. See the public-facing
* "Reading CSV Files" section of the {@link CsvIO} documentation for information regarding which
* {@link CSVFormat} parameters are checked during validation.
*/
static void validate(CSVFormat format) {
String[] header =
checkArgumentNotNull(format.getHeader(), "Illegal %s: header is required", CSVFormat.class);

checkArgument(header.length > 0, "Illegal %s: header cannot be empty", CSVFormat.class);

checkArgument(
!format.getAllowMissingColumnNames(),
"Illegal %s: cannot allow missing column names",
CSVFormat.class);

checkArgument(
!format.getIgnoreHeaderCase(), "Illegal %s: cannot ignore header case", CSVFormat.class);

checkArgument(
!format.getAllowDuplicateHeaderNames(),
"Illegal %s: cannot allow duplicate header names",
CSVFormat.class);

for (String columnName : header) {
checkArgument(
!Strings.isNullOrEmpty(columnName),
"Illegal %s: column name is required",
CSVFormat.class);
}
}

/**
* Validate the {@link CSVFormat} in relation to the {@link Schema} for CSV record parsing
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.time.Instant;
import org.apache.beam.sdk.schemas.Schema;
import org.apache.commons.collections.keyvalue.DefaultMapEntry;
import org.apache.commons.csv.CSVFormat;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
Expand All @@ -33,6 +34,88 @@
@RunWith(JUnit4.class)
public class CsvIOParseHelpersTest {

/** Tests for {@link CsvIOParseHelpers#validate(CSVFormat)}. */
@Test
public void givenCSVFormatWithHeader_validates() {
CSVFormat format = csvFormatWithHeader();
CsvIOParseHelpers.validate(format);
}

@Test
public void givenCSVFormatWithNullHeader_throwsException() {
CSVFormat format = csvFormat();
String gotMessage =
assertThrows(IllegalArgumentException.class, () -> CsvIOParseHelpers.validate(format))
.getMessage();
assertEquals("Illegal class org.apache.commons.csv.CSVFormat: header is required", gotMessage);
}

@Test
public void givenCSVFormatWithEmptyHeader_throwsException() {
CSVFormat format = csvFormat().withHeader();
String gotMessage =
assertThrows(IllegalArgumentException.class, () -> CsvIOParseHelpers.validate(format))
.getMessage();
assertEquals(
"Illegal class org.apache.commons.csv.CSVFormat: header cannot be empty", gotMessage);
}

@Test
public void givenCSVFormatWithHeaderContainingEmptyString_throwsException() {
CSVFormat format = csvFormat().withHeader("", "bar");
String gotMessage =
assertThrows(IllegalArgumentException.class, () -> CsvIOParseHelpers.validate(format))
.getMessage();
assertEquals(
"Illegal class org.apache.commons.csv.CSVFormat: column name is required", gotMessage);
}

@Test
public void givenCSVFormatWithHeaderContainingNull_throwsException() {
CSVFormat format = csvFormat().withHeader(null, "bar");
String gotMessage =
assertThrows(IllegalArgumentException.class, () -> CsvIOParseHelpers.validate(format))
.getMessage();
assertEquals(
"Illegal class org.apache.commons.csv.CSVFormat: column name is required", gotMessage);
}

@Test
public void givenCSVFormatThatAllowsMissingColumnNames_throwsException() {
CSVFormat format = csvFormatWithHeader().withAllowMissingColumnNames(true);
String gotMessage =
assertThrows(IllegalArgumentException.class, () -> CsvIOParseHelpers.validate(format))
.getMessage();
assertEquals(
"Illegal class org.apache.commons.csv.CSVFormat: cannot allow missing column names",
gotMessage);
}

@Test
public void givenCSVFormatThatIgnoresHeaderCase_throwsException() {
CSVFormat format = csvFormatWithHeader().withIgnoreHeaderCase(true);
String gotMessage =
assertThrows(IllegalArgumentException.class, () -> CsvIOParseHelpers.validate(format))
.getMessage();
assertEquals(
"Illegal class org.apache.commons.csv.CSVFormat: cannot ignore header case", gotMessage);
}

@Test
public void givenCSVFormatThatAllowsDuplicateHeaderNames_throwsException() {
CSVFormat format = csvFormatWithHeader().withAllowDuplicateHeaderNames(true);
String gotMessage =
assertThrows(IllegalArgumentException.class, () -> CsvIOParseHelpers.validate(format))
.getMessage();
assertEquals(
"Illegal class org.apache.commons.csv.CSVFormat: cannot allow duplicate header names",
gotMessage);
}

/** End of tests for {@link CsvIOParseHelpers#validate(CSVFormat)}. */
//////////////////////////////////////////////////////////////////////////////////////////////

/** Tests for {@link CsvIOParseHelpers#parseCell(String, Schema.Field)}. */
@Test
public void ignoresCaseFormat() {
String allCapsBool = "TRUE";
Expand Down Expand Up @@ -370,4 +453,17 @@ public void givenCellUnsupportedType_throws() {
+ ", consider using withCustomRecordParsing",
e.getMessage());
}

/** End of tests for {@link CsvIOParseHelpers#parseCell(String, Schema.Field)}. */
//////////////////////////////////////////////////////////////////////////////////////////////

/** Return a {@link CSVFormat} with a header and with no duplicate header names allowed. */
private static CSVFormat csvFormatWithHeader() {
return csvFormat().withHeader("foo", "bar");
}

/** Return a {@link CSVFormat} with no header and with no duplicate header names allowed. */
private static CSVFormat csvFormat() {
return CSVFormat.DEFAULT.withAllowDuplicateHeaderNames(false);
}
}

0 comments on commit 220bf64

Please sign in to comment.