From a829bdbc36badb31514ea55de6b295be96c17a2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 5 Feb 2025 18:07:34 +0100 Subject: [PATCH] Add extra columns when reading Delimited file (#12231) - Closes #12186 --- CHANGELOG.md | 6 ++ .../src/Delimited/Delimited_Format.enso | 21 ++-- .../0.0.0-dev/src/Delimited/Invalid_Rows.enso | 25 +++++ .../src/Internal/Delimited_Reader.enso | 14 ++- .../java/org/enso/table/data/table/Table.java | 13 +++ .../table/read/DelimitedFileMetadata.java | 4 +- .../org/enso/table/read/DelimitedReader.java | 78 ++++++++++++--- .../src/IO/Delimited_Read_Spec.enso | 95 ++++++++++++++++--- 8 files changed, 219 insertions(+), 37 deletions(-) create mode 100644 distribution/lib/Standard/Table/0.0.0-dev/src/Delimited/Invalid_Rows.enso diff --git a/CHANGELOG.md b/CHANGELOG.md index 42f0acc789e6..b9a7e90003ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,11 +42,17 @@ - [Reducing helper methods in `Standard.Base.Meta`.][12031] - [Added Table.Offset][12071] - [Added Column.Offset][12092] +- [When reading a Delimited file, if a row with more columns than expected is + encountered, extra columns can be added to the result.][12231] + - In `Delimited` format, the `keep_invalid_rows` setting has been renamed to + `on_invalid_rows`. The default behaviour was also changed to add any extra + columns instead of discarding them. [11926]: https://github.com/enso-org/enso/pull/11926 [12031]: https://github.com/enso-org/enso/pull/12031 [12071]: https://github.com/enso-org/enso/pull/12071 [12092]: https://github.com/enso-org/enso/pull/12092 +[12231]: https://github.com/enso-org/enso/pull/12231 #### Enso Language & Runtime diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Delimited/Delimited_Format.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Delimited/Delimited_Format.enso index 510f2ddf9f36..b5355c211b31 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Delimited/Delimited_Format.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Delimited/Delimited_Format.enso @@ -11,6 +11,7 @@ from Standard.Base.System.File_Format import parse_boolean_with_infer from Standard.Base.Widget_Helpers import make_file_read_delimiter_selector import project.Data_Formatter.Data_Formatter +import project.Delimited.Invalid_Rows.Invalid_Rows import project.Delimited.Quote_Style.Quote_Style import project.Headers.Headers import project.Internal.Delimited_Reader @@ -45,9 +46,8 @@ type Delimited_Format appended to disambiguate them. - value_formatter: Formatter to parse text values into numbers, dates, times, etc. If `Nothing` values are left as Text. - - keep_invalid_rows: Specifies whether rows that contain less or more - columns than expected should be kept (setting the missing columns to - `Nothing` or dropping the excess columns) or dropped. + - on_invalid_rows: Specifies how to handle rows that have less or more + columns than the first row. - line_endings: Sets the line ending style to use. Defaults to `Infer` - when reading a file or appending to an existing file, the line endings are detected from file contents; when writing a new file in `Infer` @@ -61,7 +61,7 @@ type Delimited_Format @delimiter make_file_read_delimiter_selector @encoding Encoding.default_widget @row_limit Rows_To_Read.default_widget - Delimited (delimiter:Text=',') (encoding:Encoding=Encoding.default) (skip_rows:Integer=0) (row_limit:Rows_To_Read=..All_Rows) (quote_style:Quote_Style=Quote_Style.With_Quotes) (headers:Headers=Headers.Detect_Headers) (value_formatter:Data_Formatter|Nothing=Data_Formatter.Value) (keep_invalid_rows:Boolean=True) (line_endings:Line_Ending_Style|Infer=Infer) (comment_character:Text|Nothing=Nothing) + Delimited (delimiter:Text=',') (encoding:Encoding=Encoding.default) (skip_rows:Integer=0) (row_limit:Rows_To_Read=..All_Rows) (quote_style:Quote_Style=Quote_Style.With_Quotes) (headers:Headers=Headers.Detect_Headers) (value_formatter:Data_Formatter|Nothing=Data_Formatter.Value) (on_invalid_rows:Invalid_Rows=Invalid_Rows.Add_Extra_Columns) (line_endings:Line_Ending_Style|Infer=Infer) (comment_character:Text|Nothing=Nothing) ## PRIVATE Resolve an unresolved constructor to the actual type. @@ -125,8 +125,8 @@ type Delimited_Format ## PRIVATE Clone the instance with some properties overridden. clone : Encoding -> Quote_Style -> Headers -> (Data_Formatter|Nothing) -> Boolean -> (Text|Nothing) -> (Text|Nothing) -> Delimited_Format - clone self (encoding:Encoding = self.encoding) (quote_style:Quote_Style=self.quote_style) (headers:Headers=self.headers) (value_formatter=self.value_formatter) (keep_invalid_rows:Boolean=self.keep_invalid_rows) (line_endings=self.line_endings) (comment_character=self.comment_character) = - Delimited_Format.Delimited self.delimiter encoding self.skip_rows self.row_limit quote_style headers value_formatter keep_invalid_rows line_endings comment_character + clone self (encoding:Encoding = self.encoding) (quote_style:Quote_Style=self.quote_style) (headers:Headers=self.headers) (value_formatter=self.value_formatter) (on_invalid_rows:Invalid_Rows=self.on_invalid_rows) (line_endings=self.line_endings) (comment_character=self.comment_character) = + Delimited_Format.Delimited self.delimiter encoding self.skip_rows self.row_limit quote_style headers value_formatter on_invalid_rows line_endings comment_character ## ICON data_input Create a clone of this with specified quoting settings. @@ -194,7 +194,12 @@ Delimited_Format.from (that : JS_Object) = headers = that.get "headers" |> parse_boolean_with_infer "headers" skip_rows = that.get "skip_rows" . if_nothing 0 row_limit = that.get "row_limit" - keep_invalid_rows = that.get "keep_invalid_rows" . if_nothing True + on_invalid_rows = case that.get "on_invalid_rows" of + True -> Invalid_Rows.Keep_Invalid_Rows + False -> Invalid_Rows.Drop_Invalid_Rows + "add_extra_columns" -> Invalid_Rows.Add_Extra_Columns + Nothing -> Invalid_Rows.Add_Extra_Columns + other -> Error.throw (Illegal_Argument.Error "Invalid value for `on_invalid_rows`: "+other.to_display_text) quote_style = case that.get "quote_style" of Nothing -> Quote_Style.With_Quotes json -> Quote_Style.from json @@ -202,6 +207,6 @@ Delimited_Format.from (that : JS_Object) = unsupported_fields = ["value_formatter", "line_endings", "comment_character"] case unsupported_fields.find that.contains_key if_missing=Nothing of Nothing -> - Delimited_Format.Delimited delimiter=delimiter encoding=encoding headers=headers skip_rows=skip_rows row_limit=row_limit quote_style=quote_style keep_invalid_rows=keep_invalid_rows + Delimited_Format.Delimited delimiter=delimiter encoding=encoding headers=headers skip_rows=skip_rows row_limit=row_limit quote_style=quote_style on_invalid_rows=on_invalid_rows field -> Error.throw (Illegal_Argument.Error ("The field `" ++ field ++ "` is currently not supported when deserializing the Delimited format from JSON.")) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Delimited/Invalid_Rows.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Delimited/Invalid_Rows.enso new file mode 100644 index 000000000000..2cc0fbcf4b30 --- /dev/null +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Delimited/Invalid_Rows.enso @@ -0,0 +1,25 @@ +from Standard.Base import Boolean, False, True + +## Describes what to do with rows that have unexpected number of columns. +type Invalid_Rows + ## Rows that contain too few or too many columns are dropped. + Drop_Invalid_Rows + + ## Rows that contain too few or too many columns are kept + + - If a row has too few columns, the missing columns are filled with `Nothing`. + - If it has too many, the extra columns are dropped. + Keep_Invalid_Rows + + ## Rows that contain more columns than expected are kept, and the extra columns are added. + + - If a row has too few columns, the missing columns are filled with `Nothing`. + - If it has too many, the extra columns are kept. The previous rows that + had less columns are filled with `Nothing`. + Add_Extra_Columns + +## PRIVATE + A conversion for backward compatibility. +Invalid_Rows.from (that : Boolean) = case that of + True -> Invalid_Rows.Keep_Invalid_Rows + False -> Invalid_Rows.Drop_Invalid_Rows diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso index 077455a2029a..6e257482e8e8 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso @@ -6,6 +6,7 @@ import Standard.Base.System.Input_Stream.Input_Stream import project.Data_Formatter.Data_Formatter import project.Delimited.Delimited_Format.Delimited_Format +import project.Delimited.Invalid_Rows.Invalid_Rows import project.Delimited.Quote_Style.Quote_Style import project.Headers.Headers import project.Internal.Java_Problems @@ -25,6 +26,7 @@ polyglot java import org.enso.table.parsing.problems.MismatchedQuote polyglot java import org.enso.table.parsing.TypeInferringParser polyglot java import org.enso.table.read.DelimitedReader polyglot java import org.enso.table.read.DelimitedReader.HeaderBehavior +polyglot java import org.enso.table.read.DelimitedReader.InvalidRowsBehavior polyglot java import org.enso.table.read.ParsingFailedException polyglot java import org.enso.table.read.QuoteStrippingParser @@ -124,9 +126,13 @@ prepare_reader format:Delimited_Format max_columns on_problems:Problem_Behavior newline = newline_override.if_nothing <| case format.line_endings of Infer -> Nothing endings -> endings.to_text + on_invalid_rows_java = case format.on_invalid_rows of + Invalid_Rows.Keep_Invalid_Rows -> InvalidRowsBehavior.KEEP + Invalid_Rows.Drop_Invalid_Rows -> InvalidRowsBehavior.DROP + Invalid_Rows.Add_Extra_Columns -> InvalidRowsBehavior.ADD_EXTRA_COLUMNS warnings_as_errors = on_problems == Problem_Behavior.Report_Error - DelimitedReader.new format.delimiter quote_characters.first quote_characters.second java_headers format.skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows newline format.comment_character warnings_as_errors java_problem_aggregator + DelimitedReader.new format.delimiter quote_characters.first quote_characters.second java_headers format.skip_rows row_limit max_columns value_parser cell_type_guesser on_invalid_rows_java newline format.comment_character warnings_as_errors java_problem_aggregator ## PRIVATE An internal type representing columns deduced from an existing file. @@ -134,7 +140,11 @@ type Detected_Headers ## Represents the headers found in the file. Existing (column_names : Vector Text) - ## Indicates that the file exists but no headers have been found, so only positional column matching is possible. + ## Indicates that the file exists but no headers have been found, + so only positional column matching is possible. + + Note that the file may still contain rows that have less or more columns + than specified here. This column count is only based on the first row. None (column_count : Integer) ## PRIVATE diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/Table.java b/std-bits/table/src/main/java/org/enso/table/data/table/Table.java index 58e35d3d73d6..cd251dddd4ac 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/table/Table.java +++ b/std-bits/table/src/main/java/org/enso/table/data/table/Table.java @@ -52,6 +52,8 @@ public Table(Column[] columns) { throw new IllegalArgumentException("Column names must be unique within a Table."); } + assert checkAllColumnsHaveSameSize(columns) : "All columns must have the same row count."; + this.columns = columns; } @@ -67,6 +69,17 @@ private static boolean checkUniqueColumns(Column[] columns) { return true; } + private static boolean checkAllColumnsHaveSameSize(Column[] columns) { + int size = columns[0].getSize(); + for (Column column : columns) { + if (column.getSize() != size) { + return false; + } + } + + return true; + } + /** * @return the number of rows in this table */ diff --git a/std-bits/table/src/main/java/org/enso/table/read/DelimitedFileMetadata.java b/std-bits/table/src/main/java/org/enso/table/read/DelimitedFileMetadata.java index 98b2e6947e2d..d39848643f03 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/DelimitedFileMetadata.java +++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedFileMetadata.java @@ -1,5 +1,7 @@ package org.enso.table.read; +import java.util.List; + /** * Metadata that can be detected by the DelimitedReader. * @@ -14,6 +16,6 @@ */ public record DelimitedFileMetadata( long columnCount, - String[] definedColumnNames, + List definedColumnNames, boolean hasAnyContent, String effectiveLineSeparator) {} diff --git a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java index 5dd11ff52b72..eef35ce58326 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java +++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java @@ -60,7 +60,7 @@ public class DelimitedReader { private final CsvParser parser; private final DatatypeParser valueParser; private final TypeInferringParser cellTypeGuesser; - private final boolean keepInvalidRows; + private final InvalidRowsBehavior keepInvalidRows; private String newlineSetting; private final NoOpParseProblemAggregator noOpProblemAggregator = new NoOpParseProblemAggregator(); private long targetTableIndex = 0; @@ -69,6 +69,7 @@ public class DelimitedReader { private long currentLine = 0; private List> builders = null; + private int initialColumnCount = 0; private final DelimitedReaderProblemAggregator problemAggregator; /** @@ -111,7 +112,7 @@ public DelimitedReader( int maxColumns, DatatypeParser valueParser, TypeInferringParser cellTypeGuesser, - boolean keepInvalidRows, + InvalidRowsBehavior keepInvalidRows, String newline, String commentCharacter, boolean warningsAsErrors, @@ -266,17 +267,36 @@ private void appendRow(String[] row) { assert canFitMoreRows(); if (row.length != builders.size()) { - problemAggregator.reportInvalidRow( - currentLine, keepInvalidRows ? targetTableIndex : null, row, builders.size()); + boolean isRowKept = + switch (keepInvalidRows) { + case DROP -> false; + case KEEP, ADD_EXTRA_COLUMNS -> true; + }; + + // The error is only reported if the column count does not match the initial column count. + // Otherwise, a single row with more columns in ADD_EXTRA_COLUMNS mode will expand the + // builders and all subsequent rows (that had original column count) would turn into warnings. + // Such flood of warnings is not useful. Instead, we only warn on the occurrences that expand + // the column count, or that have fewer columns than originally expected. + if (row.length != initialColumnCount) { + problemAggregator.reportInvalidRow( + currentLine, isRowKept ? targetTableIndex : null, row, builders.size()); + } + + if (isRowKept) { + // If the current row had more columns than expected, they are either discarded or added as + // extra columns. + if (keepInvalidRows == InvalidRowsBehavior.ADD_EXTRA_COLUMNS + && row.length > builders.size()) { + addExtraColumns(row.length - builders.size()); + } - if (keepInvalidRows) { for (int i = 0; i < builders.size() && i < row.length; i++) { builders.get(i).append(row[i]); } // If the current row had fewer columns than expected, nulls are inserted for the missing // values. - // If it had more columns, the excess columns are discarded. for (int i = row.length; i < builders.size(); i++) { builders.get(i).appendNulls(1); } @@ -292,6 +312,17 @@ private void appendRow(String[] row) { } } + private void addExtraColumns(int count) { + for (int i = 0; i < count; i++) { + int columnIndex = builders.size() + 1; + effectiveColumnNames.add(COLUMN_NAME + " " + columnIndex); + var builder = constructBuilder(targetTableIndex); + // We ensure the new builder has the same length as the previous ones by padding with nulls. + builder.appendNulls(Math.toIntExact(targetTableIndex)); + builders.add(builder); + } + } + private boolean canFitMoreRows() { return rowLimit < 0 || targetTableIndex < rowLimit; } @@ -324,7 +355,7 @@ private boolean isPlainText(String cell) { } /** The column names as defined in the input (if applicable, otherwise null). */ - private String[] definedColumnNames = null; + private List definedColumnNames = null; /** * The effective column names. @@ -332,10 +363,10 @@ private boolean isPlainText(String cell) { *

If {@code GENERATE_HEADERS} is used or if {@code INFER} is used and no headers are found, * this will be populated with automatically generated column names. */ - private String[] effectiveColumnNames; + private List effectiveColumnNames; private int getColumnCount() { - return effectiveColumnNames.length; + return effectiveColumnNames.size(); } /** @@ -380,7 +411,7 @@ private void detectHeaders() { } if (firstRow == null) { - effectiveColumnNames = new String[0]; + effectiveColumnNames = List.of(); return; } @@ -423,9 +454,11 @@ private void detectHeaders() { default -> throw new IllegalStateException("Impossible branch."); } - effectiveColumnNames = headerNames.toArray(new String[0]); + effectiveColumnNames = headerNames; if (wereHeadersDefined) { - definedColumnNames = effectiveColumnNames; + // We need a copy of the defined column names, as the effective column names may be modified + // later. + definedColumnNames = new ArrayList<>(effectiveColumnNames); } } @@ -445,6 +478,7 @@ public Table read(Reader input) { throw new EmptyFileException(); } + initialColumnCount = columnCount; initBuilders(columnCount); while (canFitMoreRows()) { var currentRow = readNextRow(); @@ -461,7 +495,7 @@ public Table read(Reader input) { Column[] columns = new Column[builders.size()]; for (int i = 0; i < builders.size(); i++) { - String columnName = effectiveColumnNames[i]; + String columnName = effectiveColumnNames.get(i); var stringStorage = builders.get(i).seal(); // We don't expect InvalidFormat to be propagated back to Enso, there is no particular type @@ -493,10 +527,14 @@ private void markUsed() { private void initBuilders(int count) { builders = new ArrayList<>(count); for (int i = 0; i < count; i++) { - builders.add(Builder.getForText(TextType.VARIABLE_LENGTH, INITIAL_ROW_CAPACITY)); + builders.add(constructBuilder(INITIAL_ROW_CAPACITY)); } } + private BuilderForType constructBuilder(long initialCapacity) { + return Builder.getForText(TextType.VARIABLE_LENGTH, initialCapacity); + } + /** Specifies how to set the headers for the returned table. */ public enum HeaderBehavior { /** Tries to infer if the headers are present in the file. */ @@ -510,4 +548,16 @@ public enum HeaderBehavior { */ GENERATE_HEADERS } + + /** Specifies how to handle rows with unexpected number of columns. */ + public enum InvalidRowsBehavior { + /** Discards rows with unexpected number of columns. */ + DROP, + + /** Keeps rows with unexpected number of columns, but the additional columns are discarded. */ + KEEP, + + /** Keeps rows with unexpected number of columns, adding extra columns. */ + ADD_EXTRA_COLUMNS + } } diff --git a/test/Table_Tests/src/IO/Delimited_Read_Spec.enso b/test/Table_Tests/src/IO/Delimited_Read_Spec.enso index c0dc864aa6cc..c14fbab5908a 100644 --- a/test/Table_Tests/src/IO/Delimited_Read_Spec.enso +++ b/test/Table_Tests/src/IO/Delimited_Read_Spec.enso @@ -283,9 +283,65 @@ add_specs suite_builder = t3.column_names.should_equal ["A", "B", "C"] t3.print - group_builder.specify "should handle too long and too short rows" <| - action keep_invalid_rows on_problems = - Data.read (enso_project.data / "varying_rows.csv") (..Delimited "," headers=True keep_invalid_rows=keep_invalid_rows value_formatter=Nothing) on_problems=on_problems + group_builder.specify "should default to adding extra columns if rows longer than first one appear, and padding shorter with Nothing" <| + r2 = Data.read (enso_project.data / "varying_rows2.csv") (..Delimited "," headers=..Has_Headers value_formatter=Nothing) + r2.column_names . should_equal ["a", "b", "c", "Column 4"] + r2.at "a" . to_vector . should_equal ["1", "0", "4"] + r2.at "b" . to_vector . should_equal ["2", "0", "5"] + r2.at "c" . to_vector . should_equal ["3", "0", "6"] + r2.at "Column 4" . to_vector . should_equal [Nothing, "10", Nothing] + + # If there are no extra columns, shorter rows are padded with Nothing like in `Keep_Invalid_Rows`. + r3 = Data.read (enso_project.data / "varying_rows3.csv") (..Delimited "," headers=..Has_Headers value_formatter=Nothing) + r3.column_names . should_equal ["a", "b", "c"] + r3.at "a" . to_vector . should_equal ["1", "0", "4"] + r3.at "b" . to_vector . should_equal ["2", "0", "5"] + r3.at "c" . to_vector . should_equal ["3", Nothing, "6"] + + r1 = Data.read (enso_project.data / "varying_rows.csv") (..Delimited "," headers=..Has_Headers value_formatter=Nothing) + r1.column_names . should_equal ["a", "b", "c", "Column 4", "Column 5", "Column 6", "Column 7", "Column 8"] + r1.at "a" . to_vector . should_equal ["1", "1", "1", Nothing, "1", "1"] + r1.at "b" . to_vector . should_equal ["2", "2", "2", Nothing, Nothing, "2"] + r1.at "c" . to_vector . should_equal ["3", "3", Nothing, Nothing, Nothing, "3"] + r1.at "Column 4" . to_vector . should_equal ["4", Nothing, Nothing, Nothing, Nothing, "4"] + r1.at "Column 5" . to_vector . should_equal [Nothing, Nothing, Nothing, Nothing, Nothing, "5"] + r1.at "Column 6" . to_vector . should_equal [Nothing, Nothing, Nothing, Nothing, Nothing, "6"] + r1.at "Column 7" . to_vector . should_equal [Nothing, Nothing, Nothing, Nothing, Nothing, "7"] + r1.at "Column 8" . to_vector . should_equal [Nothing, Nothing, Nothing, Nothing, Nothing, "8"] + + # After the first row that expands the expected column count, later the expected column count is at 4. + # But the second row that has 3 columns does not report a warning, because that was the original expected column count. + # This is so that if a big file contains a single row with more columns, all subsequent colunns that contain the _original_ number of columns do not raise superficious errors. + expected_problems = [Invalid_Row.Error 2 0 ['1', '2', '3', '4'] 3, Invalid_Row.Error 4 2 ['1', '2'] 4, Invalid_Row.Error 5 3 [Nothing] 4, Invalid_Row.Error 6 4 ['1'] 4, Invalid_Row.Error 7 5 ['1', '2', '3', '4', '5', '6', '7', '8'] 4] + Problems.get_attached_warnings r1 . should_equal_ignoring_order expected_problems + + group_builder.specify "should allow to explicitly choose to add extra columns on longer rows" <| + r2 = Data.read (enso_project.data / "varying_rows2.csv") (..Delimited "," headers=..Has_Headers on_invalid_rows=..Add_Extra_Columns value_formatter=Nothing) + r2.column_names . should_equal ["a", "b", "c", "Column 4"] + r2.at "a" . to_vector . should_equal ["1", "0", "4"] + r2.at "b" . to_vector . should_equal ["2", "0", "5"] + r2.at "c" . to_vector . should_equal ["3", "0", "6"] + r2.at "Column 4" . to_vector . should_equal [Nothing, "10", Nothing] + + group_builder.specify "should still be able to infer headers with column count mismatch at first lines" <| + r1 = Data.read (enso_project.data / "varying_rows.csv") (..Delimited "," headers=..Detect_Headers row_limit=(..First 3) value_formatter=Nothing) + r1.column_names . should_equal ["a", "b", "c", "Column 4"] + r1.at "a" . to_vector . should_equal ["1", "1", "1"] + r1.at "b" . to_vector . should_equal ["2", "2", "2"] + r1.at "c" . to_vector . should_equal ["3", "3", Nothing] + r1.at "Column 4" . to_vector . should_equal ["4", Nothing, Nothing] + + group_builder.specify "should correctly read no-headers with varying column count and Add_Extra_Columns" <| + r1 = Data.read (enso_project.data / "varying_rows.csv") (..Delimited "," headers=..No_Headers row_limit=(..First 3) on_invalid_rows=..Add_Extra_Columns value_formatter=Nothing) + r1.column_names.should_equal ["Column 1", "Column 2", "Column 3", "Column 4"] + r1.at "Column 1" . to_vector . should_equal ["a", "1", "1"] + r1.at "Column 2" . to_vector . should_equal ["b", "2", "2"] + r1.at "Column 3" . to_vector . should_equal ["c", "3", "3"] + r1.at "Column 4" . to_vector . should_equal [Nothing, "4", Nothing] + + group_builder.specify "should allow to decide to drop or keep rows with too many or too few columns, dropping excess columns" <| + action on_invalid_rows on_problems = + Data.read (enso_project.data / "varying_rows.csv") (..Delimited "," headers=..Has_Headers on_invalid_rows=on_invalid_rows value_formatter=Nothing) on_problems=on_problems tester_kept table = table.columns.map .name . should_equal ['a', 'b', 'c'] @@ -293,7 +349,7 @@ add_specs suite_builder = table.at 'b' . to_vector . should_equal ['2', '2', '2', Nothing, Nothing, '2'] table.at 'c' . to_vector . should_equal ['3', '3', Nothing, Nothing, Nothing, '3'] problems_kept = [Invalid_Row.Error 2 0 ['1', '2', '3', '4'] 3, Invalid_Row.Error 4 2 ['1', '2'] 3, Invalid_Row.Error 5 3 [Nothing] 3, Invalid_Row.Error 6 4 ['1'] 3, Invalid_Row.Error 7 5 ['1', '2', '3', '4', '5', '6', '7', '8'] 3] - Problems.test_problem_handling (action keep_invalid_rows=True) problems_kept tester_kept + Problems.test_problem_handling (action on_invalid_rows=..Keep_Invalid_Rows) problems_kept tester_kept tester_dropped table = table.columns.map .name . should_equal ['a', 'b', 'c'] @@ -301,21 +357,36 @@ add_specs suite_builder = table.at 'b' . to_vector . should_equal ['2'] table.at 'c' . to_vector . should_equal ['3'] problems_dropped = [Invalid_Row.Error 2 Nothing ['1', '2', '3', '4'] 3, Invalid_Row.Error 4 Nothing ['1', '2'] 3, Invalid_Row.Error 5 Nothing [Nothing] 3, Invalid_Row.Error 6 Nothing ['1'] 3, Invalid_Row.Error 7 Nothing ['1', '2', '3', '4', '5', '6', '7', '8'] 3] - Problems.test_problem_handling (action keep_invalid_rows=False) problems_dropped tester_dropped + Problems.test_problem_handling (action on_invalid_rows=..Drop_Invalid_Rows) problems_dropped tester_dropped - r2 = Data.read (enso_project.data / "varying_rows2.csv") (..Delimited "," headers=True keep_invalid_rows=False value_formatter=Nothing) + r2 = Data.read (enso_project.data / "varying_rows2.csv") (..Delimited "," headers=..Has_Headers on_invalid_rows=..Drop_Invalid_Rows value_formatter=Nothing) r2.column_names . should_equal ['a', 'b', 'c'] - Problems.expect_only_warning (Invalid_Row.Error 3 Nothing ['0', '0', '0', '10'] 3) r2 - warning2 = Problems.get_attached_warnings r2 . first + warning2 = Problems.expect_only_warning (Invalid_Row.Error 3 Nothing ['0', '0', '0', '10'] 3) r2 warning2.to_display_text . should_equal "The row (line 3) had too many columns (expected 3, got 4)." r2.at 'a' . to_vector . should_equal ['1', '4'] r2.at 'b' . to_vector . should_equal ['2', '5'] r2.at 'c' . to_vector . should_equal ['3', '6'] - r3 = Data.read (enso_project.data / "varying_rows3.csv") (..Delimited "," headers=True keep_invalid_rows=True value_formatter=Nothing) + r3 = Data.read (enso_project.data / "varying_rows3.csv") (..Delimited "," headers=..Has_Headers on_invalid_rows=..Keep_Invalid_Rows value_formatter=Nothing) r3.column_names . should_equal ['a', 'b', 'c'] - Problems.expect_only_warning (Invalid_Row.Error 3 1 ['0', '0'] 3) r3 - warning3 = Problems.get_attached_warnings r3 . first + warning3 = Problems.expect_only_warning (Invalid_Row.Error 3 1 ['0', '0'] 3) r3 + warning3.to_display_text . should_equal "The row (line 3, table row 1) had too few columns (expected 3, got 2)." + r3.at 'a' . to_vector . should_equal ['1', '0', '4'] + r3.at 'b' . to_vector . should_equal ['2', '0', '5'] + r3.at 'c' . to_vector . should_equal ['3', Nothing, '6'] + + group_builder.specify "should allow Boolean arguments for on_invalid_rows (backwards compatibility)" <| + r2 = Data.read (enso_project.data / "varying_rows2.csv") (..Delimited "," headers=True on_invalid_rows=False value_formatter=Nothing) + r2.column_names . should_equal ['a', 'b', 'c'] + warning2 = Problems.expect_only_warning Invalid_Row r2 + warning2.to_display_text . should_equal "The row (line 3) had too many columns (expected 3, got 4)." + r2.at 'a' . to_vector . should_equal ['1', '4'] + r2.at 'b' . to_vector . should_equal ['2', '5'] + r2.at 'c' . to_vector . should_equal ['3', '6'] + + r3 = Data.read (enso_project.data / "varying_rows3.csv") (..Delimited "," headers=True on_invalid_rows=True value_formatter=Nothing) + r3.column_names . should_equal ['a', 'b', 'c'] + warning3 = Problems.expect_only_warning Invalid_Row r3 warning3.to_display_text . should_equal "The row (line 3, table row 1) had too few columns (expected 3, got 2)." r3.at 'a' . to_vector . should_equal ['1', '0', '4'] r3.at 'b' . to_vector . should_equal ['2', '0', '5'] @@ -323,7 +394,7 @@ add_specs suite_builder = group_builder.specify "should aggregate invalid rows over some limit" <| action on_problems = - Data.read (enso_project.data / "many_invalid_rows.csv") (..Delimited "," headers=True keep_invalid_rows=False value_formatter=Nothing) on_problems + Data.read (enso_project.data / "many_invalid_rows.csv") (..Delimited "," headers=True on_invalid_rows=False value_formatter=Nothing) on_problems tester table = table.columns.map .name . should_equal ['a', 'b', 'c']