From 4ba50c5d83d444c989306323b20abaccc5528e11 Mon Sep 17 00:00:00 2001 From: jsconan Date: Wed, 14 Feb 2024 21:12:56 +0100 Subject: [PATCH 1/4] feat: set the default CSV dialect to 'excel' to reflect the default value from the Python library --- src/cerbernetix/toolbox/files/csv_file.py | 3 ++- tests/files/test_csv_file.py | 13 +++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/cerbernetix/toolbox/files/csv_file.py b/src/cerbernetix/toolbox/files/csv_file.py index e1ad18e..81e4f2c 100644 --- a/src/cerbernetix/toolbox/files/csv_file.py +++ b/src/cerbernetix/toolbox/files/csv_file.py @@ -39,6 +39,7 @@ first = file.read() ``` """ + from __future__ import annotations import csv @@ -52,7 +53,7 @@ CSV_ENCODING = "utf-8" # The default CSV dialect -CSV_DIALECT = "unix" +CSV_DIALECT = "excel" # The amount of bytes to read for auto-detecting the CSV dialect CSV_SAMPLE_SIZE = 1024 diff --git a/tests/files/test_csv_file.py b/tests/files/test_csv_file.py index ad9bd67..82c908c 100644 --- a/tests/files/test_csv_file.py +++ b/tests/files/test_csv_file.py @@ -1,4 +1,5 @@ """Test the class for reading and writing CSV files.""" + import unittest import zipfile from typing import Iterator @@ -25,14 +26,14 @@ ["Jane", "Doe", "20", "Paris"], ] CSV_LINES_STRING = [ - '"first_name","last_name","age","city"\n', - '"John","Smith","18","London"\n', - '"Jane","Doe","20","Paris"\n', + "first_name,last_name,age,city\r\n", + "John,Smith,18,London\r\n", + "Jane,Doe,20,Paris\r\n", ] CSV_LINES_REDUCED = [ - '"first_name","last_name"\n', - '"John","Smith"\n', - '"Jane","Doe"\n', + "first_name,last_name\r\n", + "John,Smith\r\n", + "Jane,Doe\r\n", ] CSV_STRING = "".join(CSV_LINES_STRING) From ce514248ef1e29c6825dbebf8ede25dad3407c7c Mon Sep 17 00:00:00 2001 From: jsconan Date: Wed, 14 Feb 2024 21:48:06 +0100 Subject: [PATCH 2/4] feat: set the default CSV dialect to 'auto' when reading and to 'excel' when writing --- src/cerbernetix/toolbox/files/__init__.py | 1 + src/cerbernetix/toolbox/files/csv_file.py | 21 ++++++++++-------- tests/files/test_csv_file.py | 27 +++++++++++++++++------ 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/src/cerbernetix/toolbox/files/__init__.py b/src/cerbernetix/toolbox/files/__init__.py index 9be61a7..e98bc30 100644 --- a/src/cerbernetix/toolbox/files/__init__.py +++ b/src/cerbernetix/toolbox/files/__init__.py @@ -66,6 +66,7 @@ """ from cerbernetix.toolbox.files.csv_file import ( + CSV_AUTO, CSV_DIALECT, CSV_ENCODING, CSVFile, diff --git a/src/cerbernetix/toolbox/files/csv_file.py b/src/cerbernetix/toolbox/files/csv_file.py index 81e4f2c..3adb4d4 100644 --- a/src/cerbernetix/toolbox/files/csv_file.py +++ b/src/cerbernetix/toolbox/files/csv_file.py @@ -55,6 +55,9 @@ # The default CSV dialect CSV_DIALECT = "excel" +# The value for auto-detecting the CSV dialect +CSV_AUTO = "auto" + # The amount of bytes to read for auto-detecting the CSV dialect CSV_SAMPLE_SIZE = 1024 @@ -140,7 +143,7 @@ def __init__( read: bool = False, write: bool = False, encoding: str = CSV_ENCODING, - dialect: str = CSV_DIALECT, + dialect: str = CSV_AUTO, **kwargs, ): r"""Creates a file manager for CSV files. @@ -158,7 +161,7 @@ def __init__( encoding (str, optional): The file encoding. Defaults to CSV_ENCODING. dialect (str, optional): The CSV dialect to use. If 'auto' is given, the reader will try detecting the CSV dialect by reading a sample at the head of the file. - Defaults to CSV_DIALECT. + Defaults to CSV_AUTO for reading or to CSV_DIALECT for writing. delimiter (str, optional): A one-character string used to separate fields. Defaults to ",". doublequote (bool, optional): Controls how instances of quotechar appearing inside a @@ -401,7 +404,7 @@ def read(self) -> dict | list: reader = csv.DictReader dialect = self.dialect - if dialect == "auto": + if dialect == CSV_AUTO: dialect = csv.Sniffer().sniff(self._file.read(CSV_SAMPLE_SIZE)) self._file.seek(0) @@ -462,7 +465,7 @@ def write(self, data: dict | list) -> int: writer = csv.writer dialect = self.dialect - if dialect == "auto": + if dialect == CSV_AUTO: dialect = CSV_DIALECT self._writer = writer(self._file, dialect=dialect, **kwargs) @@ -476,7 +479,7 @@ def write(self, data: dict | list) -> int: def read_csv_file( filename: str, encoding: str = CSV_ENCODING, - dialect: str = CSV_DIALECT, + dialect: str = CSV_AUTO, iterator: bool = False, **kwargs, ) -> Iterable[dict | list]: @@ -490,7 +493,7 @@ def read_csv_file( encoding (str, optional): The file encoding. Defaults to CSV_ENCODING. dialect (str, optional): The CSV dialect to use. If 'auto' is given, the reader will try detecting the CSV dialect by reading a sample at the head of the file. - Defaults to CSV_DIALECT. + Defaults to CSV_AUTO. iterator (bool, optional): When True, the function will return an iterator instead of a list. Defaults to False. delimiter (str, optional): A one-character string used to separate fields. @@ -623,7 +626,7 @@ def read_zip_csv( filename: str = None, encoding: str = CSV_ENCODING, decoding_errors: str = "ignore", - dialect: str = CSV_DIALECT, + dialect: str = CSV_AUTO, iterator: bool = False, **kwargs, ) -> Iterable[dict | list]: @@ -643,7 +646,7 @@ def read_zip_csv( Defaults to "ignore". dialect (str, optional): The CSV dialect to use. If 'auto' is given, the reader will try detecting the CSV dialect by reading a sample at the head of the file. - Defaults to CSV_DIALECT. + Defaults to CSV_AUTO. iterator (bool, optional): When True, the function will return an iterator instead of a list. Defaults to False. delimiter (str, optional): A one-character string used to separate fields. @@ -705,7 +708,7 @@ def read_zip_csv( else: reader_factory = csv.DictReader - if dialect == "auto": + if dialect == CSV_AUTO: dialect = csv.Sniffer().sniff(text[:CSV_SAMPLE_SIZE]) lines = re.split(r"[\r\n]+", text.strip("\r\n")) diff --git a/tests/files/test_csv_file.py b/tests/files/test_csv_file.py index 82c908c..498c3c7 100644 --- a/tests/files/test_csv_file.py +++ b/tests/files/test_csv_file.py @@ -6,6 +6,7 @@ from unittest.mock import MagicMock, Mock, patch from cerbernetix.toolbox.files import ( + CSV_AUTO, CSV_DIALECT, CSV_ENCODING, CSVFile, @@ -51,7 +52,7 @@ def test_construction_default(self): self.assertEqual(file.filename, file_path) self.assertFalse(file.binary) - self.assertEqual(file.dialect, CSV_DIALECT) + self.assertEqual(file.dialect, CSV_AUTO) self.assertEqual(file.encoding, CSV_ENCODING) self.assertIsNone(file._file) self.assertEqual(file._open_args, {"newline": ""}) @@ -231,7 +232,8 @@ def test_close_auto(self, mock_file_open): CSV_LINES_HEADLESS, ], ["list", {"fieldnames": False}, CSV_LINES_STRING, CSV_LINES_LIST], - ["auto", {"dialect": "auto"}, CSV_LINES_STRING, CSV_LINES_DICT], + ["auto", {"dialect": CSV_AUTO}, CSV_LINES_STRING, CSV_LINES_DICT], + ["dialect", {"dialect": CSV_DIALECT}, CSV_LINES_STRING, CSV_LINES_DICT], ] ) def test_read_file(self, _, params, data, expected): @@ -296,7 +298,8 @@ def test_read_file_iterator(self, mock_file_open): CSV_LINES_LIST[1:], "".join(CSV_LINES_STRING[1:]), ], - ["auto", {"dialect": "auto"}, CSV_LINES_DICT, CSV_STRING], + ["auto", {"dialect": CSV_AUTO}, CSV_LINES_DICT, CSV_STRING], + ["dialect", {"dialect": CSV_DIALECT}, CSV_LINES_DICT, CSV_STRING], ] ) def test_write_file(self, _, params, data, expected): @@ -326,17 +329,24 @@ def write(line): mock_file.write.assert_called() mock_file.close.assert_called_once() + @test_cases( + [ + [CSV_AUTO], + [CSV_DIALECT], + ] + ) @patch("builtins.open") - def test_read(self, mock_file_open): + def test_read(self, dialect, mock_file_open): """Tests a file can be read line by line.""" file_path = "/root/folder/file" mock_file = MagicMock() mock_file.close = Mock() mock_file.__iter__.return_value = CSV_LINES_STRING + mock_file.read.return_value = CSV_STRING mock_file_open.return_value = mock_file - file = CSVFile(file_path) + file = CSVFile(file_path, dialect=dialect) self.assertRaises(ValueError, file.read) @@ -427,6 +437,7 @@ def test_iterator(self, mock_file_open): mock_file = MagicMock() mock_file.close = Mock() mock_file.__iter__.return_value = CSV_LINES_STRING + mock_file.read.return_value = CSV_STRING mock_file_open.return_value = mock_file file = CSVFile(file_path) @@ -457,6 +468,7 @@ def test_read_csv_file(self, mock_file_open): mock_file = MagicMock() mock_file.close = Mock() mock_file.__iter__.return_value = CSV_LINES_STRING + mock_file.read.return_value = CSV_STRING mock_file_open.return_value = mock_file result = read_csv_file(file_path) @@ -474,6 +486,7 @@ def test_read_csv_file_iterator(self, mock_file_open): mock_file = MagicMock() mock_file.close = Mock() mock_file.__iter__.return_value = CSV_LINES_STRING + mock_file.read.return_value = CSV_STRING mock_file_open.return_value = mock_file result = read_csv_file(file_path, iterator=True) @@ -539,8 +552,8 @@ def write(line): CSV_LINES_LIST, ], [ - "dialect auto", - {"dialect": "auto"}, + "dialect", + {"dialect": CSV_DIALECT}, "FOO.CSV", CSV_STRING, CSV_LINES_DICT, From 03dff9bceaffcea969bbd82145290a17df4d089b Mon Sep 17 00:00:00 2001 From: jsconan Date: Wed, 14 Feb 2024 21:50:59 +0100 Subject: [PATCH 3/4] chore: update the changelog --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f03869b..70a10c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- Add options to JSONFile implementation (sort_keys, skip_keys, ensure_ascii, separators, strict). +- Add options to JSONFile implementation (`sort_keys`, `skip_keys`, `ensure_ascii`, `separators`, `strict`). +- Set the default CSV dialect to `'excel'` when writing (this reflects the default value from the Python library). +- Set the default CSV dialect to `'auto'` when reading (the dialect will be sniffed from the first few rows). ### Fixed From 5acda8ab589f4befbca8a03b61fcd5e8d35742ac Mon Sep 17 00:00:00 2001 From: jsconan Date: Wed, 14 Feb 2024 21:51:39 +0100 Subject: [PATCH 4/4] doc: update the documentation --- docs/toolbox.files.csv_file.md | 35 +++++++++++++++++----------------- docs/toolbox.files.md | 1 + 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/docs/toolbox.files.csv_file.md b/docs/toolbox.files.csv_file.md index d239688..e827860 100644 --- a/docs/toolbox.files.csv_file.md +++ b/docs/toolbox.files.csv_file.md @@ -50,6 +50,7 @@ with file: --------------- - **CSV_ENCODING** - **CSV_DIALECT** +- **CSV_AUTO** - **CSV_SAMPLE_SIZE** - **CSV_READER_PARAMS** - **CSV_WRITER_PARAMS** @@ -57,7 +58,7 @@ with file: --- - + ## function `read_csv_file` @@ -65,7 +66,7 @@ with file: read_csv_file( filename: 'str', encoding: 'str' = 'utf-8', - dialect: 'str' = 'unix', + dialect: 'str' = 'auto', iterator: 'bool' = False, **kwargs ) → Iterable[dict | list] @@ -81,7 +82,7 @@ The returned value can be either a list (default) or an iterator (when the itera - `filename` (str): The path to the file to read. - `encoding` (str, optional): The file encoding. Defaults to CSV_ENCODING. - - `dialect` (str, optional): The CSV dialect to use. If 'auto' is given, the reader will try detecting the CSV dialect by reading a sample at the head of the file. Defaults to CSV_DIALECT. + - `dialect` (str, optional): The CSV dialect to use. If 'auto' is given, the reader will try detecting the CSV dialect by reading a sample at the head of the file. Defaults to CSV_AUTO. - `iterator` (bool, optional): When True, the function will return an iterator instead of a list. Defaults to False. - `delimiter` (str, optional): A one-character string used to separate fields. Defaults to ','. - `doublequote` (bool, optional): Controls how instances of quotechar appearing inside a field should themselves be quoted. When True, the character is doubled. When False, the escapechar is used as a prefix to the quotechar. Defaults to True. @@ -123,7 +124,7 @@ for row in read_csv_file('path/to/file', iterator=True): --- - + ## function `write_csv_file` @@ -132,7 +133,7 @@ write_csv_file( filename: 'str', data: 'Iterable[dict | list]', encoding: 'str' = 'utf-8', - dialect: 'str' = 'unix', + dialect: 'str' = 'excel', **kwargs ) → int ``` @@ -189,7 +190,7 @@ write_csv_file('path/to/file', csv_data, encoding='UTF-8', dialect='excel') --- - + ## function `read_zip_csv` @@ -199,7 +200,7 @@ read_zip_csv( filename: 'str' = None, encoding: 'str' = 'utf-8', decoding_errors: 'str' = 'ignore', - dialect: 'str' = 'unix', + dialect: 'str' = 'auto', iterator: 'bool' = False, **kwargs ) → Iterable[dict | list] @@ -217,7 +218,7 @@ The returned value can be either a list (default) or an iterator (when the itera - `filename` (str, optional): The name of the file to extract from the zip If omitted, the first file having a '.csv' extension will be selected. Defaults to None. - `encoding` (str, optional): The file encoding. Defaults to CSV_ENCODING. - `decoding_errors` (str, optional): Controls how decoding errors are handled. If 'strict', a UnicodeError exception is raised. Other possible values are 'ignore', 'replace', and any other name registered via codecs.register_error(). See Error Handlers for details. Defaults to "ignore". - - `dialect` (str, optional): The CSV dialect to use. If 'auto' is given, the reader will try detecting the CSV dialect by reading a sample at the head of the file. Defaults to CSV_DIALECT. + - `dialect` (str, optional): The CSV dialect to use. If 'auto' is given, the reader will try detecting the CSV dialect by reading a sample at the head of the file. Defaults to CSV_AUTO. - `iterator` (bool, optional): When True, the function will return an iterator instead of a list. Defaults to False. - `delimiter` (str, optional): A one-character string used to separate fields. Defaults to ','. - `doublequote` (bool, optional): Controls how instances of quotechar appearing inside a field should themselves be quoted. When True, the character is doubled. When False, the escapechar is used as a prefix to the quotechar. Defaults to True. @@ -265,7 +266,7 @@ with open('path/to/file.zip', 'rb') as file: --- - + ## class `CSVFile` Offers a simple API for reading and writing CSV files. @@ -310,7 +311,7 @@ with file(create=True): csv = file.read_file() ``` - + ### method `__init__` @@ -322,7 +323,7 @@ __init__( read: 'bool' = False, write: 'bool' = False, encoding: 'str' = 'utf-8', - dialect: 'str' = 'unix', + dialect: 'str' = 'auto', **kwargs ) ``` @@ -339,7 +340,7 @@ Creates a file manager for CSV files. - `read` (bool, optional): Expect to also read the file. Defaults to False. - `write` (bool, optional): Expect to also write to the file. Defaults to False. - `encoding` (str, optional): The file encoding. Defaults to CSV_ENCODING. - - `dialect` (str, optional): The CSV dialect to use. If 'auto' is given, the reader will try detecting the CSV dialect by reading a sample at the head of the file. Defaults to CSV_DIALECT. + - `dialect` (str, optional): The CSV dialect to use. If 'auto' is given, the reader will try detecting the CSV dialect by reading a sample at the head of the file. Defaults to CSV_AUTO for reading or to CSV_DIALECT for writing. - `delimiter` (str, optional): A one-character string used to separate fields. Defaults to ",". - `doublequote` (bool, optional): Controls how instances of quotechar appearing inside a field should themselves be quoted. When True, the character is doubled. When False, the escapechar is used as a prefix to the quotechar. Defaults to True. - `escapechar` (str, optional): A one-character string used by the writer to escape the delimiter if quoting is set to QUOTE_NONE and the quotechar if doublequote is False. On reading, the escapechar removes any special meaning from the following character. Defaults to None, which disables escaping. @@ -566,7 +567,7 @@ size = file.size --- - + ### method `close` @@ -604,7 +605,7 @@ file.close() --- - + ### method `read` @@ -648,7 +649,7 @@ csv_data = [row for row in file] --- - + ### method `read_file` @@ -699,7 +700,7 @@ for row in file.read_file(iterator=True): --- - + ### method `write` @@ -746,7 +747,7 @@ with file(create=True): --- - + ### method `write_file` diff --git a/docs/toolbox.files.md b/docs/toolbox.files.md index 2f7d14f..a26f814 100644 --- a/docs/toolbox.files.md +++ b/docs/toolbox.files.md @@ -73,6 +73,7 @@ csv_data = file.read_zip_csv(data) **Global Variables** --------------- +- **CSV_AUTO** - **CSV_DIALECT** - **CSV_ENCODING** - **JSON_ENCODING**