Skip to content

Commit

Permalink
Unify passing options to Loaders/Extractors (#1207)
Browse files Browse the repository at this point in the history
  • Loading branch information
mleczakm authored Sep 5, 2024
1 parent b31d6d7 commit adf86f1
Show file tree
Hide file tree
Showing 10 changed files with 20 additions and 163 deletions.
2 changes: 1 addition & 1 deletion examples/topics/data_source/csv/description.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Read data from a csv file.

```php
function from_csv(
string|Path|array $path,
string|Path $path,
bool $with_header = true,
bool $empty_to_null = true,
?string $delimiter = null,
Expand Down
2 changes: 1 addition & 1 deletion examples/topics/data_source/json/description.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Read data from a json file.

```php
function from_json(
string|Path|array $path,
string|Path $path,
?string $pointer = null,
?Schema $schema = null,
);
Expand Down
2 changes: 1 addition & 1 deletion examples/topics/data_source/xml/description.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Read data from a json file.

```php
function from_xml(
string|Path|array $path,
string|Path $path,
string $xml_node_path = ''
);
```
Expand Down
26 changes: 3 additions & 23 deletions src/adapter/etl-adapter-csv/src/Flow/ETL/Adapter/CSV/functions.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,45 +4,25 @@

namespace Flow\ETL\Adapter\CSV;

use function Flow\ETL\DSL\from_all;
use Flow\ETL\Adapter\CSV\Detector\{Option, Options};
use Flow\ETL\Row\Schema;
use Flow\ETL\{Attribute\DocumentationDSL, Attribute\Module, Attribute\Type as DSLType, Extractor, Loader};
use Flow\ETL\{Attribute\DocumentationDSL, Attribute\Module, Attribute\Type as DSLType, Loader};
use Flow\Filesystem\{Path, SourceStream};

/**
* @param int<1, max> $characters_read_in_line
*/
#[DocumentationDSL(module: Module::CSV, type: DSLType::EXTRACTOR)]
function from_csv(
string|Path|array $path,
string|Path $path,
bool $with_header = true,
bool $empty_to_null = true,
?string $delimiter = null,
?string $enclosure = null,
?string $escape = null,
int $characters_read_in_line = 1000,
?Schema $schema = null
) : Extractor {
if (\is_array($path)) {
$extractors = [];

foreach ($path as $file_path) {
$extractors[] = new CSVExtractor(
\is_string($file_path) ? Path::realpath($file_path) : $file_path,
$with_header,
$empty_to_null,
$delimiter,
$enclosure,
$escape,
$characters_read_in_line,
$schema
);
}

return from_all(...$extractors);
}

) : CSVExtractor {
return new CSVExtractor(
\is_string($path) ? Path::realpath($path) : $path,
$with_header,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,32 +99,6 @@ public function test_extracting_csv_empty_headers() : void
);
}

public function test_extracting_csv_files_from_directory_recursively() : void
{
$extractor = from_csv(
[
Path::realpath(__DIR__ . '/../Fixtures/annual-enterprise-survey-2019-financial-year-provisional-csv.csv'),
Path::realpath(__DIR__ . '/../Fixtures/nested/annual-enterprise-survey-2019-financial-year-provisional-csv.csv'),
],
false
);

$total = 0;

/** @var Rows $rows */
foreach ($extractor->extract(new FlowContext(Config::default())) as $rows) {
$rows->each(function (Row $row) : void {
$this->assertSame(
['e00', 'e01', 'e02', 'e03', 'e04', 'e05', 'e06', 'e07', 'e08', 'e09'],
\array_keys($row->toArray())
);
});
$total += $rows->count();
}

self::assertSame(1998, $total);
}

public function test_extracting_csv_files_with_header() : void
{
$path = __DIR__ . '/../Fixtures/annual-enterprise-survey-2019-financial-year-provisional-csv.csv';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,36 +4,21 @@

namespace Flow\ETL\Adapter\JSON;

use function Flow\ETL\DSL\from_all;
use Flow\ETL\Adapter\JSON\JSONMachine\JsonExtractor;
use Flow\ETL\Row\Schema;
use Flow\ETL\{Attribute\DocumentationDSL, Attribute\Module, Attribute\Type, Extractor, Loader};
use Flow\ETL\{Attribute\DocumentationDSL, Attribute\Module, Attribute\Type, Loader};
use Flow\Filesystem\Path;

/**
* @param array<Path|string>|Path|string $path - string is internally turned into stream
* @param Path|string $path - string is internally turned into stream
* @param ?string $pointer - if you want to iterate only results of a subtree, use a pointer, read more at https://github.com/halaxa/json-machine#parsing-a-subtree
*/
#[DocumentationDSL(module: Module::JSON, type: Type::EXTRACTOR)]
function from_json(
string|Path|array $path,
string|Path $path,
?string $pointer = null,
?Schema $schema = null,
) : Extractor {
if (\is_array($path)) {
$extractors = [];

foreach ($path as $file) {
$extractors[] = new JsonExtractor(
\is_string($file) ? Path::realpath($file) : $file,
$pointer,
$schema
);
}

return from_all(...$extractors);
}

) : JsonExtractor {
return new JsonExtractor(
\is_string($path) ? Path::realpath($path) : $path,
$pointer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,47 +4,23 @@

namespace Flow\ETL\Adapter\Parquet;

use function Flow\ETL\DSL\from_all;
use Flow\ETL\Exception\InvalidArgumentException;
use Flow\ETL\Row\Schema;
use Flow\ETL\{Attribute\DocumentationDSL, Attribute\Module, Attribute\Type as DSLType, Extractor, Loader};
use Flow\ETL\{Attribute\DocumentationDSL, Attribute\Module, Attribute\Type as DSLType, Loader};
use Flow\Filesystem\Path;
use Flow\Parquet\ParquetFile\Compressions;
use Flow\Parquet\{ByteOrder, Options};

/**
* @param array<Path>|Path|string $path
* @param array<string> $columns
*
* @return Extractor
*/
#[DocumentationDSL(module: Module::PARQUET, type: DSLType::EXTRACTOR)]
function from_parquet(
string|Path|array $path,
string|Path $path,
array $columns = [],
Options $options = new Options(),
ByteOrder $byte_order = ByteOrder::LITTLE_ENDIAN,
?int $offset = null,
) : Extractor {
if (\is_array($path)) {
$extractors = [];

if ($offset !== null) {
throw new InvalidArgumentException('Offset can be used only with single file path, not with pattern');
}

foreach ($path as $filePath) {
$extractors[] = new ParquetExtractor(
$filePath,
$options,
$byte_order,
$columns
);
}

return from_all(...$extractors);
}

) : ParquetExtractor {
return new ParquetExtractor(
\is_string($path) ? Path::realpath($path) : $path,
$options,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,13 @@

namespace Flow\ETL\Adapter\Text;

use Flow\ETL\{Attribute\DocumentationDSL, Attribute\Module, Attribute\Type, Extractor, Loader};
use Flow\ETL\{Attribute\DocumentationDSL, Attribute\Module, Attribute\Type, Loader};
use Flow\Filesystem\Path;

/**
* @param array<Path|string>|Path|string $path
*
* @return Extractor
*/
#[DocumentationDSL(module: Module::TEXT, type: Type::EXTRACTOR)]
function from_text(
string|Path|array $path,
) : Extractor {
if (\is_array($path)) {
$extractors = [];

foreach ($path as $file_path) {
$extractors[] = new TextExtractor(
\is_string($file_path) ? Path::realpath($file_path) : $file_path,
);
}

return new Extractor\ChainExtractor(...$extractors);
}

string|Path $path,
) : TextExtractor {
return new TextExtractor(
\is_string($path) ? Path::realpath($path) : $path,
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
use function Flow\ETL\Adapter\Text\{from_text};
use Flow\ETL\Adapter\Text\TextExtractor;
use Flow\ETL\Extractor\Signal;
use Flow\ETL\{Config, Flow, FlowContext, Row, Rows};
use Flow\ETL\{Config, Flow, FlowContext, Row};
use Flow\Filesystem\Path;
use PHPUnit\Framework\TestCase;

Expand All @@ -28,28 +28,6 @@ public function test_extracting_text_file() : void
self::assertSame(1024, $rows->count());
}

public function test_extracting_text_files_from_directory() : void
{
$extractor = from_text(
[
__DIR__ . '/../Fixtures/annual-enterprise-survey-2019-financial-year-provisional-csv.csv',
__DIR__ . '/../Fixtures/nested/annual-enterprise-survey-2019-financial-year-provisional-csv.csv',
],
);

$total = 0;

/** @var Rows $rows */
foreach ($extractor->extract(new FlowContext(Config::default())) as $rows) {
$rows->each(function (Row $row) : void {
$this->assertInstanceOf(Row\Entry\StringEntry::class, $row->get('text'));
});
$total += $rows->count();
}

self::assertSame(2048, $total);
}

public function test_limit() : void
{
$extractor = new TextExtractor(Path::realpath(__DIR__ . '/../Fixtures/orders_flow.csv'));
Expand Down
25 changes: 3 additions & 22 deletions src/adapter/etl-adapter-xml/src/Flow/ETL/Adapter/XML/functions.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,37 +4,18 @@

namespace Flow\ETL\Adapter\XML;

use function Flow\ETL\DSL\from_all;
use Flow\ETL\{Adapter\XML\Loader\XMLLoader,
Adapter\XML\XMLWriter\DOMDocumentWriter,
Attribute\DocumentationDSL,
Attribute\Module,
Attribute\Type as DSLType,
Extractor};
Attribute\Type as DSLType};
use Flow\Filesystem\Path;

/**
* @param array<Path|string>|Path|string $path
*/
#[DocumentationDSL(module: Module::XML, type: DSLType::EXTRACTOR)]
function from_xml(
string|Path|array $path,
string|Path $path,
string $xml_node_path = ''
) : Extractor {
if (\is_array($path)) {
/** @var array<Extractor> $extractors */
$extractors = [];

foreach ($path as $next_path) {
$extractors[] = new XMLParserExtractor(
\is_string($next_path) ? Path::realpath($next_path) : $next_path,
$xml_node_path
);
}

return from_all(...$extractors);
}

) : XMLParserExtractor {
return new XMLParserExtractor(
\is_string($path) ? Path::realpath($path) : $path,
$xml_node_path
Expand Down

0 comments on commit adf86f1

Please sign in to comment.