Display DSL function usage examples at function doc page (#1215)

* Display DSL function usage examples at function doc page * Exclude examples from markdown linter check * Fixed join example
flow-php · Sep 7, 2024 · e1ef590 · e1ef590
1 parent 88a3943
commit e1ef590
Show file tree

Hide file tree

Showing 85 changed files with 686 additions and 14,849 deletions.
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -32,4 +32,4 @@ jobs:
         uses: "docker://norberttech/md-link-linter:latest"
         with:
           entrypoint: "/composer/vendor/bin/mdlinklint"
-          args: "--exclude=vendor --exclude=tests ."
+          args: "--exclude=vendor --exclude=tests --exclude=examples ."
diff --git a/composer.json b/composer.json
@@ -200,7 +200,7 @@
             "tools/phpunit/vendor/bin/phpunit"
         ],
         "test:docs": [
-            "docker run -t --rm -v $PWD:/app norberttech/md-link-linter --exclude=vendor --exclude=.scratchpad ."
+            "docker run -t --rm -v $PWD:/app norberttech/md-link-linter --exclude=vendor --exclude=.scratchpad --exclude=examples ."
         ],
         "test:benchmark": [
             "@test:benchmark:building_blocks",

diff --git a/examples/topics/cloud_storage/priority.txt b/examples/topics/cloud_storage/priority.txt
@@ -0,0 +1 @@
+7
diff --git a/examples/topics/data_frame/batch_size/code.php b/examples/topics/data_frame/batch_size/code.php
@@ -0,0 +1,19 @@
+<?php
+
+declare(strict_types=1);
+
+use function Flow\ETL\DSL\{data_frame, from_array, to_stream};
+
+require __DIR__ . '/../../../autoload.php';
+
+data_frame()
+    ->read(from_array([
+        ['id' => 1, 'name' => 'John'],
+        ['id' => 2, 'name' => 'Doe'],
+        ['id' => 3, 'name' => 'Jane'],
+        ['id' => 4, 'name' => 'Smith'],
+        ['id' => 5, 'name' => 'Alice'],
+    ]))
+    ->batchSize(2)
+    ->write(to_stream(__DIR__ . '/output.txt', truncate: false))
+    ->run();
diff --git a/examples/topics/data_frame/batch_size/description.md b/examples/topics/data_frame/batch_size/description.md
@@ -0,0 +1,8 @@
+Batch size defines the size of data frame. In other words, it defines how many rows are processed at once.
+This is useful when you have a large dataset, and you want to process it in smaller chunks.
+Larger batch size can speed up the processing, but it also requires more memory.
+There is no universal rule for the optimal batch size, it depends on the dataset and types of applied transformations.
+
+The Default batch size is `1` this means that each extractor will yield one row at a time.
+
+To process all rows at once, you can use [collect](/data_frame/collect/#example) or set batchSize to `-1`.
diff --git a/examples/topics/data_frame/batch_size/output.txt b/examples/topics/data_frame/batch_size/output.txt
@@ -0,0 +1,20 @@
++----+------+
+| id | name |
++----+------+
+|  1 | John |
+|  2 |  Doe |
++----+------+
+2 rows
++----+-------+
+| id |  name |
++----+-------+
+|  3 |  Jane |
+|  4 | Smith |
++----+-------+
+2 rows
++----+-------+
+| id |  name |
++----+-------+
+|  5 | Alice |
++----+-------+
+1 rows
diff --git a/examples/topics/data_frame/batch_size/priority.txt b/examples/topics/data_frame/batch_size/priority.txt
@@ -0,0 +1 @@
+4
diff --git a/examples/topics/data_frame/collect/code.php b/examples/topics/data_frame/collect/code.php
@@ -0,0 +1,19 @@
+<?php
+
+declare(strict_types=1);
+
+use function Flow\ETL\DSL\{data_frame, from_array, to_stream};
+
+require __DIR__ . '/../../../autoload.php';
+
+data_frame()
+    ->read(from_array([
+        ['id' => 1, 'name' => 'John'],
+        ['id' => 2, 'name' => 'Doe'],
+        ['id' => 3, 'name' => 'Jane'],
+        ['id' => 4, 'name' => 'Smith'],
+        ['id' => 5, 'name' => 'Alice'],
+    ]))
+    ->collect() // alternatively we can also use ->batchSize(-1)
+    ->write(to_stream(__DIR__ . '/output.txt', truncate: false))
+    ->run();
diff --git a/examples/topics/data_frame/collect/description.md b/examples/topics/data_frame/collect/description.md
@@ -0,0 +1,4 @@
+Collect is used to make sure that all rows are processed at once. This means that all rows are loaded into memory and processed at once.
+It's useful mostly for debugging and while working with relatively small datasets.
+In order to control memory consumption please use [batchSize](/data_frame/batch_size/#example).
+```php
diff --git a/examples/topics/data_frame/collect/output.txt b/examples/topics/data_frame/collect/output.txt
@@ -0,0 +1,10 @@
++----+-------+
+| id |  name |
++----+-------+
+|  1 |  John |
+|  2 |   Doe |
+|  3 |  Jane |
+|  4 | Smith |
+|  5 | Alice |
++----+-------+
+5 rows
diff --git a/examples/topics/data_frame/collect/priority.txt b/examples/topics/data_frame/collect/priority.txt
@@ -0,0 +1 @@
+5
diff --git a/examples/topics/data_frame/create_entries/code.php b/examples/topics/data_frame/create_entries/code.php
@@ -0,0 +1,19 @@
+<?php
+
+declare(strict_types=1);
+
+use function Flow\ETL\DSL\{data_frame, from_array, lit, ref, to_stream};
+
+require __DIR__ . '/../../../autoload.php';
+
+data_frame()
+    ->read(from_array([
+        ['id' => 1, 'name' => 'Norbert'],
+        ['id' => 2, 'name' => 'John'],
+        ['id' => 3, 'name' => 'Jane'],
+    ]))
+    ->withEntry('active', ref('id')->isOdd())
+    ->withEntry('number', lit(5))
+    ->collect()
+    ->write(to_stream(__DIR__ . '/output.txt', truncate: false))
+    ->run();
diff --git a/examples/topics/data_frame/create_entries/description.md b/examples/topics/data_frame/create_entries/description.md
@@ -0,0 +1,7 @@
+to create new columns (row entries) we always use `DataFrame::withEntry(string $entryName, ScalarFunction|WindowFunction $ref)` method.  
+We can create new entry by providing a unique `$entryName`, if the entry already exists it will be replaced.
+
+As a second argument we can provide a static value or a function that will be evaluated for each row. 
+
+* `DataFrame::withEntry('number', lit(5))` - creates a new column with a constant value of 5
+* `DataFrame::withEntry('is_odd', ref('another_column')->isOdd())` - creates a new column that checks if the value of `another_column` in is odd
diff --git a/examples/topics/data_frame/create_entries/output.txt b/examples/topics/data_frame/create_entries/output.txt
@@ -0,0 +1,8 @@
++----+---------+--------+--------+
+| id |    name | active | number |
++----+---------+--------+--------+
+|  1 | Norbert |   true |      5 |
+|  2 |    John |  false |      5 |
+|  3 |    Jane |   true |      5 |
++----+---------+--------+--------+
+3 rows
diff --git a/examples/topics/data_frame/create_entries/priority.txt b/examples/topics/data_frame/create_entries/priority.txt
@@ -0,0 +1 @@
+2
diff --git a/examples/topics/data_frame/data_frame/code.php b/examples/topics/data_frame/data_frame/code.php
@@ -2,7 +2,7 @@
 
 declare(strict_types=1);
 
-use function Flow\ETL\DSL\{array_entry, array_expand, data_frame, from_rows, int_entry, ref, row, rows, to_output, to_stream};
+use function Flow\ETL\DSL\{array_entry, array_expand, data_frame, from_rows, int_entry, ref, row, rows, to_stream};
 
 data_frame()
     ->read(
@@ -12,7 +12,6 @@
             )
         )
     )
-    ->write(to_output(false))
     ->withEntry('expanded', array_expand(ref('array')))
     ->write(to_stream(__DIR__ . '/output.txt', truncate: false))
     ->run();
diff --git a/examples/topics/data_frame/data_frame/priority.txt b/examples/topics/data_frame/data_frame/priority.txt
@@ -0,0 +1 @@
+1
diff --git a/examples/topics/data_frame/priority.txt b/examples/topics/data_frame/priority.txt
@@ -0,0 +1 @@
+1
diff --git a/examples/topics/data_frame/rename_entries/code.php b/examples/topics/data_frame/rename_entries/code.php
@@ -0,0 +1,19 @@
+<?php
+
+declare(strict_types=1);
+
+use function Flow\ETL\DSL\{data_frame, from_array, to_stream};
+
+require __DIR__ . '/../../../autoload.php';
+
+data_frame()
+    ->read(from_array([
+        ['id' => 1, 'name' => 'Norbert', 'joined_id' => 1, 'joined_Status' => 'active'],
+        ['id' => 2, 'name' => 'John', 'joined_id' => 2, 'joined_Status' => 'inactive'],
+        ['id' => 3, 'name' => 'Jane', 'joined_id' => 3, 'joined_Status' => 'active'],
+    ]))
+    ->rename('id', 'user_id')
+    ->renameAll('joined_', '')
+    ->collect()
+    ->write(to_stream(__DIR__ . '/output.txt', truncate: false))
+    ->run();
diff --git a/examples/topics/data_frame/rename_entries/description.md b/examples/topics/data_frame/rename_entries/description.md
@@ -0,0 +1,6 @@
+There are multiple ways to rename entries in a DataFrame:
+
+- `rename(string $from, string $to)` - renames a single entry
+- `renameAll(string $search, string $replace)` - renames all entries that contain a given substring and replaces it with another substring
+- `renameAllToLowercase()` - renames all entries to lowercase
+- `renameAllStyle(StringStyles|string $style)` - renames all entries to a given style (e.g. camel, snakem, kebab, etc.)
diff --git a/examples/topics/data_frame/rename_entries/output.txt b/examples/topics/data_frame/rename_entries/output.txt
@@ -0,0 +1,8 @@
++---------+----+----------+--------+
+|    name | id |   status | userId |
++---------+----+----------+--------+
+| Norbert |  1 |   active |      1 |
+|    John |  2 | inactive |      2 |
+|    Jane |  3 |   active |      3 |
++---------+----+----------+--------+
+3 rows
diff --git a/examples/topics/data_frame/rename_entries/priority.txt b/examples/topics/data_frame/rename_entries/priority.txt
@@ -0,0 +1 @@
+3
diff --git a/examples/topics/data_frame/reorder_entries/code.php b/examples/topics/data_frame/reorder_entries/code.php
@@ -20,7 +20,6 @@
     struct_element,
     struct_entry,
     struct_type,
-    to_output,
     to_stream,
     type_float,
     type_int,
@@ -81,6 +80,5 @@
         )
     )))
     ->reorderEntries(compare_entries_by_type_and_name())
-    ->write(to_output(false))
     ->write(to_stream(__DIR__ . '/output.txt', truncate: false))
     ->run();
diff --git a/examples/topics/data_source/array/description.md b/examples/topics/data_source/array/description.md
@@ -1 +1,9 @@
-Read data directly from a php associative array.  Relays on `array_to_rows` DSL function.
+Read data directly from a php associative array.  Relays on `array_to_rows` DSL function.
+
+```php
+function from_array(array $data);
+```
+
+Additional options:
+
+* `withSchema(Schema $schema)` - the schema of the dataset, when not set, it will be auto-detected
diff --git a/examples/topics/data_source/array/priority.txt b/examples/topics/data_source/array/priority.txt
@@ -0,0 +1 @@
+1
diff --git a/examples/topics/data_source/csv/priority.txt b/examples/topics/data_source/csv/priority.txt
@@ -0,0 +1 @@
+3
diff --git a/examples/topics/data_source/http_dynamic/output.txt b/examples/topics/data_source/http_dynamic/output.txt
@@ -1,6 +1,6 @@
 +----------+-----------------------------+---------------------+----------+--------------+-----------+----------------------+
 |     name |                    html_url |                blog |    login | public_repos | followers |           created_at |
 +----------+-----------------------------+---------------------+----------+--------------+-----------+----------------------+
-| Flow PHP | https://github.com/flow-php | http://flow-php.com | flow-php |           34 |        96 | 2020-10-26T18:40:27Z |
+| Flow PHP | https://github.com/flow-php | http://flow-php.com | flow-php |           34 |        98 | 2020-10-26T18:40:27Z |
 +----------+-----------------------------+---------------------+----------+--------------+-----------+----------------------+
 1 rows
diff --git a/examples/topics/data_source/json/code.php b/examples/topics/data_source/json/code.php
@@ -3,14 +3,22 @@
 declare(strict_types=1);
 
 use function Flow\ETL\Adapter\JSON\from_json;
-use function Flow\ETL\DSL\{data_frame, to_stream};
+use function Flow\ETL\DSL\{bool_schema, data_frame, int_schema, schema, str_schema, to_stream};
 
 require __DIR__ . '/../../../autoload.php';
 
+$schema = schema(
+    int_schema('id'),
+    str_schema('name'),
+    str_schema('email'),
+    bool_schema('active'),
+);
+
 data_frame()
-    ->read(from_json(
-        __DIR__ . '/input/dataset.json',
-    ))
+    ->read(
+        from_json(__DIR__ . '/input/dataset.json')
+            ->withSchema($schema)
+    )
     ->collect()
     ->write(to_stream(__DIR__ . '/output.txt', truncate: false))
     ->run();
diff --git a/examples/topics/data_source/json/description.md b/examples/topics/data_source/json/description.md
@@ -1,12 +1,10 @@
 Read data from a json file.
 
 ```php
-function from_json(
-    string|Path $path,
-    ?string $pointer = null,
-    ?Schema $schema = null,
-);
+function from_json(string|Path $path);
 ```
 
-* `pointer` - default null, used to iterate only results of a subtree, read more about [pointers](https://github.com/halaxa/json-machine#parsing-a-subtree)
-* `schema` - the schema of the csv file, when not set, it will be auto detected
+Additional options: 
+
+* `withPointer(string $pointer)` - default null, used to iterate only results of a subtree, read more about [pointers](https://github.com/halaxa/json-machine#parsing-a-subtree)
+* `withSchema(Schema $schema)` - the schema of the dataset, when not set, it will be auto-detected
diff --git a/examples/topics/data_source/json/priority.txt b/examples/topics/data_source/json/priority.txt
@@ -0,0 +1 @@
+4
diff --git a/examples/topics/data_source/parquet/description.md b/examples/topics/data_source/parquet/description.md
@@ -1,16 +1,12 @@
 Read data from a json file.
 
 ```php
-function from_parquet(
-    string|Path|array $uri,
-    array $columns = [],
-    Options $options = new Options(),
-    ByteOrder $byte_order = ByteOrder::LITTLE_ENDIAN,
-    ?int $offset = null,
-);
+function from_parquet(string|Path $uri);
 ```
 
-* `columns` - default [], list of columns to read, when empty, all columns will be read
-* `options` - custom Parquet Reader [Options](https://github.com/flow-php/flow/blob/1.x/src/lib/parquet/src/Flow/Parquet/Options.php)
-* `byte_order` - default `ByteOrder::LITTLE_ENDIAN`, the byte order of the parquet file
-* `offset` - default null, rows to skip from the beginning of the file 
+Additional options:
+
+* `withColumns(array $columns)` - default [], list of columns to read when not set, all columns will be read
+* `withOptions(Options $options)` - custom Parquet Reader [Options](https://github.com/flow-php/flow/blob/1.x/src/lib/parquet/src/Flow/Parquet/Options.php)
+* `withByteOrder(ByteOrder $order)` - default `ByteOrder::LITTLE_ENDIAN`, the byte order of the parquet file
+* `withOffset(int $offset)` - default null, rows to skip from the beginning of the file 
diff --git a/examples/topics/data_source/parquet/priority.txt b/examples/topics/data_source/parquet/priority.txt
@@ -0,0 +1 @@
+2
diff --git a/examples/topics/data_source/priority.txt b/examples/topics/data_source/priority.txt
@@ -0,0 +1 @@
+2
diff --git a/examples/topics/data_source/sequence_date/hidden.txt b/examples/topics/data_source/sequence_date/hidden.txt
@@ -0,0 +1 @@
+1
diff --git a/examples/topics/data_source/sequence_date_recurrences/hidden.txt b/examples/topics/data_source/sequence_date_recurrences/hidden.txt
@@ -0,0 +1 @@
+1
diff --git a/examples/topics/data_source/sequence_number/hidden.txt b/examples/topics/data_source/sequence_number/hidden.txt
@@ -0,0 +1 @@
+1
diff --git a/examples/topics/data_source/xml/code.php b/examples/topics/data_source/xml/code.php
@@ -8,11 +8,11 @@
 require __DIR__ . '/../../../autoload.php';
 
 data_frame()
-    ->read(from_xml(
-        __DIR__ . '/input/dataset.xml',
-        'users/user'
-    ))
-    ->withEntry('id', ref('node')->xpath('@id')->domElementAttribute('id'))
+    ->read(
+        from_xml(__DIR__ . '/input/dataset.xml')
+            ->withXMLNodePath('users/user')
+    )
+    ->withEntry('id', ref('node')->domElementAttributeValue('id'))
     ->withEntry('name', ref('node')->xpath('name')->domElementValue())
     ->withEntry('active', ref('node')->xpath('active')->domElementValue())
     ->withEntry('email', ref('node')->xpath('email')->domElementValue())

diff --git a/examples/topics/data_source/xml/description.md b/examples/topics/data_source/xml/description.md
@@ -1,10 +1,10 @@
 Read data from a json file.
 
 ```php
-function from_xml(
-    string|Path $path,
-    string $xml_node_path = ''
-);
+function from_xml(string|Path $path);
 ```
 
-* `xml_node_path` - default '', the path to the node to read, when empty, the root node will be read. It's not xpath, it is just a sequence of node names separated with slash.
+Additional options:
+
+* `withXMLNodePath(string $xmlNodePath)` - XML Node Path doesn’t support attributes, and it's not xpath, it is just a sequence of node names separated with slash
+* `withBufferSize(int $size)` - default 8096, the size of the buffer used to iterate through stream