Skip to content

Commit

Permalink
Merge pull request #29997 Add a ReadFromTsv example.
Browse files Browse the repository at this point in the history
  • Loading branch information
robertwb authored Jan 19, 2024
2 parents e019de8 + 02b369d commit 2ff551e
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 1 deletion.
3 changes: 2 additions & 1 deletion sdks/python/apache_beam/dataframe/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def read_gbq(


@frame_base.with_docs_from(pd)
def read_csv(path, *args, splittable=False, **kwargs):
def read_csv(path, *args, splittable=False, binary=True, **kwargs):
"""If your files are large and records do not contain quoted newlines, you may
pass the extra argument ``splittable=True`` to enable dynamic splitting for
this read on newlines. Using this option for records that do contain quoted
Expand All @@ -99,6 +99,7 @@ def read_csv(path, *args, splittable=False, **kwargs):
args,
kwargs,
incremental=True,
binary=binary,
splitter=_TextFileSplitter(args, kwargs) if splittable else None)


Expand Down
22 changes: 22 additions & 0 deletions sdks/python/apache_beam/yaml/inline_python.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,28 @@ as an input, e.g.
output_type: integer
```

This can be used to call arbitrary transforms in the Beam SDK, e.g.

```
pipeline:
transforms:
- type: PyTransform
name: ReadFromTsv
input: {}
config:
constructor: apache_beam.io.ReadFromCsv
kwargs:
path: '/path/to/*.tsv'
sep: '\t'
skip_blank_lines: True
true_values: ['yes']
false_values: ['no']
comment: '#'
on_bad_lines: 'skip'
binary: False
splittable: False
```


## Defining a transform inline using `__constructor__`

Expand Down
5 changes: 5 additions & 0 deletions sdks/python/apache_beam/yaml/readme_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,9 @@ def input_file(self, name, content):
def input_csv(self):
return self.input_file('input.csv', 'col1,col2,col3\nabc,1,2.5\n')

def input_tsv(self):
return self.input_file('input.tsv', 'col1\tcol2\tcol3\nabc\t1\t2.5\n')

def input_json(self):
return self.input_file(
'input.json', '{"col1": "abc", "col2": 1, "col3": 2.5"}\n')
Expand Down Expand Up @@ -192,6 +195,8 @@ def create_test_method(test_type, test_name, test_yaml):

def test(self):
with TestEnvironment() as env:
nonlocal test_yaml
test_yaml = test_yaml.replace('/path/to/*.tsv', env.input_tsv())
spec = yaml.load(test_yaml, Loader=SafeLoader)
if test_type == 'PARSE':
return
Expand Down

0 comments on commit 2ff551e

Please sign in to comment.