-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
288 additions
and
120 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ Parquet | |
|
||
read_parquet | ||
scan_parquet | ||
read_parquet_schema | ||
DataFrame.write_parquet | ||
|
||
SQL | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
from contextlib import contextmanager | ||
from io import BytesIO, StringIO | ||
from pathlib import Path | ||
from typing import ( | ||
Any, | ||
BinaryIO, | ||
ContextManager, | ||
Dict, | ||
Iterator, | ||
List, | ||
TextIO, | ||
Type, | ||
Union, | ||
overload, | ||
) | ||
from urllib.request import urlopen | ||
|
||
from polars.datatypes import DataType | ||
from polars.utils import format_path | ||
|
||
try: | ||
import fsspec | ||
from fsspec.utils import infer_storage_options | ||
|
||
_WITH_FSSPEC = True | ||
except ImportError: | ||
_WITH_FSSPEC = False | ||
|
||
try: | ||
from polars.polars import ipc_schema as _ipc_schema | ||
from polars.polars import parquet_schema as _parquet_schema | ||
except ImportError: # pragma: no cover | ||
pass | ||
|
||
|
||
def _process_http_file(path: str) -> BytesIO: | ||
with urlopen(path) as f: | ||
return BytesIO(f.read()) | ||
|
||
|
||
@overload | ||
def _prepare_file_arg( | ||
file: Union[str, List[str], Path, BinaryIO, bytes], **kwargs: Any | ||
) -> ContextManager[Union[str, BinaryIO]]: | ||
... | ||
|
||
|
||
@overload | ||
def _prepare_file_arg( | ||
file: Union[str, TextIO, Path, BinaryIO, bytes], **kwargs: Any | ||
) -> ContextManager[Union[str, BinaryIO]]: | ||
... | ||
|
||
|
||
@overload | ||
def _prepare_file_arg( | ||
file: Union[str, List[str], TextIO, Path, BinaryIO, bytes], **kwargs: Any | ||
) -> ContextManager[Union[str, List[str], BinaryIO, List[BinaryIO]]]: | ||
... | ||
|
||
|
||
def _prepare_file_arg( | ||
file: Union[str, List[str], TextIO, Path, BinaryIO, bytes], **kwargs: Any | ||
) -> ContextManager[Union[str, BinaryIO, List[str], List[BinaryIO]]]: | ||
""" | ||
Utility for read_[csv, parquet]. (not to be used by scan_[csv, parquet]). | ||
Returned value is always usable as a context. | ||
A `StringIO`, `BytesIO` file is returned as a `BytesIO`. | ||
A local path is returned as a string. | ||
An http URL is read into a buffer and returned as a `BytesIO`. | ||
When fsspec is installed, remote file(s) is (are) opened with | ||
`fsspec.open(file, **kwargs)` or `fsspec.open_files(file, **kwargs)`. | ||
""" | ||
|
||
# Small helper to use a variable as context | ||
@contextmanager | ||
def managed_file(file: Any) -> Iterator[Any]: | ||
try: | ||
yield file | ||
finally: | ||
pass | ||
|
||
if isinstance(file, StringIO): | ||
return BytesIO(file.read().encode("utf8")) | ||
if isinstance(file, BytesIO): | ||
return managed_file(file) | ||
if isinstance(file, Path): | ||
return managed_file(format_path(file)) | ||
if isinstance(file, str): | ||
if _WITH_FSSPEC: | ||
if infer_storage_options(file)["protocol"] == "file": | ||
return managed_file(format_path(file)) | ||
return fsspec.open(file, **kwargs) | ||
if file.startswith("http"): | ||
return _process_http_file(file) | ||
if isinstance(file, list) and bool(file) and all(isinstance(f, str) for f in file): | ||
if _WITH_FSSPEC: | ||
if all(infer_storage_options(f)["protocol"] == "file" for f in file): | ||
return managed_file([format_path(f) for f in file]) | ||
return fsspec.open_files(file, **kwargs) | ||
if isinstance(file, str): | ||
file = format_path(file) | ||
return managed_file(file) | ||
|
||
|
||
def read_ipc_schema( | ||
file: Union[str, BinaryIO, Path, bytes] | ||
) -> Dict[str, Type[DataType]]: | ||
""" | ||
Get a schema of the IPC file without reading data. | ||
Parameters | ||
---------- | ||
file | ||
Path to a file or a file-like object. | ||
Returns | ||
------- | ||
Dictionary mapping column names to datatypes | ||
""" | ||
if isinstance(file, (str, Path)): | ||
file = format_path(file) | ||
|
||
return _ipc_schema(file) | ||
|
||
|
||
def read_parquet_schema( | ||
file: Union[str, BinaryIO, Path, bytes] | ||
) -> Dict[str, Type[DataType]]: | ||
""" | ||
Get a schema of the Parquet file without reading data. | ||
Parameters | ||
---------- | ||
file | ||
Path to a file or a file-like object. | ||
Returns | ||
------- | ||
Dictionary mapping column names to datatypes | ||
""" | ||
if isinstance(file, (str, Path)): | ||
file = format_path(file) | ||
|
||
return _parquet_schema(file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.