-
Notifications
You must be signed in to change notification settings - Fork 181
/
Copy pathfile_path.py
54 lines (42 loc) · 2.03 KB
/
file_path.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# isort: dont-add-import: from __future__ import annotations
from typing import Optional
import fsspec
from daft.api_annotations import PublicAPI
from daft.context import get_context
from daft.daft import PartitionScheme, PartitionSpec
from daft.dataframe import DataFrame
@PublicAPI
def from_glob_path(path: str, fs: Optional[fsspec.AbstractFileSystem] = None) -> DataFrame:
"""Creates a DataFrame of file paths and other metadata from a glob path.
This method supports wildcards:
1. "*" matches any number of any characters including none
2. "?" matches any single character
3. "[...]" matches any single character in the brackets
4. "**" recursively matches any number of layers of directories
The returned DataFrame will have the following columns:
1. path: the path to the file/directory
2. size: size of the object in bytes
3. type: either "file" or "directory"
Example:
>>> df = daft.from_glob_path("/path/to/files/*.jpeg")
>>> df = daft.from_glob_path("/path/to/files/**/*.jpeg")
>>> df = daft.from_glob_path("/path/to/files/**/image-?.jpeg")
Args:
path (str): Path to files on disk (allows wildcards).
fs (fsspec.AbstractFileSystem): fsspec FileSystem to use for globbing and fetching metadata.
By default, Daft will automatically construct a FileSystem instance internally.
Returns:
DataFrame: DataFrame containing the path to each file as a row, along with other metadata
parsed from the provided filesystem.
"""
context = get_context()
runner_io = context.runner().runner_io()
partition_set = runner_io.glob_paths_details([path], fs=fs)
cache_entry = context.runner().put_partition_set_into_cache(partition_set)
builder_cls = context.logical_plan_builder_class()
builder = builder_cls.from_in_memory_scan(
cache_entry,
schema=runner_io.FS_LISTING_SCHEMA,
partition_spec=PartitionSpec(PartitionScheme.Unknown, partition_set.num_partitions()),
)
return DataFrame(builder)