Skip to content

Commit

Permalink
apacheGH-41034: [C++][FS][Azure] Adjust DeleteDir/DeleteDirContents/G…
Browse files Browse the repository at this point in the history
…etFileInfoSelector behaviors against Azure for generic filesystem tests (apache#41068)

### Rationale for this change

They are failing:

```text
[  FAILED  ] TestAzureHierarchicalNSGeneric.DeleteDir
[  FAILED  ] TestAzureHierarchicalNSGeneric.DeleteDirContents
[  FAILED  ] TestAzureHierarchicalNSGeneric.GetFileInfoSelector
```

### What changes are included in this PR?

* `DeleteDir()`: Check not a directory case
* `DeleteDirContents()`: Check not a directory case
* `GetFileInfoSelector()`:
  * Add not a directory check for input
  * Add support for returning metadata for directory 

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: apache#41034

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
  • Loading branch information
kou authored Apr 10, 2024
1 parent 06f0e0c commit 06d4495
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 2 deletions.
83 changes: 82 additions & 1 deletion cpp/src/arrow/filesystem/azurefs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1179,6 +1179,15 @@ Status CreateContainerIfNotExists(const std::string& container_name,
}
}

FileInfo FileInfoFromPath(std::string_view container,
const DataLake::Models::PathItem& path) {
FileInfo info{internal::ConcatAbstractPath(container, path.Name),
path.IsDirectory ? FileType::Directory : FileType::File};
info.set_size(path.FileSize);
info.set_mtime(std::chrono::system_clock::time_point{path.LastModified});
return info;
}

FileInfo DirectoryFileInfoFromPath(std::string_view path) {
return FileInfo{std::string{internal::RemoveTrailingSlash(path)}, FileType::Directory};
}
Expand Down Expand Up @@ -1576,7 +1585,7 @@ class AzureFileSystem::Impl {
}

private:
/// \pref location.container is not empty.
/// \pre location.container is not empty.
template <typename ContainerClient>
Status CheckDirExists(const ContainerClient& container_client,
const AzureLocation& location) {
Expand Down Expand Up @@ -1623,6 +1632,50 @@ class AzureFileSystem::Impl {
return result;
}

/// \brief List the paths at the root of a filesystem or some dir in a filesystem.
///
/// \pre adlfs_client is the client for the filesystem named like the first
/// segment of select.base_dir.
Status GetFileInfoWithSelectorFromFileSystem(
const DataLake::DataLakeFileSystemClient& adlfs_client,
const Core::Context& context, Azure::Nullable<int32_t> page_size_hint,
const FileSelector& select, FileInfoVector* acc_results) {
ARROW_ASSIGN_OR_RAISE(auto base_location, AzureLocation::FromString(select.base_dir));

auto directory_client = adlfs_client.GetDirectoryClient(base_location.path);
bool found = false;
DataLake::ListPathsOptions options;
options.PageSizeHint = page_size_hint;

try {
auto list_response = directory_client.ListPaths(select.recursive, options, context);
for (; list_response.HasPage(); list_response.MoveToNextPage(context)) {
if (list_response.Paths.empty()) {
continue;
}
found = true;
for (const auto& path : list_response.Paths) {
if (path.Name == base_location.path && !path.IsDirectory) {
return NotADir(base_location);
}
acc_results->push_back(FileInfoFromPath(base_location.container, path));
}
}
} catch (const Storage::StorageException& exception) {
if (IsContainerNotFound(exception) || exception.ErrorCode == "PathNotFound") {
found = false;
} else {
return ExceptionToStatus(exception,
"Failed to list paths in a directory: ", select.base_dir,
": ", directory_client.GetUrl());
}
}

return found || select.allow_not_found
? Status::OK()
: ::arrow::fs::internal::PathNotFound(select.base_dir);
}

/// \brief List the blobs at the root of a container or some dir in a container.
///
/// \pre container_client is the client for the container named like the first
Expand Down Expand Up @@ -1772,6 +1825,20 @@ class AzureFileSystem::Impl {
return VisitContainers(context, std::move(on_container));
}

auto adlfs_client = GetFileSystemClient(base_location.container);
ARROW_ASSIGN_OR_RAISE(auto hns_support, HierarchicalNamespaceSupport(adlfs_client));
if (hns_support == HNSSupport::kContainerNotFound) {
if (select.allow_not_found) {
return Status::OK();
} else {
return ::arrow::fs::internal::PathNotFound(select.base_dir);
}
}
if (hns_support == HNSSupport::kEnabled) {
return GetFileInfoWithSelectorFromFileSystem(adlfs_client, context, page_size_hint,
select, acc_results);
}
DCHECK_EQ(hns_support, HNSSupport::kDisabled);
auto container_client =
blob_service_client_->GetBlobContainerClient(base_location.container);
return GetFileInfoWithSelectorFromContainer(container_client, context, page_size_hint,
Expand Down Expand Up @@ -2157,6 +2224,17 @@ class AzureFileSystem::Impl {
Azure::Nullable<std::string> lease_id = {}) {
DCHECK(!location.container.empty());
DCHECK(!location.path.empty());
ARROW_ASSIGN_OR_RAISE(auto file_info, GetFileInfo(adlfs_client, location, lease_id));
if (file_info.type() == FileType::NotFound) {
if (require_dir_to_exist) {
return PathNotFound(location);
} else {
return Status::OK();
}
}
if (file_info.type() != FileType::Directory) {
return NotADir(location);
}
auto directory_client = adlfs_client.GetDirectoryClient(
std::string(internal::RemoveTrailingSlash(location.path)));
DataLake::DeleteDirectoryOptions options;
Expand Down Expand Up @@ -2200,6 +2278,9 @@ class AzureFileSystem::Impl {
kDelimiter, path.Name, ": ", sub_directory_client.GetUrl());
}
} else {
if (path.Name == location.path) {
return NotADir(location);
}
auto sub_file_client = adlfs_client.GetFileClient(path.Name);
try {
sub_file_client.Delete();
Expand Down
6 changes: 5 additions & 1 deletion cpp/src/arrow/filesystem/azurefs_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ class TestGeneric : public ::testing::Test, public GenericFileSystemTest {
bool allow_move_dir() const override { return false; }
bool allow_move_file() const override { return true; }
bool allow_append_to_file() const override { return true; }
bool have_directory_mtimes() const override { return false; }
bool have_directory_mtimes() const override { return true; }
bool have_flaky_directory_tree_deletion() const override { return false; }
bool have_file_metadata() const override { return true; }
// calloc() used in libxml2's xmlNewGlobalState() is detected as a
Expand Down Expand Up @@ -429,6 +429,8 @@ class TestAzuriteGeneric : public TestGeneric {
protected:
// Azurite doesn't support moving files over containers.
bool allow_move_file() const override { return false; }
// Azurite doesn't support directory mtime.
bool have_directory_mtimes() const override { return false; }
// DeleteDir() doesn't work with Azurite on macOS
bool have_flaky_directory_tree_deletion() const override {
return env_->HasSubmitBatchBug();
Expand All @@ -449,6 +451,8 @@ class TestAzureFlatNSGeneric : public TestGeneric {
protected:
// Flat namespace account doesn't support moving files over containers.
bool allow_move_file() const override { return false; }
// Flat namespace account doesn't support directory mtime.
bool have_directory_mtimes() const override { return false; }
};

class TestAzureHierarchicalNSGeneric : public TestGeneric {
Expand Down

0 comments on commit 06d4495

Please sign in to comment.