Skip to content

Commit

Permalink
Merge pull request #783 from koordinates/fix-replace-existing
Browse files Browse the repository at this point in the history
Modified `point-cloud-import --replace-existing` to reuse tiles
  • Loading branch information
olsen232 authored Feb 1, 2023
2 parents ea41958 + 45982ae commit e96b4c0
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 30 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ Please note that compatibility for 0.x releases (software or repositories) isn't

_When adding new entries to the changelog, please include issue/PR numbers wherever possible._


## 0.12.1

- Modified `point-cloud-import --replace-existing` to reuse previously imported tiles, rather than reimport them, if tiles that have already been imported are imported again - potentially saving time and disk space. Note that `point-cloud-import --update-existing` already had this optimization.

## 0.12.0

### Major changes
Expand Down
87 changes: 57 additions & 30 deletions kart/point_cloud/import_.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
rewrite_and_merge_metadata,
check_for_non_homogenous_metadata,
format_tile_for_pointer_file,
is_copc,
)
from kart.point_cloud.pdal_convert import convert_tile_to_copc
from kart.point_cloud.v1 import PointCloudV1
Expand Down Expand Up @@ -300,32 +301,34 @@ def convert_tile_to_copc_and_reextract_metadata(source, dest):
with git_fast_import(repo, *FastImportSettings().as_args(), "--quiet") as proc:
proc.stdin.write(header.encode("utf8"))

all_metadatas = []
existing_dataset = None
if update_existing:
try:
existing_dataset = repo.datasets()[ds_path]
except KeyError:
# Should it be an error to use --update-existing for a new dataset?
# Potentially not; it might be useful for callers to be agnostic
# about whether a dataset exists yet.
existing_dataset = None
else:
# Check that the metadata for the existing dataset matches the new tiles
all_metadatas.append(
{
"crs": existing_dataset.get_meta_item("crs.wkt"),
"format": existing_dataset.get_meta_item("format.json"),
"schema": existing_dataset.get_meta_item("schema.json"),
}
)
try:
existing_dataset = repo.datasets()[ds_path]
except KeyError:
existing_dataset = None
else:
existing_metadata = {
"crs": existing_dataset.get_meta_item("crs.wkt"),
"format": existing_dataset.get_meta_item("format.json"),
"schema": existing_dataset.get_meta_item("schema.json"),
}

if delete and existing_dataset is None:
# Trying to delete specific paths from a nonexistent dataset?
# This suggests the caller is confused.
raise InvalidOperation(
f"Dataset {ds_path} does not exist. Cannot delete paths from it."
)

include_existing_metadata = False
if update_existing and existing_dataset is not None:
# Should it be an error to use --update-existing for a new dataset?
# Potentially not; it might be useful for callers to be agnostic
# about whether a dataset exists yet.

# Check that the metadata for the existing dataset matches the new tiles
include_existing_metadata = True

if delete:
if existing_dataset is None:
# Trying to delete specific paths from a nonexistent dataset?
# This suggests the caller is confused.
raise InvalidOperation(
f"Dataset {ds_path} does not exist. Cannot delete paths from it."
)
root_tree = repo.head_tree
for tile_name in delete:
# Check that the blob exists; if not, error out
Expand All @@ -336,7 +339,8 @@ def convert_tile_to_copc_and_reextract_metadata(source, dest):
raise NotFound(f"{tile_name} does not exist, can't delete it")

proc.stdin.write(f"D {blob_path}\n".encode("utf8"))
else:

if not update_existing:
# Delete the entire existing dataset, before we re-import it.
proc.stdin.write(f"D {ds_path}\n".encode("utf8"))

Expand All @@ -356,10 +360,9 @@ def convert_tile_to_copc_and_reextract_metadata(source, dest):
tilename, missing_ok=True
)
if existing_summary:
source_oid = "sha256:" + source_to_hash_and_size[source][0]
if (
existing_summary["oid"] == source_oid
or existing_summary.get("sourceOid") == source_oid
source_oid = source_to_hash_and_size[source][0]
if existing_tile_matches_source(
source_oid, existing_summary, convert_to_copc
):
# This tile has already been imported before. Reuse it rather than re-importing it.
# (Especially don't use PDAL to reconvert it - that creates pointless diffs due to recompression).
Expand All @@ -368,6 +371,7 @@ def convert_tile_to_copc_and_reextract_metadata(source, dest):
blob_path,
(existing_dataset.inner_tree / rel_blob_path).data,
)
include_existing_metadata = True
del source_to_imported_metadata[source]
continue

Expand All @@ -394,6 +398,7 @@ def convert_tile_to_copc_and_reextract_metadata(source, dest):
rewrite_metadata = (
None if convert_to_copc else RewriteMetadata.DROP_OPTIMIZATION
)
all_metadatas = [existing_metadata] if include_existing_metadata else []
all_metadatas.extend(source_to_imported_metadata.values())
merged_metadata = rewrite_and_merge_metadata(all_metadatas, rewrite_metadata)
check_for_non_homogenous_metadata(merged_metadata)
Expand Down Expand Up @@ -476,3 +481,25 @@ def _common_prefix(collection, min_length=4):
if len(prefix) < min_length:
return None
return prefix


def existing_tile_matches_source(source_oid, existing_summary, convert_to_copc):
"""Check if the existing tile can be reused instead of reimporting."""
if not source_oid.startswith("sha256:"):
source_oid = "sha256:" + source_oid

if existing_summary.get("oid") == source_oid:
# The import source we were given has already been imported in its native format.
# Return True if that's what we would do anyway.
if convert_to_copc:
return is_copc(existing_summary["format"])
else:
return True

# NOTE: this logic would be more complicated if we supported more than one type of conversion.
if existing_summary.get("sourceOid") == source_oid:
# The import source we were given has already been imported, but converted to COPC.
# Return True if we were going to convert it to COPC too.
return convert_to_copc and is_copc(existing_summary["format"])

return False
17 changes: 17 additions & 0 deletions tests/point_cloud/test_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,23 @@ def test_import_update_existing(cli_runner, data_archive, requires_pdal):
assert deletes == 1


def test_import_replace_existing_with_no_changes(
cli_runner, data_archive, requires_pdal
):
with data_archive("point-cloud/laz-auckland.tgz") as src:
with data_archive("point-cloud/auckland.tgz"):
r = cli_runner.invoke(
[
"point-cloud-import",
"--dataset-path=auckland",
"--replace-existing",
"--convert-to-copc",
*glob(f"{src}/*.laz"),
]
)
assert r.exit_code == NO_CHANGES, r.stderr


def test_import_empty_commit_error(cli_runner, data_archive, requires_pdal):
with data_archive("point-cloud/laz-auckland.tgz") as src:
with data_archive("point-cloud/auckland.tgz"):
Expand Down

0 comments on commit e96b4c0

Please sign in to comment.