Skip to content

Commit

Permalink
Generate Parquet files by default in assessment colab.
Browse files Browse the repository at this point in the history
It's much faster to sample labeling examples from the Parquet dataset than the TFRecord dataset.

Also allow user to specify the resolution of the generated examples.

PiperOrigin-RevId: 715963282
  • Loading branch information
jzxu authored and copybara-github committed Jan 15, 2025
1 parent 08c554d commit 433d9b2
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 3 deletions.
24 changes: 21 additions & 3 deletions src/colab/skai_assessment_notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def install_requirements():
ASSESSMENT_NAME = '' # @param {type:"string"}
EVENT_DATE = '' # @param {type:"date"}
OUTPUT_DIR = '' # @param {type:"string"}
EXAMPLE_RESOLUTION = 0.5 # @param {type:"number"}

# @markdown ---
BEFORE_IMAGE_0 = '' # @param {type:"string"}
Expand Down Expand Up @@ -120,6 +121,9 @@ def install_requirements():
UNLABELED_TFRECORD_PATTERN = os.path.join(
OUTPUT_DIR, 'examples', 'unlabeled-large', 'unlabeled-*-of-*.tfrecord'
)
UNLABELED_PARQUET_PATTERN = os.path.join(
OUTPUT_DIR, 'examples', 'unlabeled-parquet', 'examples-*-of-*.parquet'
)
ZERO_SHOT_DIR = os.path.join(OUTPUT_DIR, 'zero_shot_model')
ZERO_SHOT_SCORES = os.path.join(ZERO_SHOT_DIR, 'dataset_0_output.csv')
LABELING_IMAGES_DIR = os.path.join(OUTPUT_DIR, 'labeling_images')
Expand Down Expand Up @@ -470,9 +474,13 @@ def check_assessment_status():
yes_no_text(_file_exists(EXAMPLE_GENERATION_CONFIG_PATH)),
)
print(
'Unlabeled examples generated:',
'Unlabeled tfrecord files generated:',
yes_no_text(_file_exists(UNLABELED_TFRECORD_PATTERN)),
)
print(
'Unlabeled parquet files generated:',
yes_no_text(_file_exists(UNLABELED_PARQUET_PATTERN)),
)
print(
'Zero-shot assessment generated:',
yes_no_text(_file_exists(ZERO_SHOT_SCORES)),
Expand Down Expand Up @@ -671,13 +679,14 @@ def write_example_generation_config(path: str) -> None:
'output_dir': OUTPUT_DIR,
'buildings_method': 'file',
'buildings_file': buildings_file,
'resolution': 0.5,
'resolution': EXAMPLE_RESOLUTION,
'use_dataflow': True,
'cloud_project': GCP_PROJECT,
'cloud_region': GCP_LOCATION,
'worker_service_account': GCP_SERVICE_ACCOUNT,
'max_dataflow_workers': 100,
'output_shards': 100,
'output_parquet': True,
'output_metadata_file': True,
'before_image_patterns': BEFORE_IMAGES,
'after_image_patterns': AFTER_IMAGES,
Expand Down Expand Up @@ -819,12 +828,20 @@ def visualize_labeling_images(images_dir: str, num: int):


def create_labeling_images(
examples_pattern: str,
tfrecord_pattern: str,
parquet_pattern: str,
scores_file: str,
output_dir: str,
max_images: int,
):
"""Creates labeling images."""

# Prefer using Parquet dataset over TFRecords.
if tf.io.gfile.glob(parquet_pattern):
examples_pattern = parquet_pattern
else:
examples_pattern = tfrecord_pattern

if not tf.io.gfile.glob(examples_pattern):
print(
f'No files match "{examples_pattern}". Please run example generation'
Expand Down Expand Up @@ -878,6 +895,7 @@ def create_labeling_images(

create_labeling_images(
UNLABELED_TFRECORD_PATTERN,
UNLABELED_PARQUET_PATTERN,
ZERO_SHOT_SCORES,
LABELING_IMAGES_DIR,
MAX_LABELING_IMAGES,
Expand Down
2 changes: 2 additions & 0 deletions src/colab/sync_notebook_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
'DEFAULT_THRESHOLD': 0.5,
'HIGH_PRECISION_THRESHOLD': 0.6,
'HIGH_RECALL_THRESHOLD': 0.4,
'MAX_LABELING_IMAGES': 1000,
'EXAMPLE_RESOLUTION': 0.5,
}


Expand Down

0 comments on commit 433d9b2

Please sign in to comment.