From 433d9b26fc21f03496fb4a4196ba3810e3d9b4f0 Mon Sep 17 00:00:00 2001 From: Joseph Xu Date: Wed, 15 Jan 2025 15:10:05 -0800 Subject: [PATCH] Generate Parquet files by default in assessment colab. It's much faster to sample labeling examples from the Parquet dataset than the TFRecord dataset. Also allow user to specify the resolution of the generated examples. PiperOrigin-RevId: 715963282 --- src/colab/skai_assessment_notebook.py | 24 +++++++++++++++++++++--- src/colab/sync_notebook_source.py | 2 ++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/src/colab/skai_assessment_notebook.py b/src/colab/skai_assessment_notebook.py index 213d9d9..5a32c7d 100644 --- a/src/colab/skai_assessment_notebook.py +++ b/src/colab/skai_assessment_notebook.py @@ -77,6 +77,7 @@ def install_requirements(): ASSESSMENT_NAME = '' # @param {type:"string"} EVENT_DATE = '' # @param {type:"date"} OUTPUT_DIR = '' # @param {type:"string"} +EXAMPLE_RESOLUTION = 0.5 # @param {type:"number"} # @markdown --- BEFORE_IMAGE_0 = '' # @param {type:"string"} @@ -120,6 +121,9 @@ def install_requirements(): UNLABELED_TFRECORD_PATTERN = os.path.join( OUTPUT_DIR, 'examples', 'unlabeled-large', 'unlabeled-*-of-*.tfrecord' ) +UNLABELED_PARQUET_PATTERN = os.path.join( + OUTPUT_DIR, 'examples', 'unlabeled-parquet', 'examples-*-of-*.parquet' +) ZERO_SHOT_DIR = os.path.join(OUTPUT_DIR, 'zero_shot_model') ZERO_SHOT_SCORES = os.path.join(ZERO_SHOT_DIR, 'dataset_0_output.csv') LABELING_IMAGES_DIR = os.path.join(OUTPUT_DIR, 'labeling_images') @@ -470,9 +474,13 @@ def check_assessment_status(): yes_no_text(_file_exists(EXAMPLE_GENERATION_CONFIG_PATH)), ) print( - 'Unlabeled examples generated:', + 'Unlabeled tfrecord files generated:', yes_no_text(_file_exists(UNLABELED_TFRECORD_PATTERN)), ) + print( + 'Unlabeled parquet files generated:', + yes_no_text(_file_exists(UNLABELED_PARQUET_PATTERN)), + ) print( 'Zero-shot assessment generated:', yes_no_text(_file_exists(ZERO_SHOT_SCORES)), @@ -671,13 +679,14 @@ def write_example_generation_config(path: str) -> None: 'output_dir': OUTPUT_DIR, 'buildings_method': 'file', 'buildings_file': buildings_file, - 'resolution': 0.5, + 'resolution': EXAMPLE_RESOLUTION, 'use_dataflow': True, 'cloud_project': GCP_PROJECT, 'cloud_region': GCP_LOCATION, 'worker_service_account': GCP_SERVICE_ACCOUNT, 'max_dataflow_workers': 100, 'output_shards': 100, + 'output_parquet': True, 'output_metadata_file': True, 'before_image_patterns': BEFORE_IMAGES, 'after_image_patterns': AFTER_IMAGES, @@ -819,12 +828,20 @@ def visualize_labeling_images(images_dir: str, num: int): def create_labeling_images( - examples_pattern: str, + tfrecord_pattern: str, + parquet_pattern: str, scores_file: str, output_dir: str, max_images: int, ): """Creates labeling images.""" + + # Prefer using Parquet dataset over TFRecords. + if tf.io.gfile.glob(parquet_pattern): + examples_pattern = parquet_pattern + else: + examples_pattern = tfrecord_pattern + if not tf.io.gfile.glob(examples_pattern): print( f'No files match "{examples_pattern}". Please run example generation' @@ -878,6 +895,7 @@ def create_labeling_images( create_labeling_images( UNLABELED_TFRECORD_PATTERN, + UNLABELED_PARQUET_PATTERN, ZERO_SHOT_SCORES, LABELING_IMAGES_DIR, MAX_LABELING_IMAGES, diff --git a/src/colab/sync_notebook_source.py b/src/colab/sync_notebook_source.py index 15b6bfd..3d818be 100644 --- a/src/colab/sync_notebook_source.py +++ b/src/colab/sync_notebook_source.py @@ -63,6 +63,8 @@ 'DEFAULT_THRESHOLD': 0.5, 'HIGH_PRECISION_THRESHOLD': 0.6, 'HIGH_RECALL_THRESHOLD': 0.4, + 'MAX_LABELING_IMAGES': 1000, + 'EXAMPLE_RESOLUTION': 0.5, }