diff --git a/core/src/main/java/google/registry/beam/common/RegistryPipelineOptions.java b/core/src/main/java/google/registry/beam/common/RegistryPipelineOptions.java index bbd3e480639..74186e2e4c4 100644 --- a/core/src/main/java/google/registry/beam/common/RegistryPipelineOptions.java +++ b/core/src/main/java/google/registry/beam/common/RegistryPipelineOptions.java @@ -16,6 +16,7 @@ import google.registry.beam.common.RegistryJpaIO.Write; import google.registry.config.RegistryEnvironment; +import google.registry.model.annotations.DeleteAfterMigration; import google.registry.persistence.PersistenceModule.JpaTransactionManagerType; import google.registry.persistence.PersistenceModule.TransactionIsolationLevel; import java.util.Objects; @@ -65,6 +66,17 @@ public interface RegistryPipelineOptions extends GcpOptions { void setSqlWriteShards(int maxConcurrentSqlWriters); + @DeleteAfterMigration + @Description( + "Whether to use self allocated primary IDs when building entities. This should only be used" + + " when the IDs are not significant and the resulting entities are not persisted back to" + + " the database. Use with caution as self allocated IDs are not unique across workers," + + " and persisting entities with these IDs can be dangerous.") + @Default.Boolean(false) + boolean getUseSelfAllocatedId(); + + void setUseSelfAllocatedId(boolean useSelfAllocatedId); + static RegistryPipelineComponent toRegistryPipelineComponent(RegistryPipelineOptions options) { return DaggerRegistryPipelineComponent.builder() .isolationOverride(options.getIsolationOverride()) diff --git a/core/src/main/java/google/registry/beam/common/RegistryPipelineWorkerInitializer.java b/core/src/main/java/google/registry/beam/common/RegistryPipelineWorkerInitializer.java index f4d13e9039d..ea6899b68c8 100644 --- a/core/src/main/java/google/registry/beam/common/RegistryPipelineWorkerInitializer.java +++ b/core/src/main/java/google/registry/beam/common/RegistryPipelineWorkerInitializer.java @@ -22,6 +22,8 @@ import google.registry.config.RegistryEnvironment; import google.registry.config.SystemPropertySetter; import google.registry.model.AppEngineEnvironment; +import google.registry.model.IdService; +import google.registry.model.IdService.SelfAllocatedIdSupplier; import google.registry.persistence.transaction.JpaTransactionManager; import google.registry.persistence.transaction.TransactionManagerFactory; import org.apache.beam.sdk.harness.JvmInitializer; @@ -65,12 +67,20 @@ public void beforeProcessing(PipelineOptions options) { transactionManagerLazy = registryPipelineComponent.getJpaTransactionManager(); } TransactionManagerFactory.setJpaTmOnBeamWorker(transactionManagerLazy::get); - // Masquerade all threads as App Engine threads so we can create Ofy keys in the pipeline. Also + // Masquerade all threads as App Engine threads, so we can create Ofy keys in the pipeline. Also // loads all ofy entities. new AppEngineEnvironment("s~" + registryPipelineComponent.getProjectId()) .setEnvironmentForAllThreads(); - // Set the system property so that we can call IdService.allocateId() without access to - // datastore. SystemPropertySetter.PRODUCTION_IMPL.setProperty(PROPERTY, "true"); + // Use self-allocated IDs if requested. Note that this inevitably results in duplicate IDs from + // multiple workers, which can also collide with existing IDs in the database. So they cannot be + // dependent upon for comparison or anything significant. The resulting entities can never be + // persisted back into the database. This is a stop-gap measure that should only be used when + // you need to create Buildables in Beam, but do not have control over how the IDs are + // allocated, and you don't care about the generated IDs as long + // as you can build the entities. + if (registryOptions.getUseSelfAllocatedId()) { + IdService.setIdSupplier(SelfAllocatedIdSupplier.getInstance()); + } } } diff --git a/core/src/main/java/google/registry/beam/rde/RdePipeline.java b/core/src/main/java/google/registry/beam/rde/RdePipeline.java index 0bd4088c4a7..07a4b06a69a 100644 --- a/core/src/main/java/google/registry/beam/rde/RdePipeline.java +++ b/core/src/main/java/google/registry/beam/rde/RdePipeline.java @@ -128,7 +128,7 @@ *
In non-test, non-beam environments the Id is generated by Datastore, otherwise it's from an - * atomic long number that's incremented every time this method is called. + *
Normally, the ID is globally unique and allocated by Datastore. It is possible to override
+ * this behavior by providing an ID supplier, such as in unit tests, where a self-allocated ID based
+ * on a monotonically increasing atomic {@link long} is used. Such an ID supplier can also be used
+ * in other scenarios, such as in a Beam pipeline to get around the limitation of Beam's inability
+ * to use GAE SDK to access Datastore. The override should be used with great care lest it results
+ * in irreversible data corruption.
+ *
+ * @see #setIdSupplier(Supplier)
*/
@DeleteAfterMigration
public final class IdService {
- /**
- * A placeholder String passed into DatastoreService.allocateIds that ensures that all ids are
- * initialized from the same id pool.
- */
- private static final String APP_WIDE_ALLOCATION_KIND = "common";
+ private static final FluentLogger logger = FluentLogger.forEnclosingClass();
+
+ private IdService() {}
+
+ private static Supplier Currently, the only use case for an override is in the Beam pipeline, where access to
+ * Datastore is not possible through the App Engine API. As such, the setter explicitly checks if
+ * the runtime is Beam.
*
- * Note that one should only use self-allocate Ids in Beam for entities whose Ids are not
- * important and are not persisted back to the database, i. e. nowhere the uniqueness of the ID is
- * required.
+ * Because the provided supplier is not guaranteed to be globally unique and compatible with
+ * existing IDs in the database, one should proceed with great care. It is safe to use an
+ * arbitrary supplier when the resulting IDs are not significant and not persisted back to the
+ * database, i.e. the IDs are only required by the {@link Buildable} contract but are not used in
+ * any meaningful way. One example is the RDE pipeline where we project EPP resource entities from
+ * history entries to watermark time, which are then marshalled into XML elements in the RDE
+ * deposits, where the IDs are omitted.
*/
- private static final AtomicLong nextSelfAllocatedId = new AtomicLong(1); // ids cannot be zero
-
- private static final boolean isSelfAllocated() {
- return RegistryEnvironment.UNITTEST.equals(RegistryEnvironment.get())
- || "true".equals(System.getProperty(RegistryPipelineWorkerInitializer.PROPERTY, "false"));
+ public static void setIdSupplier(Supplier The generated IDs are only unique within the same JVM. It is not suitable for production use
+ * unless in cases the IDs are not significant.
+ */
+ public static class SelfAllocatedIdSupplier implements Supplier