Skip to content

Commit

Permalink
Handle reserved characters (colon, dot, white space) in Lineage FQN
Browse files Browse the repository at this point in the history
  • Loading branch information
Abacn committed Aug 7, 2024
1 parent cfd3e06 commit 1c599d3
Show file tree
Hide file tree
Showing 25 changed files with 212 additions and 70 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -162,5 +162,5 @@ protected abstract void rename(
*
* <p>Unless override by FileSystem implementations, default to no-op.
*/
protected void reportLineage(ResourceIdT unusedId, Lineage.Type unusedType) {}
protected void reportLineage(ResourceIdT unusedId, Lineage unusedLineage) {}
}
Original file line number Diff line number Diff line change
Expand Up @@ -398,12 +398,12 @@ public ResourceId apply(@Nonnull Metadata input) {

/** Report source {@link Lineage} metrics for resource id. */
public static void reportSourceLineage(ResourceId resourceId) {
getFileSystemInternal(resourceId.getScheme()).reportLineage(resourceId, Lineage.Type.SOURCE);
getFileSystemInternal(resourceId.getScheme()).reportLineage(resourceId, Lineage.getSources());
}

/** Report sink {@link Lineage} metrics for resource id. */
public static void reportSinkLineage(ResourceId resourceId) {
getFileSystemInternal(resourceId.getScheme()).reportLineage(resourceId, Lineage.Type.SINK);
getFileSystemInternal(resourceId.getScheme()).reportLineage(resourceId, Lineage.getSinks());
}

private static class FilterResult {
Expand Down
104 changes: 88 additions & 16 deletions sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/Lineage.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,36 +19,108 @@

import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.beam.sdk.annotations.Internal;
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings;
import org.checkerframework.checker.nullness.qual.Nullable;

/**
* Standard collection of metrics used to record source and sinks information for lineage tracking.
*/
public class Lineage {

public static final String LINEAGE_NAMESPACE = "lineage";
private static final StringSet SOURCES =
Metrics.stringSet(LINEAGE_NAMESPACE, Type.SOURCE.toString());
private static final StringSet SINKS = Metrics.stringSet(LINEAGE_NAMESPACE, Type.SINK.toString());
private static final Lineage SOURCES = new Lineage(Type.SOURCE);
private static final Lineage SINKS = new Lineage(Type.SINK);

private final StringSet metric;

private Lineage(Type type) {
this.metric = Metrics.stringSet(LINEAGE_NAMESPACE, type.toString());
}

/** {@link StringSet} representing sources and optionally side inputs. */
public static StringSet getSources() {
/** {@link Lineage} representing sources and optionally side inputs. */
public static Lineage getSources() {
return SOURCES;
}

/** {@link StringSet} representing sinks. */
public static StringSet getSinks() {
/** {@link Lineage} representing sinks. */
public static Lineage getSinks() {
return SINKS;
}

/** {@link StringSet} representing {@link Type}. */
public static StringSet get(Type type) {
switch (type) {
case SOURCE:
return getSources();
case SINK:
return getSinks();
default:
throw new IllegalArgumentException(String.format("Unsupported Lineage type: %s", type));
private static final Pattern RESERVED_CHARS = Pattern.compile("[:\\s.]");

/**
* Wrap segment to valid segment name.
*
* <p>Specifically, If there are reserved chars (colon, whitespace, dot), escape with backtick. If
* the segment is already wrapped, return the original.
*/
private static String wrapSegment(String value) {
if (value.startsWith("`") && value.endsWith("`")) {
return value;
}
if (RESERVED_CHARS.matcher(value).find()) {
return String.format("`%s`", value);
}
return value;
}

/**
* Assemble fully qualified name (<a
* href="https://cloud.google.com/data-catalog/docs/fully-qualified-names">FQN</a>). Format:
*
* <ul>
* <li>{@code system:segment1.segment2}
* <li>{@code system:routine:segment1.segment2}
* <li>{@code system:`segment1.with.dots:clons`.segment2}
* </ul>
*
* <p>This helper method is for internal and testing usage only.
*/
@Internal
public static String getFqName(
String system, @Nullable String routine, Iterable<String> segments) {
StringBuilder builder = new StringBuilder(system);
if (!Strings.isNullOrEmpty(routine)) {
builder.append(":").append(routine);
}
int idx = 0;
for (String segment : segments) {
if (idx == 0) {
builder.append(":");
} else {
builder.append(".");
}
builder.append(wrapSegment(segment));
++idx;
}
return builder.toString();
}

/**
* Assemble the FQN of given system, and segments.
*
* <p>This helper method is for internal and testing usage only.
*/
@Internal
public static String getFqName(String system, Iterable<String> segments) {
return getFqName(system, null, segments);
}

/**
* Add a FQN (fully-qualified name) to Lineage. Segments will be processed via {@link #getFqName}.
*/
public void add(String system, @Nullable String routine, Iterable<String> segments) {
metric.add(getFqName(system, routine, segments));
}

/**
* Add a FQN (fully-qualified name) to Lineage. Segments will be processed via {@link #getFqName}.
*/
public void add(String system, Iterable<String> segments) {
add(system, null, segments);
}

/** Query {@link StringSet} metrics from {@link MetricResults}. */
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.metrics;

import static org.junit.Assert.assertEquals;

import java.util.Map;
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList;
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;

/** Tests for {@link Lineage}. */
@RunWith(JUnit4.class)
public class LineageTest {
@Test
public void testGetFqName() {
Map<String, String> testCases =
ImmutableMap.<String, String>builder()
.put("apache-beam", "apache-beam")
.put("`apache-beam`", "`apache-beam`")
.put("apache.beam", "`apache.beam`")
.put("apache:beam", "`apache:beam`")
.put("apache beam", "`apache beam`")
.put("`apache beam`", "`apache beam`")
.put("apache\tbeam", "`apache\tbeam`")
.put("apache\nbeam", "`apache\nbeam`")
.build();
testCases.forEach(
(key, value) ->
assertEquals("apache:" + value, Lineage.getFqName("apache", ImmutableList.of(key))));
testCases.forEach(
(key, value) ->
assertEquals(
"apache:beam:" + value,
Lineage.getFqName("apache", "beam", ImmutableList.of(key))));
testCases.forEach(
(key, value) ->
assertEquals(
"apache:beam:" + value + "." + value,
Lineage.getFqName("apache", "beam", ImmutableList.of(key, key))));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -216,10 +216,10 @@ protected String getScheme() {
}

@Override
protected void reportLineage(GcsResourceId resourceId, Lineage.Type type) {
protected void reportLineage(GcsResourceId resourceId, Lineage lineage) {
GcsPath path = resourceId.getGcsPath();
if (!path.getBucket().isEmpty()) {
Lineage.get(type).add(String.format("gcs:%s.%s", path.getBucket(), path.getObject()));
lineage.add("gcs", ImmutableList.of(path.getBucket(), path.getObject()));
} else {
LOG.warn("Report Lineage on relative path {} is unsupported", path.getObject());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -626,8 +626,8 @@ protected S3ResourceId matchNewResource(String singleResourceSpec, boolean isDir
}

@Override
protected void reportLineage(S3ResourceId resourceId, Lineage.Type type) {
Lineage.get(type).add(String.format("s3:%s.%s", resourceId.getBucket(), resourceId.getKey()));
protected void reportLineage(S3ResourceId resourceId, Lineage lineage) {
lineage.add("s3", ImmutableList.of(resourceId.getBucket(), resourceId.getKey()));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -657,8 +657,8 @@ protected S3ResourceId matchNewResource(String singleResourceSpec, boolean isDir
}

@Override
protected void reportLineage(S3ResourceId resourceId, Lineage.Type type) {
Lineage.get(type).add(String.format("s3:%s.%s", resourceId.getBucket(), resourceId.getKey()));
protected void reportLineage(S3ResourceId resourceId, Lineage lineage) {
lineage.add("s3", ImmutableList.of(resourceId.getBucket(), resourceId.getKey()));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -452,16 +452,14 @@ protected AzfsResourceId matchNewResource(String singleResourceSpec, boolean isD
}

@Override
protected void reportLineage(AzfsResourceId resourceId, Lineage.Type type) {
protected void reportLineage(AzfsResourceId resourceId, Lineage lineage) {
if (!Strings.isNullOrEmpty(resourceId.getBlob())) {
Lineage.get(type)
.add(
String.format(
"abs:%s.%s.%s",
resourceId.getAccount(), resourceId.getContainer(), resourceId.getBlob()));
lineage.add(
"abs",
ImmutableList.of(
resourceId.getAccount(), resourceId.getContainer(), resourceId.getBlob()));
} else {
Lineage.get(type)
.add(String.format("abs:%s.%s", resourceId.getAccount(), resourceId.getContainer()));
lineage.add("abs", ImmutableList.of(resourceId.getAccount(), resourceId.getContainer()));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
import org.apache.beam.sdk.util.FluentBackoff;
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting;
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings;
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList;
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists;
import org.checkerframework.checker.nullness.qual.NonNull;
import org.checkerframework.checker.nullness.qual.Nullable;
Expand Down Expand Up @@ -413,7 +414,7 @@ public static String toTableSpec(TableReference ref) {
return sb.toString();
}

public static String dataCatalogName(TableReference ref, BigQueryOptions options) {
public static List<String> dataCatalogSegments(TableReference ref, BigQueryOptions options) {
String tableIdBase;
int ix = ref.getTableId().indexOf('$');
if (ix == -1) {
Expand All @@ -429,7 +430,7 @@ public static String dataCatalogName(TableReference ref, BigQueryOptions options
} else {
projectId = options.getProject();
}
return String.format("bigquery:%s.%s.%s", projectId, ref.getDatasetId(), tableIdBase);
return ImmutableList.of(projectId, ref.getDatasetId(), tableIdBase);
}

static <K, V> List<V> getOrCreateMapListValue(Map<K, List<V>> map, K key) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ protected ExtractResult extractFiles(PipelineOptions options) throws Exception {
BigQueryHelpers.toTableSpec(tableToExtract)));
}
// emit this table ID as a lineage source
Lineage.getSources().add(BigQueryHelpers.dataCatalogName(tableToExtract, bqOptions));
Lineage.getSources()
.add("bigquery", BigQueryHelpers.dataCatalogSegments(tableToExtract, bqOptions));

TableSchema schema = table.getSchema();
JobService jobService = bqServices.getJobService(bqOptions);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
import org.apache.beam.sdk.io.BoundedSource;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.StorageClient;
import org.apache.beam.sdk.metrics.Lineage;
import org.apache.beam.sdk.metrics.StringSet;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.transforms.SerializableFunction;
Expand Down Expand Up @@ -109,12 +108,12 @@ public List<BigQueryStorageStreamSource<T>> split(
@Nullable Table targetTable = getTargetTable(bqOptions);

ReadSession.Builder readSessionBuilder = ReadSession.newBuilder();
StringSet lineageSources = Lineage.getSources();
Lineage lineage = Lineage.getSources();
if (targetTable != null) {
TableReference tableReference = targetTable.getTableReference();
readSessionBuilder.setTable(BigQueryHelpers.toTableResourceName(tableReference));
// register the table as lineage source
lineageSources.add(BigQueryHelpers.dataCatalogName(tableReference, bqOptions));
lineage.add("bigquery", BigQueryHelpers.dataCatalogSegments(tableReference, bqOptions));
} else {
// If the table does not exist targetTable will be null.
// Construct the table id if we can generate it. For error recording/logging.
Expand All @@ -123,7 +122,7 @@ public List<BigQueryStorageStreamSource<T>> split(
readSessionBuilder.setTable(tableReferenceId);
// register the table as lineage source
TableReference tableReference = BigQueryHelpers.parseTableUrn(tableReferenceId);
lineageSources.add(BigQueryHelpers.dataCatalogName(tableReference, bqOptions));
lineage.add("bigquery", BigQueryHelpers.dataCatalogSegments(tableReference, bqOptions));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ public void processElement(ProcessContext context) {
BigQueryOptions bqOptions = context.getPipelineOptions().as(BigQueryOptions.class);
Lineage.getSinks()
.add(
BigQueryHelpers.dataCatalogName(
"bigquery",
BigQueryHelpers.dataCatalogSegments(
tableDestination1.getTableReference(), bqOptions));
return CreateTableHelpers.possiblyCreateTable(
bqOptions,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1109,7 +1109,8 @@ public void process(
pipelineOptions.as(BigQueryOptions.class)));
Lineage.getSinks()
.add(
BigQueryHelpers.dataCatalogName(
"bigquery",
BigQueryHelpers.dataCatalogSegments(
state.getTableDestination().getTableReference(),
pipelineOptions.as(BigQueryOptions.class)));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,8 @@ public void process(

Lineage.getSinks()
.add(
BigQueryHelpers.dataCatalogName(
"bigquery",
BigQueryHelpers.dataCatalogSegments(
tableDestination.getTableReference(), bigQueryOptions));

Coder<DestinationT> destinationCoder = dynamicDestinations.getDestinationCoder();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,8 @@ public void processElement(
if (!entry.getValue().isEmpty()) {
Lineage.getSinks()
.add(
BigQueryHelpers.dataCatalogName(
"bigquery",
BigQueryHelpers.dataCatalogSegments(
entry.getKey().getTableReference(),
c.getPipelineOptions().as(BigQueryOptions.class)));
pendingJobs.add(startWriteRename(entry.getKey(), entry.getValue(), c, window));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,8 @@ public void processElement(
} else {
Lineage.getSinks()
.add(
BigQueryHelpers.dataCatalogName(
"bigquery",
BigQueryHelpers.dataCatalogSegments(
tableReference, c.getPipelineOptions().as(BigQueryOptions.class)));
}

Expand Down
Loading

0 comments on commit 1c599d3

Please sign in to comment.