Skip to content

Commit

Permalink
fix: crawler not handling descriptions properly (#993)
Browse files Browse the repository at this point in the history
  • Loading branch information
richardtreier authored Jul 10, 2024
1 parent e2e41d6 commit af68e61
Show file tree
Hide file tree
Showing 10 changed files with 211 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,22 +32,22 @@ alter table connector
-- Data offers, additionally keyed by env ID
create table data_offer
(
connector_id text not null,
asset_id text not null,
ui_asset_json jsonb not null,
created_at timestamp with time zone not null,
updated_at timestamp with time zone,
asset_title text collate alphanumeric_with_natural_sort not null,
description text not null default ''::text,
curator_organization_name text not null default ''::text,
data_category text not null default ''::text,
data_subcategory text not null default ''::text,
data_model text not null default ''::text,
transport_mode text not null default ''::text,
geo_reference_method text not null default ''::text,
keywords text[] not null default '{}'::text[],
keywords_comma_joined text not null default ''::text,
version text not null default ''::text,
connector_id text not null,
asset_id text not null,
ui_asset_json jsonb not null,
created_at timestamp with time zone not null,
updated_at timestamp with time zone,
asset_title text collate alphanumeric_with_natural_sort not null,
description_no_markdown text not null default ''::text,
short_description_no_markdown text not null default ''::text,
data_category text not null default ''::text,
data_subcategory text not null default ''::text,
data_model text not null default ''::text,
transport_mode text not null default ''::text,
geo_reference_method text not null default ''::text,
keywords text[] not null default '{}'::text[],
keywords_comma_joined text not null default ''::text,
version text not null default ''::text,
primary key (connector_id, asset_id),
constraint data_offer_connector_fkey foreign key (connector_id) references connector (connector_id)
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,8 @@ public static CrawlerExtensionContext buildContext(
objectMapperJsonLd
);
var contractOfferRecordUpdater = new ContractOfferRecordUpdater();
var dataOfferRecordUpdater = new DataOfferRecordUpdater(connectorQueries);
var shortDescriptionBuilder = new ShortDescriptionBuilder();
var dataOfferRecordUpdater = new DataOfferRecordUpdater(shortDescriptionBuilder);
var contractOfferQueries = new ContractOfferQueries();
var dataOfferLimitsEnforcer = new DataOfferLimitsEnforcer(crawlerConfig, crawlerEventLogger);
var dataOfferPatchBuilder = new CatalogPatchBuilder(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@

import de.sovity.edc.ext.catalog.crawler.dao.connectors.ConnectorRef;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.Setter;
import lombok.experimental.FieldDefaults;

Expand All @@ -27,6 +30,9 @@
*/
@Getter
@Setter
@Builder
@RequiredArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class FetchedCatalog {
ConnectorRef connectorRef;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,18 @@
package de.sovity.edc.ext.catalog.crawler.crawling.fetching.model;

import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import lombok.experimental.FieldDefaults;

@Getter
@Setter
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class FetchedContractOffer {
String contractOfferId;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@

import de.sovity.edc.ext.wrapper.api.common.model.UiAsset;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import lombok.experimental.FieldDefaults;

Expand All @@ -27,6 +30,9 @@
*/
@Getter
@Setter
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class FetchedDataOffer {
String assetId;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@

import de.sovity.edc.ext.catalog.crawler.crawling.fetching.model.FetchedDataOffer;
import de.sovity.edc.ext.catalog.crawler.crawling.writing.utils.ChangeTracker;
import de.sovity.edc.ext.catalog.crawler.dao.connectors.ConnectorQueries;
import de.sovity.edc.ext.catalog.crawler.dao.connectors.ConnectorRef;
import de.sovity.edc.ext.catalog.crawler.dao.utils.JsonbUtils;
import de.sovity.edc.ext.catalog.crawler.db.jooq.tables.records.DataOfferRecord;
import de.sovity.edc.ext.catalog.crawler.utils.JsonUtils2;
import de.sovity.edc.ext.wrapper.api.common.mappers.asset.utils.ShortDescriptionBuilder;
import lombok.RequiredArgsConstructor;
import org.jooq.JSONB;

Expand All @@ -36,8 +36,7 @@
*/
@RequiredArgsConstructor
public class DataOfferRecordUpdater {

private final ConnectorQueries connectorQueries;
private final ShortDescriptionBuilder shortDescriptionBuilder;

/**
* Create a new {@link DataOfferRecord}.
Expand Down Expand Up @@ -84,9 +83,15 @@ public boolean updateDataOffer(
);

changes.setIfChanged(
blankIfNull(record.getDescription()),
blankIfNull(asset.getDescription()),
record::setDescription
blankIfNull(record.getDescriptionNoMarkdown()),
shortDescriptionBuilder.extractMarkdownText(blankIfNull(asset.getDescription())),
record::setDescriptionNoMarkdown
);

changes.setIfChanged(
blankIfNull(record.getShortDescriptionNoMarkdown()),
blankIfNull(asset.getDescriptionShortText()),
record::setShortDescriptionNoMarkdown
);

changes.setIfChanged(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
public class CrawlerTestDb implements BeforeAllCallback, AfterAllCallback {
private final TestDatabaseViaTestcontainers db = new TestDatabaseViaTestcontainers();


private HikariDataSource dataSource = null;
private DslContextFactory dslContextFactory = null;

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
/*
* Copyright (c) 2023 sovity GmbH
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0
*
* SPDX-License-Identifier: Apache-2.0
*
* Contributors:
* sovity GmbH - initial implementation
*
*/

package de.sovity.edc.ext.catalog.crawler.crawling.writing;

import de.sovity.edc.ext.catalog.crawler.CrawlerTestDb;
import de.sovity.edc.ext.catalog.crawler.TestData;
import de.sovity.edc.ext.catalog.crawler.crawling.fetching.model.FetchedCatalog;
import de.sovity.edc.ext.catalog.crawler.crawling.fetching.model.FetchedContractOffer;
import de.sovity.edc.ext.catalog.crawler.crawling.fetching.model.FetchedDataOffer;
import de.sovity.edc.ext.catalog.crawler.db.jooq.Tables;
import de.sovity.edc.ext.catalog.crawler.db.jooq.enums.ConnectorContractOffersExceeded;
import de.sovity.edc.ext.catalog.crawler.db.jooq.enums.ConnectorDataOffersExceeded;
import de.sovity.edc.ext.catalog.crawler.db.jooq.enums.ConnectorOnlineStatus;
import de.sovity.edc.ext.wrapper.api.common.model.UiAsset;
import org.assertj.core.data.TemporalUnitLessThanOffset;
import org.jooq.impl.DSL;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.RegisterExtension;

import java.time.OffsetDateTime;
import java.time.temporal.ChronoUnit;
import java.util.List;

import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.when;

class ConnectorSuccessWriterTest {
@RegisterExtension
private static final CrawlerTestDb TEST_DATABASE = new CrawlerTestDb();

ConnectorUpdateSuccessWriter connectorUpdateSuccessWriter;

@BeforeEach
void setup() {
var container = new DataOfferWriterTestDydi();
connectorUpdateSuccessWriter = container.getConnectorUpdateSuccessWriter();
when(container.getCrawlerConfig().getMaxContractOffersPerDataOffer()).thenReturn(1);
when(container.getCrawlerConfig().getMaxDataOffersPerConnector()).thenReturn(1);
}

@Test
void testDataOfferWriter_fullSingleUpdate() {
TEST_DATABASE.testTransaction(dsl -> {
// arrange
var connectorRef = TestData.connectorRef;
TestData.insertConnector(dsl, connectorRef, unused -> {
});
var uiAsset = UiAsset.builder()
.assetId("assetId")
.title("title")
.description("# Description\n\n**with Markdown**")
.descriptionShortText("descriptionShortText")
.dataCategory("dataCategory")
.dataSubcategory("dataSubCategory")
.dataModel("dataModel")
.transportMode("transportMode")
.geoReferenceMethod("geoReferenceMethod")
.keywords(List.of("a", "b"))
.build();
var fetchedContractOffer = FetchedContractOffer.builder()
.contractOfferId("contractOfferId")
.uiPolicyJson("\"test-policy\"")
.build();
var fetchedDataOffer = FetchedDataOffer.builder()
.assetId("assetId")
.uiAsset(uiAsset)
.uiAssetJson("\"test\"")
.contractOffers(List.of(fetchedContractOffer))
.build();
var fetchedCatalog = FetchedCatalog.builder()
.connectorRef(connectorRef)
.dataOffers(List.of(fetchedDataOffer))
.build();

// act
connectorUpdateSuccessWriter.handleConnectorOnline(
dsl,
connectorRef,
dsl.fetchOne(
Tables.CONNECTOR,
Tables.CONNECTOR.CONNECTOR_ID.eq(connectorRef.getConnectorId())
),
fetchedCatalog
);

// assert
var connector = dsl.fetchOne(
Tables.CONNECTOR,
Tables.CONNECTOR.CONNECTOR_ID.eq(connectorRef.getConnectorId())
);
var dataOffer = dsl.fetchOne(
Tables.DATA_OFFER,
DSL.and(
Tables.DATA_OFFER.CONNECTOR_ID.eq(connectorRef.getConnectorId()),
Tables.DATA_OFFER.ASSET_ID.eq("assetId")
)
);
var contractOffer = dsl.fetchOne(
Tables.CONTRACT_OFFER,
DSL.and(
Tables.CONTRACT_OFFER.CONNECTOR_ID.eq(connectorRef.getConnectorId()),
Tables.CONTRACT_OFFER.ASSET_ID.eq("assetId"),
Tables.CONTRACT_OFFER.CONTRACT_OFFER_ID.eq("contractOfferId")
)
);

var now = OffsetDateTime.now();
var minuteAccuracy = new TemporalUnitLessThanOffset(1, ChronoUnit.MINUTES);
assertThat(connector).isNotNull();
assertThat(connector.getOnlineStatus()).isEqualTo(ConnectorOnlineStatus.ONLINE);
assertThat(connector.getLastRefreshAttemptAt()).isCloseTo(now, minuteAccuracy);
assertThat(connector.getLastSuccessfulRefreshAt()).isCloseTo(now, minuteAccuracy);
assertThat(connector.getDataOffersExceeded()).isEqualTo(ConnectorDataOffersExceeded.OK);
assertThat(connector.getContractOffersExceeded()).isEqualTo(ConnectorContractOffersExceeded.OK);

assertThat(dataOffer).isNotNull();
assertThat(dataOffer.getAssetTitle()).isEqualTo("title");
assertThat(dataOffer.getDescriptionNoMarkdown()).isEqualTo("Description with Markdown");
assertThat(dataOffer.getShortDescriptionNoMarkdown()).isEqualTo("descriptionShortText");
assertThat(dataOffer.getDataCategory()).isEqualTo("dataCategory");
assertThat(dataOffer.getDataSubcategory()).isEqualTo("dataSubCategory");
assertThat(dataOffer.getDataModel()).isEqualTo("dataModel");
assertThat(dataOffer.getTransportMode()).isEqualTo("transportMode");
assertThat(dataOffer.getGeoReferenceMethod()).isEqualTo("geoReferenceMethod");
assertThat(dataOffer.getKeywords()).containsExactly("a", "b");
assertThat(dataOffer.getKeywordsCommaJoined()).isEqualTo("a, b");
assertThat(dataOffer.getUiAssetJson().data()).isEqualTo("\"test\"");

assertThat(contractOffer).isNotNull();
assertThat(contractOffer.getUiPolicyJson().data()).isEqualTo("\"test-policy\"");
});
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,15 @@

package de.sovity.edc.ext.catalog.crawler.crawling.writing;

import de.sovity.edc.ext.catalog.crawler.crawling.logging.CrawlerEventLogger;
import de.sovity.edc.ext.catalog.crawler.dao.CatalogPatchApplier;
import de.sovity.edc.ext.catalog.crawler.dao.connectors.ConnectorQueries;
import de.sovity.edc.ext.catalog.crawler.dao.contract_offers.ContractOfferQueries;
import de.sovity.edc.ext.catalog.crawler.dao.contract_offers.ContractOfferRecordUpdater;
import de.sovity.edc.ext.catalog.crawler.dao.data_offers.DataOfferQueries;
import de.sovity.edc.ext.catalog.crawler.dao.data_offers.DataOfferRecordUpdater;
import de.sovity.edc.ext.catalog.crawler.orchestration.config.CrawlerConfig;
import de.sovity.edc.ext.wrapper.api.common.mappers.asset.utils.ShortDescriptionBuilder;
import lombok.Value;
import org.eclipse.edc.spi.system.configuration.Config;

Expand All @@ -34,9 +36,8 @@ class DataOfferWriterTestDydi {
ContractOfferQueries contractOfferQueries = new ContractOfferQueries();
ContractOfferRecordUpdater contractOfferRecordUpdater = new ContractOfferRecordUpdater();
ConnectorQueries connectorQueries = new ConnectorQueries(crawlerConfig);
DataOfferRecordUpdater dataOfferRecordUpdater = new DataOfferRecordUpdater(
connectorQueries
);
ShortDescriptionBuilder shortDescriptionBuilder = new ShortDescriptionBuilder();
DataOfferRecordUpdater dataOfferRecordUpdater = new DataOfferRecordUpdater(shortDescriptionBuilder);
CatalogPatchBuilder catalogPatchBuilder = new CatalogPatchBuilder(
contractOfferQueries,
dataOfferQueries,
Expand All @@ -45,4 +46,16 @@ class DataOfferWriterTestDydi {
);
CatalogPatchApplier catalogPatchApplier = new CatalogPatchApplier();
ConnectorUpdateCatalogWriter connectorUpdateCatalogWriter = new ConnectorUpdateCatalogWriter(catalogPatchBuilder, catalogPatchApplier);

// for the ConnectorUpdateSuccessWriterTest
CrawlerEventLogger crawlerEventLogger = new CrawlerEventLogger();
DataOfferLimitsEnforcer dataOfferLimitsEnforcer = new DataOfferLimitsEnforcer(
crawlerConfig,
crawlerEventLogger
);
ConnectorUpdateSuccessWriter connectorUpdateSuccessWriter = new ConnectorUpdateSuccessWriter(
crawlerEventLogger,
connectorUpdateCatalogWriter,
dataOfferLimitsEnforcer
);
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ public String buildShortDescription(String descriptionMarkdown) {
return null;
}

var text = extractText(descriptionMarkdown);
var text = extractMarkdownText(descriptionMarkdown);
return abbreviate(text, 300);
}

String extractText(String markdown) {
public String extractMarkdownText(String markdown) {
var options = new MutableDataSet();
var parser = Parser.builder(options).build();
var renderer = HtmlRenderer.builder(options).build();
Expand Down

0 comments on commit af68e61

Please sign in to comment.