Skip to content

Commit

Permalink
ENH: Add new Semantic Types - SPATIAL.WKT, SPATIAL.GEOJSON (#121)
Browse files Browse the repository at this point in the history
  • Loading branch information
tsegall committed Jan 21, 2025
1 parent 2691aad commit 33bbf51
Show file tree
Hide file tree
Showing 21 changed files with 455 additions and 41 deletions.
6 changes: 6 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@

## Changes ##

### 15.12.0
- ENH: Add new Semantic Types - SPATIAL.WKT, SPATIAL.GEOJSON (#121)
- INT: Bump logback-classic to 1.5.16, google phonenumber to 8.13.53
- INT: Some minor cleaning
- BUG: If FORMAT_DETECTION is enabled - error messages were being printed to stdout

### 15.11.2
- INT: Minor gradle cleanup
- INT: Minor code cleanup
Expand Down
6 changes: 4 additions & 2 deletions SemanticTypes.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
|FILENAME|Name of file|https://en.wikipedia.org/wiki/Filename|*|
|FREE_TEXT|Free Text field - e.g. Description, Notes, Comments, ...||bg, ca, da, de, en, es, fi, fr, ga, hr, hu, it, lv, nl, pt, ro, ru, sk, sv, tr|
|FULL_ADDRESS_EN|Full Address (English Language)||en-CA,en-US|
|GENDER.TEXT_<LANGUAGE>|Gender|https://www.wikidata.org/wiki/Property:P21, https://en.wikipedia.org/wiki/Gender, https://schema.org/gender|bg, ca, en, de, es, fi, fr, hr, it, ms, nl, pl, pt, ro, ru, sv, tr|
|GENDER.TEXT_<LANGUAGE>|Gender|https://www.wikidata.org/wiki/Property:P21, https://en.wikipedia.org/wiki/Gender, https://schema.org/gender|bg, ca, en, de, es, fi, fr, hr, is, it, ms, nl, pl, pt, ro, ru, sv, tr|
|GENDER.TEXT_JA|Gender (Japanese Language)|https://www.wikidata.org/wiki/Property:P21, https://en.wikipedia.org/wiki/Gender, https://schema.org/gender|ja|
|GENDER.TEXT_ZH|Gender (Chinese Language)|https://www.wikidata.org/wiki/Property:P21, https://en.wikipedia.org/wiki/Gender, https://schema.org/gender|zh|
|GUID|Globally Unique Identifier, e.g. 30DD879E-FE2F-11DB-8314-9800310C9A67|https://www.wikidata.org/wiki/Q254972, https://en.wikipedia.org/wiki/Universally_unique_identifier|*|
Expand Down Expand Up @@ -82,7 +82,7 @@
|MONTH.FULL_<LOCALE>|Full Month name <LOCALE> = Locale, e.g. en-US for English language in US)|https://en.wikipedia.org/wiki/Month|*|
|NAME.FIRST|First Name|https://www.wikidata.org/wiki/Property:P735, https://schema.org/givenName|en, de, es, fr, it, nl, pt|
|NAME.FIRST_LAST|Merged Name (First Last)|https://en.wikipedia.org/wiki/Personal_name|en, de, es, fr, it, nl, pt|
|NAME.LAST|Last Name|https://www.wikidata.org/wiki/Property:P734, https://en.wikipedia.org/wiki/https://en.wikipedia.org/wiki/Surname, https://schema.org/familyName|en, de, es, fr, it, nl, pt|
|NAME.LAST|Last Name|https://www.wikidata.org/wiki/Property:P734, https://en.wikipedia.org/wiki/https://en.wikipedia.org/wiki/Surname, https://schema.org/familyName|en, de, el, es, fr, it, nl, pt|
|NAME.LAST_FIRST|Merged Name (Last, First)|https://en.wikipedia.org/wiki/Personal_name|en, de, es, fr, it, nl, pt|
|NAME.MIDDLE|Middle Name|https://en.wikipedia.org/wiki/Middle_name, https://schema.org/additionalName|en|
|NAME.MIDDLE_INITIAL|Middle Initial|https://en.wikipedia.org/wiki/Middle_name|en|
Expand Down Expand Up @@ -115,6 +115,8 @@
|POSTAL_CODE.POSTAL_CODE_UY|Postal Code (UY)|https://www.wikidata.org/wiki/Property:P281, https://schema.org/postalCode|es-UY|
|POSTAL_CODE.ZIP5_PLUS4_US|Postal Code + 4 (US)|https://www.wikidata.org/wiki/Property:P281, https://en.wikipedia.org/wiki/ZIP_Code, https://schema.org/postalCode|en-US,en-CA,fr-CA|
|POSTAL_CODE.ZIP5_US|Postal Code (US)|https://www.wikidata.org/wiki/Property:P281, https://en.wikipedia.org/wiki/ZIP_Code, https://schema.org/postalCode|en-US,en-Latn-US,en-CA,fr-CA|
|SPATIAL.GEOJSON|coordinate reference systems - GEOJSON|https://en.wikipedia.org/wiki/GeoJSON|*|
|SPATIAL.WKT|coordinate reference systems - Well-known Text|https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry|*|
|SSN|Social Security Number (US)|https://en.wikipedia.org/wiki/Social_Security_number|en-US|
|STATE_PROVINCE.CANTON_CH|Swiss CANTON Code|https://en.wikipedia.org/wiki/Cantons_of_Switzerland, https://schema.org/State|de-CH,fr-CH,it-CH|
|STATE_PROVINCE.CANTON_NAME_CH|Swiss CANTON Name|https://en.wikipedia.org/wiki/Cantons_of_Switzerland|de-CH,fr-CH,it-CH|
Expand Down
4 changes: 4 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ wrapper {
gradleVersion = '8.12'
}

tasks.register('examples.clean') {
dependsOn gradle.includedBuilds*.task(':clean')
}

tasks.register('examples.build') {
dependsOn gradle.includedBuilds*.task(':build')
}
Expand Down
4 changes: 3 additions & 1 deletion cli/src/main/java/com/cobber/fta/driver/Driver.java
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,9 @@ else if ("--version".equals(unprocessed[idx])) {
System.exit(0);
}

if (helpRequested && cmdLineOptions.verbose != 0) {
if (helpRequested) {
if (cmdLineOptions.verbose == 0)
System.exit(0);
final TextAnalyzer analyzer = TextAnalyzer.getDefaultAnalysis(cmdLineOptions.locale);
final Collection<LogicalType> registered = analyzer.getPlugins().getRegisteredSemanticTypes();
final Set<String> names = new TreeSet<>();
Expand Down
44 changes: 42 additions & 2 deletions core/src/main/java/com/cobber/fta/core/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,13 @@
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

import com.fasterxml.jackson.databind.ObjectMapper;

Expand Down Expand Up @@ -282,9 +286,24 @@ public static String determineStreamFormat(final ObjectMapper mapper, final Map<
// Ignore
}
}
if (first == '<' && last == '>'&& samples - fmtXML < 5) {
if (first == '<' && last == '>' && samples - fmtXML < 5) {
try {
DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new InputSource(new StringReader(sample)));
DocumentBuilder db = DocumentBuilderFactory.newInstance().newDocumentBuilder();
db.setErrorHandler(new ErrorHandler()
{
@Override
public void fatalError(SAXParseException exception) throws SAXException {
}

@Override
public void error(SAXParseException exception) throws SAXException {
}

@Override
public void warning(SAXParseException exception) throws SAXException {
}
});
db.parse(new InputSource(new StringReader(sample)));
fmtXML++;
} catch (Exception e) {
// Ignore
Expand Down Expand Up @@ -406,4 +425,25 @@ public static double uniquenessProbability(final int sampleSpace, final int samp

return 1.0 - result;
}

public static boolean containsIgnoreCase(String src, String hunting) {
final int len = hunting.length();
if (len == 0)
return true; // Empty string is contained

final char firstLower = Character.toLowerCase(hunting.charAt(0));
final char firstUpper = Character.toUpperCase(hunting.charAt(0));

for (int i = src.length() - len; i >= 0; i--) {
// Quick check before calling the more expensive regionMatches() method:
final char ch = src.charAt(i);
if (ch != firstLower && ch != firstUpper)
continue;

if (src.regionMatches(true, i, hunting, 0, len))
return true;
}

return false;
}
}
6 changes: 3 additions & 3 deletions settings.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ includeBuild 'examples/core/speed'
dependencyResolutionManagement {
versionCatalogs {
libs {
version('fta', '15.11.2')
version('fta', '15.12.0')
version('jacoco', '0.8.12')

// https://mvnrepository.com/artifact/com.univocity/univocity-parsers
Expand All @@ -58,7 +58,7 @@ dependencyResolutionManagement {
// https://mvnrepository.com/artifact/com.fasterxml.jackson.datatype/jackson-datatype-jsr310
library('jacksonDataType', 'com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.18.2')
// https://mvnrepository.com/artifact/com.googlecode.libphonenumber/libphonenumber
library('googlePhoneNumber', 'com.googlecode.libphonenumber:libphonenumber:8.13.52')
library('googlePhoneNumber', 'com.googlecode.libphonenumber:libphonenumber:8.13.53')
// https://mvnrepository.com/artifact/org.apache.commons/commons-text
library('commonsText', 'org.apache.commons:commons-text:1.13.0')
// https://mvnrepository.com/artifact/com.github.krraghavan/xeger
Expand All @@ -70,7 +70,7 @@ dependencyResolutionManagement {
// https://mvnrepository.com/artifact/com.google.guava/guava
library('guava', 'com.google.guava:guava:33.4.0-jre')
// https://mvnrepository.com/artifact/ch.qos.logback/logback-classic
library('logbackClassic', 'ch.qos.logback:logback-classic:1.5.15')
library('logbackClassic', 'ch.qos.logback:logback-classic:1.5.16')
// https://mvnrepository.com/artifact/com.datadoghq/sketches-java
library('sketches', 'com.datadoghq:sketches-java:0.8.3')
}
Expand Down
1 change: 0 additions & 1 deletion types/src/main/java/com/cobber/fta/plugins/Gender.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ public class Gender extends LogicalTypeFinite {
private GenderData genderData;
private Set<String> languageMembers;


class GenderPair {
String feminine;
String masculine;
Expand Down
160 changes: 160 additions & 0 deletions types/src/main/java/com/cobber/fta/plugins/SpatialGeoJSON.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
/*
* Copyright 2017-2024 Tim Segall
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cobber.fta.plugins;

import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;

import com.cobber.fta.AnalysisConfig;
import com.cobber.fta.AnalyzerContext;
import com.cobber.fta.Facts;
import com.cobber.fta.FiniteMap;
import com.cobber.fta.LogicalTypeInfinite;
import com.cobber.fta.PluginAnalysis;
import com.cobber.fta.PluginDefinition;
import com.cobber.fta.token.TokenStreams;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;

/**
* Plugin to detect Spatial data in GeoJSON format.
* See https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry
*/
public class SpatialGeoJSON extends LogicalTypeInfinite {
/** The Regular Expression for this Semantic type. */
public static final String REGEXP = ".*";

private final ObjectMapper mapper = new ObjectMapper();

private static final String[] keywords = {
"POINT",
"MULTIPOINT",
"LINESTRING",
"MULTILINESTRING",
"POLYGON",
"MULTIPOLYGON"
};
private Set<String> keywordSet;
private int minKeywordLength = Integer.MAX_VALUE;
private int maxKeywordLength = 0;

/**
* Construct a Spatial plugin based on the Plugin Definition.
* @param plugin The definition of this plugin.
*/
public SpatialGeoJSON(final PluginDefinition plugin) {
super(plugin);

for (String keyword : keywords) {
final int len = keyword.length();
if (len < minKeywordLength)
minKeywordLength = len;
if (len > maxKeywordLength)
maxKeywordLength = len;
}

keywordSet = new HashSet<>(Arrays.asList(keywords));
}

@Override
public String nextRandom() {
StringBuilder s = new StringBuilder();
s.append("{ \"coordinates\": [");
final int points = getRandom().nextInt(8) + 2;
for (int i = 0; i < points; i++) {
s.append('[').
append(getRandom().nextDouble() * 100).
append(", ").
append(getRandom().nextDouble() * 100).
append(']');
if (i != points - 1)
s.append(", ");
}

s.append("], \"type\": \"LineString\" }");

return s.toString();
}

@Override
public String getRegExp() {
return REGEXP;
}

@Override
public boolean isRegExpComplete() {
return false;
}

private boolean check(final String trimmed) {
String input = trimmed;
final int len = input.length();
if (len < minKeywordLength)
return false;

char first = input.charAt(0);
char last = input.charAt(input.length() - 1);

// If the whole field is quoted - then inspect it without the quotes
if (first == '"' && last == '"') {
input = input.substring(1, input.length() - 1);
first = input.charAt(0);
last = input.charAt(input.length() - 1);
}

// Quick test to see if it might be a JSON structure (note: must see entire JSON field, i.e. not truncated)
if (first == '{' && last == '}') {
try {
JsonNode root = mapper.readTree(input);
JsonNode typeNode = root.path("type");
if (!typeNode.isMissingNode()) {
JsonNode coordinatesNode = root.path("coordinates");
if (coordinatesNode.isMissingNode())
return false;
String shapeType = root.path("type").asText();
return keywordSet.contains(shapeType.toUpperCase(Locale.ROOT));
}
} catch (IOException e) {
return false;
}
}

return false;
}

@Override
public boolean isValid(final String input, final boolean detectMode, final long count) {
return check(input.trim());
}

@Override
public boolean isCandidate(final String trimmed, final StringBuilder compressed, final int[] charCounts, final int[] lastIndex) {
if (charCounts['{'] == 0)
return false;

return check(trimmed);
}

@Override
public PluginAnalysis analyzeSet(final AnalyzerContext context, final long matchCount, final long realSamples, final String currentRegExp,
final Facts facts, final FiniteMap cardinality, final FiniteMap outliers, final TokenStreams tokenStreams, final AnalysisConfig analysisConfig) {
return (double) matchCount / realSamples >= getThreshold() / 100.0 ? PluginAnalysis.OK : PluginAnalysis.SIMPLE_NOT_OK;
}
}

Loading

0 comments on commit 33bbf51

Please sign in to comment.