Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

expect regex extracted tokens in database bloom filters #103

Merged
merged 26 commits into from
Nov 11, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
b9d2731
expect regex extracted tokens in database bloom filters
elliVM Oct 18, 2024
465cf83
add RegexExtractedValueTest
elliVM Oct 22, 2024
7d4559d
remove unnecessary test
elliVM Oct 22, 2024
945e839
TokenizedValue: call tokenizer only when needed, clean up tests
elliVM Oct 22, 2024
2a8d97c
clear up exception message in BloomFilterFromRecord
elliVM Oct 22, 2024
b3ac348
BloomFilterFromRecord: remove ULong.longValue() from constructor, cla…
elliVM Oct 22, 2024
0d72406
set logger level to debug when indexstatement is reached with bloom d…
elliVM Oct 22, 2024
a682789
remove consumer class and use for loop in TableFilters
elliVM Oct 22, 2024
b354614
use try with resources and add comments on equals methods about DSLCo…
elliVM Oct 22, 2024
5f907c8
add Tokenizable interface and decorators, rename BloomFilterFromRecor…
elliVM Oct 23, 2024
0bf7410
add missing assertion to test
elliVM Oct 23, 2024
31e0bf9
move method after constructors
elliVM Oct 23, 2024
f370356
fix hard coded filter size and fix testing that different sizes are a…
elliVM Oct 25, 2024
eda9878
refactor code to simplify, add testing for SQL temp table values crea…
elliVM Oct 28, 2024
aefb66a
use UncheckedIOException constructor
elliVM Oct 29, 2024
73efeb9
TableFilters returns a batch that CategoryTableWithFilters executes
elliVM Oct 29, 2024
441e40b
add test for SafeBatch
elliVM Oct 29, 2024
4844ebd
update comments and clean up code, add constructors for RegexLikeCond…
elliVM Oct 29, 2024
145fcb6
use qualified names update tests
elliVM Oct 29, 2024
40c2165
more descriptive naming of methods and variables, update comments, ja…
elliVM Oct 30, 2024
6513861
apply spotless
elliVM Oct 30, 2024
adc7faf
add missing hashCode() methods
elliVM Oct 31, 2024
2c01634
don't wrap jooq.Batch object and execute in CategoryTableWithFilters,…
elliVM Oct 31, 2024
34efb98
throw exception if search term filter tokens size larger than expecte…
elliVM Nov 1, 2024
7412ec2
improve TableFiltersTest and TokensAsStringsTest
elliVM Nov 1, 2024
7dea6f7
allow search term filter tokens to be larger than expected tokens
elliVM Nov 11, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,12 @@
<version>2.2.224</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>nl.jqno.equalsverifier</groupId>
<artifactId>equalsverifier</artifactId>
<version>3.16.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
/*
* Teragrep Archive Datasource (pth_06)
* Copyright (C) 2021-2024 Suomen Kanuuna Oy
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
* Additional permission under GNU Affero General Public License version 3
* section 7
*
* If you modify this Program, or any covered work, by linking or combining it
* with other code, such other code is not for that reason alone subject to any
* of the requirements of the GNU Affero GPL version 3 as long as this Program
* is the same Program as licensed from Suomen Kanuuna Oy without any additional
* modifications.
*
* Supplemented terms under GNU Affero General Public License version 3
* section 7
*
* Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
* versions must be marked as "Modified version of" The Program.
*
* Names of the licensors and authors may not be used for publicity purposes.
*
* No rights are granted for use of trade names, trademarks, or service marks
* which are in The Program if any.
*
* Licensee must indemnify licensors and authors for any liability that these
* contractual assumptions impose on licensors and authors.
*
* To the extent this program is licensed as part of the Commercial versions of
* Teragrep, the applicable Commercial License may apply to this file if you as
* a licensee so wish it.
*/
package com.teragrep.pth_06.planner.bloomfilter;

import org.apache.spark.util.sketch.BloomFilter;
import org.jooq.Record;
import org.jooq.Table;
import org.jooq.impl.DSL;
import org.jooq.types.ULong;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Objects;
import java.util.Set;

import static com.teragrep.pth_06.jooq.generated.bloomdb.Bloomdb.BLOOMDB;

/**
* Extracts filter type from record, creates a bloom filter and returns the filters byte array
*/
public final class BloomFilterFromRecord {

private final Logger LOGGER = LoggerFactory.getLogger(BloomFilterFromRecord.class);
private final Long expected;
private final Double fpp;
private final String pattern;
private final String searchTerm;

private BloomFilter create() {
if (expected == null || fpp == null) {
LOGGER
.error(
"Null field while creating bloom filter expected <{}>, fpp <{}>, pattern <{}>, search term <{}>",
expected, fpp, pattern, searchTerm
);
throw new RuntimeException("Object field was null");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this exception message could be a bit clearer

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Clarified the exception messages, added tests for excetpions, removed use of .longValue() method in constructor which would lead to NPE.

}
final BloomFilter filter = BloomFilter.create(expected, fpp);
// if no pattern use to tokenized value (currently BLOOMDB.FILTERTYPE.PATTERN is NOT NULL)
if (pattern == null) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

object is configurable

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Refactored object to be not configurable

LOGGER.info("Table pattern was null using tokenizer to generate tokens");
new TokenizedValue(searchTerm).stringTokens().forEach(filter::put);
}
else { // get tokens using regex
final Set<String> tokens = new RegexExtractedValue(searchTerm, pattern).tokens();
LOGGER.info("Insert pattern <{}> tokens to temp table filter <{}>", pattern, tokens);
if (tokens.isEmpty()) {
throw new IllegalStateException(
"Trying to insert empty filter, pattern match joined table should always have tokens"
);
}
tokens.forEach(filter::put);
}
return filter;
}

public BloomFilterFromRecord(Record record, Table<?> table, String searchTerm) {
this(
record.getValue(DSL.field(DSL.name(table.getName(), "expectedElements"), ULong.class)).longValue(),
record.getValue(DSL.field(DSL.name(table.getName(), "targetFpp"), Double.class)),
record.getValue(BLOOMDB.FILTERTYPE.PATTERN, String.class),
searchTerm
);
}

public BloomFilterFromRecord(Long expected, Double fpp, String pattern, String searchTerm) {
this.expected = expected;
this.fpp = fpp;
this.pattern = pattern;
this.searchTerm = searchTerm;
}

public byte[] bytes() {
final BloomFilter filter = create();
final ByteArrayOutputStream filterBAOS = new ByteArrayOutputStream();
try {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not try-with-resources?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

refactored to use try-with-resources

filter.writeTo(filterBAOS);
filterBAOS.close();
}
catch (IOException e) {
throw new UncheckedIOException(new IOException("Error writing filter bytes: " + e.getMessage()));
}
return filterBAOS.toByteArray();
}

@Override
public boolean equals(final Object object) {
if (this == object)
return true;
if (object == null || getClass() != object.getClass())
return false;
final BloomFilterFromRecord cast = (BloomFilterFromRecord) object;
return expected.equals(cast.expected) && fpp.equals(cast.fpp) && Objects.equals(pattern, cast.pattern)
&& searchTerm.equals(cast.searchTerm);
}

@Override
public int hashCode() {
return Objects.hash(expected, fpp, pattern, searchTerm);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
* Teragrep, the applicable Commercial License may apply to this file if you as
* a licensee so wish it.
*/
package com.teragrep.pth_06.planner;
package com.teragrep.pth_06.planner.bloomfilter;

import com.teragrep.pth_06.planner.walker.conditions.QueryCondition;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
* Teragrep, the applicable Commercial License may apply to this file if you as
* a licensee so wish it.
*/
package com.teragrep.pth_06.planner;
package com.teragrep.pth_06.planner.bloomfilter;

import com.teragrep.pth_06.config.ConditionConfig;
import com.teragrep.pth_06.planner.walker.conditions.CategoryTableCondition;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
* Teragrep, the applicable Commercial License may apply to this file if you as
* a licensee so wish it.
*/
package com.teragrep.pth_06.planner;
package com.teragrep.pth_06.planner.bloomfilter;

import com.teragrep.pth_06.planner.walker.conditions.QueryCondition;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,126 +43,70 @@
* Teragrep, the applicable Commercial License may apply to this file if you as
* a licensee so wish it.
*/
package com.teragrep.pth_06.planner;
package com.teragrep.pth_06.planner.bloomfilter;

import com.teragrep.blf_01.Token;
import org.apache.spark.util.sketch.BloomFilter;
import org.jooq.*;
import org.jooq.DSLContext;
import org.jooq.Field;
import org.jooq.Record;
import org.jooq.Table;
import org.jooq.impl.DSL;
import org.jooq.types.ULong;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.regex.Pattern;
import java.util.Objects;
import java.util.function.Consumer;

import static com.teragrep.pth_06.jooq.generated.bloomdb.Bloomdb.BLOOMDB;
import static org.jooq.impl.SQLDataType.BIGINTUNSIGNED;

/**
* Filter types of a table that can be inserted into the tables category table
*/
public final class TableFilters {
public final class FilterFromRecordToCategoryTableConsumer implements Consumer<Record> {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please use object way instead of functional way for producing an iterator meaning for loop instead of a consumer

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

refactored to use for loop


private final DSLContext ctx;
private final Table<?> table;
private final long bloomTermId;
private final TokenizedValue value;
private final TableRecords recordsInMetadata;

public TableFilters(DSLContext ctx, Table<?> table, long bloomTermId, String input) {
this(
ctx,
table,
bloomTermId,
new TokenizedValue(input),
new TableFilterTypesFromMetadata(ctx, table, bloomTermId)
);
}
private final String searchTerm;

public TableFilters(
public FilterFromRecordToCategoryTableConsumer(
DSLContext ctx,
Table<?> table,
long bloomTermId,
TokenizedValue value,
TableFilterTypesFromMetadata recordsInMetadata
String searchTerm
) {
this.ctx = ctx;
this.table = table;
this.bloomTermId = bloomTermId;
this.value = value;
this.recordsInMetadata = recordsInMetadata;
this.searchTerm = searchTerm;
}

/**
* Extracts filter type from record, creates a bloom filter and returns the filters byte array
*
* @param record record with filter info
* @return byte[] of the created filter
*/
private byte[] filterBytesFromRecord(final Record record) {
final ULong expected = record.getValue(DSL.field(DSL.name(table.getName(), "expectedElements"), ULong.class));
final Double fpp = record.getValue(DSL.field(DSL.name(table.getName(), "targetFpp"), Double.class));
final String pattern = record.getValue(BLOOMDB.FILTERTYPE.PATTERN, String.class);
final BloomFilter filter = BloomFilter.create(expected.longValue(), fpp);
final Pattern compiled = Pattern.compile(pattern);
boolean isEmpty = true;
for (final Token token : value.tokens()) {
final String tokenString = token.toString();
if (compiled.matcher(tokenString).matches()) {
isEmpty = false;
filter.put(tokenString);
}
}
if (isEmpty) {
throw new IllegalStateException("Trying to insert empty filter");
}
final ByteArrayOutputStream filterBAOS = new ByteArrayOutputStream();
try {
filter.writeTo(filterBAOS);
filterBAOS.close();
}
catch (IOException e) {
throw new UncheckedIOException(new IOException("Error writing filter bytes: " + e.getMessage()));
}
return filterBAOS.toByteArray();
}

private void insertFilterRecordToCategoryTable(final Record record) {
@Override
public void accept(final Record record) {
final Table<Record> categoryTable = DSL.table(DSL.name(("term_" + bloomTermId + "_" + this.table.getName())));
final Field<?>[] insertFields = {
DSL.field("term_id", BIGINTUNSIGNED.nullable(false)),
DSL.field("type_id", BIGINTUNSIGNED.nullable(false)),
DSL.field(DSL.name(categoryTable.getName(), "filter"), byte[].class)
};
final BloomFilterFromRecord filterFromRecord = new BloomFilterFromRecord(record, table, searchTerm);
final Field<?>[] valueFields = {
DSL.val(bloomTermId, ULong.class),
DSL.val(record.getValue(BLOOMDB.FILTERTYPE.ID), ULong.class),
DSL.val(filterBytesFromRecord(record), byte[].class)
DSL.val(filterFromRecord.bytes(), byte[].class)
};
ctx.insertInto(categoryTable).columns(insertFields).values(valueFields).execute();
}

public void insertFiltersIntoCategoryTable() {
recordsInMetadata.toResult().forEach(this::insertFilterRecordToCategoryTable);
}

/**
* Expects DSLContext values to be the same instance
*
* @param object object compared
* @returs true if object is equal
*/
@Override
public boolean equals(final Object object) {
if (this == object)
return true;
if (object == null)
return false;
if (object.getClass() != this.getClass())
if (object == null || this.getClass() != object.getClass())
return false;
final TableFilters cast = (TableFilters) object;
return this.ctx == cast.ctx && this.value.equals(cast.value) && this.table.equals(cast.table)
&& this.bloomTermId == cast.bloomTermId;
final FilterFromRecordToCategoryTableConsumer cast = (FilterFromRecordToCategoryTableConsumer) object;
return bloomTermId == cast.bloomTermId && ctx == cast.ctx && table.equals(cast.table)
&& searchTerm.equals(cast.searchTerm);
}

@Override
public int hashCode() {
return Objects.hash(ctx, table, bloomTermId, searchTerm);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
* Teragrep, the applicable Commercial License may apply to this file if you as
* a licensee so wish it.
*/
package com.teragrep.pth_06.planner;
package com.teragrep.pth_06.planner.bloomfilter;

import com.teragrep.pth_06.planner.walker.conditions.PatternMatchCondition;
import com.teragrep.pth_06.planner.walker.conditions.QueryCondition;
Expand Down
Loading