Skip to content

Commit

Permalink
Major refactoring:
Browse files Browse the repository at this point in the history
removing collation id from utf8string.
  • Loading branch information
dbatomic committed Feb 2, 2024
1 parent f68b511 commit 95a9dfc
Show file tree
Hide file tree
Showing 67 changed files with 230 additions and 352 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ private CollatorFactory() {
// UNICODE case sensitive comparison (ROOT locale, in ICU).
collatorTable[2] = new CollatorInfo("UNICODE", Collator.getInstance(ULocale.ROOT), "153.120.0.0");


// UNICODE case insensitive comparison (ROOT locale, in ICU).
collatorTable[3] = new CollatorInfo("UNICODE_CI", Collator.getInstance(ULocale.ROOT), "153.120.0.0");
collatorTable[3].collator.setStrength(Collator.SECONDARY);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,17 +47,14 @@
* <p>
* Note: This is not designed for general use cases, should not be used outside SQL.
*/
public class UTF8String implements Comparable<UTF8String>, Externalizable, KryoSerializable,
Cloneable {
public class UTF8String implements Externalizable, KryoSerializable, Cloneable {

// These are only updated by readExternal() or read()
@Nonnull
protected Object base;
protected long offset;
protected int numBytes;

private transient int comparatorId;

public Object getBaseObject() { return base; }
public long getBaseOffset() { return offset; }

Expand Down Expand Up @@ -102,24 +99,14 @@ public class UTF8String implements Comparable<UTF8String>, Externalizable, KryoS
private static final UTF8String COMMA_UTF8 = UTF8String.fromString(",");
public static final UTF8String EMPTY_UTF8 = UTF8String.fromString("");

public static final int DefaultCollationId = 0;

/**
* Creates an UTF8String from byte array, which should be encoded in UTF-8.
*
* Note: `bytes` will be hold by returned UTF8String.
*/
public static UTF8String fromBytes(byte[] bytes) {
if (bytes != null) {
return new UTF8String(bytes, BYTE_ARRAY_OFFSET, bytes.length, DefaultCollationId);
} else {
return null;
}
}

public static UTF8String fromBytes(byte[] bytes, int collatorId) {
if (bytes != null) {
return new UTF8String(bytes, BYTE_ARRAY_OFFSET, bytes.length, collatorId);
return new UTF8String(bytes, BYTE_ARRAY_OFFSET, bytes.length);
} else {
return null;
}
Expand All @@ -132,15 +119,7 @@ public static UTF8String fromBytes(byte[] bytes, int collatorId) {
*/
public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes) {
if (bytes != null) {
return new UTF8String(bytes, BYTE_ARRAY_OFFSET + offset, numBytes, DefaultCollationId);
} else {
return null;
}
}

public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes, int collatorId) {
if (bytes != null) {
return new UTF8String(bytes, BYTE_ARRAY_OFFSET + offset, numBytes, collatorId);
return new UTF8String(bytes, BYTE_ARRAY_OFFSET + offset, numBytes);
} else {
return null;
}
Expand All @@ -150,11 +129,7 @@ public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes, int c
* Creates an UTF8String from given address (base and offset) and length.
*/
public static UTF8String fromAddress(Object base, long offset, int numBytes) {
return new UTF8String(base, offset, numBytes, DefaultCollationId);
}

public static UTF8String fromAddress(Object base, long offset, int numBytes, int collationId) {
return new UTF8String(base, offset, numBytes, collationId);
return new UTF8String(base, offset, numBytes);
}

/**
Expand All @@ -164,13 +139,6 @@ public static UTF8String fromString(String str) {
return str == null ? null : fromBytes(str.getBytes(StandardCharsets.UTF_8));
}

/**
* Creates an UTF8String from String.
*/
public static UTF8String fromString(String str, int collatorId) {
return str == null ? null : fromBytes(str.getBytes(StandardCharsets.UTF_8), collatorId);
}

/**
* Creates an UTF8String that contains `length` spaces.
*/
Expand All @@ -188,16 +156,15 @@ public static boolean isWhitespaceOrISOControl(int codePoint) {
return Character.isWhitespace(codePoint) || Character.isISOControl(codePoint);
}

protected UTF8String(Object base, long offset, int numBytes, int comparatorId) {
protected UTF8String(Object base, long offset, int numBytes) {
this.base = base;
this.offset = offset;
this.numBytes = numBytes;
this.comparatorId = comparatorId;
}

// for serialization
public UTF8String() {
this(null, 0, 0, DefaultCollationId);
this(null, 0, 0);
}

/**
Expand All @@ -218,16 +185,6 @@ public void writeTo(ByteBuffer buffer) {
buffer.position(pos + numBytes);
}

public UTF8String installCollationAwareComparator(int comparatorId) {
this.comparatorId = comparatorId;
return this;
}

// public UTF8String installCollationAwareComparator(String collationName) {
// this.comparatorId = CollatorFactory.getInstance().collationNameToId(collationName);
// return this;
// }

/**
* Returns a {@link ByteBuffer} wrapping the base object if it is a byte array
* or a copy of the data if the base object is not a byte array.
Expand Down Expand Up @@ -293,9 +250,6 @@ public int numChars() {
* Returns a 64-bit integer that can be used as the prefix used in sorting.
*/
public long getPrefix() {
if (this.comparatorId != 0) {
throw new RuntimeException("Can't do prefix on collated string.");
}
return ByteArray.getPrefix(base, offset, numBytes);
}

Expand All @@ -314,6 +268,19 @@ public byte[] getBytes() {
}
}

@Override
public boolean equals(final Object other) {
throw new RuntimeException("can't call direct equals against utf8 string");
}

public int collationAwareCompareTo(final UTF8String other, int collationId) {
return CollatorFactory.getInfoForId(collationId).comparator.compare(this, other);
}

public boolean collationAwareEquals(final UTF8String other, int collationId) {
return CollatorFactory.getInfoForId(collationId).equalsFunction.apply(this, other);
}

/**
* Returns a substring of this.
* @param start the position of first code point
Expand Down Expand Up @@ -372,10 +339,6 @@ public UTF8String substringSQL(int pos, int length) {
* Returns whether this contains `substring` or not.
*/
public boolean contains(final UTF8String substring) {
if (this.comparatorId != 0) {
throw new RuntimeException("Can't do contains on collated string.");
}

if (substring.numBytes == 0) {
return true;
}
Expand Down Expand Up @@ -415,10 +378,6 @@ public boolean endsWith(final UTF8String suffix) {
* Returns the upper case of this string
*/
public UTF8String toUpperCase() {
if (this.comparatorId != 0) {
throw new RuntimeException("Can't do toUpperCase on collated string.");
}

if (numBytes == 0) {
return EMPTY_UTF8;
}
Expand Down Expand Up @@ -505,10 +464,6 @@ public UTF8String toTitleCase() {
}

private UTF8String toTitleCaseSlow() {
if (this.comparatorId != 0) {
throw new RuntimeException("Can't do toTitleCaseSlow on collated string.");
}

StringBuilder sb = new StringBuilder();
String s = toString();
sb.append(s);
Expand All @@ -527,10 +482,6 @@ private UTF8String toTitleCaseSlow() {
* 0 will be returned, else the index of match (1-based index)
*/
public int findInSet(UTF8String match) {
if (this.comparatorId != 0) {
throw new RuntimeException("Can't do findInSet on collated string.");
}

if (match.contains(COMMA_UTF8)) {
return 0;
}
Expand Down Expand Up @@ -565,7 +516,7 @@ private UTF8String copyUTF8String(int start, int end) {
int len = end - start + 1;
byte[] newBytes = new byte[len];
copyMemory(base, offset + start, newBytes, BYTE_ARRAY_OFFSET, len);
return UTF8String.fromBytes(newBytes, this.comparatorId);
return UTF8String.fromBytes(newBytes);
}

/**
Expand Down Expand Up @@ -815,9 +766,6 @@ public UTF8String repeat(int times) {
* @return the position of the first occurrence of substr, if not found, -1 returned.
*/
public int indexOf(UTF8String v, int start) {
if (this.comparatorId != 0) {
throw new RuntimeException("Can't do indexOf on collated string.");
}
if (v.numBytes() == 0) {
return 0;
}
Expand Down Expand Up @@ -848,9 +796,6 @@ public int indexOf(UTF8String v, int start) {
* Find the `str` from left to right.
*/
private int find(UTF8String str, int start) {
if (this.comparatorId != 0) {
throw new RuntimeException("Can't do find on collated string.");
}
assert (str.numBytes > 0);
while (start <= numBytes - str.numBytes) {
if (ByteArrayMethods.arrayEquals(base, offset + start, str.base, str.offset, str.numBytes)) {
Expand All @@ -865,9 +810,6 @@ private int find(UTF8String str, int start) {
* Find the `str` from right to left.
*/
private int rfind(UTF8String str, int start) {
if (this.comparatorId != 0) {
throw new RuntimeException("Can't do rfind on collated string.");
}
assert (str.numBytes > 0);
while (start >= 0) {
if (ByteArrayMethods.arrayEquals(base, offset + start, str.base, str.offset, str.numBytes)) {
Expand All @@ -885,9 +827,6 @@ private int rfind(UTF8String str, int start) {
* right) is returned. subStringIndex performs a case-sensitive match when searching for delim.
*/
public UTF8String subStringIndex(UTF8String delim, int count) {
if (this.comparatorId != 0) {
throw new RuntimeException("Can't do subStringIndex on collated string.");
}
if (delim.numBytes == 0 || count == 0) {
return EMPTY_UTF8;
}
Expand Down Expand Up @@ -1012,7 +951,6 @@ public static UTF8String concat(UTF8String... inputs) {

// Compute the total length of the result.
long totalLength = 0;
int commonComparator = inputs[0].comparatorId;
for (UTF8String input : inputs) {
if (input == null) {
return null;
Expand All @@ -1032,8 +970,7 @@ public static UTF8String concat(UTF8String... inputs) {
offset += len;
}

// TODO: Check all the collations.
return fromBytes(result, commonComparator);
return fromBytes(result);
}

/**
Expand Down Expand Up @@ -1089,9 +1026,6 @@ public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) {
}

public UTF8String[] split(UTF8String pattern, int limit) {
if (this.comparatorId != 0) {
throw new RuntimeException("Can't do split on collated string.");
}
// For the empty `pattern` a `split` function ignores trailing empty strings unless original
// string is empty.
if (numBytes() != 0 && pattern.numBytes() == 0) {
Expand All @@ -1111,9 +1045,6 @@ public UTF8String[] split(UTF8String pattern, int limit) {
}

public UTF8String[] splitSQL(UTF8String delimiter, int limit) {
if (this.comparatorId != 0) {
throw new RuntimeException("Can't do split on collated string.");
}
// if delimiter is empty string, skip the regex based splitting directly as regex
// treats empty string as matching anything, thus use the input directly.
if (delimiter.numBytes() == 0) {
Expand All @@ -1128,9 +1059,6 @@ public UTF8String[] splitSQL(UTF8String delimiter, int limit) {
}

private UTF8String[] split(String delimiter, int limit) {
if (this.comparatorId != 0) {
throw new RuntimeException("Can't do split on collated string.");
}
// Java String's split method supports "ignore empty string" behavior when the limit is 0
// whereas other languages do not. To avoid this java specific behavior, we fall back to
// -1 when the limit is 0.
Expand All @@ -1146,9 +1074,6 @@ private UTF8String[] split(String delimiter, int limit) {
}

public UTF8String replace(UTF8String search, UTF8String replace) {
if (this.comparatorId != 0) {
throw new RuntimeException("Can't do replace on collated string.");
}
// This implementation is loosely based on commons-lang3's StringUtils.replace().
if (numBytes == 0 || search.numBytes == 0) {
return this;
Expand All @@ -1175,9 +1100,6 @@ public UTF8String replace(UTF8String search, UTF8String replace) {
}

public UTF8String translate(Map<String, String> dict) {
if (this.comparatorId != 0) {
throw new RuntimeException("Can't do translate on collated string.");
}
String srcStr = this.toString();

StringBuilder sb = new StringBuilder();
Expand Down Expand Up @@ -1490,37 +1412,16 @@ public int binaryCompare(@Nonnull final UTF8String other) {
base, offset, numBytes, other.base, other.offset, other.numBytes);
}

@Override
public int compareTo(@Nonnull final UTF8String other) {
return CollatorFactory.getInfoForId(comparatorId).comparator.compare(this, other);
}

public int compare(final UTF8String other) {
return compareTo(other);
}

public boolean binaryEquals(final UTF8String other) {
return ByteArrayMethods.arrayEquals(base, offset, other.base, other.offset, numBytes);
}

@Override
public boolean equals(final Object other) {
if (other instanceof UTF8String o) {
return CollatorFactory.getInfoForId(comparatorId).equalsFunction.apply(this, o);
} else {
return false;
}
}

/**
* Levenshtein distance is a metric for measuring the distance of two strings. The distance is
* defined by the minimum number of single-character edits (i.e. insertions, deletions or
* substitutions) that are required to change one of the strings into the other.
*/
public int levenshteinDistance(UTF8String other) {
if (this.comparatorId != 0) {
throw new RuntimeException("Can't do levenshtein on collated string.");
}
// Implementation adopted from
// org.apache.commons.text.similarity.LevenshteinDistance.unlimitedCompare

Expand Down Expand Up @@ -1581,9 +1482,6 @@ public int levenshteinDistance(UTF8String other) {
}

public int levenshteinDistance(UTF8String other, int threshold) {
if (this.comparatorId != 0) {
throw new RuntimeException("Can't do levenshtein on collated string.");
}
// Implementation adopted from
// org.apache.commons.text.similarity.LevenshteinDistance.limitedCompare

Expand Down Expand Up @@ -1673,16 +1571,10 @@ public int levenshteinDistance(UTF8String other, int threshold) {
return -1;
}

@Override
public int hashCode() {
return CollatorFactory.getInfoForId(comparatorId).hashFunction.apply(this);
}

public int binaryHash() {
return Murmur3_x86_32.hashUnsafeBytes(base, offset, numBytes, 42);
}


/**
* Soundex mapping table
*/
Expand All @@ -1695,9 +1587,6 @@ public int binaryHash() {
* https://en.wikipedia.org/wiki/Soundex
*/
public UTF8String soundex() {
if (this.comparatorId != 0) {
throw new RuntimeException("Can't do levenshtein on collated string.");
}
if (numBytes == 0) {
return EMPTY_UTF8;
}
Expand Down
Loading

0 comments on commit 95a9dfc

Please sign in to comment.