Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: use proper indexes for full text search #4787

Merged
merged 16 commits into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 42 additions & 1 deletion pkg/query-service/app/logs/v3/json_filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ const (
ARRAY_INT64 = "Array(Int64)"
ARRAY_FLOAT64 = "Array(Float64)"
ARRAY_BOOL = "Array(Bool)"
NGRAM_SIZE = 4
)

var dataTypeMapping = map[string]string{
Expand Down Expand Up @@ -106,6 +107,29 @@ func getJSONFilterKey(key v3.AttributeKey, op v3.FilterOperator, isArray bool) (
return keyname, nil
}

// takes the path and the values and generates where clauses for better usage of index
func getPathIndexFilter(path string) string {
filters := []string{}
keyArr := strings.Split(path, ".")
if len(keyArr) < 2 {
return ""
}
nityanandagohain marked this conversation as resolved.
Show resolved Hide resolved

for i, key := range keyArr {
if i == 0 {
continue
}
key = strings.TrimSuffix(key, "[*]")
if len(key) >= NGRAM_SIZE {
filters = append(filters, strings.ToLower(key))
}
}
if len(filters) > 0 {
return fmt.Sprintf("lower(body) like '%%%s%%'", strings.Join(filters, "%"))
}
return ""
}

func GetJSONFilter(item v3.FilterItem) (string, error) {

dataType := item.Key.DataType
Expand Down Expand Up @@ -153,11 +177,28 @@ func GetJSONFilter(item v3.FilterItem) (string, error) {
return "", fmt.Errorf("unsupported operator: %s", op)
}

filters := []string{}

pathFilter := getPathIndexFilter(item.Key.Key)
if pathFilter != "" {
filters = append(filters, pathFilter)
}
if op == v3.FilterOperatorContains ||
op == v3.FilterOperatorEqual ||
op == v3.FilterOperatorHas {
val, ok := item.Value.(string)
if ok && len(val) >= NGRAM_SIZE {
filters = append(filters, fmt.Sprintf("lower(body) like '%%%s%%'", strings.ToLower(val)))
}
}

// add exists check for non array items as default values of int/float/bool will corrupt the results
if !isArray && !(item.Operator == v3.FilterOperatorExists || item.Operator == v3.FilterOperatorNotExists) {
existsFilter := fmt.Sprintf("JSON_EXISTS(body, '$.%s')", getPath(strings.Split(item.Key.Key, ".")[1:]))
filter = fmt.Sprintf("%s AND %s", existsFilter, filter)
}

return filter, nil
filters = append(filters, filter)

return strings.Join(filters, " AND "), nil
}
24 changes: 12 additions & 12 deletions pkg/query-service/app/logs/v3/json_filter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ var testGetJSONFilterData = []struct {
Operator: "has",
Value: "index_service",
},
Filter: "has(JSONExtract(JSON_QUERY(body, '$.\"requestor_list\"[*]'), 'Array(String)'), 'index_service')",
Filter: "lower(body) like '%requestor_list%' AND lower(body) like '%index_service%' AND has(JSONExtract(JSON_QUERY(body, '$.\"requestor_list\"[*]'), 'Array(String)'), 'index_service')",
},
{
Name: "Array membership int64",
Expand All @@ -181,7 +181,7 @@ var testGetJSONFilterData = []struct {
Operator: "has",
Value: 2,
},
Filter: "has(JSONExtract(JSON_QUERY(body, '$.\"int_numbers\"[*]'), '" + ARRAY_INT64 + "'), 2)",
Filter: "lower(body) like '%int_numbers%' AND has(JSONExtract(JSON_QUERY(body, '$.\"int_numbers\"[*]'), '" + ARRAY_INT64 + "'), 2)",
},
{
Name: "Array membership float64",
Expand All @@ -194,7 +194,7 @@ var testGetJSONFilterData = []struct {
Operator: "nhas",
Value: 2.2,
},
Filter: "NOT has(JSONExtract(JSON_QUERY(body, '$.\"nested_num\"[*].\"float_nums\"[*]'), '" + ARRAY_FLOAT64 + "'), 2.200000)",
Filter: "lower(body) like '%nested_num%float_nums%' AND NOT has(JSONExtract(JSON_QUERY(body, '$.\"nested_num\"[*].\"float_nums\"[*]'), '" + ARRAY_FLOAT64 + "'), 2.200000)",
},
{
Name: "Array membership bool",
Expand All @@ -207,7 +207,7 @@ var testGetJSONFilterData = []struct {
Operator: "has",
Value: true,
},
Filter: "has(JSONExtract(JSON_QUERY(body, '$.\"bool\"[*]'), '" + ARRAY_BOOL + "'), true)",
Filter: "lower(body) like '%bool%' AND has(JSONExtract(JSON_QUERY(body, '$.\"bool\"[*]'), '" + ARRAY_BOOL + "'), true)",
},
{
Name: "eq operator",
Expand All @@ -220,7 +220,7 @@ var testGetJSONFilterData = []struct {
Operator: "=",
Value: "hello",
},
Filter: "JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') = 'hello'",
Filter: "lower(body) like '%message%' AND lower(body) like '%hello%' AND JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') = 'hello'",
},
{
Name: "eq operator number",
Expand All @@ -233,7 +233,7 @@ var testGetJSONFilterData = []struct {
Operator: "=",
Value: 1,
},
Filter: "JSON_EXISTS(body, '$.\"status\"') AND JSONExtract(JSON_VALUE(body, '$.\"status\"'), '" + INT64 + "') = 1",
Filter: "lower(body) like '%status%' AND JSON_EXISTS(body, '$.\"status\"') AND JSONExtract(JSON_VALUE(body, '$.\"status\"'), '" + INT64 + "') = 1",
},
{
Name: "neq operator number",
Expand All @@ -246,7 +246,7 @@ var testGetJSONFilterData = []struct {
Operator: "=",
Value: 1.1,
},
Filter: "JSON_EXISTS(body, '$.\"status\"') AND JSONExtract(JSON_VALUE(body, '$.\"status\"'), '" + FLOAT64 + "') = 1.100000",
Filter: "lower(body) like '%status%' AND JSON_EXISTS(body, '$.\"status\"') AND JSONExtract(JSON_VALUE(body, '$.\"status\"'), '" + FLOAT64 + "') = 1.100000",
},
{
Name: "eq operator bool",
Expand All @@ -259,7 +259,7 @@ var testGetJSONFilterData = []struct {
Operator: "=",
Value: true,
},
Filter: "JSON_EXISTS(body, '$.\"boolkey\"') AND JSONExtract(JSON_VALUE(body, '$.\"boolkey\"'), '" + BOOL + "') = true",
Filter: "lower(body) like '%boolkey%' AND JSON_EXISTS(body, '$.\"boolkey\"') AND JSONExtract(JSON_VALUE(body, '$.\"boolkey\"'), '" + BOOL + "') = true",
},
{
Name: "greater than operator",
Expand All @@ -272,7 +272,7 @@ var testGetJSONFilterData = []struct {
Operator: ">",
Value: 1,
},
Filter: "JSON_EXISTS(body, '$.\"status\"') AND JSONExtract(JSON_VALUE(body, '$.\"status\"'), '" + INT64 + "') > 1",
Filter: "lower(body) like '%status%' AND JSON_EXISTS(body, '$.\"status\"') AND JSONExtract(JSON_VALUE(body, '$.\"status\"'), '" + INT64 + "') > 1",
},
{
Name: "regex operator",
Expand All @@ -285,7 +285,7 @@ var testGetJSONFilterData = []struct {
Operator: "regex",
Value: "a*",
},
Filter: "JSON_EXISTS(body, '$.\"message\"') AND match(JSON_VALUE(body, '$.\"message\"'), 'a*')",
Filter: "lower(body) like '%message%' AND JSON_EXISTS(body, '$.\"message\"') AND match(JSON_VALUE(body, '$.\"message\"'), 'a*')",
},
{
Name: "contains operator",
Expand All @@ -298,7 +298,7 @@ var testGetJSONFilterData = []struct {
Operator: "contains",
Value: "a",
},
Filter: "JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') ILIKE '%a%'",
Filter: "lower(body) like '%message%' AND JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') ILIKE '%a%'",
},
{
Name: "exists",
Expand All @@ -311,7 +311,7 @@ var testGetJSONFilterData = []struct {
Operator: "exists",
Value: "",
},
Filter: "JSON_EXISTS(body, '$.\"message\"')",
Filter: "lower(body) like '%message%' AND JSON_EXISTS(body, '$.\"message\"')",
},
}

Expand Down
17 changes: 12 additions & 5 deletions pkg/query-service/app/logs/v3/query_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ var logOperators = map[v3.FilterOperator]string{
v3.FilterOperatorLessThanOrEq: "<=",
v3.FilterOperatorGreaterThan: ">",
v3.FilterOperatorGreaterThanOrEq: ">=",
v3.FilterOperatorLike: "ILIKE",
v3.FilterOperatorNotLike: "NOT ILIKE",
v3.FilterOperatorContains: "ILIKE",
v3.FilterOperatorNotContains: "NOT ILIKE",
v3.FilterOperatorLike: "LIKE",
v3.FilterOperatorNotLike: "NOT LIKE",
v3.FilterOperatorContains: "LIKE",
v3.FilterOperatorNotContains: "NOT LIKE",
v3.FilterOperatorRegex: "match(%s, %s)",
v3.FilterOperatorNotRegex: "NOT match(%s, %s)",
v3.FilterOperatorIn: "IN",
Expand Down Expand Up @@ -192,10 +192,17 @@ func buildLogsTimeSeriesFilterQuery(fs *v3.FilterSet, groupBy []v3.AttributeKey,
conditions = append(conditions, fmt.Sprintf(logsOp, columnName, fmtVal))
case v3.FilterOperatorContains, v3.FilterOperatorNotContains:
columnName := getClickhouseColumnName(item.Key)
conditions = append(conditions, fmt.Sprintf("%s %s '%%%s%%'", columnName, logsOp, item.Value))
conditions = append(conditions, fmt.Sprintf("lower(%s) %s lower('%%%s%%')", columnName, logsOp, item.Value))
default:
columnName := getClickhouseColumnName(item.Key)
fmtVal := utils.ClickHouseFormattedValue(value)

// for use lower for like and ilike
if op == v3.FilterOperatorLike || op == v3.FilterOperatorNotLike {
columnName = fmt.Sprintf("lower(%s)", columnName)
nityanandagohain marked this conversation as resolved.
Show resolved Hide resolved
fmtVal = strings.ToLower(fmtVal)
nityanandagohain marked this conversation as resolved.
Show resolved Hide resolved
}

conditions = append(conditions, fmt.Sprintf("%s %s %s", columnName, logsOp, fmtVal))
}
} else {
Expand Down
22 changes: 11 additions & 11 deletions pkg/query-service/app/logs/v3/query_builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ var timeSeriesFilterQueryData = []struct {
FilterSet: &v3.FilterSet{Operator: "AND", Items: []v3.FilterItem{
{Key: v3.AttributeKey{Key: "host", DataType: v3.AttributeKeyDataTypeString, Type: v3.AttributeKeyTypeTag}, Value: "102.%", Operator: "like"},
}},
ExpectedFilter: "attributes_string_value[indexOf(attributes_string_key, 'host')] ILIKE '102.%'",
ExpectedFilter: "lower(attributes_string_value[indexOf(attributes_string_key, 'host')]) LIKE '102.%'",
},
{
Name: "Test IN",
Expand Down Expand Up @@ -185,14 +185,14 @@ var timeSeriesFilterQueryData = []struct {
FilterSet: &v3.FilterSet{Operator: "AND", Items: []v3.FilterItem{
{Key: v3.AttributeKey{Key: "host", DataType: v3.AttributeKeyDataTypeString, Type: v3.AttributeKeyTypeTag}, Value: "102.", Operator: "contains"},
}},
ExpectedFilter: "attributes_string_value[indexOf(attributes_string_key, 'host')] ILIKE '%102.%'",
ExpectedFilter: "lower(attributes_string_value[indexOf(attributes_string_key, 'host')]) LIKE lower('%102.%')",
},
{
Name: "Test not contains",
FilterSet: &v3.FilterSet{Operator: "AND", Items: []v3.FilterItem{
{Key: v3.AttributeKey{Key: "host", DataType: v3.AttributeKeyDataTypeString, Type: v3.AttributeKeyTypeTag}, Value: "102.", Operator: "ncontains"},
}},
ExpectedFilter: "attributes_string_value[indexOf(attributes_string_key, 'host')] NOT ILIKE '%102.%'",
ExpectedFilter: "lower(attributes_string_value[indexOf(attributes_string_key, 'host')]) NOT LIKE lower('%102.%')",
},
{
Name: "Test regex",
Expand All @@ -214,15 +214,15 @@ var timeSeriesFilterQueryData = []struct {
{Key: v3.AttributeKey{Key: "host", DataType: v3.AttributeKeyDataTypeString, Type: v3.AttributeKeyTypeTag}, Value: "102.", Operator: "ncontains"},
}},
GroupBy: []v3.AttributeKey{{Key: "host", DataType: v3.AttributeKeyDataTypeString, Type: v3.AttributeKeyTypeTag}},
ExpectedFilter: "attributes_string_value[indexOf(attributes_string_key, 'host')] NOT ILIKE '%102.%' AND has(attributes_string_key, 'host')",
ExpectedFilter: "lower(attributes_string_value[indexOf(attributes_string_key, 'host')]) NOT LIKE lower('%102.%') AND has(attributes_string_key, 'host')",
},
{
Name: "Test groupBy isColumn",
FilterSet: &v3.FilterSet{Operator: "AND", Items: []v3.FilterItem{
{Key: v3.AttributeKey{Key: "host", DataType: v3.AttributeKeyDataTypeString, Type: v3.AttributeKeyTypeTag}, Value: "102.", Operator: "ncontains"},
}},
GroupBy: []v3.AttributeKey{{Key: "host", DataType: v3.AttributeKeyDataTypeString, Type: v3.AttributeKeyTypeTag, IsColumn: true}},
ExpectedFilter: "attributes_string_value[indexOf(attributes_string_key, 'host')] NOT ILIKE '%102.%' AND `attribute_string_host_exists`=true",
ExpectedFilter: "lower(attributes_string_value[indexOf(attributes_string_key, 'host')]) NOT LIKE lower('%102.%') AND `attribute_string_host_exists`=true",
},
{
Name: "Wrong data",
Expand All @@ -236,7 +236,7 @@ var timeSeriesFilterQueryData = []struct {
FilterSet: &v3.FilterSet{Operator: "AND", Items: []v3.FilterItem{
{Key: v3.AttributeKey{Key: "body", DataType: v3.AttributeKeyDataTypeString, Type: v3.AttributeKeyTypeTag}, Value: "%test%", Operator: "like"},
}},
ExpectedFilter: "attributes_string_value[indexOf(attributes_string_key, 'body')] ILIKE '%test%'",
ExpectedFilter: "lower(attributes_string_value[indexOf(attributes_string_key, 'body')]) LIKE '%test%'",
},
{
Name: "Test exists on top level field",
Expand Down Expand Up @@ -845,7 +845,7 @@ var testBuildLogsQueryData = []struct {
},
},
TableName: "logs",
ExpectedQuery: "SELECT toStartOfInterval(fromUnixTimestamp64Nano(timestamp), INTERVAL 60 SECOND) AS ts, toFloat64(count(distinct(attributes_string_value[indexOf(attributes_string_key, 'name')]))) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND body ILIKE '%test%' AND has(attributes_string_key, 'name') group by ts having value > 10 order by value DESC",
ExpectedQuery: "SELECT toStartOfInterval(fromUnixTimestamp64Nano(timestamp), INTERVAL 60 SECOND) AS ts, toFloat64(count(distinct(attributes_string_value[indexOf(attributes_string_key, 'name')]))) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND lower(body) LIKE '%test%' AND has(attributes_string_key, 'name') group by ts having value > 10 order by value DESC",
},
{
Name: "Test attribute with same name as top level key",
Expand All @@ -871,7 +871,7 @@ var testBuildLogsQueryData = []struct {
},
},
TableName: "logs",
ExpectedQuery: "SELECT toStartOfInterval(fromUnixTimestamp64Nano(timestamp), INTERVAL 60 SECOND) AS ts, toFloat64(count(distinct(attributes_string_value[indexOf(attributes_string_key, 'name')]))) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND attributes_string_value[indexOf(attributes_string_key, 'body')] ILIKE '%test%' AND has(attributes_string_key, 'name') group by ts having value > 10 order by value DESC",
ExpectedQuery: "SELECT toStartOfInterval(fromUnixTimestamp64Nano(timestamp), INTERVAL 60 SECOND) AS ts, toFloat64(count(distinct(attributes_string_value[indexOf(attributes_string_key, 'name')]))) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND lower(attributes_string_value[indexOf(attributes_string_key, 'body')]) LIKE '%test%' AND has(attributes_string_key, 'name') group by ts having value > 10 order by value DESC",
},

// Tests for table panel type
Expand Down Expand Up @@ -958,7 +958,7 @@ var testBuildLogsQueryData = []struct {
},
},
TableName: "logs",
ExpectedQuery: "SELECT now() as ts, attributes_string_value[indexOf(attributes_string_key, 'name')] as `name`, toFloat64(count(*)) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') ILIKE '%a%' AND has(attributes_string_key, 'name') group by `name` order by `name` DESC",
ExpectedQuery: "SELECT now() as ts, attributes_string_value[indexOf(attributes_string_key, 'name')] as `name`, toFloat64(count(*)) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND lower(body) like '%message%' AND JSON_EXISTS(body, '$.\"message\"') AND JSON_VALUE(body, '$.\"message\"') ILIKE '%a%' AND has(attributes_string_key, 'name') group by `name` order by `name` DESC",
},
{
Name: "TABLE: Test count with JSON Filter Array, groupBy, orderBy",
Expand Down Expand Up @@ -992,7 +992,7 @@ var testBuildLogsQueryData = []struct {
},
},
TableName: "logs",
ExpectedQuery: "SELECT now() as ts, attributes_string_value[indexOf(attributes_string_key, 'name')] as `name`, toFloat64(count(*)) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND has(JSONExtract(JSON_QUERY(body, '$.\"requestor_list\"[*]'), 'Array(String)'), 'index_service') AND has(attributes_string_key, 'name') group by `name` order by `name` DESC",
ExpectedQuery: "SELECT now() as ts, attributes_string_value[indexOf(attributes_string_key, 'name')] as `name`, toFloat64(count(*)) as value from signoz_logs.distributed_logs where (timestamp >= 1680066360726210000 AND timestamp <= 1680066458000000000) AND lower(body) like '%requestor_list%' AND lower(body) like '%index_service%' AND has(JSONExtract(JSON_QUERY(body, '$.\"requestor_list\"[*]'), 'Array(String)'), 'index_service') AND has(attributes_string_key, 'name') group by `name` order by `name` DESC",
},
}

Expand Down Expand Up @@ -1305,7 +1305,7 @@ var testPrepLogsQueryData = []struct {
},
},
TableName: "logs",
ExpectedQuery: "SELECT timestamp, id, trace_id, span_id, trace_flags, severity_text, severity_number, body,CAST((attributes_string_key, attributes_string_value), 'Map(String, String)') as attributes_string,CAST((attributes_int64_key, attributes_int64_value), 'Map(String, Int64)') as attributes_int64,CAST((attributes_float64_key, attributes_float64_value), 'Map(String, Float64)') as attributes_float64,CAST((attributes_bool_key, attributes_bool_value), 'Map(String, Bool)') as attributes_bool,CAST((resources_string_key, resources_string_value), 'Map(String, String)') as resources_string from signoz_logs.distributed_logs where attributes_string_value[indexOf(attributes_string_key, 'method')] ILIKE '%GET%' AND ",
ExpectedQuery: "SELECT timestamp, id, trace_id, span_id, trace_flags, severity_text, severity_number, body,CAST((attributes_string_key, attributes_string_value), 'Map(String, String)') as attributes_string,CAST((attributes_int64_key, attributes_int64_value), 'Map(String, Int64)') as attributes_int64,CAST((attributes_float64_key, attributes_float64_value), 'Map(String, Float64)') as attributes_float64,CAST((attributes_bool_key, attributes_bool_value), 'Map(String, Bool)') as attributes_bool,CAST((resources_string_key, resources_string_value), 'Map(String, String)') as resources_string from signoz_logs.distributed_logs where lower(attributes_string_value[indexOf(attributes_string_key, 'method')]) LIKE lower('%GET%') AND ",
Options: Options{IsLivetailQuery: true},
},
{
Expand Down
3 changes: 2 additions & 1 deletion pkg/query-service/model/v3/v3.go
Original file line number Diff line number Diff line change
Expand Up @@ -827,7 +827,8 @@ const (
FilterOperatorNotContains FilterOperator = "ncontains"
FilterOperatorRegex FilterOperator = "regex"
FilterOperatorNotRegex FilterOperator = "nregex"
// (I)LIKE is faster than REGEX and supports index
// (I)LIKE is faster than REGEX
// ilike doesn't support index so internally we use lower(body) like for query
FilterOperatorLike FilterOperator = "like"
FilterOperatorNotLike FilterOperator = "nlike"

Expand Down
Loading