From 16d17833fa024bd82107b4adf8554a5370bb7a64 Mon Sep 17 00:00:00 2001 From: zwwhdls Date: Sat, 7 Dec 2024 20:24:29 +0800 Subject: [PATCH] feat: subContent plugin Signed-off-by: zwwhdls --- api/request.go | 14 ---- api/server.go | 2 +- config/config.go | 62 ++++++++-------- pkg/dispatch/plugin/header.go | 10 +++ pkg/dispatch/plugin/header_test.go | 63 ++++++++++++++++ pkg/dispatch/plugin/subcontent.go | 114 +++++++++++++++++++++++++++++ pkg/models/doc/document.go | 3 +- pkg/service/chain.go | 3 + 8 files changed, 224 insertions(+), 47 deletions(-) create mode 100644 pkg/dispatch/plugin/header_test.go create mode 100644 pkg/dispatch/plugin/subcontent.go diff --git a/api/request.go b/api/request.go index 652b261..8ea756c 100644 --- a/api/request.go +++ b/api/request.go @@ -137,20 +137,6 @@ func (q *DocQuery) ToQuery() *doc.DocumentQuery { Value: q.WebUrl, }) } - if q.ParentID != "" { - attrQueries = append(attrQueries, doc.AttrQuery{ - Attr: "parentId", - Option: "=", - Value: q.ParentID, - }) - } - if q.UnRead != nil { - attrQueries = append(attrQueries, doc.AttrQuery{ - Attr: "unRead", - Option: "=", - Value: true, - }) - } query.AttrQueries = attrQueries return query diff --git a/api/server.go b/api/server.go index b45b7ac..613a17c 100644 --- a/api/server.go +++ b/api/server.go @@ -86,6 +86,6 @@ func (s *HttpServer) handle(group *gin.RouterGroup) { docGroup := group.Group("/namespace/:namespace/docs") docGroup.POST("/entry/:entryId", s.store()) docGroup.DELETE("/entry/:entryId", s.delete()) - docGroup.PUT("/entry/:entryId/", s.update()) + docGroup.PUT("/entry/:entryId", s.update()) docGroup.GET("/search", s.search()) } diff --git a/config/config.go b/config/config.go index 650624e..cc49613 100644 --- a/config/config.go +++ b/config/config.go @@ -32,30 +32,30 @@ type Config struct { Plugins []string `json:"plugins,omitempty"` // meilisearch - MeiliConfig MeiliConfig `json:"meili_config,omitempty"` + MeiliConfig MeiliConfig `json:"meiliConfig,omitempty"` // llm limit token - LimitToken int `json:"limit_token,omitempty"` // used by summary, split input into mutil sub-docs summaried by llm separately. + LimitToken int `json:"limitToken,omitempty"` // used by summary, split input into mutil sub-docs summaried by llm separately. // openai key - OpenAIBaseUrl string `json:"open_ai_base_url,omitempty"` // if openai is used for embedding or llm, it is needed, default is "https://api.openai.com" - OpenAIKey string `json:"open_ai_key,omitempty"` // if openai is used for embedding or llm, it is needed + OpenAIBaseUrl string `json:"openAiBaseUrl,omitempty"` // if openai is used for embedding or llm, it is needed, default is "https://api.openai.com" + OpenAIKey string `json:"openAiKey,omitempty"` // if openai is used for embedding or llm, it is needed // gemini key - GeminiBaseUri string `json:"gemini_base_uri,omitempty"` // if gemini is used for embedding or llm, it is needed, default is "https://generativelanguage.googleapis.com" - GeminiKey string `json:"gemini_key,omitempty"` // if gemini is used for embedding or llm, it is needed + GeminiBaseUri string `json:"geminiBaseUri,omitempty"` // if gemini is used for embedding or llm, it is needed, default is "https://generativelanguage.googleapis.com" + GeminiKey string `json:"geminiKey,omitempty"` // if gemini is used for embedding or llm, it is needed // embedding config - EmbeddingConfig EmbeddingConfig `json:"embedding_config,omitempty"` + EmbeddingConfig EmbeddingConfig `json:"embeddingConfig,omitempty"` // vector store config - VectorStoreConfig VectorStoreConfig `json:"vector_store_config,omitempty"` + VectorStoreConfig VectorStoreConfig `json:"vectorStoreConfig,omitempty"` // LLM - LLMConfig LLMConfig `json:"llm_config,omitempty"` + LLMConfig LLMConfig `json:"llmConfig,omitempty"` // text spliter - TextSpliterConfig TextSpliterConfig `json:"text_spliter_config,omitempty"` + TextSpliterConfig TextSpliterConfig `json:"textSpliterConfig,omitempty"` } type MeiliConfig struct { @@ -67,7 +67,7 @@ type MeiliConfig struct { } type LLMConfig struct { - LLMType LLMType `json:"llm_type"` + LLMType LLMType `json:"llmType"` Prompts map[string]string `json:"prompts,omitempty"` OpenAI OpenAIConfig `json:"openai,omitempty"` GLM6B GLM6BConfig `json:"glm6b,omitempty"` @@ -79,44 +79,44 @@ type GLM6BConfig struct { } type OpenAIConfig struct { - QueryPerMinute int `json:"query_per_minute,omitempty"` // qpm, default is 3 - Burst int `json:"burst,omitempty"` // burst, default is 5 - Model *string `json:"model,omitempty"` // model of openai, default for llm is "gpt-3.5-turbo"; default for embedding is "text-embedding-ada-002" - MaxReturnToken *int `json:"max_return_token,omitempty"` // maxReturnToken + VectorStoreConfig.TopK * TextSpliterConfig.SpliterChunkSize <= token limit of llm model - FrequencyPenalty *uint `json:"frequency_penalty,omitempty"` - PresencePenalty *uint `json:"presence_penalty,omitempty"` + QueryPerMinute int `json:"queryPerMinute,omitempty"` // qpm, default is 3 + Burst int `json:"burst,omitempty"` // burst, default is 5 + Model *string `json:"model,omitempty"` // model of openai, default for llm is "gpt-3.5-turbo"; default for embedding is "text-embedding-ada-002" + MaxReturnToken *int `json:"maxReturnToken,omitempty"` // maxReturnToken + VectorStoreConfig.TopK * TextSpliterConfig.SpliterChunkSize <= token limit of llm model + FrequencyPenalty *uint `json:"frequencyPenalty,omitempty"` + PresencePenalty *uint `json:"presencePenalty,omitempty"` Temperature *float32 `json:"temperature,omitempty"` } type GeminiConfig struct { - QueryPerMinute int `json:"query_per_minute,omitempty"` // qpm, default is 3 - Burst int `json:"burst,omitempty"` // burst, default is 5 - Model *string `json:"model,omitempty"` // model of gemini, default for llm is "gemini-pro"; default for embedding is "embedding-001" + QueryPerMinute int `json:"queryPerMinute,omitempty"` // qpm, default is 3 + Burst int `json:"burst,omitempty"` // burst, default is 5 + Model *string `json:"model,omitempty"` // model of gemini, default for llm is "gemini-pro"; default for embedding is "embedding-001" } type EmbeddingConfig struct { - EmbeddingType EmbeddingType `json:"embedding_type"` + EmbeddingType EmbeddingType `json:"embeddingType"` OpenAI OpenAIConfig `json:"openai,omitempty"` - HuggingFace HuggingFaceConfig `json:"hugging_face,omitempty"` + HuggingFace HuggingFaceConfig `json:"huggingFace,omitempty"` Gemini GeminiConfig `json:"gemini,omitempty"` } type HuggingFaceConfig struct { - EmbeddingUrl string `json:"embedding_url,omitempty"` - EmbeddingModel string `json:"embedding_model,omitempty"` + EmbeddingUrl string `json:"embeddingUrl,omitempty"` + EmbeddingModel string `json:"embeddingModel,omitempty"` } type VectorStoreConfig struct { - VectorStoreType VectorStoreType `json:"vector_store_type"` - VectorUrl string `json:"vector_url"` - TopK *int `json:"top_k,omitempty"` // topk of knn, default is 6 - EmbeddingDim int `json:"embedding_dim,omitempty"` // embedding dimension, default is 1536 + VectorStoreType VectorStoreType `json:"vectorStoreType"` + VectorUrl string `json:"vectorUrl"` + TopK *int `json:"topK,omitempty"` // topk of knn, default is 6 + EmbeddingDim int `json:"embeddingDim,omitempty"` // embedding dimension, default is 1536 } type TextSpliterConfig struct { - SpliterChunkSize int `json:"spliter_chunk_size,omitempty"` // chunk of files splited to store, default is 4000 - SpliterChunkOverlap int `json:"spliter_chunk_overlap,omitempty"` // overlap of each chunks, default is 200 - SpliterSeparator string `json:"spliter_separator,omitempty"` // separator to split files, default is \n + SpliterChunkSize int `json:"spliterChunkSize,omitempty"` // chunk of files splited to store, default is 4000 + SpliterChunkOverlap int `json:"spliterChunkOverlap,omitempty"` // overlap of each chunks, default is 200 + SpliterSeparator string `json:"spliterSeparator,omitempty"` // separator to split files, default is \n } type LLMType string diff --git a/pkg/dispatch/plugin/header.go b/pkg/dispatch/plugin/header.go index 6cb31aa..4ffff22 100644 --- a/pkg/dispatch/plugin/header.go +++ b/pkg/dispatch/plugin/header.go @@ -20,6 +20,7 @@ import ( "bytes" "context" "fmt" + "net/url" "github.com/PuerkitoBio/goquery" @@ -68,6 +69,15 @@ func (h *HeaderImgPlugin) Run(ctx context.Context, doc *doc.Document) error { } return true }) + hurl, err := url.Parse(headerImgUrl) + if err != nil { + return nil + } + // relative address + if hurl.Host == "" { + websiteUrl, _ := url.Parse(doc.WebUrl) + headerImgUrl = websiteUrl.ResolveReference(&url.URL{Path: headerImgUrl}).String() + } doc.HeaderImage = headerImgUrl return nil } diff --git a/pkg/dispatch/plugin/header_test.go b/pkg/dispatch/plugin/header_test.go new file mode 100644 index 0000000..a84896c --- /dev/null +++ b/pkg/dispatch/plugin/header_test.go @@ -0,0 +1,63 @@ +/* + Copyright 2024 Friday Author. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package plugin + +import ( + "context" + "testing" + + "github.com/basenana/friday/pkg/models/doc" +) + +func TestHeaderImgPlugin_Run(t *testing.T) { + tests := []struct { + name string + document *doc.Document + wantErr bool + wantHeaderImg string + }{ + { + name: "test-relative-address", + document: &doc.Document{ + WebUrl: "https://blog.abc/123", + Content: "

\"\"

", + }, + wantErr: false, + wantHeaderImg: "https://blog.abc/media/123.png", + }, + { + name: "test-normal", + document: &doc.Document{ + WebUrl: "https://blog.abc", + Content: "

", + }, + wantErr: false, + wantHeaderImg: "https://def/123.png", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + h := &HeaderImgPlugin{} + if err := h.Run(context.TODO(), tt.document); (err != nil) != tt.wantErr { + t.Errorf("Run() error = %v, wantErr %v", err, tt.wantErr) + } + if tt.document.HeaderImage != tt.wantHeaderImg { + t.Errorf("Run() got = %v, want %v", tt.document.HeaderImage, tt.wantHeaderImg) + } + }) + } +} diff --git a/pkg/dispatch/plugin/subcontent.go b/pkg/dispatch/plugin/subcontent.go new file mode 100644 index 0000000..8ae620b --- /dev/null +++ b/pkg/dispatch/plugin/subcontent.go @@ -0,0 +1,114 @@ +/* + Copyright 2024 Friday Author. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package plugin + +import ( + "bytes" + "context" + "regexp" + "strings" + + "github.com/PuerkitoBio/goquery" + + "github.com/basenana/friday/pkg/models/doc" +) + +type SubContentPlugin struct { +} + +func (s *SubContentPlugin) Name() string { + return "subContent" +} + +func (s *SubContentPlugin) Run(ctx context.Context, doc *doc.Document) error { + doc.SubContent = GenerateContentSubContent(doc.Content) + return nil +} + +var _ ChainPlugin = &SubContentPlugin{} + +var repeatSpace = regexp.MustCompile(`\s+`) +var htmlCharFilterRegexp = regexp.MustCompile(`\s]+))?)+\s*|\s*)/?>`) + +func ContentTrim(contentType, content string) string { + switch contentType { + case "html", "htm", "webarchive", ".webarchive": + content = strings.ReplaceAll(content, "

", "

\n") + content = strings.ReplaceAll(content, "

", "

\n") + content = strings.ReplaceAll(content, "", "\n") + content = strings.ReplaceAll(content, "", "\n") + content = htmlCharFilterRegexp.ReplaceAllString(content, "") + } + content = repeatSpace.ReplaceAllString(content, " ") + return content +} + +func GenerateContentSubContent(content string) string { + if subContent, err := slowPathContentSubContent([]byte(content)); err == nil { + return subContent + } + + content = ContentTrim("html", content) + subContents := strings.Split(content, "\n") + contents := make([]string, 0) + i := 0 + for _, subContent := range subContents { + subContent = strings.TrimSpace(subContent) + if subContent != "" { + contents = append(contents, subContent) + i++ + if i >= 3 { + break + } + } + } + return strings.Join(contents, " ") +} + +func slowPathContentSubContent(content []byte) (string, error) { + query, err := goquery.NewDocumentFromReader(bytes.NewReader(content)) + if err != nil { + return "", err + } + + contents := make([]string, 0) + query.Find("p").EachWithBreak(func(i int, selection *goquery.Selection) bool { + if len(contents) > 10 { + return false + } + t := strings.TrimSpace(selection.Text()) + if t != "" { + contents = append(contents, strings.ReplaceAll(t, "\n", " ")) + } + return true + }) + + return trimDocumentContent(strings.Join(contents, " "), 400), nil +} + +func trimDocumentContent(str string, m int) string { + str = ContentTrim("html", str) + runes := []rune(str) + if len(runes) > m { + return string(runes[:m]) + } + return str +} + +func init() { + RegisterPlugin(&SubContentPlugin{}) +} diff --git a/pkg/models/doc/document.go b/pkg/models/doc/document.go index 2d69954..6022bcb 100644 --- a/pkg/models/doc/document.go +++ b/pkg/models/doc/document.go @@ -26,7 +26,7 @@ import ( var ( DocFilterableAttrs = []string{"namespace", "id", "entryId", "name", "source", "webUrl", "createdAt", "updatedAt"} - DocAttrFilterableAttrs = []string{"namespace", "entryId", "key", "id"} + DocAttrFilterableAttrs = []string{"namespace", "entryId", "key", "id", "value"} DocSortAttrs = []string{"createdAt", "updatedAt"} ) @@ -47,6 +47,7 @@ type Document struct { Content string `json:"content"` Summary string `json:"summary,omitempty"` HeaderImage string `json:"headerImage,omitempty"` + SubContent string `json:"subContent,omitempty"` CreatedAt time.Time `json:"createdAt,omitempty"` UpdatedAt time.Time `json:"updatedAt,omitempty"` diff --git a/pkg/service/chain.go b/pkg/service/chain.go index 3676780..ec7b6f5 100644 --- a/pkg/service/chain.go +++ b/pkg/service/chain.go @@ -98,6 +98,9 @@ func (c *Chain) Search(ctx context.Context, query *doc.DocumentQuery, attrQuerie for _, attr := range attrs { ids = append(ids, attr.EntryId) } + if len(ids) == 0 && len(attrQueries) != 0 { + return []doc.Document{}, nil + } if len(ids) != 0 { query.AttrQueries = append(query.AttrQueries, doc.AttrQuery{ Attr: "entryId",