Skip to content

Commit

Permalink
Merge pull request #49 from basenana/feature/plugin
Browse files Browse the repository at this point in the history
feat: subContent plugin
  • Loading branch information
zwwhdls authored Dec 7, 2024
2 parents 67550fb + 16d1783 commit f0cc594
Show file tree
Hide file tree
Showing 8 changed files with 224 additions and 47 deletions.
14 changes: 0 additions & 14 deletions api/request.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,20 +137,6 @@ func (q *DocQuery) ToQuery() *doc.DocumentQuery {
Value: q.WebUrl,
})
}
if q.ParentID != "" {
attrQueries = append(attrQueries, doc.AttrQuery{
Attr: "parentId",
Option: "=",
Value: q.ParentID,
})
}
if q.UnRead != nil {
attrQueries = append(attrQueries, doc.AttrQuery{
Attr: "unRead",
Option: "=",
Value: true,
})
}

query.AttrQueries = attrQueries
return query
Expand Down
2 changes: 1 addition & 1 deletion api/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,6 @@ func (s *HttpServer) handle(group *gin.RouterGroup) {
docGroup := group.Group("/namespace/:namespace/docs")
docGroup.POST("/entry/:entryId", s.store())
docGroup.DELETE("/entry/:entryId", s.delete())
docGroup.PUT("/entry/:entryId/", s.update())
docGroup.PUT("/entry/:entryId", s.update())
docGroup.GET("/search", s.search())
}
62 changes: 31 additions & 31 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,30 +32,30 @@ type Config struct {
Plugins []string `json:"plugins,omitempty"`

// meilisearch
MeiliConfig MeiliConfig `json:"meili_config,omitempty"`
MeiliConfig MeiliConfig `json:"meiliConfig,omitempty"`

// llm limit token
LimitToken int `json:"limit_token,omitempty"` // used by summary, split input into mutil sub-docs summaried by llm separately.
LimitToken int `json:"limitToken,omitempty"` // used by summary, split input into mutil sub-docs summaried by llm separately.

// openai key
OpenAIBaseUrl string `json:"open_ai_base_url,omitempty"` // if openai is used for embedding or llm, it is needed, default is "https://api.openai.com"
OpenAIKey string `json:"open_ai_key,omitempty"` // if openai is used for embedding or llm, it is needed
OpenAIBaseUrl string `json:"openAiBaseUrl,omitempty"` // if openai is used for embedding or llm, it is needed, default is "https://api.openai.com"
OpenAIKey string `json:"openAiKey,omitempty"` // if openai is used for embedding or llm, it is needed

// gemini key
GeminiBaseUri string `json:"gemini_base_uri,omitempty"` // if gemini is used for embedding or llm, it is needed, default is "https://generativelanguage.googleapis.com"
GeminiKey string `json:"gemini_key,omitempty"` // if gemini is used for embedding or llm, it is needed
GeminiBaseUri string `json:"geminiBaseUri,omitempty"` // if gemini is used for embedding or llm, it is needed, default is "https://generativelanguage.googleapis.com"
GeminiKey string `json:"geminiKey,omitempty"` // if gemini is used for embedding or llm, it is needed

// embedding config
EmbeddingConfig EmbeddingConfig `json:"embedding_config,omitempty"`
EmbeddingConfig EmbeddingConfig `json:"embeddingConfig,omitempty"`

// vector store config
VectorStoreConfig VectorStoreConfig `json:"vector_store_config,omitempty"`
VectorStoreConfig VectorStoreConfig `json:"vectorStoreConfig,omitempty"`

// LLM
LLMConfig LLMConfig `json:"llm_config,omitempty"`
LLMConfig LLMConfig `json:"llmConfig,omitempty"`

// text spliter
TextSpliterConfig TextSpliterConfig `json:"text_spliter_config,omitempty"`
TextSpliterConfig TextSpliterConfig `json:"textSpliterConfig,omitempty"`
}

type MeiliConfig struct {
Expand All @@ -67,7 +67,7 @@ type MeiliConfig struct {
}

type LLMConfig struct {
LLMType LLMType `json:"llm_type"`
LLMType LLMType `json:"llmType"`
Prompts map[string]string `json:"prompts,omitempty"`
OpenAI OpenAIConfig `json:"openai,omitempty"`
GLM6B GLM6BConfig `json:"glm6b,omitempty"`
Expand All @@ -79,44 +79,44 @@ type GLM6BConfig struct {
}

type OpenAIConfig struct {
QueryPerMinute int `json:"query_per_minute,omitempty"` // qpm, default is 3
Burst int `json:"burst,omitempty"` // burst, default is 5
Model *string `json:"model,omitempty"` // model of openai, default for llm is "gpt-3.5-turbo"; default for embedding is "text-embedding-ada-002"
MaxReturnToken *int `json:"max_return_token,omitempty"` // maxReturnToken + VectorStoreConfig.TopK * TextSpliterConfig.SpliterChunkSize <= token limit of llm model
FrequencyPenalty *uint `json:"frequency_penalty,omitempty"`
PresencePenalty *uint `json:"presence_penalty,omitempty"`
QueryPerMinute int `json:"queryPerMinute,omitempty"` // qpm, default is 3
Burst int `json:"burst,omitempty"` // burst, default is 5
Model *string `json:"model,omitempty"` // model of openai, default for llm is "gpt-3.5-turbo"; default for embedding is "text-embedding-ada-002"
MaxReturnToken *int `json:"maxReturnToken,omitempty"` // maxReturnToken + VectorStoreConfig.TopK * TextSpliterConfig.SpliterChunkSize <= token limit of llm model
FrequencyPenalty *uint `json:"frequencyPenalty,omitempty"`
PresencePenalty *uint `json:"presencePenalty,omitempty"`
Temperature *float32 `json:"temperature,omitempty"`
}

type GeminiConfig struct {
QueryPerMinute int `json:"query_per_minute,omitempty"` // qpm, default is 3
Burst int `json:"burst,omitempty"` // burst, default is 5
Model *string `json:"model,omitempty"` // model of gemini, default for llm is "gemini-pro"; default for embedding is "embedding-001"
QueryPerMinute int `json:"queryPerMinute,omitempty"` // qpm, default is 3
Burst int `json:"burst,omitempty"` // burst, default is 5
Model *string `json:"model,omitempty"` // model of gemini, default for llm is "gemini-pro"; default for embedding is "embedding-001"
}

type EmbeddingConfig struct {
EmbeddingType EmbeddingType `json:"embedding_type"`
EmbeddingType EmbeddingType `json:"embeddingType"`
OpenAI OpenAIConfig `json:"openai,omitempty"`
HuggingFace HuggingFaceConfig `json:"hugging_face,omitempty"`
HuggingFace HuggingFaceConfig `json:"huggingFace,omitempty"`
Gemini GeminiConfig `json:"gemini,omitempty"`
}

type HuggingFaceConfig struct {
EmbeddingUrl string `json:"embedding_url,omitempty"`
EmbeddingModel string `json:"embedding_model,omitempty"`
EmbeddingUrl string `json:"embeddingUrl,omitempty"`
EmbeddingModel string `json:"embeddingModel,omitempty"`
}

type VectorStoreConfig struct {
VectorStoreType VectorStoreType `json:"vector_store_type"`
VectorUrl string `json:"vector_url"`
TopK *int `json:"top_k,omitempty"` // topk of knn, default is 6
EmbeddingDim int `json:"embedding_dim,omitempty"` // embedding dimension, default is 1536
VectorStoreType VectorStoreType `json:"vectorStoreType"`
VectorUrl string `json:"vectorUrl"`
TopK *int `json:"topK,omitempty"` // topk of knn, default is 6
EmbeddingDim int `json:"embeddingDim,omitempty"` // embedding dimension, default is 1536
}

type TextSpliterConfig struct {
SpliterChunkSize int `json:"spliter_chunk_size,omitempty"` // chunk of files splited to store, default is 4000
SpliterChunkOverlap int `json:"spliter_chunk_overlap,omitempty"` // overlap of each chunks, default is 200
SpliterSeparator string `json:"spliter_separator,omitempty"` // separator to split files, default is \n
SpliterChunkSize int `json:"spliterChunkSize,omitempty"` // chunk of files splited to store, default is 4000
SpliterChunkOverlap int `json:"spliterChunkOverlap,omitempty"` // overlap of each chunks, default is 200
SpliterSeparator string `json:"spliterSeparator,omitempty"` // separator to split files, default is \n
}

type LLMType string
Expand Down
10 changes: 10 additions & 0 deletions pkg/dispatch/plugin/header.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"bytes"
"context"
"fmt"
"net/url"

"github.com/PuerkitoBio/goquery"

Expand Down Expand Up @@ -68,6 +69,15 @@ func (h *HeaderImgPlugin) Run(ctx context.Context, doc *doc.Document) error {
}
return true
})
hurl, err := url.Parse(headerImgUrl)
if err != nil {
return nil
}
// relative address
if hurl.Host == "" {
websiteUrl, _ := url.Parse(doc.WebUrl)
headerImgUrl = websiteUrl.ResolveReference(&url.URL{Path: headerImgUrl}).String()
}
doc.HeaderImage = headerImgUrl
return nil
}
Expand Down
63 changes: 63 additions & 0 deletions pkg/dispatch/plugin/header_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
Copyright 2024 Friday Author.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package plugin

import (
"context"
"testing"

"github.com/basenana/friday/pkg/models/doc"
)

func TestHeaderImgPlugin_Run(t *testing.T) {
tests := []struct {
name string
document *doc.Document
wantErr bool
wantHeaderImg string
}{
{
name: "test-relative-address",
document: &doc.Document{
WebUrl: "https://blog.abc/123",
Content: "<p><img src=\"media/123.png\" alt=\"\" /></p>",
},
wantErr: false,
wantHeaderImg: "https://blog.abc/media/123.png",
},
{
name: "test-normal",
document: &doc.Document{
WebUrl: "https://blog.abc",
Content: "<p><img src=\"https://def/123.png\" /></p>",
},
wantErr: false,
wantHeaderImg: "https://def/123.png",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
h := &HeaderImgPlugin{}
if err := h.Run(context.TODO(), tt.document); (err != nil) != tt.wantErr {
t.Errorf("Run() error = %v, wantErr %v", err, tt.wantErr)
}
if tt.document.HeaderImage != tt.wantHeaderImg {
t.Errorf("Run() got = %v, want %v", tt.document.HeaderImage, tt.wantHeaderImg)
}
})
}
}
114 changes: 114 additions & 0 deletions pkg/dispatch/plugin/subcontent.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/*
Copyright 2024 Friday Author.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package plugin

import (
"bytes"
"context"
"regexp"
"strings"

"github.com/PuerkitoBio/goquery"

"github.com/basenana/friday/pkg/models/doc"
)

type SubContentPlugin struct {
}

func (s *SubContentPlugin) Name() string {
return "subContent"
}

func (s *SubContentPlugin) Run(ctx context.Context, doc *doc.Document) error {
doc.SubContent = GenerateContentSubContent(doc.Content)
return nil
}

var _ ChainPlugin = &SubContentPlugin{}

var repeatSpace = regexp.MustCompile(`\s+`)
var htmlCharFilterRegexp = regexp.MustCompile(`</?[!\w:]+((\s+[\w-]+(\s*=\s*(?:\\*".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`)

func ContentTrim(contentType, content string) string {
switch contentType {
case "html", "htm", "webarchive", ".webarchive":
content = strings.ReplaceAll(content, "</p>", "</p>\n")
content = strings.ReplaceAll(content, "</P>", "</P>\n")
content = strings.ReplaceAll(content, "</div>", "</div>\n")
content = strings.ReplaceAll(content, "</DIV>", "</DIV>\n")
content = htmlCharFilterRegexp.ReplaceAllString(content, "")
}
content = repeatSpace.ReplaceAllString(content, " ")
return content
}

func GenerateContentSubContent(content string) string {
if subContent, err := slowPathContentSubContent([]byte(content)); err == nil {
return subContent
}

content = ContentTrim("html", content)
subContents := strings.Split(content, "\n")
contents := make([]string, 0)
i := 0
for _, subContent := range subContents {
subContent = strings.TrimSpace(subContent)
if subContent != "" {
contents = append(contents, subContent)
i++
if i >= 3 {
break
}
}
}
return strings.Join(contents, " ")
}

func slowPathContentSubContent(content []byte) (string, error) {
query, err := goquery.NewDocumentFromReader(bytes.NewReader(content))
if err != nil {
return "", err
}

contents := make([]string, 0)
query.Find("p").EachWithBreak(func(i int, selection *goquery.Selection) bool {
if len(contents) > 10 {
return false
}
t := strings.TrimSpace(selection.Text())
if t != "" {
contents = append(contents, strings.ReplaceAll(t, "\n", " "))
}
return true
})

return trimDocumentContent(strings.Join(contents, " "), 400), nil
}

func trimDocumentContent(str string, m int) string {
str = ContentTrim("html", str)
runes := []rune(str)
if len(runes) > m {
return string(runes[:m])
}
return str
}

func init() {
RegisterPlugin(&SubContentPlugin{})
}
3 changes: 2 additions & 1 deletion pkg/models/doc/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import (

var (
DocFilterableAttrs = []string{"namespace", "id", "entryId", "name", "source", "webUrl", "createdAt", "updatedAt"}
DocAttrFilterableAttrs = []string{"namespace", "entryId", "key", "id"}
DocAttrFilterableAttrs = []string{"namespace", "entryId", "key", "id", "value"}
DocSortAttrs = []string{"createdAt", "updatedAt"}
)

Expand All @@ -47,6 +47,7 @@ type Document struct {
Content string `json:"content"`
Summary string `json:"summary,omitempty"`
HeaderImage string `json:"headerImage,omitempty"`
SubContent string `json:"subContent,omitempty"`

CreatedAt time.Time `json:"createdAt,omitempty"`
UpdatedAt time.Time `json:"updatedAt,omitempty"`
Expand Down
3 changes: 3 additions & 0 deletions pkg/service/chain.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ func (c *Chain) Search(ctx context.Context, query *doc.DocumentQuery, attrQuerie
for _, attr := range attrs {
ids = append(ids, attr.EntryId)
}
if len(ids) == 0 && len(attrQueries) != 0 {
return []doc.Document{}, nil
}
if len(ids) != 0 {
query.AttrQueries = append(query.AttrQueries, doc.AttrQuery{
Attr: "entryId",
Expand Down

0 comments on commit f0cc594

Please sign in to comment.