Skip to content

Commit

Permalink
fix: get more accurate emberder segmentation
Browse files Browse the repository at this point in the history
Signed-off-by: Abirdcfly <fp544037857@gmail.com>
  • Loading branch information
Abirdcfly committed Dec 11, 2023
1 parent b175755 commit 2b3daec
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 9 deletions.
25 changes: 18 additions & 7 deletions controllers/knowledgebase_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"github.com/tmc/langchaingo/documentloaders"
langchainembeddings "github.com/tmc/langchaingo/embeddings"
"github.com/tmc/langchaingo/llms/openai"
"github.com/tmc/langchaingo/schema"
"github.com/tmc/langchaingo/textsplitter"
"github.com/tmc/langchaingo/vectorstores/chroma"
apierrors "k8s.io/apimachinery/pkg/api/errors"
Expand Down Expand Up @@ -435,17 +436,22 @@ func (r *KnowledgeBaseReconciler) handleFile(ctx context.Context, log logr.Logge
return err
}
dataReader := bytes.NewReader(data)
var documents []schema.Document
var loader documentloaders.Loader
switch filepath.Ext(fileName) {
case "txt":
case ".txt":
loader = documentloaders.NewText(dataReader)
case "csv":
case ".csv":
if v == arcadiav1alpha1.ObjectTypeQA {
loader = pkgdocumentloaders.NewQACSV(dataReader, fileName, "q", "a")
documents, err = loader.Load(ctx)
if err != nil {
return err
}
} else {
loader = documentloaders.NewCSV(dataReader)
}
case "html", "htm":
case ".html", ".htm":
loader = documentloaders.NewHTML(dataReader)
default:
loader = documentloaders.NewText(dataReader)
Expand Down Expand Up @@ -475,11 +481,15 @@ func (r *KnowledgeBaseReconciler) handleFile(ctx context.Context, log logr.Logge
// )
//}

documents, err := loader.LoadAndSplit(ctx, split)
if err != nil {
return err
if len(documents) == 0 {
documents, err = loader.LoadAndSplit(ctx, split)
if err != nil {
return err
}
}
for i, doc := range documents {
log.V(5).Info(fmt.Sprintf("document[%d]: embedding:%s, metadata:%v", i, doc.PageContent, doc.Metadata))
}

switch store.Spec.Type() { // nolint: gocritic
case arcadiav1alpha1.VectorStoreTypeChroma:
s, err := chroma.New(
Expand Down Expand Up @@ -511,6 +521,7 @@ func (r *KnowledgeBaseReconciler) reconcileDelete(ctx context.Context, log logr.
chroma.WithChromaURL(vectorStore.Spec.Enpoint.URL),
chroma.WithDistanceFunction(vectorStore.Spec.Chroma.DistanceFunction),
chroma.WithNameSpace(kb.VectorStoreCollectionName()),
chroma.WithOpenAiAPIKey("fake"),
)
if err != nil {
log.Error(err, "reconcile delete: init vector store error, may leave garbage data")
Expand Down
2 changes: 1 addition & 1 deletion deploy/charts/arcadia/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ apiVersion: v2
name: arcadia
description: A Helm chart(KubeBB Component) for KubeAGI Arcadia
type: application
version: 0.1.38
version: 0.1.39
appVersion: "0.0.1"

keywords:
Expand Down
1 change: 1 addition & 0 deletions deploy/charts/arcadia/templates/controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ spec:
containers:
- command:
- /manager
- --v={{ .Values.controller.logLevel }}
env:
- name: POD_NAME
valueFrom:
Expand Down
1 change: 1 addition & 0 deletions deploy/charts/arcadia/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ controller:
requests:
cpu: 10m
memory: 64Mi
logLevel: 2
# graphql and bff server
apiserver:
image: kubeagi/arcadia:v0.1.0-20231207-11c8738
Expand Down
3 changes: 2 additions & 1 deletion tests/example-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ kind load docker-image controller:example-e2e --name=$KindName

info "3. install arcadia"
kubectl create namespace arcadia
helm install -narcadia arcadia deploy/charts/arcadia -f tests/deploy-values.yaml --set controller.image=controller:example-e2e --set apiserver.image=controller:example-e2e --wait --timeout $HelmTimeout
helm install -narcadia arcadia deploy/charts/arcadia -f tests/deploy-values.yaml --set controller.image=controller:example-e2e --set apiserver.image=controller:example-e2e --wait --timeout $HelmTimeout --set controller.logLevel=5

info "4. check system datasource arcadia-minio(system datasource)"
waitCRDStatusReady "Datasource" "arcadia" "arcadia-minio"
Expand Down Expand Up @@ -264,4 +264,5 @@ waitCRDStatusReady "Application" "arcadia" "base-chat-with-knowledgebase"
sleep 3
curl -XPOST http://127.0.0.1:8081/chat --data '{"query":"旷工最小计算单位为多少天?","response_mode":"blocking","conversion_id":"","app_name":"base-chat-with-knowledgebase", "app_namespace":"arcadia"}' | jq -e '.message'

exit 1
info "all finished! ✅"

0 comments on commit 2b3daec

Please sign in to comment.