From 97b6ae4ababab66cc327d483a696a6858e0abd04 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Fri, 8 Nov 2024 14:43:06 +0800 Subject: [PATCH 1/3] update document and segment word count --- api/services/dataset_service.py | 44 ++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index 50da547fd84c84..9ec471466279b0 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -1415,9 +1415,13 @@ def create_segment(cls, args: dict, document: Document, dataset: Dataset): created_by=current_user.id, ) if document.doc_form == "qa_model": + segment_document.word_count += len(args["answer"]) segment_document.answer = args["answer"] - + db.session.add(segment_document) + # update document word count + document.word_count += segment_document.word_count + db.session.add(document) db.session.commit() # save vector index @@ -1436,6 +1440,7 @@ def create_segment(cls, args: dict, document: Document, dataset: Dataset): @classmethod def multi_create_segment(cls, segments: list, document: Document, dataset: Dataset): lock_name = "multi_add_segment_lock_document_id_{}".format(document.id) + increment_word_count = 0 with redis_client.lock(lock_name, timeout=600): embedding_model = None if dataset.indexing_technique == "high_quality": @@ -1461,7 +1466,10 @@ def multi_create_segment(cls, segments: list, document: Document, dataset: Datas tokens = 0 if dataset.indexing_technique == "high_quality" and embedding_model: # calc embedding use tokens - tokens = embedding_model.get_text_embedding_num_tokens(texts=[content]) + if document.doc_form == "qa_model": + tokens = embedding_model.get_text_embedding_num_tokens(texts=[content + segment_item["answer"]]) + else: + tokens = embedding_model.get_text_embedding_num_tokens(texts=[content]) segment_document = DocumentSegment( tenant_id=current_user.current_tenant_id, dataset_id=document.dataset_id, @@ -1479,6 +1487,8 @@ def multi_create_segment(cls, segments: list, document: Document, dataset: Datas ) if document.doc_form == "qa_model": segment_document.answer = segment_item["answer"] + segment_document.word_count += len(segment_item["answer"]) + increment_word_count += segment_document.word_count db.session.add(segment_document) segment_data_list.append(segment_document) @@ -1487,7 +1497,9 @@ def multi_create_segment(cls, segments: list, document: Document, dataset: Datas keywords_list.append(segment_item["keywords"]) else: keywords_list.append(None) - + # update document word count + document.word_count += increment_word_count + db.session.add(document) try: # save vector index VectorService.create_segments_vector(keywords_list, pre_segment_data_list, dataset) @@ -1527,17 +1539,25 @@ def update_segment(cls, args: dict, segment: DocumentSegment, document: Document else: raise ValueError("Can't update disabled segment") try: + word_count_change = segment.word_count content = args["content"] if segment.content == content: - if document.doc_form == "qa_model": - segment.answer = args["answer"] if args.get("keywords"): segment.keywords = args["keywords"] + segment.word_count = len(content) + if document.doc_form == "qa_model": + segment.answer = args["answer"] + segment.word_count += len(args["answer"]) + word_count_change = segment.word_count - word_count_change segment.enabled = True segment.disabled_at = None segment.disabled_by = None db.session.add(segment) db.session.commit() + # update document word count + if word_count_change != 0: + document.word_count = max(0, document.word_count + word_count_change) + db.session.add(document) # update segment index task if "keywords" in args: keyword = Keyword(dataset) @@ -1565,7 +1585,10 @@ def update_segment(cls, args: dict, segment: DocumentSegment, document: Document ) # calc embedding use tokens - tokens = embedding_model.get_text_embedding_num_tokens(texts=[content]) + if document.doc_form == "qa_model": + tokens = embedding_model.get_text_embedding_num_tokens(texts=[content + segment.answer]) + else: + tokens = embedding_model.get_text_embedding_num_tokens(texts=[content]) segment.content = content segment.index_node_hash = segment_hash segment.word_count = len(content) @@ -1580,6 +1603,12 @@ def update_segment(cls, args: dict, segment: DocumentSegment, document: Document segment.disabled_by = None if document.doc_form == "qa_model": segment.answer = args["answer"] + segment.word_count += len(args["answer"]) + word_count_change = segment.word_count - word_count_change + # update document word count + if word_count_change != 0: + document.word_count = max(0, document.word_count + word_count_change) + db.session.add(document) db.session.add(segment) db.session.commit() # update segment vector index @@ -1608,6 +1637,9 @@ def delete_segment(cls, segment: DocumentSegment, document: Document, dataset: D redis_client.setex(indexing_cache_key, 600, 1) delete_segment_from_index_task.delay(segment.id, segment.index_node_id, dataset.id, document.id) db.session.delete(segment) + # update document word count + document.word_count -= segment.word_count + db.session.add(document) db.session.commit() From 8385af0bd0a443cb6d7538f51ce2149c4041637f Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Fri, 8 Nov 2024 14:50:25 +0800 Subject: [PATCH 2/3] update document and segment word count --- api/services/dataset_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index 9ec471466279b0..a2cc87a57979db 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -1417,7 +1417,7 @@ def create_segment(cls, args: dict, document: Document, dataset: Dataset): if document.doc_form == "qa_model": segment_document.word_count += len(args["answer"]) segment_document.answer = args["answer"] - + db.session.add(segment_document) # update document word count document.word_count += segment_document.word_count From 47af46866020542babd4ed1f15e14f56a1ca65e1 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Fri, 8 Nov 2024 17:17:07 +0800 Subject: [PATCH 3/3] document word count --- api/tasks/batch_create_segment_to_index_task.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/api/tasks/batch_create_segment_to_index_task.py b/api/tasks/batch_create_segment_to_index_task.py index de7f0ddec1f3b6..d1b41f26751519 100644 --- a/api/tasks/batch_create_segment_to_index_task.py +++ b/api/tasks/batch_create_segment_to_index_task.py @@ -57,7 +57,7 @@ def batch_create_segment_to_index_task( model_type=ModelType.TEXT_EMBEDDING, model=dataset.embedding_model, ) - + word_count_change = 0 for segment in content: content = segment["content"] doc_id = str(uuid.uuid4()) @@ -86,8 +86,13 @@ def batch_create_segment_to_index_task( ) if dataset_document.doc_form == "qa_model": segment_document.answer = segment["answer"] + segment_document.word_count += len(segment["answer"]) + word_count_change += segment_document.word_count db.session.add(segment_document) document_segments.append(segment_document) + # update document word count + dataset_document.word_count += word_count_change + db.session.add(dataset_document) # add index to db indexing_runner = IndexingRunner() indexing_runner.batch_add_segments(document_segments, dataset)