Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrades Document Layout Analysis model. #4054

Merged
merged 3 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions api/db/services/task_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,8 +247,8 @@ def new_task():
task["progress"] = 0.0

prev_tasks = TaskService.get_tasks(doc["id"])
ck_num = 0
if prev_tasks:
ck_num = 0
for task in tsks:
ck_num += reuse_prev_task_chunks(task, prev_tasks, chunking_config)
TaskService.filter_delete([Task.doc_id == doc["id"]])
Expand All @@ -258,7 +258,7 @@ def new_task():
chunk_ids.extend(task["chunk_ids"].split())
if chunk_ids:
settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]), chunking_config["kb_id"])
DocumentService.update_by_id(doc["id"], {"chunk_num": ck_num})
DocumentService.update_by_id(doc["id"], {"chunk_num": ck_num})

bulk_insert_into_db(Task, tsks, True)
DocumentService.begin2parse(doc["id"])
Expand Down
2 changes: 2 additions & 0 deletions conf/infinity_mapping.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
"content_with_weight": {"type": "varchar", "default": ""},
"content_ltks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
"content_sm_ltks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
"authors_tks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
"authors_sm_tks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
"page_num_int": {"type": "varchar", "default": ""},
"top_int": {"type": "varchar", "default": ""},
"position_int": {"type": "varchar", "default": ""},
Expand Down
3 changes: 2 additions & 1 deletion deepdoc/vision/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@

from .ocr import OCR
from .recognizer import Recognizer
from .layout_recognizer import LayoutRecognizer
from .layout_recognizer import LayoutRecognizer4YOLOv10 as LayoutRecognizer
from .table_structure_recognizer import TableStructureRecognizer


def init_in_out(args):
from PIL import Image
import os
Expand Down
88 changes: 88 additions & 0 deletions deepdoc/vision/layout_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,14 @@
import re
from collections import Counter
from copy import deepcopy

import cv2
import numpy as np
from huggingface_hub import snapshot_download

from api.utils.file_utils import get_project_base_directory
from deepdoc.vision import Recognizer
from deepdoc.vision.operators import nms


class LayoutRecognizer(Recognizer):
Expand Down Expand Up @@ -149,3 +152,88 @@ def findLayout(ty):

ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set]
return ocr_res, page_layout


class LayoutRecognizer4YOLOv10(LayoutRecognizer):
labels = [
"title",
"Text",
"Reference",
"Figure",
"Figure caption",
"Table",
"Table caption",
"Table caption",
"Equation",
"Figure caption",
]

def __init__(self, domain):
domain = "layout"
super().__init__(domain)
self.auto = False
self.scaleFill = False
self.scaleup = True
self.stride = 32
self.center = True

def preprocess(self, image_list):
inputs = []
new_shape = self.input_shape # height, width
for img in image_list:
shape = img.shape[:2]# current shape [height, width]
# Scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
# Compute padding
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
dw /= 2 # divide padding into 2 sides
dh /= 2
ww, hh = new_unpad
img = np.array(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)).astype(np.float32)
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
img = cv2.copyMakeBorder(
img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
) # add border
img /= 255.0
img = img.transpose(2, 0, 1)
img = img[np.newaxis, :, :, :].astype(np.float32)
inputs.append({self.input_names[0]: img, "scale_factor": [shape[1]/ww, shape[0]/hh, dw, dh]})

return inputs

def postprocess(self, boxes, inputs, thr):
thr = 0.08
boxes = np.squeeze(boxes)
scores = boxes[:, 4]
boxes = boxes[scores > thr, :]
scores = scores[scores > thr]
if len(boxes) == 0:
return []
class_ids = boxes[:, -1].astype(int)
boxes = boxes[:, :4]
boxes[:, 0] -= inputs["scale_factor"][2]
boxes[:, 2] -= inputs["scale_factor"][2]
boxes[:, 1] -= inputs["scale_factor"][3]
boxes[:, 3] -= inputs["scale_factor"][3]
input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0],
inputs["scale_factor"][1]])
boxes = np.multiply(boxes, input_shape, dtype=np.float32)

unique_class_ids = np.unique(class_ids)
indices = []
for class_id in unique_class_ids:
class_indices = np.where(class_ids == class_id)[0]
class_boxes = boxes[class_indices, :]
class_scores = scores[class_indices]
class_keep_boxes = nms(class_boxes, class_scores, 0.45)
indices.extend(class_indices[class_keep_boxes])

return [{
"type": self.label_list[class_ids[i]].lower(),
"bbox": [float(t) for t in boxes[i].tolist()],
"score": float(scores[i])
} for i in indices]

26 changes: 26 additions & 0 deletions deepdoc/vision/operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -709,3 +709,29 @@ def preprocess(im, preprocess_ops):
for operator in preprocess_ops:
im, im_info = operator(im, im_info)
return im, im_info


def nms(bboxes, scores, iou_thresh):
import numpy as np
x1 = bboxes[:, 0]
y1 = bboxes[:, 1]
x2 = bboxes[:, 2]
y2 = bboxes[:, 3]
areas = (y2 - y1) * (x2 - x1)

indices = []
index = scores.argsort()[::-1]
while index.size > 0:
i = index[0]
indices.append(i)
x11 = np.maximum(x1[i], x1[index[1:]])
y11 = np.maximum(y1[i], y1[index[1:]])
x22 = np.minimum(x2[i], x2[index[1:]])
y22 = np.minimum(y2[i], y2[index[1:]])
w = np.maximum(0, x22 - x11 + 1)
h = np.maximum(0, y22 - y11 + 1)
overlaps = w * h
ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
idx = np.where(ious <= iou_thresh)[0]
index = index[idx + 1]
return indices