Skip to content

Commit

Permalink
Merge branch 'infiniflow:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
isthaison authored Dec 2, 2024
2 parents 6e4f401 + 59a5813 commit eb5f2f1
Show file tree
Hide file tree
Showing 17 changed files with 267 additions and 162 deletions.
4 changes: 2 additions & 2 deletions README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -332,9 +332,9 @@ RAGFlow 只有通过开源协作才能蓬勃发展。秉持这一精神,我们

## 👥 加入社区

扫二维码添加 RAGFlow 小助手,进 RAGFlow 交流群。
扫二维码添加 InfinifFlow 小助手,进 RAGFlow 交流群。

<p align="center">
<img src="https://github.com/infiniflow/ragflow/assets/7248/bccf284f-46f2-4445-9809-8f1030fb7585" width=50% height=50%>
<img src="https://github.com/user-attachments/assets/87095713-7ad2-4c48-bd11-10030d0e30ae" width=50% height=50%>
</p>

5 changes: 5 additions & 0 deletions api/apps/chunk_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ def list_chunk():
sres = settings.retrievaler.search(query, search.index_name(tenant_id), kb_ids, highlight=True)
res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
for id in sres.ids:
chunk_elem = sres.field[id]
if 'position_list' in chunk_elem:
if isinstance(chunk_elem["position_list"], str):
chunk_elem.pop('position_list') # Infinity will store position list as empty str

d = {
"chunk_id": id,
"content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[
Expand Down
2 changes: 2 additions & 0 deletions api/ragflow_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from api.db.init_data import init_web_data
from api.versions import get_ragflow_version
from api.utils import show_configs
from rag.settings import print_rag_settings


def update_progress():
Expand Down Expand Up @@ -75,6 +76,7 @@ def update_progress():
)
show_configs()
settings.init_settings()
print_rag_settings()

# init db
init_web_db()
Expand Down
36 changes: 36 additions & 0 deletions conf/llm_factories.json
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,18 @@
"tags": "TEXT EMBEDDING",
"max_tokens": 8196,
"model_type": "embedding"
},
{
"llm_name": "jina-reranker-v2-base-multilingual",
"tags": "RE-RANK,8k",
"max_tokens": 8196,
"model_type": "rerank"
},
{
"llm_name": "jina-embeddings-v3",
"tags": "TEXT EMBEDDING",
"max_tokens": 8196,
"model_type": "embedding"
}
]
},
Expand Down Expand Up @@ -2432,6 +2444,18 @@
"max_tokens": 4000,
"model_type": "embedding"
},
{
"llm_name": "voyage-3",
"tags": "TEXT EMBEDDING,32000",
"max_tokens": 32000,
"model_type": "embedding"
},
{
"llm_name": "voyage-3-lite",
"tags": "TEXT EMBEDDING,32000",
"max_tokens": 32000,
"model_type": "embedding"
},
{
"llm_name": "rerank-1",
"tags": "RE-RANK, 8000",
Expand All @@ -2443,6 +2467,18 @@
"tags": "RE-RANK, 4000",
"max_tokens": 4000,
"model_type": "rerank"
},
{
"llm_name": "rerank-2",
"tags": "RE-RANK, 16000",
"max_tokens": 16000,
"model_type": "rerank"
},
{
"llm_name": "rerank-2-lite",
"tags": "RE-RANK, 8000",
"max_tokens": 8000,
"model_type": "rerank"
}
]
},
Expand Down
19 changes: 11 additions & 8 deletions rag/app/book.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,30 +26,33 @@
class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
callback(msg="OCR is running...")
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback)
callback(msg="OCR finished")
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))

from timeit import default_timer as timer
start = timer()
self._layouts_rec(zoomin)
callback(0.67, "Layout analysis finished")
callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start))
logging.debug("layouts: {}".format(timer() - start))

start = timer()
self._table_transformer_job(zoomin)
callback(0.68, "Table analysis finished")
callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start))

start = timer()
self._text_merge()
tbls = self._extract_table_figure(True, zoomin, True, True)
self._naive_vertical_merge()
self._filter_forpages()
self._merge_with_same_bullet()
callback(0.75, "Text merging finished.")

callback(0.8, "Text extraction finished")
callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))

return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
for b in self.boxes], tbls
Expand Down
11 changes: 6 additions & 5 deletions rag/app/laws.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,25 +108,26 @@ def __init__(self):

def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
callback(msg="OCR is running...")
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished")
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))

from timeit import default_timer as timer
start = timer()
self._layouts_rec(zoomin)
callback(0.67, "Layout analysis finished")
callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start))
logging.debug("layouts:".format(
))
self._naive_vertical_merge()

callback(0.8, "Text extraction finished")
callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))

return [(b["text"], self._line_tag(b, zoomin))
for b in self.boxes], None
Expand Down
15 changes: 10 additions & 5 deletions rag/app/manual.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,30 +36,35 @@ def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR is running...")
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished.")
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
# for bb in self.boxes:
# for b in bb:
# print(b)
logging.debug("OCR: {}".format(timer() - start))

start = timer()
self._layouts_rec(zoomin)
callback(0.65, "Layout analysis finished.")
callback(0.65, "Layout analysis ({:.2f}s)".format(timer() - start))
logging.debug("layouts: {}".format(timer() - start))

start = timer()
self._table_transformer_job(zoomin)
callback(0.67, "Table analysis finished.")
callback(0.67, "Table analysis ({:.2f}s)".format(timer() - start))

start = timer()
self._text_merge()
tbls = self._extract_table_figure(True, zoomin, True, True)
self._concat_downward()
self._filter_forpages()
callback(0.68, "Text merging finished")
callback(0.68, "Text merged ({:.2f}s)".format(timer() - start))

# clean mess
for b in self.boxes:
Expand Down
43 changes: 23 additions & 20 deletions rag/app/naive.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,30 +124,35 @@ class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
start = timer()
callback(msg="OCR is running...")
first_start = start
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished")
logging.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))

start = timer()
self._layouts_rec(zoomin)
callback(0.63, "Layout analysis finished.")
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))

start = timer()
self._table_transformer_job(zoomin)
callback(0.65, "Table analysis finished.")
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))

start = timer()
self._text_merge()
callback(0.67, "Text merging finished")
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
tbls = self._extract_table_figure(True, zoomin, True, True)
# self._naive_vertical_merge()
self._concat_downward()
# self._filter_forpages()

logging.info("layouts cost: {}s".format(timer() - start))
logging.info("layouts cost: {}s".format(timer() - first_start))
return [(b["text"], self._line_tag(b, zoomin))
for b in self.boxes], tbls

Expand All @@ -170,7 +175,7 @@ def __call__(self, filename, binary=None):
else:
if sections and sections[-1][0].strip().find("#") == 0:
sec_, _ = sections.pop(-1)
sections.append((sec_+"\n"+sec, ""))
sections.append((sec_ + "\n" + sec, ""))
else:
sections.append((sec, ""))

Expand All @@ -188,7 +193,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
"""

eng = lang.lower() == "english" # is_english(cks)
is_english = lang.lower() == "english" # is_english(cks)
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
Expand All @@ -201,8 +206,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_parser = None
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections, tbls = Docx()(filename, binary)
res = tokenize_table(tbls, doc, eng) # just for table
sections, tables = Docx()(filename, binary)
res = tokenize_table(tables, doc, is_english) # just for table

callback(0.8, "Finish parsing.")
st = timer()
Expand All @@ -215,16 +220,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if kwargs.get("section_only", False):
return chunks

res.extend(tokenize_chunks_docx(chunks, doc, eng, images))
res.extend(tokenize_chunks_docx(chunks, doc, is_english, images))
logging.info("naive_merge({}): {}".format(filename, timer() - st))
return res

elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf(
) if parser_config.get("layout_recognize", True) else PlainParser()
sections, tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
res = tokenize_table(tbls, doc, eng)
pdf_parser = Pdf() if parser_config.get("layout_recognize", True) else PlainParser()
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
res = tokenize_table(tables, doc, is_english)

elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
Expand All @@ -243,8 +246,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,

elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections, tbls = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary)
res = tokenize_table(tbls, doc, eng)
sections, tables = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary)
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")

elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
Expand Down Expand Up @@ -284,7 +287,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if kwargs.get("section_only", False):
return chunks

res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
logging.info("naive_merge({}): {}".format(filename, timer() - st))
return res

Expand Down
17 changes: 11 additions & 6 deletions rag/app/one.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,25 +24,30 @@
class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
callback(msg="OCR is running...")
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished")
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))

from timeit import default_timer as timer
start = timer()
self._layouts_rec(zoomin, drop=False)
callback(0.63, "Layout analysis finished.")
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
logging.debug("layouts cost: {}s".format(timer() - start))

start = timer()
self._table_transformer_job(zoomin)
callback(0.65, "Table analysis finished.")
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))

start = timer()
self._text_merge()
callback(0.67, "Text merging finished")
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
tbls = self._extract_table_figure(True, zoomin, True, True)
self._concat_downward()

Expand Down
Loading

0 comments on commit eb5f2f1

Please sign in to comment.