Skip to content

Commit

Permalink
Added support for extracting info from image in the docs (#120)
Browse files Browse the repository at this point in the history
Signed-off-by: Ye, Xinyu <xinyu.ye@intel.com>
  • Loading branch information
XinyuYe-Intel authored Jun 26, 2024
1 parent 72a48d0 commit e237454
Show file tree
Hide file tree
Showing 7 changed files with 95 additions and 22 deletions.
7 changes: 7 additions & 0 deletions comps/dataprep/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@

The Dataprep Microservice aims to preprocess the data from various sources (either structured or unstructured data) to text data, and convert the text data to embedding vectors then store them in the database.

## Install Requirements

```bash
apt-get update
apt-get install libreoffice
```

## Use LVM (Large Vision Model) for Summarizing Image Data

Occasionally unstructured data will contain image data, to convert the image data to the text data, LVM can be used to summarize the image. To leverage LVM, please refer to this [readme](../lvms/README.md) to start the LVM microservice first and then set the below environment variable, before starting any dataprep microservice.
Expand Down
4 changes: 4 additions & 0 deletions comps/dataprep/milvus/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
beautifulsoup4
cairosvg
docarray[full]
docx2txt
easyocr
fastapi
frontend==0.0.3
Expand All @@ -8,6 +10,7 @@ langchain
langchain-community
langchain-text-splitters
langchain_milvus
markdown
numpy
opentelemetry-api
opentelemetry-exporter-otlp
Expand All @@ -19,6 +22,7 @@ pydantic==2.7.3
pymilvus==2.4.3
pymupdf==1.24.5
python-docx==0.8.11
python-pptx
sentence_transformers
shortuuid
unstructured[all-docs]==0.11.5
3 changes: 2 additions & 1 deletion comps/dataprep/qdrant/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ apt-get install poppler-utils -y

## Start Qdrant Server

Please refer to this [readme](../../../vectorstores/langchain/qdrant/README.md).
Please refer to this [readme](../../vectorstores/langchain/qdrant/README.md).

## Setup Environment Variables

Expand All @@ -24,6 +24,7 @@ export https_proxy=${your_http_proxy}
export QDRANT=${host_ip}
export QDRANT_PORT=6333
export COLLECTION_NAME=${your_collection_name}
export PYTHONPATH=${path_to_comps}
```

## Start Document Preparation Microservice for Qdrant with Python Script
Expand Down
4 changes: 4 additions & 0 deletions comps/dataprep/qdrant/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
beautifulsoup4
cairosvg
docarray[full]
docx2txt
easyocr
fastapi
huggingface_hub
langchain
langchain-community
langchain-text-splitters
markdown
numpy
opentelemetry-api
opentelemetry-exporter-otlp
Expand All @@ -15,6 +18,7 @@ Pillow
prometheus-fastapi-instrumentator
pymupdf
python-docx
python-pptx
qdrant-client
sentence_transformers
shortuuid
Expand Down
3 changes: 2 additions & 1 deletion comps/dataprep/redis/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ cd langchain_ray; pip install -r requirements_ray.txt

## 1.2 Start Redis Stack Server

Please refer to this [readme](../../../vectorstores/langchain/redis/README.md).
Please refer to this [readme](../../vectorstores/langchain/redis/README.md).

## 1.3 Setup Environment Variables

Expand All @@ -41,6 +41,7 @@ export INDEX_NAME=${your_index_name}
export LANGCHAIN_TRACING_V2=true
export LANGCHAIN_API_KEY=${your_langchain_api_key}
export LANGCHAIN_PROJECT="opea/gen-ai-comps:dataprep"
export PYTHONPATH=${path_to_comps}
```

## 1.4 Start Document Preparation Microservice for Redis with Python Script
Expand Down
3 changes: 3 additions & 0 deletions comps/dataprep/redis/langchain/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
beautifulsoup4
cairosvg
docarray[full]
docx2txt
easyocr
fastapi
huggingface_hub
Expand All @@ -18,6 +20,7 @@ prometheus-fastapi-instrumentator
pymupdf
pyspark
python-docx
python-pptx
redis
sentence_transformers
shortuuid
Expand Down
93 changes: 73 additions & 20 deletions comps/dataprep/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,20 @@
import multiprocessing
import os
import re
import shutil
import signal
import timeit
import unicodedata
from urllib.parse import urlparse, urlunparse

import cairosvg
import docx
import docx2txt
import easyocr
import fitz
import numpy as np
import pandas as pd
import pptx
import requests
import yaml
from bs4 import BeautifulSoup
Expand All @@ -27,7 +32,6 @@
UnstructuredHTMLLoader,
UnstructuredImageLoader,
UnstructuredMarkdownLoader,
UnstructuredPowerPointLoader,
UnstructuredXMLLoader,
)
from langchain_community.llms import HuggingFaceEndpoint
Expand Down Expand Up @@ -131,32 +135,81 @@ def load_txt(txt_path):

def load_doc(doc_path):
"""Load doc file."""
txt_path = doc_path.replace(".doc", ".txt")
try:
os.system(f'antiword "{doc_path}" > "{txt_path}"')
except:
raise AssertionError(
"antiword failed or not installed, if not installed,"
+ 'use "apt-get update && apt-get install -y antiword" to install it.'
)
text = load_txt(txt_path)
os.remove(txt_path)
print("Converting doc file to docx file...")
docx_path = doc_path + "x"
os.system(f"libreoffice --headless --invisible --convert-to docx --outdir {os.path.dirname(docx_path)} {doc_path}")
print("Converted doc file to docx file.")
text = load_docx(docx_path)
os.remove(docx_path)
return text


def load_docx(docx_path):
"""Load docx file."""
doc = DDocument(docx_path)
doc = docx.Document(docx_path)
text = ""
# Save all 'rId:filenames' relationships in an dictionary and save the images if any.
rid2img = {}
for r in doc.part.rels.values():
if isinstance(r._target, docx.parts.image.ImagePart):
rid2img[r.rId] = os.path.basename(r._target.partname)
if rid2img:
save_path = "./imgs/"
os.makedirs(save_path, exist_ok=True)
docx2txt.process(docx_path, save_path)
for paragraph in doc.paragraphs:
text += paragraph.text
if hasattr(paragraph, "text"):
text += paragraph.text + "\n"
if "graphicData" in paragraph._p.xml:
for rid in rid2img:
if rid in paragraph._p.xml:
img_path = os.path.join(save_path, rid2img[rid])
img_text = load_image(img_path)
if img_text:
text += img_text + "\n"
if rid2img:
shutil.rmtree(save_path)
return text


def load_ppt(ppt_path):
"""Load ppt file."""
print("Converting ppt file to pptx file...")
pptx_path = ppt_path + "x"
os.system(f"libreoffice --headless --invisible --convert-to pptx --outdir {os.path.dirname(pptx_path)} {ppt_path}")
print("Converted ppt file to pptx file.")
text = load_pptx(pptx_path)
os.remove(pptx_path)
return text


def load_pptx(pptx_path):
"""Load pptx file."""
loader = UnstructuredPowerPointLoader(pptx_path)
text = loader.load()[0].page_content
text = ""
prs = pptx.Presentation(pptx_path)
for slide in prs.slides:
for shape in sorted(slide.shapes, key=lambda shape: (shape.top, shape.left)):
if shape.has_text_frame:
if shape.text:
text += shape.text + "\n"
if shape.has_table:
table_contents = "\n".join(
[
"\t".join([(cell.text if hasattr(cell, "text") else "") for cell in row.cells])
for row in shape.table.rows
if hasattr(row, "cells")
]
)
if table_contents:
text += table_contents + "\n"
if hasattr(shape, "image") and hasattr(shape.image, "blob"):
img_path = f"./{shape.image.filename}"
with open(img_path, "wb") as f:
f.write(shape.image.blob)
img_text = load_image(img_path)
if img_text:
text += img_text + "\n"
os.remove(img_path)
return text


Expand Down Expand Up @@ -214,13 +267,11 @@ def load_image(image_path):
return response.json()["text"].strip()
loader = UnstructuredImageLoader(image_path)
text = loader.load()[0].page_content
return text
return text.strip()


def load_svg(svg_path):
"""Load the svg file."""
import cairosvg

png_path = svg_path.replace(".svg", ".png")
cairosvg.svg2png(url=svg_path, write_to=png_path)
text = load_image(png_path)
Expand All @@ -239,7 +290,9 @@ def document_loader(doc_path):
return load_doc(doc_path)
elif doc_path.endswith(".docx"):
return load_docx(doc_path)
elif doc_path.endswith(".pptx") or doc_path.endswith(".ppt"):
elif doc_path.endswith(".ppt"):
return load_ppt(doc_path)
elif doc_path.endswith(".pptx"):
return load_pptx(doc_path)
elif doc_path.endswith(".md"):
return load_md(doc_path)
Expand All @@ -261,7 +314,7 @@ def document_loader(doc_path):
):
return load_image(doc_path)
elif doc_path.endswith(".svg"):
return load_image(doc_path)
return load_svg(doc_path)
else:
raise NotImplementedError(
"Current only support pdf, html, txt, doc, docx, pptx, ppt, md, xml"
Expand Down

0 comments on commit e237454

Please sign in to comment.