Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: 优化doc、ppt等旧office格式文档的全文搜索 #171

Merged
merged 2 commits into from
Nov 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>3.2.4</version>
<version>3.3.5</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.jmal</groupId>
Expand Down Expand Up @@ -243,6 +243,11 @@
<artifactId>poi-ooxml</artifactId>
<version>5.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>5.3.0</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
Expand Down Expand Up @@ -343,7 +348,7 @@
<dependency>
<groupId>io.minio</groupId>
<artifactId>minio</artifactId>
<version>8.5.9</version>
<version>8.5.13</version>
</dependency>
<!-- 阿里云 OSS -->
<dependency>
Expand All @@ -361,7 +366,7 @@
<dependency>
<groupId>com.qcloud</groupId>
<artifactId>cos_api</artifactId>
<version>5.6.213</version>
<version>5.6.234</version>
<exclusions>
<exclusion>
<groupId>commons-logging</groupId>
Expand Down
6 changes: 4 additions & 2 deletions src/main/java/com/jmal/clouddisk/lucene/LuceneService.java
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,9 @@ private String readFileContent(File file, String fileId) {
case "doc", "docx" -> {
return readContentService.readWordContent(file);
}
case "xls", "xlsx" -> {
return readContentService.readExcelContent(file);
}
}
if (fileProperties.getSimText().contains(type)) {
String charset = UniversalDetector.detectCharset(file);
Expand Down Expand Up @@ -707,8 +710,7 @@ public void processFilesToBeIndexed() throws IOException {
}
List<org.bson.Document> pipeline = Arrays.asList(new org.bson.Document("$match", new org.bson.Document("index", 0)), new org.bson.Document("$project", new org.bson.Document("_id", 1)), new org.bson.Document("$limit", 8));
AggregateIterable<org.bson.Document> aggregateIterable = mongoTemplate.getCollection(CommonFileService.COLLECTION_NAME).aggregate(pipeline);
while (aggregateIterable.iterator().hasNext()) {
org.bson.Document document = aggregateIterable.iterator().next();
for (org.bson.Document document : aggregateIterable) {
String fileId = document.getObjectId("_id").toHexString();
FileIntroVO fileIntroVO = getFileIntroVO(fileId);
if (fileIntroVO != null) {
Expand Down
135 changes: 115 additions & 20 deletions src/main/java/com/jmal/clouddisk/lucene/ReadContentService.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,20 @@
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hslf.usermodel.HSLFShape;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hslf.usermodel.HSLFTextShape;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OLE2NotOfficeXmlFileException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFShape;
import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.poi.xslf.usermodel.XSLFTextShape;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.jsoup.Jsoup;
Expand All @@ -41,6 +51,7 @@
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.List;
import java.util.regex.Pattern;

@Service
@RequiredArgsConstructor
Expand All @@ -57,7 +68,8 @@ public class ReadContentService {

/**
* 将 DWG 文件转换为 MXWeb 文件
* @param file 文件
*
* @param file 文件
* @param fileId 文件 ID
* @return MXWeb 文件路径
*/
Expand Down Expand Up @@ -92,7 +104,7 @@ public String readPdfContent(File file, String fileId) {
if (!text.isEmpty()) {
content.append(text);
} else {
taskProgressService.addTaskProgress(file,TaskType.OCR, pageNumber + "/" + document.getNumberOfPages());
taskProgressService.addTaskProgress(file, TaskType.OCR, pageNumber + "/" + document.getNumberOfPages());
PDPage page = document.getPage(pageNumber - 1);
PDResources resources = page.getResources();
for (COSName xObjectName : resources.getXObjectNames()) {
Expand Down Expand Up @@ -131,7 +143,7 @@ public String readEpubContent(File file, String fileId) {

// 生成封面图像
String username = commonFileService.getUsernameByAbsolutePath(Path.of(file.getAbsolutePath()));
if (StrUtil.isNotBlank(fileId)) {
if (StrUtil.isNotBlank(fileId)) {
File coverFile = FileContentUtil.epubCoverImage(book, videoProcessService.getVideoCacheDir(username, fileId));
commonFileService.updateCoverFileDocument(fileId, coverFile);
}
Expand All @@ -158,36 +170,119 @@ public String readEpubContent(File file, String fileId) {
}

public String readPPTContent(File file) {
try (FileInputStream fis = new FileInputStream(file.getAbsolutePath());
XMLSlideShow ppt = new XMLSlideShow(fis)) {
StringBuilder stringBuilder = new StringBuilder();
for (XSLFSlide slide : ppt.getSlides()) {
for (XSLFShape shape : slide.getShapes()) {
if (shape instanceof XSLFTextShape textShape) {
stringBuilder.append(textShape.getText());
StringBuilder stringBuilder = new StringBuilder();
String fileName = file.getName().toLowerCase();

try (FileInputStream fis = new FileInputStream(file)) {
if (fileName.endsWith(".pptx")) {
// 读取 .pptx 文件
try (XMLSlideShow pptx = new XMLSlideShow(fis)) {
for (XSLFSlide slide : pptx.getSlides()) {
for (XSLFShape shape : slide.getShapes()) {
if (shape instanceof XSLFTextShape textShape) {
stringBuilder.append(textShape.getText()).append(" ");
}
}
}
}
} else if (fileName.endsWith(".ppt")) {
// 读取 .ppt 文件
try (HSLFSlideShow ppt = new HSLFSlideShow(fis)) {
for (org.apache.poi.hslf.usermodel.HSLFSlide slide : ppt.getSlides()) {
for (HSLFShape shape : slide.getShapes()) {
if (shape instanceof HSLFTextShape textShape) {
stringBuilder.append(textShape.getText()).append(" ");
}
}
}
}
} else {
throw new IllegalArgumentException("不支持的文件格式");
}
return stringBuilder.toString();
} catch (IOException e) {
FileContentUtil.readFailed(file, e);
FileContentUtil.readFailed(file, e);
}
return null;

return stringBuilder.toString().trim();
}

public String readWordContent(File file) {
try (FileInputStream fis = new FileInputStream(file.getAbsolutePath());
XWPFDocument document = new XWPFDocument(fis)) {
StringBuilder stringBuilder = new StringBuilder();
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph para : paragraphs) {
stringBuilder.append(para.getText());
try (FileInputStream fis = new FileInputStream(file)) {
try {
// 尝试读取 OOXML 格式 (.docx) 文件
XWPFDocument document = new XWPFDocument(fis);
StringBuilder stringBuilder = new StringBuilder();
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph para : paragraphs) {
stringBuilder.append(para.getText());
}
return stringBuilder.toString();
} catch (OLE2NotOfficeXmlFileException e) {
// 如果文件不是 OOXML 格式,尝试读取 OLE2 格式 (.doc) 文件
try (FileInputStream fis2 = new FileInputStream(file);
POIFSFileSystem poifs = new POIFSFileSystem(fis2);
HWPFDocument doc = new HWPFDocument(poifs)) {
WordExtractor extractor = new WordExtractor(doc);
return extractor.getText();
}
}
return stringBuilder.toString();
} catch (IOException e) {
FileContentUtil.readFailed(file, e);
}
return null;
}

private static final Pattern NON_NUMERIC_PATTERN = Pattern.compile("[^0-9]+");

public String readExcelContent(File file) {
StringBuilder content = new StringBuilder();
try (FileInputStream fis = new FileInputStream(file)) {
Workbook workbook;
if (file.getName().endsWith(".xlsx")) {
workbook = new XSSFWorkbook(fis);
} else if (file.getName().endsWith(".xls")) {
workbook = new HSSFWorkbook(fis);
} else {
throw new IllegalArgumentException("不支持的文件格式");
}

for (Sheet sheet : workbook) {
for (Row row : sheet) {
for (Cell cell : row) {
String cellValue = getCellValueAsString(cell);
// 过滤掉数字,只保留文字
if (NON_NUMERIC_PATTERN.matcher(cellValue).matches()) {
content.append(cellValue).append(" ");
}
}
}
}
} catch (IOException e) {
FileContentUtil.readFailed(file, e);
}
return content.toString().trim();
}

private String getCellValueAsString(Cell cell) {
if (cell == null) {
return "";
}
switch (cell.getCellType()) {
case STRING:
return cell.getStringCellValue();
case BOOLEAN:
return Boolean.toString(cell.getBooleanCellValue());
case NUMERIC:
if (DateUtil.isCellDateFormatted(cell)) {
return cell.getDateCellValue().toString();
} else {
return Double.toString(cell.getNumericCellValue());
}
case FORMULA:
return cell.getCellFormula();
case BLANK:
default:
return "";
}
}
}