Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Text Extraction taskrunner #4884

Merged
merged 6 commits into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions containers/scripts/docker-compose-taskrunner.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,26 @@ services:
- ../../packages/equation_extraction:/equation_extraction_task
- ../../packages/taskrunner:/taskrunner
command: /equation_extraction_task/dev.sh

text_extraction-taskrunner:
build:
context: ../..
dockerfile: ./packages/text_extraction/Dockerfile
target: text_extraction_taskrunner_builder
container_name: text_extraction-taskrunner
networks:
- terarium
environment:
TERARIUM_MQ_ADDRESSES: "amqp://rabbitmq:5672"
TERARIUM_MQ_PASSWORD: "terarium123"
TERARIUM_MQ_USERNAME: "terarium"
TERARIUM_TASKRUNNER_REQUEST_TYPE: "text_extraction"
depends_on:
rabbitmq:
condition: service_healthy
extra_hosts:
- "${local_host_name}:host-gateway"
volumes:
- ../../packages/text_extraction:/text_extraction_task
- ../../packages/taskrunner:/taskrunner
command: /text_extraction_task/dev.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ public static enum TaskType {
@JsonAlias("equation_extraction")
EQUATION_EXTRACTION_CPU("equation_extraction"),
@JsonAlias("equation_extraction_gpu")
EQUATION_EXTRACTION_GPU("equation_extraction_gpu");
EQUATION_EXTRACTION_GPU("equation_extraction_gpu"),
@JsonAlias("text_extraction")
TEXT_EXTRACTION("text_extraction");

private final String value;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
import software.uncharted.terarium.hmiserver.service.notification.NotificationGroupInstance;
import software.uncharted.terarium.hmiserver.service.notification.NotificationService;
import software.uncharted.terarium.hmiserver.service.tasks.ExtractEquationsResponseHandler;
import software.uncharted.terarium.hmiserver.service.tasks.ExtractTextResponseHandler;
import software.uncharted.terarium.hmiserver.service.tasks.ModelCardResponseHandler;
import software.uncharted.terarium.hmiserver.service.tasks.TaskService;
import software.uncharted.terarium.hmiserver.utils.ByteMultipartFile;
Expand Down Expand Up @@ -191,10 +192,10 @@ public ExtractPDFResponse runExtractPDF(
final ExtractPDFResponse extractionResponse = new ExtractPDFResponse();

try {
notificationInterface.sendMessage("Starting COSMOS text extraction...");
log.info("Starting COSMOS text extraction for document: {}", documentName);
notificationInterface.sendMessage("Starting text extraction...");
log.info("Starting text extraction for document: {}", documentName);

final Future<CosmosTextExtraction> cosmosTextExtractionFuture = extractTextFromPDF(
final Future<TextExtraction> textExtractionFuture = extractTextFromPDF(
notificationInterface,
documentName,
documentContents
Expand All @@ -208,14 +209,14 @@ public ExtractPDFResponse runExtractPDF(
userId
);

// wait for cosmos text extraction
final CosmosTextExtraction cosmosTextExtraction = cosmosTextExtractionFuture.get();
notificationInterface.sendMessage("COSMOS text extraction complete!");
log.info("COSMOS text extraction complete for document: {}", documentName);
extractionResponse.documentAbstract = cosmosTextExtraction.documentAbstract;
extractionResponse.documentText = cosmosTextExtraction.documentText;
extractionResponse.assets = cosmosTextExtraction.assets;
extractionResponse.files = cosmosTextExtraction.files;
// wait for text extraction
final TextExtraction textExtraction = textExtractionFuture.get();
notificationInterface.sendMessage("Text extraction complete!");
log.info("Text extraction complete for document: {}", documentName);
extractionResponse.documentAbstract = textExtraction.documentAbstract;
extractionResponse.documentText = textExtraction.documentText;
extractionResponse.assets = textExtraction.assets;
extractionResponse.files = textExtraction.files;
kbirk marked this conversation as resolved.
Show resolved Hide resolved

try {
// wait for equation extraction
Expand Down Expand Up @@ -831,21 +832,52 @@ public Future<EquationExtraction> extractEquationsFromPDF(
});
}

static class CosmosTextExtraction {
static class TextExtraction {

String documentAbstract;
String documentText;
List<DocumentExtraction> assets = new ArrayList<>();
List<ExtractionFile> files = new ArrayList<>();
}

public Future<CosmosTextExtraction> extractTextFromPDF(
public Future<TextExtraction> extractTextFromPDF(
final NotificationGroupInstance<Properties> notificationInterface,
final String userId,
final byte[] pdf
) throws JsonProcessingException, TimeoutException, InterruptedException, ExecutionException, IOException {
final int REQUEST_TIMEOUT_MINUTES = 5;

final TaskRequest req = new TaskRequest();
req.setTimeoutMinutes(REQUEST_TIMEOUT_MINUTES);
req.setInput(pdf);
req.setScript(ExtractTextResponseHandler.NAME);
req.setUserId(userId);
req.setType(TaskType.TEXT_EXTRACTION);

return executor.submit(() -> {
final TaskResponse resp = taskService.runTaskSync(req);

final byte[] outputBytes = resp.getOutput();
final ExtractTextResponseHandler.ResponseOutput output = objectMapper.readValue(
outputBytes,
ExtractTextResponseHandler.ResponseOutput.class
);

final TextExtraction extraction = new TextExtraction();

extraction.documentText = output.getResponse().asText();

return extraction;
});
}

public Future<TextExtraction> extractTextFromPDFCosmos(
kbirk marked this conversation as resolved.
Show resolved Hide resolved
final NotificationGroupInstance<Properties> notificationInterface,
final String documentName,
final byte[] pdf
) {
return executor.submit(() -> {
final CosmosTextExtraction extractionResponse = new CosmosTextExtraction();
final TextExtraction extractionResponse = new TextExtraction();

final ByteMultipartFile documentFile = new ByteMultipartFile(pdf, documentName, "application/pdf");

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package software.uncharted.terarium.hmiserver.service.tasks;

import com.fasterxml.jackson.databind.JsonNode;
import lombok.Data;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;

@Component
@RequiredArgsConstructor
@Slf4j
public class ExtractTextResponseHandler extends TaskResponseHandler {

public static final String NAME = "text_extraction_task:extract_text";

@Override
public String getName() {
return NAME;
}

@Data
public static class ResponseOutput {

private JsonNode response;
}
}
40 changes: 40 additions & 0 deletions packages/text_extraction/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
HELP.md
.gradle
build/
!gradle/wrapper/gradle-wrapper.jar
!**/src/main/**/build/
!**/src/test/**/build/
*.egg-info

### STS ###
.apt_generated
.classpath
.factorypath
.project
.settings
.springBeans
.sts4-cache
bin/
!**/src/main/**/bin/
!**/src/test/**/bin/

### IntelliJ IDEA ###
.idea
*.iws
*.iml
*.ipr
out/
!**/src/main/**/out/
!**/src/test/**/out/

### NetBeans ###
/nbproject/private/
/nbbuild/
/dist/
/nbdist/
/.nb-gradle/

### VS Code ###
.vscode/
mira.egg-info
__pycache__
64 changes: 64 additions & 0 deletions packages/text_extraction/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Build the Spring Boot application
FROM python:3.11-slim-bookworm AS text_extraction_taskrunner_builder

###### DEV ONLY ######
#VVVVVVVVVVVVVVVVVVVVV

# These deps are installed only for use during local development

# Install OpenJDK
RUN apt-get update && apt-get install -y --no-install-recommends \
ocrmypdf \
openjdk-17-jdk-headless && \
rm -rf /var/lib/apt/lists/*
dgauldie marked this conversation as resolved.
Show resolved Hide resolved

# Install PyPDF2
RUN pip install PyPDF2

#^^^^^^^^^^^^^^^^^^^^
######################

WORKDIR /taskrunner

COPY ./packages/taskrunner .

RUN ./gradlew bootJar

WORKDIR /

# Copy the jar to the root directory
RUN mv /taskrunner/build/libs/*.jar .
RUN mv /terarium-1.0.0-SNAPSHOT.jar /taskrunner.jar

# ------------------------------------------------------------------------------
FROM python:3.11-slim-bookworm

WORKDIR /

# Install OpenJRE
RUN apt-get update && apt-get install -y --no-install-recommends \
ocrmypdf \
openjdk-17-jre-headless && \
rm -rf /var/lib/apt/lists/*

# Install PyPDF2
RUN pip install PyPDF2

# Copy the Spring Boot fat JAR from the builder image
COPY --from=text_extraction_taskrunner_builder /taskrunner/build/libs/*.jar /taskrunner.jar

# Install taskrunner
COPY ./packages/taskrunner/setup.py /taskrunner/setup.py
COPY ./packages/taskrunner/taskrunner.py /taskrunner/taskrunner.py
WORKDIR /taskrunner
RUN pip install --no-cache-dir -e .

# Install pdf tasks
COPY ./packages/text_extraction /text_extraction_task

WORKDIR /text_extraction_task
RUN pip install --no-cache-dir -e .

WORKDIR /

CMD ["java", "-jar", "taskrunner.jar"]
15 changes: 15 additions & 0 deletions packages/text_extraction/dev.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

# ensure the volume mounted python code is using editable mode
echo "Installing python tasks"
cd /text_extraction_task
pip install --no-cache-dir -e .

# run it
echo "Installing taskrunner"
cd /taskrunner
pip install --no-cache-dir -e .

BUILD_DIR=/taskrunner-build-text-extraction
mkdir -p $BUILD_DIR
./gradlew --project-cache-dir /tmp/.gradle/$$ -PcustomBuildDir=$BUILD_DIR bootRun
14 changes: 14 additions & 0 deletions packages/text_extraction/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from setuptools import setup, find_packages

setup(
name="text_extraction_task",
version="0.1.0",
packages=find_packages(),
install_requires=[],
entry_points={
"console_scripts": [
"text_extraction_task:extract_text=tasks.extract_text:main",
],
},
python_requires=">=3.10",
)
Empty file.
88 changes: 88 additions & 0 deletions packages/text_extraction/tasks/extract_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import sys
import os
import traceback
import subprocess
from taskrunner import TaskRunnerInterface
import PyPDF2

def cleanup():
pass


def get_filename(id: str):
return f"/tmp/{id}.pdf"


def create_temp_file(name, contents):
with open(name, 'wb') as f:
f.write(contents)


def delete_temp_file(name):
try:
os.remove(name)
except:
pass


def extract_text_from_pdf(filename):
reader = PyPDF2.PdfReader(filename)
text = ""
for page in reader.pages:
text += page.extract_text()
return text


def main():
exitCode = 0
try:
taskrunner = TaskRunnerInterface(description="Embedding CLI")
taskrunner.on_cancellation(cleanup)

bs = taskrunner.read_input_bytes_with_timeout()

ifilename = get_filename("input_" + taskrunner.id)
ofilename = get_filename("output_" + taskrunner.id)

create_temp_file(ifilename, bs)

# Define the command and arguments
command = ["ocrmypdf", "--force-ocr", ifilename, ofilename]

# Run the command
result = subprocess.run(command, capture_output=True, text=True)

# Print the output
if result.returncode != 0:
taskrunner.log("Error running ocrmypdf")
taskrunner.log("stderr:" + result.stderr)
taskrunner.log("Return code:" + str(result.returncode))
raise Exception("Error running ocrmypdf")

if result.stdout != "":
taskrunner.log("ocrmypdf stdout:")
taskrunner.log(result.stdout)

if result.stderr != "":
taskrunner.log("ocrmypdf stderr:")
taskrunner.log(result.stderr)

taskrunner.log("Extracting text")
text = extract_text_from_pdf(ofilename)
taskrunner.log("Extracted text!")

taskrunner.write_output_dict_with_timeout({"response": text})

except Exception as e:
sys.stderr.write(f"Error: {str(e)}\n")
sys.stderr.write(traceback.format_exc())
sys.stderr.flush()
exitCode = 1

taskrunner.log("Shutting down")
taskrunner.shutdown()
sys.exit(exitCode)


if __name__ == "__main__":
main()
Loading