Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] autotex driver using autotex for conversion #81

Draft
wants to merge 12 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion tex2pdf-service/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

104 changes: 104 additions & 0 deletions tex2pdf-service/tex2pdf/converter_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import time
import typing
from enum import Enum
from glob import glob

from tex2pdf_tools.preflight_parser import PreflightStatusValues, generate_preflight_response
from tex2pdf_tools.tex_inspection import find_unused_toplevel_files, maybe_bbl
Expand Down Expand Up @@ -635,3 +636,106 @@ def generate_pdf(self) -> str | None:
logger.debug("Directory listing of %s is: %s", self.out_dir, os.listdir(self.out_dir))

return self.outcome.get("pdf_file")

class AutoTeXConverterDriver(ConverterDriver):
"""Uses autotex for conversion."""

def __init__(self, work_dir: str, source: str, tag: str | None = None, max_time_budget: float | None = None):
# Default are all already ok
super().__init__(work_dir, source, use_addon_tree=False, tag=tag, max_time_budget=max_time_budget)
self.zzrm = ZeroZeroReadMe()

def generate_pdf(self) -> str|None:
"""We have the beef."""
logger = get_logger()
t0 = time.perf_counter()

# run autotex.pl on the id
PATH = "/usr/local/bin:/opt_arxiv/bin:/opt_arxiv/arxiv-perl/bin:/usr/sbin:/usr/bin:/bin:/sbin"
# SECRETS or GOOGLE_APPLICATION_CREDENTIALS is not defined at all at this point but
# be defensive and squish it anyway.
cmdenv = {"SECRETS": "?", "GOOGLE_APPLICATION_CREDENTIALS": "?", "PATH": PATH}

arxivID = self.tag
# maybe it is already source
worker_args = [
"autotex.pl", "-f", "fInm", "-q",
"-S", self.in_dir, # here the original tarball has been dumped
"-W", self.out_dir, # work_dir/out where we expect files
# TODO currently autotex.pl DOES NOT HONOR THIS!!!
"-v", "-Z", "-p", arxivID ]
with subprocess.Popen(worker_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE,
cwd="/autotex", encoding='iso-8859-1', env=cmdenv) as child:
process_completion = False
try:
(out, err) = child.communicate(timeout=self.max_time_budget)
process_completion = True
except subprocess.TimeoutExpired:
logger.warning("Process timeout %s", shlex.join(worker_args), extra=self.log_extra)
child.kill()
(out, err) = child.communicate()
elapse_time = time.perf_counter() - t0
t1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
logger.debug(f"Exec result: return code: {child.returncode}", extra=self.log_extra)

# files generated
# self.in_dir / tex_cache / arXivID.pdf (might have a version!)
# self.in_dir / tex_logs / autotex.log (same name, not good)
# we need to move them to self.out_dir so that the follow-up packaging
# into a tarfile works
pdf_files = glob(f"{self.in_dir}/tex_cache/{arxivID}*.pdf")
if not pdf_files:
pdf_file = None
elif len(pdf_files) > 1:
raise Exception(f"Multiple PDF files found: {pdf_files}")
else:
# move the file to self.out_dir
pdf_file = os.path.join(self.out_dir, os.path.basename(pdf_files[0]))
os.rename(pdf_files[0], pdf_file)
# we use glob here, since we will need to rename the autotex.log created
# by autotex.pl to arxivID.log *within* autotex.log
log_files = glob(f"{self.in_dir}/tex_logs/autotex.log")
if not log_files:
logger.warning(f"No log files found for {arxivID}")
log = None
else:
with open(log_files[0]) as file:
log = file.read()

# Create an outcome structure
# This is unfortunately not well documented and has severe duplication of entries
self.outcome = {
ID_TAG: self.tag,
"converters": [ {
"pdf_file": pdf_file,
"runs": [ {
"args": worker_args,
"stdout": out,
"stderr": err,
"return_code": child.returncode,
"run_env": cmdenv,
"start_time": t0, "end_time": t1,
"elapse_time": elapse_time,
"process_completion": process_completion,
"PATH": PATH,
"arxiv_id": arxivID,
"log": log
}]
} ],
"start_time": str(t0),
"timeout": str(self.max_time_budget),
"total_time": elapse_time,
"pdf_files": [ pdf_file ],
"pdf_file": pdf_file,
"status": "success" if pdf_file else "fail",
}

# we need to get ZZRM
if self.zzrm is None:
logger.debug("no self.zzrm found, that should not happen")
self.zzrm = ZeroZeroReadMe()
else:
logger.debug("self.zzrm = %s", self.zzrm)

return self.outcome.get("pdf_file")

64 changes: 63 additions & 1 deletion tex2pdf-service/tex2pdf/tex2pdf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from starlette.responses import FileResponse, HTMLResponse

from . import MAX_APPENDING_FILES, MAX_TIME_BUDGET, MAX_TOPLEVEL_TEX_FILES, USE_ADDON_TREE
from .converter_driver import ConversionOutcomeMaker, ConverterDriver, PreflightVersion
from .converter_driver import ConversionOutcomeMaker, ConverterDriver, PreflightVersion, AutoTeXConverterDriver
from .fastapi_util import closer
from .pdf_watermark import Watermark
from .service_logger import get_logger
Expand Down Expand Up @@ -249,6 +249,68 @@ async def convert_pdf(
}
return GzipResponse(content, headers=headers, background=closer(content, filename, log_extra))

@app.post('/autotex/',
responses={
STATCODE.HTTP_200_OK: {"content": {"application/gzip": {}},
"description": "Conversion result"},
STATCODE.HTTP_400_BAD_REQUEST: {"model": Message},
STATCODE.HTTP_422_UNPROCESSABLE_ENTITY: {"model": Message},
STATCODE.HTTP_500_INTERNAL_SERVER_ERROR: {"model": Message}
})
async def autotex_pdf(incoming: UploadFile,
timeout: typing.Annotated[int | None,
Query(title="Time out", description="Time out in seconds.")] = None,
) -> Response:
"""Get a tarball, and convert to PDF using autotex."""
filename = incoming.filename if incoming.filename else tempfile.mktemp(prefix="download")
log_extra = {"source_filename": filename}
logger = get_logger()
logger.info("%s", incoming.filename)
tag = os.path.basename(filename)
while True:
[stem, ext] = os.path.splitext(tag)
if ext in [".gz", ".zip", ".tar"]:
tag = stem
continue
break
with tempfile.TemporaryDirectory(prefix=tag) as tempdir:
in_dir, out_dir = prep_tempdir(tempdir)
await save_stream(in_dir, incoming, filename, log_extra)
timeout_secs = float(MAX_TIME_BUDGET)
if timeout is not None:
try:
timeout_secs = float(timeout)
except ValueError:
pass
pass
driver = AutoTeXConverterDriver(tempdir, filename, tag=tag, max_time_budget=timeout_secs)
try:
_pdf_file = driver.generate_pdf()
except RemovedSubmission:
# TODO how can we detect this???
logger.info("Archive is marked deleted.")
return JSONResponse(status_code=STATCODE.HTTP_422_UNPROCESSABLE_ENTITY,
content={"message": "The source is marked deleted."})

except Exception as exc:
logger.error(f"Exception %s", str(exc), exc_info=True)
return JSONResponse(status_code=STATCODE.HTTP_500_INTERNAL_SERVER_ERROR,
content={"message": traceback.format_exc()})

out_dir_files = os.listdir(out_dir)
outcome_maker = ConversionOutcomeMaker(tempdir, tag)
outcome_maker.create_outcome(driver, driver.outcome, outcome_files=out_dir_files)

content = open(os.path.join(tempdir, outcome_maker.outcome_file), "rb")
filename = os.path.basename(outcome_maker.outcome_file)
headers = {
"Content-Type": "application/gzip",
"Content-Disposition": f"attachment; filename={filename}",
}
return GzipResponse(content, headers=headers,
background=closer(content, filename, log_extra))



@app.get("/texlive/info")
async def texlive_info() -> FileResponse:
Expand Down