Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More fixes to webpages and LaTex #2891

Merged
merged 3 commits into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import re

from helm.common.optional_dependencies import handle_module_not_found_error, OptionalDependencyNotInstalled
from helm.common.hierarchical_logger import hlog

try:
from latex import build_pdf
Expand Down Expand Up @@ -220,25 +221,21 @@ def handle_latex_error(
# Error format: "LaTeX Error: Environment <env> undefined."
undefined_search = re.search(r"LaTeX Error: Environment (.*) undefined", str_e)
if undefined_search:
# If a package is missing and this is our first retry, then simply include TEX_INCLUDES
if num_try_remaining == MAX_NUM_TRIES:
fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")
if num_try_remaining < MAX_NUM_TRIES or fixed_code == original_latex_code:
# Here we try to manually solve the missing environment.
# This is either executed on the second rety or the first if no changements
# were made in the first retry.
assert TEX_INCLUDES in fixed_code, "TEX_INCLUDES should be present in the code"
# TEX_INCLUDES is already present, so we add the missing package
# Since we cannot know the name of the package that contains the missing environment,
# we simply hope that they are named the same way.
env_undefined: str = undefined_search.group(1)

if f"\\usepackage{{{env_undefined}}}" in fixed_code:
# We already tried to include the missing package, but it probably
# does not exist, so we raise an error
raise RuntimeError(str(e)) from e

fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + f"\n\\usepackage{{{env_undefined}}}\n")
# Here we try to manually solve the missing environment.
# This is either executed on the second rety or the first if no changements
# were made in the first retry.
assert TEX_INCLUDES in fixed_code, f"TEX_INCLUDES should be present in the code. code={fixed_code}"
# TEX_INCLUDES is already present, so we add the missing package
# Since we cannot know the name of the package that contains the missing environment,
# we simply hope that they are named the same way.
env_undefined: str = undefined_search.group(1)

if f"\\usepackage{{{env_undefined}}}" in fixed_code:
# We already tried to include the missing package, but it probably
# does not exist, so we raise an error
raise RuntimeError(str(e)) from e

fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + f"\n\\usepackage{{{env_undefined}}}\n")

# Try again with the fixed code (if the fixed code is different from the original code)
if fixed_code != original_latex_code:
Expand Down Expand Up @@ -310,20 +307,21 @@ def latex_to_image(
documentclass_search = re.search(r"\\documentclass(\[.*?\])?\{.*?\}", original_latex_code)
documentstyle_search = re.search(r"\\documentstyle(\[.*?\])?\{.*?\}", original_latex_code)
if documentclass_search:
documentclass: str = documentclass_search.group(1)
original_latex_code = original_latex_code.replace(f"\\documentclass{{{documentclass}}}", TEX_BEGIN_FILE)
matching_string = documentclass_search.group()
original_latex_code = original_latex_code.replace(matching_string, TEX_BEGIN_FILE)
elif documentstyle_search:
documentstyle: str = documentstyle_search.group(1)
original_latex_code = original_latex_code.replace(f"\\documentstyle{{{documentstyle}}}", TEX_BEGIN_FILE)
matching_string = documentstyle_search.group()
original_latex_code = original_latex_code.replace(matching_string, TEX_BEGIN_FILE)
else:
# If there is no \documentclass, we add our own
original_latex_code = TEX_BEGIN_FILE + "\n\n" + original_latex_code

# 2.2. Add includes. In this ste we remove lal includes for the default ones.
# 2.2. Add includes. In this ste we remove all includes for the default ones.
original_latex_code = re.sub(r"\\usepackage(\[.*?\])?\{.*\}", "", original_latex_code)
original_latex_code = original_latex_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")

latex_code: str = original_latex_code
hlog(f"Compiling LaTeX code:\n{latex_code}")
try:
pdf_stream = latex_to_pdf(latex_code, assets_path=assets_path)
image = pdf_to_image(pdf_stream, crop=crop, resize_to=resize_to)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Dict, List, Any, Optional

from helm.benchmark.annotation.image2struct.image_compiler_annotator import CompilationError
from helm.benchmark.scenarios.scenario import VALID_SPLIT
from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import (
Image2StructureScenario,
Expand All @@ -14,6 +15,7 @@
from helm.benchmark.scenarios.vision_language.image2struct.webpage.utils import convert_html_to_text
from helm.common.general import ensure_directory_exists
from helm.common.optional_dependencies import handle_module_not_found_error
from helm.common.hierarchical_logger import hlog

try:
from html2text import HTML2Text
Expand Down Expand Up @@ -73,31 +75,48 @@ def serve_and_take_screenshot(
if not success:
# This runs on examples that are not expected to fail
server.stop()
hlog(f"Failed to start the Jekyll server: {repo_path} on port {port}. Will raise a ValueError.")
raise ValueError(f"Jekyll server failed to start: {repo_path}")

# Take a screenshot of a random page
success = False
error: Optional[Exception] = None

for _ in range(max_tries):
MAX_TRIES_ALL_ERRORS = 3
MAX_TRIES_CONNECTION_REFUSED = 5
MAX_TRIES = max(MAX_TRIES_ALL_ERRORS, MAX_TRIES_CONNECTION_REFUSED)
for compilation_attempt in range(MAX_TRIES):
try:
infos: Dict[str, Any] = save_random_screenshot(destination_path, port=port, options=screenshot_options)
success = True
break
except Exception as e:
error = e

if "net::ERR_CONNECTION_REFUSED" in str(e):
if "net::ERR_CONNECTION_REFUSED" in str(e) and compilation_attempt < MAX_TRIES_CONNECTION_REFUSED:
hlog(
f"Failed to take a screenshot: ERR_CONNECTION_REFUSED [Attempt {compilation_attempt + 1}/"
f"{MAX_TRIES_CONNECTION_REFUSED}]. Error: {e}. Retrying..."
)
server.stop()
time.sleep(0.5)
server.start()
time.sleep(0.5)
elif compilation_attempt < MAX_TRIES_ALL_ERRORS:
hlog(
f"Failed to take a screenshot: Unknown [Attempt {compilation_attempt + 1}/{MAX_TRIES_ALL_ERRORS}]."
f" Error: {e}. Retrying..."
)
else:
# Do not retry
hlog(
f"Failed to take a screenshot: Unknown [Attempt {compilation_attempt + 1}/{MAX_TRIES_ALL_ERRORS}]."
f" Error: {e}. Raising CompilationError."
)
break

if not success:
raise ValueError(f"Failed to take a screenshot: {error}")
raise CompilationError(f"Failed to take a screenshot: {error}")

# Stop the server
server.stop()
Expand Down