Skip to content

Commit

Permalink
Merge pull request #124 from AI4WA/feature/cli
Browse files Browse the repository at this point in the history
Feature/cli
  • Loading branch information
PascalSun authored Jan 7, 2025
2 parents 5f8aedd + 504cd3d commit 7cbd714
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 27 deletions.
31 changes: 12 additions & 19 deletions Docs2KG/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,29 +38,18 @@ def get_supported_formats(cls) -> str:
return ", ".join(cls.PROCESSORS.keys())


def setup_environment(config_path: str = None):
"""Setup the environment with the config file."""
if config_path:
os.environ["CONFIG_FILE"] = str(Path(config_path).resolve())
else:
os.environ["CONFIG_FILE"] = str(Path.cwd() / "config.yml")


@click.group()
@click.option(
"--config",
"-c",
type=click.Path(exists=True),
help="Path to the configuration file (default: ./config.yml)",
)
def cli(config):
def cli():
"""Docs2KG - Document to Knowledge Graph conversion tool.
Supports multiple document formats: PDF, DOCX, HTML, and EPUB.
"""
setup_environment(config)
logger.info(f"Using configuration: {os.environ.get('CONFIG_FILE')}")
logger.info(PROJECT_CONFIG.data)
logger.info("Welcome to Docs2KG!")
logger.info(f"Configuration loaded from: {os.environ.get('CONFIG_FILE')}")
logger.info(f"Input directory: {PROJECT_CONFIG.data.input_dir}")
logger.info(f"Output directory: {PROJECT_CONFIG.data.output_dir}")
logger.info(f"Ontology directory: {PROJECT_CONFIG.data.ontology_dir}")
logger.info("---")


def process_single_file(
Expand Down Expand Up @@ -154,7 +143,11 @@ def process_document(file_path, project_id, agent_name, agent_type):


@cli.command()
@click.argument("input_dir", type=click.Path(exists=True))
@click.argument(
"input_dir",
type=click.Path(exists=True),
default=PROJECT_CONFIG.data.input_dir.as_posix(),
)
@click.option(
"--project-id",
"-p",
Expand Down
16 changes: 8 additions & 8 deletions Docs2KG/digitization/native/html_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,16 @@ class HTMLDocling(DigitizationBase):
HTMLDocling class for processing HTML content from files or URLs to markdown.
"""

def __init__(self, source: Union[str, Path]):
self.is_url = isinstance(source, str) and self._is_valid_url(source)
def __init__(self, file_path: Union[str, Path]):
self.is_url = isinstance(file_path, str) and self._is_valid_url(file_path)

if self.is_url:
# Create a filename from the URL
url_path = urlparse(source).path
url_path = urlparse(file_path).path
url_filename = (
unquote(Path(url_path).name)
if url_path and Path(url_path).name
else urlparse(source).netloc
else urlparse(file_path).netloc
)
self.html_filename = (
f"{url_filename}.html"
Expand All @@ -36,16 +36,16 @@ def __init__(self, source: Union[str, Path]):

# Download and save the HTML content
self.html_path = PROJECT_CONFIG.data.input_dir / self.html_filename
self._download_and_save_html(source)
self._download_and_save_html(file_path)

# Use the saved file path as the source
source = self.html_path
file_path = self.html_path

super().__init__(
file_path=source,
file_path=file_path,
supported_formats=["html", "htm"],
)
self.source = source
self.source = file_path

def _download_and_save_html(self, url: str) -> None:
"""
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ You have two ways to run the package:
### Command Line

```bash
# first setup the CONFIG_FILE environment variable to local one
export CONFIG_FILE=config.yml # or any other path for the configuration file
docs2kg # this command will tell you how to use the package

# we currently support the following commands
Expand Down
3 changes: 3 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ You have two ways to run the package:
### Command Line

```bash
# first setup the CONFIG_FILE environment variable to local one
export CONFIG_FILE=config.yml # or any other path for the configuration file

docs2kg # this command will tell you how to use the package

# we currently support the following commands
Expand Down

0 comments on commit 7cbd714

Please sign in to comment.