Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding web page digest function to service module #84

Merged
merged 15 commits into from
Mar 27, 2024
12 changes: 11 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,17 @@
"expiringdict",
]

service_requires = ["docstring_parser", "docker", "pymongo", "pymysql"]
service_requires = [
"docstring_parser",
"docker",
"pymongo",
"pymysql",
"langchain",
"langchain-text-splitters",
"beautifulsoup4",
"playwright",
"lxml",
]

doc_requires = [
"sphinx",
Expand Down
2 changes: 2 additions & 0 deletions src/agentscope/service/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from .text_processing.summarization import summarization
from .retrieval.retrieval_from_list import retrieve_from_list
from .service_status import ServiceExecStatus
from .web_search.web_digest import webpage_digest


def get_help() -> None:
Expand Down Expand Up @@ -52,4 +53,5 @@ def get_help() -> None:
"summarization",
"retrieve_from_list",
"ServiceExecStatus",
"webpage_digest",
ZiTao-Li marked this conversation as resolved.
Show resolved Hide resolved
]
193 changes: 193 additions & 0 deletions src/agentscope/service/web_search/web_digest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
# -*- coding: utf-8 -*-
"""parsing and digesting the web pages"""
from typing import Optional, Union
import requests
from loguru import logger

from agentscope.service.service_response import ServiceResponse
from agentscope.service.service_status import ServiceExecStatus
from agentscope.models.model import ModelWrapperBase
from agentscope.prompt import PromptEngine, PromptType


DEFAULT_SYS_PROMPT = (
"You're a web page analyser. You job is to extract important"
"and useful information from html or webpage description.\n"
)

DEFAULT_SUMMARY_PROMPT = (
"You're a web page analyser. You job is to summarize the "
"content of a webpage.\n"
)

DEFAULT_SPLIT_LEVEL = [
("h1", "h1"),
("h2", "h1"),
]


def webpage_digest(
ZiTao-Li marked this conversation as resolved.
Show resolved Hide resolved
url: str,
model: Optional[ModelWrapperBase] = None,
prompt_type: Optional[PromptType] = PromptType.LIST,
html_split_levels: Optional[tuple[str]] = ("h1", "h2"),
digest_prompt: Optional[str] = DEFAULT_SYS_PROMPT,
digest_summary_prompt: Optional[str] = DEFAULT_SUMMARY_PROMPT,
) -> ServiceResponse:
"""Function for parsing and digesting the web page.
Args:
url (str): the url of the web page
model (Optional[ModelWrapperBase]): the model that is
used to digest the web page content.
prompt_type (Optional[PromptEngine]): how the prompt engine
compose the messages.
html_split_levels (Optional[tuple[str]):
In case the webpage is too long, this
parameter is for splitting the html file.
Default will split a html file at 'h1' and 'h2' levels,
and feed each partition to model to digest.
digest_prompt (Optional[str]): prompt for
the model to analyze the webpage content
digest_summary_prompt (Optional[str]): prompt for
the model to summarize the webpage content

Returns:
ZiTao-Li marked this conversation as resolved.
Show resolved Hide resolved
ServiceResponse: containing the following digested content in dict.
1) "html_text_content": (str)text information on the webpage;
2) "href_links": (list[dict[str, str]])list of hyper-links
on the webpage:
[
{
"content": $content_in_tag_a
"href_link": "https://xxxx"
},
{
"content": $content_in_tag_a
"href_link": "https://yyyyy"
},
...
]
3) "model_digested": (list[dict[str, str]]) list of LLM
digested information for each split of the html
file if `model` is not None
[
{
"split_info": {'h1': 'Foo'}
"digested_text": "model digested info for
content under h1"
},
{
"split_info": "summary"
"digested_text": "model summarize info for
the whole page"
}
]
If `model` is None, this will be an empty list.
"split_info" is the header of the split, "digested_text"
is the digested content of the split.
There is a special "split_info": "summary" for summarizing
the content of all text information on web page.
"""
html = requests.get(url)
logger.info(f"Get content from {url}...")
digest_result = {}

try:
from langchain.docstore.document import Document
from bs4 import BeautifulSoup
from langchain_community.document_transformers import (
BeautifulSoupTransformer,
)
from langchain_text_splitters import HTMLHeaderTextSplitter
except ImportError as exc:
raise ImportError(
"LangChain and BeautifulSoup4 required for processing the "
"web page without model."
"Please install with `pip install langchain beautifulsoup4` .",
) from exc

html_doc = Document(page_content=html.text, metadata={"url": url})

# html to text
bs_transformer = BeautifulSoupTransformer()
html_content = bs_transformer.transform_documents([html_doc])
digest_result["html_text_content"] = html_content[0].page_content
logger.info(f"Parse text info from {url}, save to 'html_text_content'")

# obtain links in html
links = []
soup = BeautifulSoup(html.text, "html.parser")
for element in soup.find_all():
if element.name in ["a"]:
links.append(
{
"content": element.get_text(),
"href_link": element.get("href"),
},
)
digest_result["href_links"] = links
logger.info(f"Parse href links from {url}, save to 'href_links'")

# split the webpage and feed them to a model for analysis
model_digested = []
if model is not None:
logger.info(f"Start using model to digest {url}")
prompt_engine = PromptEngine(model=model, prompt_type=prompt_type)

def compose_prompt(
instruction: Optional[str],
content: Optional[str],
) -> Union[str, list[dict]]:
"""simply compose to feed model"""
sys_msg = {
"name": "system",
"role": "system",
"content": instruction or " ",
}
info_msg = {
"name": "user",
"role": "user",
"content": content or " ",
}
return prompt_engine.join(sys_msg, info_msg)

all_text = ""
if html_split_levels is None:
# set the default splitting granularity to h1 and h2 in html
html_split_levels = ("h1", "h2")
split_levels = [(s, s) for s in html_split_levels]
html_splitter = HTMLHeaderTextSplitter(
headers_to_split_on=split_levels,
)
html_header_splits = html_splitter.split_text(html_doc.page_content)
# ask the model to analyze all split parts
for split in html_header_splits:
if len(split.page_content) == 0:
pass
prompt = compose_prompt(digest_prompt, split.page_content)
analysis_res = model(messages=prompt)
if analysis_res.text:
model_digested.append(
{
"split_info": split.metadata,
"digested_text": analysis_res.text,
},
)
all_text += analysis_res.text
# generate a final summary
prompt = compose_prompt(digest_summary_prompt, all_text)
analysis_res = model(messages=prompt)
if analysis_res.text:
model_digested.append(
{
"split_info": "summary",
"digested_text": analysis_res.text,
},
)

digest_result["model_digested"] = model_digested

return ServiceResponse(
status=ServiceExecStatus.SUCCESS,
content=digest_result,
)
85 changes: 85 additions & 0 deletions tests/web_digest_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-
""" Python web digest test."""

import unittest
from unittest.mock import patch, MagicMock, Mock

from agentscope.service import ServiceResponse
from agentscope.service import webpage_digest
from agentscope.service.service_status import ServiceExecStatus
from agentscope.models import ModelWrapperBase, ModelResponse
from agentscope.message import Msg


class TestWebSearches(unittest.TestCase):
"""ExampleTest for a unit test."""

@patch("requests.get")
def test_webpage_digest(self, mock_get: MagicMock) -> None:
"""test webpage_digest"""
# Set up the mock response
mock_response = Mock()
mock_return_text = """
<!DOCTYPE html>
<html>
<body>
<div>
<p>
Some intro text about Foo. <a href="xxx">Examples</a>
pan-x-c marked this conversation as resolved.
Show resolved Hide resolved
</p>
</div>
</body>
</html>
"""

expected_result = ServiceResponse(
status=ServiceExecStatus.SUCCESS,
content={
"html_text_content": "Some intro text about Foo. "
"Examples (xxx)",
"href_links": [
{
"content": "Examples",
"href_link": "xxx",
},
],
"model_digested": [
{
"split_info": {},
"digested_text": "model return",
},
{
"split_info": "summary",
"digested_text": "model return",
},
],
},
)

mock_response.text = mock_return_text
mock_get.return_value = mock_response

# set parameters
fake_url = "fake-url"

# test the case with dummy model
class DummyModel(ModelWrapperBase):
"""Dummy model for testing"""

def __init__(self) -> None:
self.max_length = 1000

def __call__(self, messages: list[Msg]) -> ModelResponse:
return ModelResponse(text="model return")

dummy_model = DummyModel()
results = webpage_digest(url=fake_url, model=dummy_model)
self.assertEqual(
results,
expected_result,
)


# This allows the tests to be run from the command line
if __name__ == "__main__":
unittest.main()