Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[community]: Render documents to graphviz #24830

Merged
Merged
1 change: 1 addition & 0 deletions libs/community/extended_testing_deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ gliner>=0.2.7
google-cloud-documentai>=2.20.1,<3
gql>=3.4.1,<4
gradientai>=1.4.0,<2
graphviz>=0.20.3,<0.21
hdbcli>=2.19.21,<3
hologres-vector==0.0.6
html2text>=2020.1.16
Expand Down
122 changes: 122 additions & 0 deletions libs/community/langchain_community/graph_vectorstores/visualize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import re
from typing import TYPE_CHECKING, Dict, Iterable, Optional, Tuple

from langchain_core._api import beta
from langchain_core.documents import Document

from langchain_community.graph_vectorstores.links import get_links

if TYPE_CHECKING:
import graphviz


def _escape_id(id: str) -> str:
return id.replace(":", "_")


_EDGE_DIRECTION = {
"in": "back",
"out": "forward",
"bidir": "both",
}

_WORD_RE = re.compile("\s*\S+")


def _split_prefix(s: str, max_chars: int = 50) -> str:
words = _WORD_RE.finditer(s)

split = min(len(s), max_chars)
for word in words:
if word.end(0) > max_chars:
break
split = word.end(0)

if split == len(s):
return s
else:
return f"{s[0:split]}..."


@beta()
def render_graphviz(
documents: Iterable[Document],
engine: Optional[str] = None,
node_color: Optional[str] = None,
node_colors: Optional[Dict[str, Optional[str]]] = None,
skip_tags: Iterable[Tuple[str, str]] = (),
) -> "graphviz.Digraph":
"""Render a collection of GraphVectorStore documents to GraphViz format.

Args:
documents: The documents to render.
engine: GraphViz layout engine to use. `None` uses the default.
node_color: Default node color.
node_colors: Dictionary specifying colors of specific nodes. Useful for
emphasizing nodes that were selected by MMR, or differ from other
results.
skip_tags: Set of tags to skip when rendering the graph. Specified as
tuples containing the kind and tag.

Returns:
The "graphviz.Digraph" representing the nodes. May be printed to source,
or rendered using `dot`.

Note:
To render the generated DOT source code, you also need to install Graphviz_
(`download page <https://www.graphviz.org/download/>`_,
`archived versions <https://www2.graphviz.org/Archive/stable/>`_,
`installation procedure for Windows <https://forum.graphviz.org/t/new-simplified-installation-procedure-on-windows/224>`_).
"""
if node_colors is None:
node_colors = {}

try:
import graphviz
except (ImportError, ModuleNotFoundError):
raise ImportError(
"Could not import graphviz python package. "
"Please install it with `pip install graphviz`."
)

graph = graphviz.Digraph(engine=engine)
graph.attr(rankdir="LR")
graph.attr("node", style="filled")

skip_tags = set(skip_tags)
tags: dict[Tuple[str, str], str] = {}

for document in documents:
id = document.id
if id is None:
raise ValueError(f"Illegal graph document without ID: {document}")
escaped_id = _escape_id(id)
color = node_colors[id] if id in node_colors else node_color

node_label = "\n".join(
[
graphviz.escape(id),
graphviz.escape(_split_prefix(document.page_content)),
]
)
graph.node(
escaped_id,
label=node_label,
shape="note",
fillcolor=color,
tooltip=graphviz.escape(document.page_content),
)

for link in get_links(document):
tag_key = (link.kind, link.tag)
if tag_key in skip_tags:
continue

tag_id = tags.get(tag_key)
if tag_id is None:
tag_id = f"tag_{len(tags)}"
tags[tag_key] = tag_id
graph.node(tag_id, label=graphviz.escape(f"{link.kind}:{link.tag}"))

graph.edge(escaped_id, tag_id, dir=_EDGE_DIRECTION[link.direction])
return graph
113 changes: 113 additions & 0 deletions libs/community/tests/unit_tests/graph_vectorstores/test_visualize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import pytest
from langchain_core.documents import Document

from langchain_community.graph_vectorstores.links import METADATA_LINKS_KEY, Link
from langchain_community.graph_vectorstores.visualize import render_graphviz


@pytest.mark.requires("graphviz")
def test_visualize_simple_graph() -> None:
doc1 = Document(
id="a",
page_content="some content",
metadata={
METADATA_LINKS_KEY: [
Link.incoming("href", "a"),
Link.bidir("kw", "foo"),
]
},
)
doc2 = Document(
id="b",
page_content="<some\n more content>",
metadata={
METADATA_LINKS_KEY: [
Link.incoming("href", "b"),
Link.outgoing("href", "a"),
Link.bidir("kw", "foo"),
Link.bidir("kw", "bar"),
]
},
)

assert render_graphviz([doc1, doc2]).source == (
"digraph {\n"
"\trankdir=LR\n"
"\tnode [style=filled]\n"
'\ta [label="a\nsome content" shape=note tooltip="some content"]\n'
'\ttag_0 [label="href:a"]\n'
"\ta -> tag_0 [dir=back]\n"
'\ttag_1 [label="kw:foo"]\n'
"\ta -> tag_1 [dir=both]\n"
'\tb [label="b\n<some\n more content>" '
'shape=note tooltip="<some\n more content>"]\n'
'\ttag_2 [label="href:b"]\n'
"\tb -> tag_2 [dir=back]\n"
"\tb -> tag_0 [dir=forward]\n"
"\tb -> tag_1 [dir=both]\n"
'\ttag_3 [label="kw:bar"]\n'
"\tb -> tag_3 [dir=both]\n"
"}\n"
)

assert render_graphviz([doc1, doc2], engine="fdp").engine == "fdp"

assert render_graphviz([doc1, doc2], node_colors={"a": "gold"}).source == (
"digraph {\n"
"\trankdir=LR\n"
"\tnode [style=filled]\n"
'\ta [label="a\nsome content" fillcolor=gold '
'shape=note tooltip="some content"]\n'
'\ttag_0 [label="href:a"]\n'
"\ta -> tag_0 [dir=back]\n"
'\ttag_1 [label="kw:foo"]\n'
"\ta -> tag_1 [dir=both]\n"
'\tb [label="b\n<some\n more content>" '
'shape=note tooltip="<some\n more content>"]\n'
'\ttag_2 [label="href:b"]\n'
"\tb -> tag_2 [dir=back]\n"
"\tb -> tag_0 [dir=forward]\n"
"\tb -> tag_1 [dir=both]\n"
'\ttag_3 [label="kw:bar"]\n'
"\tb -> tag_3 [dir=both]\n"
"}\n"
)

assert render_graphviz(
[doc1, doc2], node_color="gold", node_colors={"a": None}
).source == (
"digraph {\n"
"\trankdir=LR\n"
"\tnode [style=filled]\n"
'\ta [label="a\nsome content" shape=note tooltip="some content"]\n'
'\ttag_0 [label="href:a"]\n'
"\ta -> tag_0 [dir=back]\n"
'\ttag_1 [label="kw:foo"]\n'
"\ta -> tag_1 [dir=both]\n"
'\tb [label="b\n<some\n more content>" fillcolor=gold '
'shape=note tooltip="<some\n more content>"]\n'
'\ttag_2 [label="href:b"]\n'
"\tb -> tag_2 [dir=back]\n"
"\tb -> tag_0 [dir=forward]\n"
"\tb -> tag_1 [dir=both]\n"
'\ttag_3 [label="kw:bar"]\n'
"\tb -> tag_3 [dir=both]\n"
"}\n"
)

assert render_graphviz([doc1, doc2], skip_tags=[("kw", "foo")]).source == (
"digraph {\n"
"\trankdir=LR\n"
"\tnode [style=filled]\n"
'\ta [label="a\nsome content" shape=note tooltip="some content"]\n'
'\ttag_0 [label="href:a"]\n'
"\ta -> tag_0 [dir=back]\n"
'\tb [label="b\n<some\n more content>" '
'shape=note tooltip="<some\n more content>"]\n'
'\ttag_1 [label="href:b"]\n'
"\tb -> tag_1 [dir=back]\n"
"\tb -> tag_0 [dir=forward]\n"
'\ttag_2 [label="kw:bar"]\n'
"\tb -> tag_2 [dir=both]\n"
"}\n"
)
Loading