getzep · paul-paliychuk · Aug 22, 2024 · Aug 21, 2024 · Aug 21, 2024 · Aug 22, 2024
diff --git a/core/graphiti.py b/core/graphiti.py
diff --git a/core/prompts/dedupe_edges.py b/core/prompts/dedupe_edges.py
@@ -6,15 +6,16 @@
 
 class Prompt(Protocol):
 	v1: PromptVersion
-	edge_list: PromptVersion
+    v2: PromptVersion
 
 
 class Versions(TypedDict):
 	v1: PromptFunction
-	edge_list: PromptFunction
+	v2: PromptFunction
+    edge_list: PromptFunction
 
 
 def v1(context: dict[str, any]) -> list[Message]:
 	return [
 		Message(
 			role='system',
@@ -54,6 +55,48 @@
 	]
 
 
+def v2(context: dict[str, any]) -> list[Message]:
+    return [
+        Message(
+            role="system",
+            content="You are a helpful assistant that de-duplicates relationship from edge lists.",
+        ),
+        Message(
+            role="user",
+            content=f"""
+        Given the following context, deduplicate edges from a list of new edges given a list of existing edges:
+
+        Existing Edges:
+        {json.dumps(context['existing_edges'], indent=2)}
+
+        New Edges:
+        {json.dumps(context['extracted_edges'], indent=2)}
+
+        Task:
+        1. start with the list of edges from New Edges
+        2. If any edge in New Edges is a duplicate of an edge in Existing Edges, replace the new edge with the existing
+            edge in the list
+        3. Respond with the resulting list of edges
+
+        Guidelines:
+        1. Use both the triplet name and fact of edges to determine if they are duplicates, 
+            duplicate edges may have different names meaning the same thing and slight variations in the facts.
+        2. If you encounter facts that are semantically equivalent or very similar, keep the original edge
+
+        Respond with a JSON object in the following format:
+        {{
+            "new_edges": [
+                {{
+                    "triplet": "source_node_name-edge_name-target_node_name",
+                    "fact": "one sentence description of the fact"
+                }}
+            ]
+        }}
+        """,
+        ),
+    ]
+
+
 def edge_list(context: dict[str, any]) -> list[Message]:
 	return [
 		Message(
@@ -90,4 +133,4 @@
 	]
 
 
-versions: Versions = {'v1': v1, 'edge_list': edge_list}
+versions: Versions = {"v1": v1, "v2": v2, "edge_list": edge_list}
diff --git a/core/prompts/invalidate_edges.py b/core/prompts/invalidate_edges.py
@@ -15,34 +15,40 @@ def v1(context: dict[str, any]) -> list[Message]:
 	return [
 		Message(
 			role='system',
-			content='You are an AI assistant that helps determine which relationships in a knowledge graph should be invalidated based on newer information.',
+			content='You are an AI assistant that helps determine which relationships in a knowledge graph should be invalidated based solely on explicit contradictions in newer information.',
 		),
 		Message(
 			role='user',
 			content=f"""
-                Based on the provided existing edges and new edges with their timestamps, determine which existing relationships, if any, should be invalidated due to contradictions or updates in the new edges.
+               Based on the provided existing edges and new edges with their timestamps, determine which existing relationships, if any, should be invalidated due to contradictions or updates in the new edges.
                 Only mark a relationship as invalid if there is clear evidence from new edges that the relationship is no longer true.
-                Do not invalidate relationships merely because they weren't mentioned in new edges.
+                Do not invalidate relationships merely because they weren't mentioned in new edges. You may use the current episode and previous episodes as well as the facts of each edge to understand the context of the relationships.
+
+                Previous Episodes:
+                {context['previous_episodes']}
+
+                Current Episode:
+                {context['current_episode']}
 
                 Existing Edges (sorted by timestamp, newest first):
                 {context['existing_edges']}
 
                 New Edges:
                 {context['new_edges']}
 
-                Each edge is formatted as: "UUID | SOURCE_NODE - EDGE_NAME - TARGET_NODE (TIMESTAMP)"
+                Each edge is formatted as: "UUID | SOURCE_NODE - EDGE_NAME - TARGET_NODE (fact: EDGE_FACT), TIMESTAMP)"
 
                 For each existing edge that should be invalidated, respond with a JSON object in the following format:
                 {{
                     "invalidated_edges": [
                         {{
                             "edge_uuid": "The UUID of the edge to be invalidated (the part before the | character)",
-                            "reason": "Brief explanation of why this edge is being invalidated"
+                            "fact": "Updated fact of the edge"
                         }}
                     ]
                 }}
 
-                If no relationships need to be invalidated, return an empty list for "invalidated_edges".
+                If no relationships need to be invalidated based on these strict criteria, return an empty list for "invalidated_edges".
             """,
 		),
 	]

diff --git a/core/utils/bulk_utils.py b/core/utils/bulk_utils.py
@@ -98,7 +98,7 @@ async def dedupe_nodes_bulk(
 
 	existing_nodes = await get_relevant_nodes(compressed_nodes, driver)
 
-	nodes, partial_uuid_map = await dedupe_extracted_nodes(
+	nodes, partial_uuid_map, _ = await dedupe_extracted_nodes(
 		llm_client, compressed_nodes, existing_nodes
 	)
 

diff --git a/core/utils/maintenance/edge_operations.py b/core/utils/maintenance/edge_operations.py
@@ -8,6 +8,7 @@
 from core.llm_client import LLMClient
 from core.nodes import EntityNode, EpisodicNode
 from core.prompts import prompt_library
+from core.utils.maintenance.temporal_operations import NodeEdgeNodeTriplet
 
 logger = logging.getLogger(__name__)
 
@@ -179,6 +180,53 @@ async def extract_edges(
 	return edges
 
 
+def create_edge_identifier(
+    source_node: EntityNode, edge: EntityEdge, target_node: EntityNode
+) -> str:
+    return f"{source_node.name}-{edge.name}-{target_node.name}"
+
+
+async def dedupe_extracted_edges_v2(
+    llm_client: LLMClient,
+    extracted_edges: list[NodeEdgeNodeTriplet],
+    existing_edges: list[NodeEdgeNodeTriplet],
+) -> list[NodeEdgeNodeTriplet]:
+    # Create edge map
+    edge_map = {}
+    for n1, edge, n2 in existing_edges:
+        edge_map[create_edge_identifier(n1, edge, n2)] = edge
+    for n1, edge, n2 in extracted_edges:
+        if create_edge_identifier(n1, edge, n2) in edge_map:
+            continue
+        edge_map[create_edge_identifier(n1, edge, n2)] = edge
+
+    # Prepare context for LLM
+    context = {
+        "extracted_edges": [
+            {"triplet": create_edge_identifier(n1, edge, n2), "fact": edge.fact}
+            for n1, edge, n2 in extracted_edges
+        ],
+        "existing_edges": [
+            {"triplet": create_edge_identifier(n1, edge, n2), "fact": edge.fact}
+            for n1, edge, n2 in extracted_edges
+        ],
+    }
+    logger.info(prompt_library.dedupe_edges.v2(context))
+    llm_response = await llm_client.generate_response(
+        prompt_library.dedupe_edges.v2(context)
+    )
+    new_edges_data = llm_response.get("new_edges", [])
+    logger.info(f"Extracted new edges: {new_edges_data}")
+
+    # Get full edge data
+    edges = []
+    for edge_data in new_edges_data:
+        edge = edge_map[edge_data["triplet"]]
+        edges.append(edge)
+
+    return edges
+
+
 async def dedupe_extracted_edges(
 	llm_client: LLMClient,
 	extracted_edges: list[EntityEdge],