Edge extraction efficiency (#1140)

prasmussen15 · web-flow · commit 6470423c1f9b · 2026-01-13T09:12:33.000-05:00
* Add node splitting when a large number of nodes are extrcted

* update

* add tests

* update

* update

* update

* update

* update

* update

* update
diff --git a/examples/podcast/podcast_runner.py b/examples/podcast/podcast_runner.py
@@ -77,11 +77,7 @@ class IsPresidentOf(BaseModel):
 
 async def main(use_bulk: bool = False):
     setup_logging()
-    client = Graphiti(
-        neo4j_uri,
-        neo4j_user,
-        neo4j_password,
-    )
+    client = Graphiti(neo4j_uri, neo4j_user, neo4j_password)
     await clear_data(client.driver)
     await client.build_indices_and_constraints()
     messages = parse_podcast_messages()
diff --git a/graphiti_core/utils/content_chunking.py b/graphiti_core/utils/content_chunking.py
@@ -16,7 +16,11 @@
 
 import json
 import logging
+import random
 import re
+from itertools import combinations
+from math import comb
+from typing import TypeVar
 
 from graphiti_core.helpers import (
     CHUNK_DENSITY_THRESHOLD,
@@ -700,3 +704,123 @@ def _chunk_by_lines(
         chunks.append('\n'.join(current_lines))
 
     return chunks if chunks else [content]
+
+
+T = TypeVar('T')
+
+MAX_COMBINATIONS_TO_EVALUATE = 1000
+
+
+def _random_combination(n: int, k: int) -> tuple[int, ...]:
+    """Generate a random combination of k items from range(n)."""
+    return tuple(sorted(random.sample(range(n), k)))
+
+
+def generate_covering_chunks(items: list[T], k: int) -> list[tuple[list[T], list[int]]]:
+    """Generate chunks of items that cover all pairs using a greedy approach.
+
+    Based on the Handshake Flights Problem / Covering Design problem.
+    Each chunk of K items covers C(K,2) = K(K-1)/2 pairs. We greedily select
+    chunks to maximize coverage of uncovered pairs, minimizing the total number
+    of chunks needed to ensure every pair of items appears in at least one chunk.
+
+    For large inputs where C(n,k) > MAX_COMBINATIONS_TO_EVALUATE, random sampling
+    is used instead of exhaustive search to maintain performance.
+
+    Lower bound (Schönheim): F >= ceil(N/K * ceil((N-1)/(K-1)))
+
+    Args:
+        items: List of items to partition into covering chunks
+        k: Maximum number of items per chunk
+
+    Returns:
+        List of tuples (chunk_items, global_indices) where global_indices maps
+        each position in chunk_items to its index in the original items list.
+    """
+    n = len(items)
+    if n <= k:
+        return [(items, list(range(n)))]
+
+    # Track uncovered pairs using frozensets of indices
+    uncovered_pairs: set[frozenset[int]] = {
+        frozenset([i, j]) for i in range(n) for j in range(i + 1, n)
+    }
+
+    chunks: list[tuple[list[T], list[int]]] = []
+
+    # Determine if we need to sample or can enumerate all combinations
+    total_combinations = comb(n, k)
+    use_sampling = total_combinations > MAX_COMBINATIONS_TO_EVALUATE
+
+    while uncovered_pairs:
+        # Greedy selection: find the chunk that covers the most uncovered pairs
+        best_chunk_indices: tuple[int, ...] | None = None
+        best_covered_count = 0
+
+        if use_sampling:
+            # Sample random combinations when there are too many to enumerate
+            seen_combinations: set[tuple[int, ...]] = set()
+            # Limit total attempts (including duplicates) to prevent infinite loops
+            max_total_attempts = MAX_COMBINATIONS_TO_EVALUATE * 3
+            total_attempts = 0
+            samples_evaluated = 0
+            while samples_evaluated < MAX_COMBINATIONS_TO_EVALUATE:
+                total_attempts += 1
+                if total_attempts > max_total_attempts:
+                    # Too many total attempts, break to avoid infinite loop
+                    break
+                chunk_indices = _random_combination(n, k)
+                if chunk_indices in seen_combinations:
+                    continue
+                seen_combinations.add(chunk_indices)
+                samples_evaluated += 1
+
+                # Count how many uncovered pairs this chunk covers
+                covered_count = sum(
+                    1
+                    for i, idx_i in enumerate(chunk_indices)
+                    for idx_j in chunk_indices[i + 1 :]
+                    if frozenset([idx_i, idx_j]) in uncovered_pairs
+                )
+
+                if covered_count > best_covered_count:
+                    best_covered_count = covered_count
+                    best_chunk_indices = chunk_indices
+        else:
+            # Enumerate all combinations when feasible
+            for chunk_indices in combinations(range(n), k):
+                # Count how many uncovered pairs this chunk covers
+                covered_count = sum(
+                    1
+                    for i, idx_i in enumerate(chunk_indices)
+                    for idx_j in chunk_indices[i + 1 :]
+                    if frozenset([idx_i, idx_j]) in uncovered_pairs
+                )
+
+                if covered_count > best_covered_count:
+                    best_covered_count = covered_count
+                    best_chunk_indices = chunk_indices
+
+        if best_chunk_indices is None or best_covered_count == 0:
+            # Greedy search couldn't find a chunk covering uncovered pairs.
+            # This can happen with random sampling. Fall back to creating
+            # small chunks that directly cover remaining pairs.
+            break
+
+        # Mark pairs in this chunk as covered
+        for i, idx_i in enumerate(best_chunk_indices):
+            for idx_j in best_chunk_indices[i + 1 :]:
+                uncovered_pairs.discard(frozenset([idx_i, idx_j]))
+
+        chunk_items = [items[idx] for idx in best_chunk_indices]
+        chunks.append((chunk_items, list(best_chunk_indices)))
+
+    # Handle any remaining uncovered pairs that the greedy algorithm missed.
+    # This can happen when random sampling fails to find covering chunks.
+    # Create minimal chunks (size 2) to guarantee all pairs are covered.
+    for pair in uncovered_pairs:
+        pair_indices = sorted(pair)
+        chunk_items = [items[idx] for idx in pair_indices]
+        chunks.append((chunk_items, pair_indices))
+
+    return chunks
diff --git a/graphiti_core/utils/maintenance/edge_operations.py b/graphiti_core/utils/maintenance/edge_operations.py
@@ -35,15 +35,18 @@
 from graphiti_core.nodes import CommunityNode, EntityNode, EpisodicNode
 from graphiti_core.prompts import prompt_library
 from graphiti_core.prompts.dedupe_edges import EdgeDuplicate
+from graphiti_core.prompts.extract_edges import Edge as ExtractedEdge
 from graphiti_core.prompts.extract_edges import ExtractedEdges
 from graphiti_core.search.search import search
 from graphiti_core.search.search_config import SearchResults
 from graphiti_core.search.search_config_recipes import EDGE_HYBRID_SEARCH_RRF
 from graphiti_core.search.search_filters import SearchFilters
+from graphiti_core.utils.content_chunking import generate_covering_chunks
 from graphiti_core.utils.datetime_utils import ensure_utc, utc_now
 from graphiti_core.utils.maintenance.dedup_helpers import _normalize_string_exact
 
 DEFAULT_EDGE_NAME = 'RELATES_TO'
+MAX_NODES = 15
 
 logger = logging.getLogger(__name__)
 
@@ -120,27 +123,110 @@ async def extract_edges(
         else []
     )
 
-    # Prepare context for LLM
-    context = {
-        'episode_content': episode.content,
-        'nodes': [
-            {'id': idx, 'name': node.name, 'entity_types': node.labels}
-            for idx, node in enumerate(nodes)
-        ],
-        'previous_episodes': [ep.content for ep in previous_episodes],
-        'reference_time': episode.valid_at,
-        'edge_types': edge_types_context,
-        'custom_extraction_instructions': custom_extraction_instructions or '',
-    }
+    # Generate covering chunks to ensure all node pairs are processed.
+    # Uses a greedy approach based on the Handshake Flights Problem.
+    covering_chunks = generate_covering_chunks(nodes, MAX_NODES)
+
+    # Pre-assign pairs to chunks to avoid duplicate edge extraction.
+    # Each pair is assigned to the first chunk that contains it.
+    processed_pairs: set[frozenset[int]] = set()
+    chunk_assigned_pairs: list[set[frozenset[int]]] = []
+
+    for _, global_indices in covering_chunks:
+        assigned_pairs: set[frozenset[int]] = set()
+        for i, idx_i in enumerate(global_indices):
+            for idx_j in global_indices[i + 1 :]:
+                pair = frozenset([idx_i, idx_j])
+                if pair not in processed_pairs:
+                    processed_pairs.add(pair)
+                    assigned_pairs.add(pair)
+        chunk_assigned_pairs.append(assigned_pairs)
+
+    async def extract_edges_for_chunk(
+        chunk: list[EntityNode],
+        global_indices: list[int],
+        assigned_pairs: set[frozenset[int]],
+    ) -> list[ExtractedEdge]:
+        # Skip chunks with no assigned pairs (all pairs already processed)
+        if not assigned_pairs:
+            return []
+
+        # Prepare context for LLM
+        context = {
+            'episode_content': episode.content,
+            'nodes': [
+                {'id': idx, 'name': node.name, 'entity_types': node.labels}
+                for idx, node in enumerate(chunk)
+            ],
+            'previous_episodes': [ep.content for ep in previous_episodes],
+            'reference_time': episode.valid_at,
+            'edge_types': edge_types_context,
+            'custom_extraction_instructions': custom_extraction_instructions or '',
+        }
 
-    llm_response = await llm_client.generate_response(
-        prompt_library.extract_edges.edge(context),
-        response_model=ExtractedEdges,
-        max_tokens=extract_edges_max_tokens,
-        group_id=group_id,
-        prompt_name='extract_edges.edge',
+        llm_response = await llm_client.generate_response(
+            prompt_library.extract_edges.edge(context),
+            response_model=ExtractedEdges,
+            max_tokens=extract_edges_max_tokens,
+            group_id=group_id,
+            prompt_name='extract_edges.edge',
+        )
+        chunk_edges_data = ExtractedEdges(**llm_response).edges
+
+        # Map chunk-local indices to global indices in the original nodes list
+        # Note: global_indices are guaranteed valid by generate_covering_chunks,
+        # but LLM-returned local indices need validation
+        valid_edges: list[ExtractedEdge] = []
+        chunk_size = len(global_indices)
+
+        for edge_data in chunk_edges_data:
+            source_local_idx = edge_data.source_entity_id
+            target_local_idx = edge_data.target_entity_id
+
+            # Validate LLM-returned indices are within chunk bounds
+            if not (0 <= source_local_idx < chunk_size):
+                logger.warning(
+                    f'Source index {source_local_idx} out of bounds for chunk of size '
+                    f'{chunk_size} in edge {edge_data.relation_type}'
+                )
+                continue
+
+            if not (0 <= target_local_idx < chunk_size):
+                logger.warning(
+                    f'Target index {target_local_idx} out of bounds for chunk of size '
+                    f'{chunk_size} in edge {edge_data.relation_type}'
+                )
+                continue
+
+            # Map to global indices (guaranteed valid by generate_covering_chunks)
+            mapped_source = global_indices[source_local_idx]
+            mapped_target = global_indices[target_local_idx]
+            edge_data.source_entity_id = mapped_source
+            edge_data.target_entity_id = mapped_target
+
+            # Only include edges for pairs assigned to this chunk
+            edge_pair = frozenset([mapped_source, mapped_target])
+            if edge_pair in assigned_pairs:
+                valid_edges.append(edge_data)
+
+        return valid_edges
+
+    # Extract edges from all chunks in parallel
+    chunk_results: list[list[ExtractedEdge]] = list(
+        await semaphore_gather(
+            *[
+                extract_edges_for_chunk(chunk, global_indices, assigned_pairs)
+                for (chunk, global_indices), assigned_pairs in zip(
+                    covering_chunks, chunk_assigned_pairs, strict=True
+                )
+            ]
+        )
     )
-    edges_data = ExtractedEdges(**llm_response).edges
+
+    # Combine results from all chunks
+    edges_data: list[ExtractedEdge] = []
+    for chunk_edges in chunk_results:
+        edges_data.extend(chunk_edges)
 
     end = time()
     logger.debug(f'Extracted new edges: {edges_data} in {(end - start) * 1000} ms')
@@ -161,22 +247,9 @@ async def extract_edges(
         if not edge_data.fact.strip():
             continue
 
-        source_node_idx = edge_data.source_entity_id
-        target_node_idx = edge_data.target_entity_id
-
-        if len(nodes) == 0:
-            logger.warning('No entities provided for edge extraction')
-            continue
-
-        if not (0 <= source_node_idx < len(nodes) and 0 <= target_node_idx < len(nodes)):
-            logger.warning(
-                f'Invalid entity IDs in edge extraction for {edge_data.relation_type}. '
-                f'source_entity_id: {source_node_idx}, target_entity_id: {target_node_idx}, '
-                f'but only {len(nodes)} entities available (valid range: 0-{len(nodes) - 1})'
-            )
-            continue
-        source_node_uuid = nodes[source_node_idx].uuid
-        target_node_uuid = nodes[target_node_idx].uuid
+        # Indices already validated in extract_edges_for_chunk
+        source_node_uuid = nodes[edge_data.source_entity_id].uuid
+        target_node_uuid = nodes[edge_data.target_entity_id].uuid
 
         if valid_at:
             try:
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "graphiti-core"
 description = "A temporal graph building library"
-version = "0.25.4"
+version = "0.25.5"
 authors = [
     { name = "Paul Paliychuk", email = "paul@getzep.com" },
     { name = "Preston Rasmussen", email = "preston@getzep.com" },
diff --git a/tests/utils/test_content_chunking.py b/tests/utils/test_content_chunking.py