3535from graphiti_core .nodes import CommunityNode , EntityNode , EpisodicNode
3636from graphiti_core .prompts import prompt_library
3737from graphiti_core .prompts .dedupe_edges import EdgeDuplicate
38+ from graphiti_core .prompts .extract_edges import Edge as ExtractedEdge
3839from graphiti_core .prompts .extract_edges import ExtractedEdges
3940from graphiti_core .search .search import search
4041from graphiti_core .search .search_config import SearchResults
4142from graphiti_core .search .search_config_recipes import EDGE_HYBRID_SEARCH_RRF
4243from graphiti_core .search .search_filters import SearchFilters
44+ from graphiti_core .utils .content_chunking import generate_covering_chunks
4345from graphiti_core .utils .datetime_utils import ensure_utc , utc_now
4446from graphiti_core .utils .maintenance .dedup_helpers import _normalize_string_exact
4547
4648DEFAULT_EDGE_NAME = 'RELATES_TO'
49+ MAX_NODES = 15
4750
4851logger = logging .getLogger (__name__ )
4952
@@ -120,27 +123,110 @@ async def extract_edges(
120123 else []
121124 )
122125
123- # Prepare context for LLM
124- context = {
125- 'episode_content' : episode .content ,
126- 'nodes' : [
127- {'id' : idx , 'name' : node .name , 'entity_types' : node .labels }
128- for idx , node in enumerate (nodes )
129- ],
130- 'previous_episodes' : [ep .content for ep in previous_episodes ],
131- 'reference_time' : episode .valid_at ,
132- 'edge_types' : edge_types_context ,
133- 'custom_extraction_instructions' : custom_extraction_instructions or '' ,
134- }
126+ # Generate covering chunks to ensure all node pairs are processed.
127+ # Uses a greedy approach based on the Handshake Flights Problem.
128+ covering_chunks = generate_covering_chunks (nodes , MAX_NODES )
129+
130+ # Pre-assign pairs to chunks to avoid duplicate edge extraction.
131+ # Each pair is assigned to the first chunk that contains it.
132+ processed_pairs : set [frozenset [int ]] = set ()
133+ chunk_assigned_pairs : list [set [frozenset [int ]]] = []
134+
135+ for _ , global_indices in covering_chunks :
136+ assigned_pairs : set [frozenset [int ]] = set ()
137+ for i , idx_i in enumerate (global_indices ):
138+ for idx_j in global_indices [i + 1 :]:
139+ pair = frozenset ([idx_i , idx_j ])
140+ if pair not in processed_pairs :
141+ processed_pairs .add (pair )
142+ assigned_pairs .add (pair )
143+ chunk_assigned_pairs .append (assigned_pairs )
144+
145+ async def extract_edges_for_chunk (
146+ chunk : list [EntityNode ],
147+ global_indices : list [int ],
148+ assigned_pairs : set [frozenset [int ]],
149+ ) -> list [ExtractedEdge ]:
150+ # Skip chunks with no assigned pairs (all pairs already processed)
151+ if not assigned_pairs :
152+ return []
153+
154+ # Prepare context for LLM
155+ context = {
156+ 'episode_content' : episode .content ,
157+ 'nodes' : [
158+ {'id' : idx , 'name' : node .name , 'entity_types' : node .labels }
159+ for idx , node in enumerate (chunk )
160+ ],
161+ 'previous_episodes' : [ep .content for ep in previous_episodes ],
162+ 'reference_time' : episode .valid_at ,
163+ 'edge_types' : edge_types_context ,
164+ 'custom_extraction_instructions' : custom_extraction_instructions or '' ,
165+ }
135166
136- llm_response = await llm_client .generate_response (
137- prompt_library .extract_edges .edge (context ),
138- response_model = ExtractedEdges ,
139- max_tokens = extract_edges_max_tokens ,
140- group_id = group_id ,
141- prompt_name = 'extract_edges.edge' ,
167+ llm_response = await llm_client .generate_response (
168+ prompt_library .extract_edges .edge (context ),
169+ response_model = ExtractedEdges ,
170+ max_tokens = extract_edges_max_tokens ,
171+ group_id = group_id ,
172+ prompt_name = 'extract_edges.edge' ,
173+ )
174+ chunk_edges_data = ExtractedEdges (** llm_response ).edges
175+
176+ # Map chunk-local indices to global indices in the original nodes list
177+ # Note: global_indices are guaranteed valid by generate_covering_chunks,
178+ # but LLM-returned local indices need validation
179+ valid_edges : list [ExtractedEdge ] = []
180+ chunk_size = len (global_indices )
181+
182+ for edge_data in chunk_edges_data :
183+ source_local_idx = edge_data .source_entity_id
184+ target_local_idx = edge_data .target_entity_id
185+
186+ # Validate LLM-returned indices are within chunk bounds
187+ if not (0 <= source_local_idx < chunk_size ):
188+ logger .warning (
189+ f'Source index { source_local_idx } out of bounds for chunk of size '
190+ f'{ chunk_size } in edge { edge_data .relation_type } '
191+ )
192+ continue
193+
194+ if not (0 <= target_local_idx < chunk_size ):
195+ logger .warning (
196+ f'Target index { target_local_idx } out of bounds for chunk of size '
197+ f'{ chunk_size } in edge { edge_data .relation_type } '
198+ )
199+ continue
200+
201+ # Map to global indices (guaranteed valid by generate_covering_chunks)
202+ mapped_source = global_indices [source_local_idx ]
203+ mapped_target = global_indices [target_local_idx ]
204+ edge_data .source_entity_id = mapped_source
205+ edge_data .target_entity_id = mapped_target
206+
207+ # Only include edges for pairs assigned to this chunk
208+ edge_pair = frozenset ([mapped_source , mapped_target ])
209+ if edge_pair in assigned_pairs :
210+ valid_edges .append (edge_data )
211+
212+ return valid_edges
213+
214+ # Extract edges from all chunks in parallel
215+ chunk_results : list [list [ExtractedEdge ]] = list (
216+ await semaphore_gather (
217+ * [
218+ extract_edges_for_chunk (chunk , global_indices , assigned_pairs )
219+ for (chunk , global_indices ), assigned_pairs in zip (
220+ covering_chunks , chunk_assigned_pairs , strict = True
221+ )
222+ ]
223+ )
142224 )
143- edges_data = ExtractedEdges (** llm_response ).edges
225+
226+ # Combine results from all chunks
227+ edges_data : list [ExtractedEdge ] = []
228+ for chunk_edges in chunk_results :
229+ edges_data .extend (chunk_edges )
144230
145231 end = time ()
146232 logger .debug (f'Extracted new edges: { edges_data } in { (end - start ) * 1000 } ms' )
@@ -161,22 +247,9 @@ async def extract_edges(
161247 if not edge_data .fact .strip ():
162248 continue
163249
164- source_node_idx = edge_data .source_entity_id
165- target_node_idx = edge_data .target_entity_id
166-
167- if len (nodes ) == 0 :
168- logger .warning ('No entities provided for edge extraction' )
169- continue
170-
171- if not (0 <= source_node_idx < len (nodes ) and 0 <= target_node_idx < len (nodes )):
172- logger .warning (
173- f'Invalid entity IDs in edge extraction for { edge_data .relation_type } . '
174- f'source_entity_id: { source_node_idx } , target_entity_id: { target_node_idx } , '
175- f'but only { len (nodes )} entities available (valid range: 0-{ len (nodes ) - 1 } )'
176- )
177- continue
178- source_node_uuid = nodes [source_node_idx ].uuid
179- target_node_uuid = nodes [target_node_idx ].uuid
250+ # Indices already validated in extract_edges_for_chunk
251+ source_node_uuid = nodes [edge_data .source_entity_id ].uuid
252+ target_node_uuid = nodes [edge_data .target_entity_id ].uuid
180253
181254 if valid_at :
182255 try :
0 commit comments