diff --git a/graphiti_core/utils/maintenance/dedup_helpers.py b/graphiti_core/utils/maintenance/dedup_helpers.py index b8ce68b89..b8c5fb180 100644 --- a/graphiti_core/utils/maintenance/dedup_helpers.py +++ b/graphiti_core/utils/maintenance/dedup_helpers.py @@ -43,8 +43,12 @@ def _normalize_string_exact(name: str) -> str: def _normalize_name_for_fuzzy(name: str) -> str: - """Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles.""" - normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_string_exact(name)) + """Produce a fuzzier form that keeps word characters and apostrophes for n-gram shingles. + + Uses \\w (which includes Unicode letters, digits, and underscore) instead of + [a-z0-9] so that CJK and other non-Latin scripts are preserved. + """ + normalized = re.sub(r"[^\w' ]", ' ', _normalize_string_exact(name)) normalized = normalized.strip() return re.sub(r'[\s]+', ' ', normalized) @@ -85,13 +89,37 @@ def _has_high_entropy(normalized_name: str) -> bool: return _name_entropy(normalized_name) >= _NAME_ENTROPY_THRESHOLD +def _has_cjk(text: str) -> bool: + """Return True if any character falls in CJK Unified Ideographs or common CJK ranges.""" + for ch in text: + cp = ord(ch) + if ( + 0x4E00 <= cp <= 0x9FFF # CJK Unified Ideographs + or 0x3400 <= cp <= 0x4DBF # CJK Extension A + or 0xF900 <= cp <= 0xFAFF # CJK Compatibility Ideographs + or 0x3000 <= cp <= 0x303F # CJK Symbols and Punctuation + or 0x3040 <= cp <= 0x30FF # Hiragana + Katakana + or 0xAC00 <= cp <= 0xD7AF # Hangul Syllables + ): + return True + return False + + def _shingles(normalized_name: str) -> set[str]: - """Create 3-gram shingles from the normalized name for MinHash calculations.""" + """Create n-gram shingles from the normalized name for MinHash calculations. + + Uses 2-gram for CJK text (each character carries more information than a + Latin letter) and 3-gram for Latin text (unchanged behaviour). + """ cleaned = normalized_name.replace(' ', '') if len(cleaned) < 2: return {cleaned} if cleaned else set() - return {cleaned[i : i + 3] for i in range(len(cleaned) - 2)} + n = 2 if _has_cjk(cleaned) else 3 + if len(cleaned) < n: + return {cleaned} + + return {cleaned[i : i + n] for i in range(len(cleaned) - n + 1)} def _hash_shingle(shingle: str, seed: int) -> int: @@ -251,6 +279,7 @@ def _resolve_with_similarity( 'DedupResolutionState', '_normalize_string_exact', '_normalize_name_for_fuzzy', + '_has_cjk', '_has_high_entropy', '_minhash_signature', '_lsh_bands', diff --git a/tests/utils/maintenance/test_node_operations.py b/tests/utils/maintenance/test_node_operations.py index 40fab785b..37d41ed3e 100644 --- a/tests/utils/maintenance/test_node_operations.py +++ b/tests/utils/maintenance/test_node_operations.py @@ -13,6 +13,7 @@ DedupResolutionState, _build_candidate_indexes, _cached_shingles, + _has_cjk, _has_high_entropy, _hash_shingle, _jaccard_similarity, @@ -629,3 +630,83 @@ async def test_batch_summaries_calls_llm_for_long_summary(): # LLM should have been called to condense the long summary llm_client.generate_response.assert_awaited_once() assert node.summary == 'Condensed summary' + + +# --- CJK support tests --- + + +def test_has_cjk_detection(): + assert _has_cjk('中际旭创') is True + assert _has_cjk('Alice Smith') is False + assert _has_cjk('源杰半导体 Semiconductors') is True + assert _has_cjk('') is False + + +def test_normalize_name_for_fuzzy_preserves_cjk(): + """CJK characters must survive normalization (the old [^a-z0-9] regex stripped them).""" + assert '中际旭创' in _normalize_name_for_fuzzy('中际旭创') + assert '源杰' in _normalize_name_for_fuzzy('源杰半导体') + # Mixed: Latin and CJK both preserved + result = _normalize_name_for_fuzzy('Google 中际旭创') + assert 'google' in result + assert '中际旭创' in result + + +def test_shingles_cjk_uses_bigrams(): + """CJK text should produce 2-gram shingles (not 3-gram).""" + shingle_set = _shingles('中际旭创') + # 4 CJK chars → 3 bigrams: 中际, 际旭, 旭创 + assert shingle_set == {'中际', '际旭', '旭创'} + + +def test_shingles_latin_still_uses_trigrams(): + """Latin text behaviour must be unchanged.""" + shingle_set = _shingles('alice') + assert shingle_set == {'ali', 'lic', 'ice'} + + +def test_shingles_short_cjk(): + """Two CJK chars → single bigram.""" + assert _shingles('中际') == {'中际'} + + +def test_cjk_fuzzy_matching_end_to_end(): + """Two similar CJK names should get non-zero Jaccard similarity after the fix.""" + name_a = _normalize_name_for_fuzzy('中际旭创') + name_b = _normalize_name_for_fuzzy('中际旭创科技') + shingles_a = _shingles(name_a) + shingles_b = _shingles(name_b) + # Both should have non-empty shingles (the bug was empty shingles) + assert len(shingles_a) > 0 + assert len(shingles_b) > 0 + # They share common bigrams so Jaccard > 0 + score = _jaccard_similarity(shingles_a, shingles_b) + assert score > 0.0 + + +def test_cjk_entity_resolution_deterministic(): + """CJK entity with exact name match should resolve deterministically.""" + # Use a longer name to pass the entropy/length filter + candidate = EntityNode(name='中际旭创光电科技', group_id='group', labels=['Entity']) + extracted = EntityNode(name='中际旭创光电科技', group_id='group', labels=['Entity']) + + indexes = _build_candidate_indexes([candidate]) + state = DedupResolutionState(resolved_nodes=[None], uuid_map={}, unresolved_indices=[]) + + _resolve_with_similarity([extracted], indexes, state) + + assert state.resolved_nodes[0].uuid == candidate.uuid + assert state.uuid_map[extracted.uuid] == candidate.uuid + + +def test_short_cjk_name_defers_to_llm(): + """Short CJK names (< 6 chars, single token) should defer to LLM, not fuzzy match.""" + extracted = EntityNode(name='中际旭创', group_id='group', labels=['Entity']) + indexes = _build_candidate_indexes([]) + state = DedupResolutionState(resolved_nodes=[None], uuid_map={}, unresolved_indices=[]) + + _resolve_with_similarity([extracted], indexes, state) + + # Short CJK name → low entropy filter → deferred to LLM + assert state.resolved_nodes[0] is None + assert state.unresolved_indices == [0]