getzep · Lucas5357 · Mar 29, 2026
diff --git a/graphiti_core/utils/maintenance/dedup_helpers.py b/graphiti_core/utils/maintenance/dedup_helpers.py
@@ -43,8 +43,12 @@ def _normalize_string_exact(name: str) -> str:
 
 
 def _normalize_name_for_fuzzy(name: str) -> str:
-    """Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles."""
-    normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_string_exact(name))
+    """Produce a fuzzier form that keeps word characters and apostrophes for n-gram shingles.
+
+    Uses \\w (which includes Unicode letters, digits, and underscore) instead of
+    [a-z0-9] so that CJK and other non-Latin scripts are preserved.
+    """
+    normalized = re.sub(r"[^\w' ]", ' ', _normalize_string_exact(name))
     normalized = normalized.strip()
     return re.sub(r'[\s]+', ' ', normalized)
 
@@ -85,13 +89,37 @@ def _has_high_entropy(normalized_name: str) -> bool:
     return _name_entropy(normalized_name) >= _NAME_ENTROPY_THRESHOLD
 
 
+def _has_cjk(text: str) -> bool:
+    """Return True if any character falls in CJK Unified Ideographs or common CJK ranges."""
+    for ch in text:
+        cp = ord(ch)
+        if (
+            0x4E00 <= cp <= 0x9FFF      # CJK Unified Ideographs
+            or 0x3400 <= cp <= 0x4DBF    # CJK Extension A
+            or 0xF900 <= cp <= 0xFAFF    # CJK Compatibility Ideographs
+            or 0x3000 <= cp <= 0x303F    # CJK Symbols and Punctuation
+            or 0x3040 <= cp <= 0x30FF    # Hiragana + Katakana
+            or 0xAC00 <= cp <= 0xD7AF    # Hangul Syllables
+        ):
+            return True
+    return False
+
+
 def _shingles(normalized_name: str) -> set[str]:
-    """Create 3-gram shingles from the normalized name for MinHash calculations."""
+    """Create n-gram shingles from the normalized name for MinHash calculations.
+
+    Uses 2-gram for CJK text (each character carries more information than a
+    Latin letter) and 3-gram for Latin text (unchanged behaviour).
+    """
     cleaned = normalized_name.replace(' ', '')
     if len(cleaned) < 2:
         return {cleaned} if cleaned else set()
 
-    return {cleaned[i : i + 3] for i in range(len(cleaned) - 2)}
+    n = 2 if _has_cjk(cleaned) else 3
+    if len(cleaned) < n:
+        return {cleaned}
+
+    return {cleaned[i : i + n] for i in range(len(cleaned) - n + 1)}
 
 
 def _hash_shingle(shingle: str, seed: int) -> int:
@@ -251,6 +279,7 @@ def _resolve_with_similarity(
     'DedupResolutionState',
     '_normalize_string_exact',
     '_normalize_name_for_fuzzy',
+    '_has_cjk',
     '_has_high_entropy',
     '_minhash_signature',
     '_lsh_bands',

diff --git a/tests/utils/maintenance/test_node_operations.py b/tests/utils/maintenance/test_node_operations.py
@@ -13,6 +13,7 @@
     DedupResolutionState,
     _build_candidate_indexes,
     _cached_shingles,
+    _has_cjk,
     _has_high_entropy,
     _hash_shingle,
     _jaccard_similarity,
@@ -629,3 +630,83 @@ async def test_batch_summaries_calls_llm_for_long_summary():
     # LLM should have been called to condense the long summary
     llm_client.generate_response.assert_awaited_once()
     assert node.summary == 'Condensed summary'
+
+
+# --- CJK support tests ---
+
+
+def test_has_cjk_detection():
+    assert _has_cjk('中际旭创') is True
+    assert _has_cjk('Alice Smith') is False
+    assert _has_cjk('源杰半导体 Semiconductors') is True
+    assert _has_cjk('') is False
+
+
+def test_normalize_name_for_fuzzy_preserves_cjk():
+    """CJK characters must survive normalization (the old [^a-z0-9] regex stripped them)."""
+    assert '中际旭创' in _normalize_name_for_fuzzy('中际旭创')
+    assert '源杰' in _normalize_name_for_fuzzy('源杰半导体')
+    # Mixed: Latin and CJK both preserved
+    result = _normalize_name_for_fuzzy('Google 中际旭创')
+    assert 'google' in result
+    assert '中际旭创' in result
+
+
+def test_shingles_cjk_uses_bigrams():
+    """CJK text should produce 2-gram shingles (not 3-gram)."""
+    shingle_set = _shingles('中际旭创')
+    # 4 CJK chars → 3 bigrams: 中际, 际旭, 旭创
+    assert shingle_set == {'中际', '际旭', '旭创'}
+
+
+def test_shingles_latin_still_uses_trigrams():
+    """Latin text behaviour must be unchanged."""
+    shingle_set = _shingles('alice')
+    assert shingle_set == {'ali', 'lic', 'ice'}
+
+
+def test_shingles_short_cjk():
+    """Two CJK chars → single bigram."""
+    assert _shingles('中际') == {'中际'}
+
+
+def test_cjk_fuzzy_matching_end_to_end():
+    """Two similar CJK names should get non-zero Jaccard similarity after the fix."""
+    name_a = _normalize_name_for_fuzzy('中际旭创')
+    name_b = _normalize_name_for_fuzzy('中际旭创科技')
+    shingles_a = _shingles(name_a)
+    shingles_b = _shingles(name_b)
+    # Both should have non-empty shingles (the bug was empty shingles)
+    assert len(shingles_a) > 0
+    assert len(shingles_b) > 0
+    # They share common bigrams so Jaccard > 0
+    score = _jaccard_similarity(shingles_a, shingles_b)
+    assert score > 0.0
+
+
+def test_cjk_entity_resolution_deterministic():
+    """CJK entity with exact name match should resolve deterministically."""
+    # Use a longer name to pass the entropy/length filter
+    candidate = EntityNode(name='中际旭创光电科技', group_id='group', labels=['Entity'])
+    extracted = EntityNode(name='中际旭创光电科技', group_id='group', labels=['Entity'])
+
+    indexes = _build_candidate_indexes([candidate])
+    state = DedupResolutionState(resolved_nodes=[None], uuid_map={}, unresolved_indices=[])
+
+    _resolve_with_similarity([extracted], indexes, state)
+
+    assert state.resolved_nodes[0].uuid == candidate.uuid
+    assert state.uuid_map[extracted.uuid] == candidate.uuid
+
+
+def test_short_cjk_name_defers_to_llm():
+    """Short CJK names (< 6 chars, single token) should defer to LLM, not fuzzy match."""
+    extracted = EntityNode(name='中际旭创', group_id='group', labels=['Entity'])
+    indexes = _build_candidate_indexes([])
+    state = DedupResolutionState(resolved_nodes=[None], uuid_map={}, unresolved_indices=[])
+
+    _resolve_with_similarity([extracted], indexes, state)
+
+    # Short CJK name → low entropy filter → deferred to LLM
+    assert state.resolved_nodes[0] is None
+    assert state.unresolved_indices == [0]