Advance OSS contribution for [Bug]: IntelEmbedding RuntimeError

xodn348 · xodn348 · commit 4098fe640b43 · 2026-05-07T08:08:24.000Z
Nightly Codex produced a focused contribution for #19328. Constraint: Automated nightly run; keep changes small and reviewable. Confidence: medium Scope-risk: narrow Tested: See uploaded nightly artifacts and workflow logs. Not-tested: Maintainer CI beyond this workflow.
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-optimum-intel/llama_index/embeddings/huggingface_optimum_intel/base.py b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-optimum-intel/llama_index/embeddings/huggingface_optimum_intel/base.py
@@ -1,4 +1,4 @@
-from typing import Any, List, Optional
+from typing import Any, Dict, List, Optional
 
 from llama_index.core.base.embeddings.base import (
     DEFAULT_EMBED_BATCH_SIZE,
@@ -41,6 +41,8 @@ def __init__(
         cache_folder: Optional[str] = None,
         model: Optional[Any] = None,
         tokenizer: Optional[Any] = None,
+        model_kwargs: Optional[Dict[str, Any]] = None,
+        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
         embed_batch_size: int = DEFAULT_EMBED_BATCH_SIZE,
         callback_manager: Optional[CallbackManager] = None,
         device: Optional[str] = None,
@@ -54,9 +56,19 @@ def __init__(
                 "optimum-intel neural-compressor intel_extension_for_pytorch`"
             )
 
+        model_kwargs = model_kwargs or {}
+        tokenizer_kwargs = tokenizer_kwargs or {}
+
+        if cache_folder:
+            model_kwargs.setdefault("cache_dir", cache_folder)
+            tokenizer_kwargs.setdefault("cache_dir", cache_folder)
+
         device = device or infer_torch_device()
-        model = model or IPEXModel.from_pretrained(folder_name).to(device)
-        tokenizer = tokenizer or AutoTokenizer.from_pretrained(folder_name)
+        model_kwargs.setdefault("weights_only", False)
+        model = model or IPEXModel.from_pretrained(folder_name, **model_kwargs).to(device)
+        tokenizer = tokenizer or AutoTokenizer.from_pretrained(
+            folder_name, **tokenizer_kwargs
+        )
 
         if max_length is None:
             try:
@@ -83,6 +95,7 @@ def __init__(
             normalize=normalize,
             query_instruction=query_instruction,
             text_instruction=text_instruction,
+            cache_folder=cache_folder,
         )
         self._model = model
         self._tokenizer = tokenizer
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-optimum-intel/tests/test_embeddings_huggingface_optimum_intel.py b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-optimum-intel/tests/test_embeddings_huggingface_optimum_intel.py
@@ -1,7 +1,73 @@
+import sys
+import types
+
 from llama_index.core.base.embeddings.base import BaseEmbedding
 from llama_index.embeddings.huggingface_optimum_intel import IntelEmbedding
+import llama_index.embeddings.huggingface_optimum_intel.base as optimum_intel_base
 
 
 def test_optimum_intel_embedding_class():
     names_of_base_classes = [b.__name__ for b in IntelEmbedding.__mro__]
     assert BaseEmbedding.__name__ in names_of_base_classes
+
+
+def test_optimum_intel_load_kwargs(monkeypatch):
+    model_calls = []
+    tokenizer_calls = []
+
+    class MockConfig:
+        max_position_embeddings = 512
+
+    class MockModel:
+        config = MockConfig()
+
+        def to(self, device):
+            self.device = device
+            return self
+
+    class MockIPEXModel:
+        @classmethod
+        def from_pretrained(cls, folder_name, **kwargs):
+            model_calls.append((folder_name, kwargs))
+            return MockModel()
+
+    class MockTokenizer:
+        model_max_length = 256
+
+        @classmethod
+        def from_pretrained(cls, folder_name, **kwargs):
+            tokenizer_calls.append((folder_name, kwargs))
+            return cls()
+
+    optimum = types.ModuleType("optimum")
+    optimum_intel = types.ModuleType("optimum.intel")
+    optimum_intel.IPEXModel = MockIPEXModel
+    monkeypatch.setitem(sys.modules, "optimum", optimum)
+    monkeypatch.setitem(sys.modules, "optimum.intel", optimum_intel)
+    monkeypatch.setattr(optimum_intel_base, "AutoTokenizer", MockTokenizer)
+
+    embed_model = IntelEmbedding(
+        "Intel/bge-small-en-v1.5-rag-int8-static",
+        cache_folder="/tmp/hf-cache",
+        model_kwargs={"revision": "main"},
+        tokenizer_kwargs={"use_fast": True},
+        device="cpu",
+    )
+
+    assert embed_model.cache_folder == "/tmp/hf-cache"
+    assert model_calls == [
+        (
+            "Intel/bge-small-en-v1.5-rag-int8-static",
+            {
+                "revision": "main",
+                "cache_dir": "/tmp/hf-cache",
+                "weights_only": False,
+            },
+        )
+    ]
+    assert tokenizer_calls == [
+        (
+            "Intel/bge-small-en-v1.5-rag-int8-static",
+            {"use_fast": True, "cache_dir": "/tmp/hf-cache"},
+        )
+    ]