Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 13 additions & 11 deletions chatbot-core/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,20 @@
Main entry point for the FastAPI application.
"""

# Standard library
import asyncio
from contextlib import asynccontextmanager

# Third‑party
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel

# First‑party
from api.routes import chatbot
from api.config.loader import CONFIG
from api.services.memory import cleanup_expired_sessions, reload_persisted_sessions
from utils import LoggerFactory
from pydantic import BaseModel

logger = LoggerFactory.get_logger(__name__)

Expand Down Expand Up @@ -63,6 +68,7 @@ class HealthResponse(BaseModel):
"""Response model for health check endpoint."""
status: str
llm_available: bool
embedding_available: bool


app = FastAPI(lifespan=lifespan)
Expand All @@ -86,19 +92,15 @@ async def health_check():
Health check endpoint for container orchestration (Kubernetes, Docker, etc.).
Returns:
HealthResponse: Contains the service status and LLM availability.
HealthResponse: Contains the service status and model availability.
"""
llm_available = False
try:
# pylint: disable=import-outside-toplevel
from api.models.llama_cpp_provider import llm_provider
llm_available = llm_provider is not None
except Exception: # pylint: disable=broad-except
pass

# pylint: disable=import-outside-toplevel
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why you ignore this pylint?

from api.models.runtime_models import get_models_status
models_status = get_models_status()
return HealthResponse(
status="healthy",
llm_available=llm_available
llm_available=models_status["llm_available"],
embedding_available=models_status["embedding_available"]
)


Expand Down
30 changes: 24 additions & 6 deletions chatbot-core/api/models/embedding_model.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,27 @@
"""Loads and exports once the sentence transformer model."""
"""
Embedding model lazy loader wrapper.

from rag.embedding.embedding_utils import load_embedding_model
from api.config.loader import CONFIG
from utils import LoggerFactory
This module provides access to the sentence transformer embedding model
through lazy initialization to avoid blocking startup with heavy model loads.

logger = LoggerFactory.instance().get_logger("api")
DEPRECATED: Direct access to EMBEDDING_MODEL global is no longer supported.
Use api.models.runtime_models.get_embedding_model() instead.
"""

EMBEDDING_MODEL = load_embedding_model(CONFIG["retrieval"]["embedding_model_name"], logger)
from api.models.runtime_models import get_embedding_model

# For backward compatibility during transition, provide a lazy property-like accessor
# but warn about the deprecated approach
def _get_embedding_model_compat():
"""Backward compatibility wrapper."""
model = get_embedding_model()
if model is None:
raise RuntimeError(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should raise Error

"Embedding model not available. "
"Check logs for initialization errors. "
"Consider using get_embedding_model() for graceful degraded mode."
)
return model

# Module-level getter for compatibility, but do NOT eagerly load
EMBEDDING_MODEL = property(lambda self: _get_embedding_model_compat()) # pylint: disable=invalid-name
5 changes: 4 additions & 1 deletion chatbot-core/api/models/llama_cpp_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,4 +147,7 @@ def _stream_generator():
yield "Sorry, an unexpected error occurred."


llm_provider = None if CONFIG["is_test_mode"] else LlamaCppProvider()
# Lazy initialization is now handled by api.models.runtime_models
# This module should NOT instantiate llm_provider at import time
# llm_provider = None if CONFIG["is_test_mode"] else LlamaCppProvider()
# REMOVED - use runtime_models instead
135 changes: 135 additions & 0 deletions chatbot-core/api/models/runtime_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
"""
Runtime lazy model loader with caching and error handling.

This module provides thread-safe, lazy initialization for heavy components
(embedding model and LLM provider) that should not be loaded at import time.
Models are initialized on first use and cached for subsequent calls.
"""

from threading import Lock
from typing import Optional, Dict, Any
from utils import LoggerFactory

logger = LoggerFactory.instance().get_logger("models")

# Caching and locking for lazy initialization
_models_cache: Dict[str, Any] = {}
_models_lock = Lock()
_models_errors: Dict[str, str] = {}


def get_embedding_model():
"""
Lazily load and cache the sentence transformer embedding model.

Returns:
Optional[SentenceTransformer]: The loaded model, or None if initialization failed.
Check logs for error details.
"""
return _get_cached_model(
"embedding",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Define this on top of the file as a var.

_load_embedding_model_impl
)


def get_llm_provider():
"""
Lazily load and cache the LLM provider.

Returns:
Optional[LlamaCppProvider]: The loaded provider, or None if initialization failed.
Check logs for error details.
"""

return _get_cached_model(
"llm",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Define on top

_load_llm_provider_impl
)


def get_models_status() -> Dict[str, Any]:
"""
Get the current status of all cached models (for health checks).

Returns:
dict: Status object with keys:
- embedding_available: bool
- llm_available: bool
- embedding_error: Optional[str]
- llm_error: Optional[str]
"""
return {
"embedding_available": "embedding" in _models_cache,
"llm_available": "llm" in _models_cache,
"embedding_error": _models_errors.get("embedding"),
"llm_error": _models_errors.get("llm"),
}


def _get_cached_model(model_name: str, loader_fn) -> Optional[Any]:
"""
Generic lazy loader with caching and thread-safe locking.

Args:
model_name: Key for caching (e.g., "embedding", "llm")
loader_fn: Callable that returns the loaded model or raises an exception

Returns:
The cached model instance, or None if initialization failed.
"""
if model_name in _models_cache:
logger.debug("%s model already cached, returning existing instance", model_name)
return _models_cache[model_name]

if model_name in _models_errors:
return None

with _models_lock:
# Double-check after acquiring lock
if model_name in _models_cache:
return _models_cache[model_name]
if model_name in _models_errors:
return None

try:
logger.info("Initializing %s model for the first time...", model_name)
model = loader_fn()
_models_cache[model_name] = model
logger.info("%s model initialized successfully", model_name)
return model
except (ImportError, RuntimeError, ValueError) as exc:
error_msg = f"Failed to initialize {model_name}: {type(exc).__name__}: {exc}"
_models_errors[model_name] = error_msg
logger.error(error_msg, exc_info=True)
return None


def _load_embedding_model_impl():
"""
Internal: actually load the embedding model.
Called only on first use and under lock.
"""
# pylint: disable=import-outside-toplevel
from sentence_transformers import SentenceTransformer
from api.config.loader import CONFIG

model_name = CONFIG["retrieval"]["embedding_model_name"]
logger.debug("Loading embedding model: %s", model_name)
return SentenceTransformer(model_name)


def _load_llm_provider_impl():
"""
Internal: actually load the LLM provider.
Called only on first use and under lock.
"""
# pylint: disable=import-outside-toplevel
from api.models.llama_cpp_provider import LlamaCppProvider
from api.config.loader import CONFIG

if CONFIG.get("is_test_mode", False):
logger.info("Test mode enabled: LLM provider will not be instantiated")
return None

logger.debug("Initializing LLM provider...")
return LlamaCppProvider()
25 changes: 18 additions & 7 deletions chatbot-core/api/services/chat_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
from typing import AsyncGenerator, List, Optional

from api.config.loader import CONFIG
from api.models.embedding_model import EMBEDDING_MODEL
from api.models.llama_cpp_provider import llm_provider
from api.models.runtime_models import get_embedding_model, get_llm_provider
from api.models.schemas import ChatResponse, QueryType, try_str_to_query_type, FileAttachment
from api.prompts.prompt_builder import build_prompt
from api.prompts.prompts import (
Expand All @@ -30,6 +29,8 @@
from rag.retriever.retrieve import get_relevant_documents
from utils import LoggerFactory

llm_provider = get_llm_provider()

logger = LoggerFactory.instance().get_logger("api")
llm_config = CONFIG["llm"]
retrieval_config = CONFIG["retrieval"]
Expand Down Expand Up @@ -386,9 +387,15 @@ def retrieve_context(user_input: str) -> str:
"Dev mode enabled - skipping RAG retrieval. Build indices to enable full RAG.")
return "Dev mode: RAG indices not built. This is a placeholder context for testing."

# Lazily load embedding model on first use
embedding_model = get_embedding_model()
if embedding_model is None:
logger.warning("Embedding model unavailable - RAG retrieval disabled")
return retrieval_config["empty_context_message"]

data_retrieved, _ = get_relevant_documents(
user_input,
EMBEDDING_MODEL,
embedding_model,
logger=logger,
source_name="plugins",
top_k=retrieval_config["top_k"]
Expand Down Expand Up @@ -430,12 +437,14 @@ def generate_answer(prompt: str, max_tokens: Optional[int] = None) -> str:
Returns:
str: The model's generated text response.
"""
if llm_provider is None:
# Use the global llm_provider instance
provider = llm_provider
if provider is None:
logger.warning(
"LLM provider not available - returning fallback response")
return "LLM is not available. Please install llama-cpp-python and configure a model."
try:
return llm_provider.generate(
return provider.generate(
prompt=prompt,
max_tokens=max_tokens or llm_config["max_tokens"])
except (ImportError, AttributeError) as e:
Expand Down Expand Up @@ -464,13 +473,15 @@ async def generate_answer_stream(
Yields:
str: Individual tokens
"""
if llm_provider is None:
# Use the global llm_provider instance
provider = llm_provider
if provider is None:
logger.warning(
"LLM provider not available - returning fallback response")
yield "LLM is not available. Please install llama-cpp-python and configure a model."
return
try:
async for token in llm_provider.generate_stream(
async for token in provider.generate_stream(
prompt=prompt,
max_tokens=max_tokens or llm_config["max_tokens"]
):
Expand Down
Loading