topoteretes · Vasilije1990 · Mar 30, 2026 · Feb 28, 2026 · Feb 28, 2026 · Mar 15, 2026
diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml
@@ -1278,3 +1278,36 @@ jobs:
           CACHE_BACKEND: 'redis'
           CACHE_HOST: ${{ inputs.ci-image != '' && 'redis' || 'localhost' }}
         run: uv run pytest cognee/tests/test_usage_logger_e2e.py -v --log-level=INFO
+
+  run_conditional_auth_test:
+    name: Conditional Authentication Test
+    runs-on: ubuntu-latest
+    container: ${{ inputs.ci-image != '' && fromJSON(format('{{"image":"{0}","credentials":{{"username":"{1}","password":"{2}"}}}}', inputs.ci-image, github.actor, github.token)) || null }}
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Check out
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+
+      - name: Run Conditional Authentication Test
+        env:
+          ENV: 'dev'
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_DIMENSIONS: 300
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+          ENABLE_BACKEND_ACCESS_CONTROL: "false"
+        run: uv run python ./cognee/tests/api/test_conditional_authentication_endpoints.py
diff --git a/cognee-mcp/pyproject.toml b/cognee-mcp/pyproject.toml
@@ -8,7 +8,7 @@ requires-python = ">=3.10"
 dependencies = [
     # For local cognee repo usage remove comment bellow and add absolute path to cognee. Then run `uv sync --reinstall` in the mcp folder on local cognee changes.
     #"cognee[postgres,docs,neo4j] @ file:/Users/igorilic/Desktop/cognee",
-    "cognee[postgres,docs,neo4j]==0.5.4",
+    "cognee[postgres,docs,neo4j]==0.5.5",
-    "cognee[postgres,docs,neo4j]==0.5.5",
+    "cognee[postgres,docs,neo4j]==0.5.6",
-    "cognee[postgres,docs,neo4j]==0.5.5",
+    "cognee[postgres,docs,neo4j]==0.5.6",
     "fastmcp>=2.10.0,<3.0.0",
     "mcp>=1.12.0,<2.0.0",
     "uv>=0.6.3,<1.0.0",

diff --git a/cognee-mcp/uv.lock b/cognee-mcp/uv.lock
diff --git a/cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py
@@ -26,6 +26,10 @@
     TikTokenTokenizer,
 )
 from cognee.shared.rate_limiting import embedding_rate_limiter_context_manager
+from cognee.infrastructure.databases.vector.embeddings.utils import (
+    sanitize_embedding_text_inputs,
+    handle_embedding_response,
+)
 
 litellm.set_verbose = False
 logger = get_logger("FastembedEmbeddingEngine")
@@ -101,18 +105,19 @@ async def embed_text(self, text: List[str]) -> List[List[float]]:
             - List[List[float]]: A list of embeddings, where each embedding is a list of floats
               representing the vector form of the input text.
         """
+        sanitized_text_input = sanitize_embedding_text_inputs(text)
         try:
             if self.mock:
-                return [[0.0] * self.dimensions for _ in text]
+                return [[0.0] * self.dimensions for _ in sanitized_text_input]
             else:
                 async with embedding_rate_limiter_context_manager():
                     embeddings = self.embedding_model.embed(
-                        text,
+                        sanitized_text_input,
                         batch_size=len(text),
                         parallel=None,
                     )
-
-                return list(embeddings)
+                embeddings = list(embeddings)
+                return handle_embedding_response(text, embeddings, self.dimensions)
 
         except Exception as error:
             logger.error(f"Embedding error in FastembedEmbeddingEngine: {str(error)}")

diff --git a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
@@ -28,6 +28,10 @@
     TikTokenTokenizer,
 )
 from cognee.shared.rate_limiting import embedding_rate_limiter_context_manager
+from cognee.infrastructure.databases.vector.embeddings.utils import (
+    sanitize_embedding_text_inputs,
+    handle_embedding_response,
+)
 
 litellm.set_verbose = False
 logger = get_logger("LiteLLMEmbeddingEngine")
@@ -123,15 +127,20 @@ async def embed_text(self, text: List[str]) -> List[List[float]]:
 
             - List[List[float]]: A list of vectors representing the embedded texts.
         """
+
+        sanitized_text_input = sanitize_embedding_text_inputs(text)
+
         try:
             if self.mock:
-                response = {"data": [{"embedding": [0.0] * self.dimensions} for _ in text]}
+                response = {
+                    "data": [{"embedding": [0.0] * self.dimensions} for _ in sanitized_text_input]
+                }
                 return [data["embedding"] for data in response["data"]]
             else:
                 async with embedding_rate_limiter_context_manager():
                     embedding_kwargs = {
                         "model": self.model,
-                        "input": text,
+                        "input": sanitized_text_input,
                         "api_key": self.api_key,
                         "api_base": self.endpoint,
                         "api_version": self.api_version,
@@ -146,7 +155,8 @@ async def embed_text(self, text: List[str]) -> List[List[float]]:
                         timeout=30.0,
                     )
 
-                return [data["embedding"] for data in response.data]
+                embedding_response = [data["embedding"] for data in response.data]
+                return handle_embedding_response(text, embedding_response, self.dimensions)
 
         except litellm.exceptions.ContextWindowExceededError as error:
             if isinstance(text, list) and len(text) > 1:

diff --git a/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py
@@ -20,6 +20,10 @@
 )
 from cognee.shared.rate_limiting import embedding_rate_limiter_context_manager
 from cognee.shared.utils import create_secure_ssl_context
+from cognee.infrastructure.databases.vector.embeddings.utils import (
+    sanitize_embedding_text_inputs,
+    handle_embedding_response,
+)
 
 logger = get_logger("OllamaEmbeddingEngine")
 
@@ -90,15 +94,18 @@ async def embed_text(self, text: List[str]) -> List[List[float]]:
 
             - List[List[float]]: A list of embedding vectors corresponding to the text prompts.
         """
+        sanitized_text_input = sanitize_embedding_text_inputs(text)
         if self.mock:
-            return [[0.0] * self.dimensions for _ in text]
+            return [[0.0] * self.dimensions for _ in sanitized_text_input]
 
         # Handle case when a single string is passed instead of a list
-        if not isinstance(text, list):
-            text = [text]
+        if not isinstance(sanitized_text_input, list):
+            text = [sanitized_text_input]
 
-        embeddings = await asyncio.gather(*[self._get_embedding(prompt) for prompt in text])
-        return embeddings
+        embeddings = await asyncio.gather(
+            *[self._get_embedding(prompt) for prompt in sanitized_text_input]
+        )
+        return handle_embedding_response(text, embeddings, self.dimensions)
 
     def _truncate_text_to_token_limit(self, text: str, max_tokens: int = 2048) -> str:
         """
@@ -110,6 +117,12 @@ def _truncate_text_to_token_limit(self, text: str, max_tokens: int = 2048) -> st
             logger.warning(
                 f"Text exceeds character limit ({len(text)} > {char_limit}), truncating..."
             )
+            # TODO: Refactor to better handle truncation, handle it the same as it is handled in LiteLLMEmbeddingEngine
+            #       when the ContextWindowExceededError happens.
+            #       Also max_tokens is never provided to function call so it will always default to 2048, we should make
+            #       it so that it is provided based on the model's context length.
+            #       The char_limit is not a good estimate based on the average number of characters per token, and
+            #       actual value should be based on actual token count using the tokenizer or when the ContextWindowExceededError happens.
             return text[:char_limit]
         return text
 

diff --git a/cognee/infrastructure/databases/vector/embeddings/utils.py b/cognee/infrastructure/databases/vector/embeddings/utils.py
@@ -0,0 +1,51 @@
+from typing import List, Union
+from cognee.shared.logging_utils import setup_logging
+
+logger = setup_logging()
+
+
+def is_embeddable(s: str) -> bool:
+    """
+    Check if input string is embeddable, if not it will be replaced with a dummy value to prevent API errors.
+    Empty strings and a string with only a space character are not embeddable.
+    If input string contains at least one alphanumeric character, it is considered embeddable.
+    """
+    if not isinstance(s, str):
+        return False
+    # Strip whitespace to check if the string is empty or only contains spaces
+    s = s.strip()
+    if len(s) >= 1:
+        return True
+    logger.debug(
+        "Input string was not embeddable. Skipping embedding and using dummy value instead."
+    )
+    return False
+
+
+def sanitize_embedding_text_inputs(text: Union[str, List[str]]) -> List[str]:
+    """
+    Transform invalid/empty inputs into a safe dummy to prevent API 422 embedding errors while
+    keeping list length consistent.
+    """
+    # Ensure we are working with a list
+    text_list = [text] if isinstance(text, str) else text
+    dummy_value = "."
+
+    return [t if is_embeddable(t) else dummy_value for t in text_list]
+
+
+def handle_embedding_response(
+    original_texts: Union[List[str], str], embeddings: List[List[float]], dimensions: int
+) -> List[List[float]]:
+    """
+    Compare the original input strings against the results.
+    If the original string was 'junk' that was not embeddable, overwrite its vector with zeros.
+    """
+    if isinstance(original_texts, str):
+        original_texts = [original_texts]
+
+    zero_vector = [0.0] * dimensions
+    return [
+        embeddings[i] if is_embeddable(original_texts[i]) else zero_vector
+        for i in range(len(original_texts))
+    ]
diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py
@@ -10,8 +10,10 @@
 from cognee.infrastructure.engine import DataPoint
 from cognee.infrastructure.engine.utils import parse_id
 from cognee.infrastructure.files.storage import get_file_storage
-from cognee.modules.storage.utils import copy_model, get_own_properties
+from cognee.modules.storage.utils import copy_model
 from cognee.infrastructure.databases.vector.exceptions import CollectionNotFoundError
+from cognee.infrastructure.databases.vector.pgvector.serialize_data import serialize_data
+from cognee.shared.logging_utils import get_logger
 
 from ..embeddings.EmbeddingEngine import EmbeddingEngine
 from ..models.ScoredResult import ScoredResult
@@ -24,6 +26,8 @@
     COGNEE_VECTOR_RESULT_COUNT,
 )
 
+logger = get_logger("LanceDBAdapter")
+
 
 class IndexSchema(DataPoint):
     """
@@ -187,8 +191,10 @@ class LanceDataPoint(LanceModel, Generic[IdType, PayloadSchema]):
             payload: PayloadSchema
 
         def create_lance_data_point(data_point: DataPoint, vector: list[float]) -> LanceDataPoint:
-            properties = get_own_properties(data_point)
-            properties["id"] = str(properties["id"])
+            payload_model = self.get_data_point_schema(type(data_point))
+            properties = payload_model.model_validate(
+                serialize_data(data_point.model_dump())
+            ).model_dump()
 
             return LanceDataPoint[str, self.get_data_point_schema(type(data_point))](
                 id=str(data_point.id),
@@ -203,14 +209,98 @@ def create_lance_data_point(data_point: DataPoint, vector: list[float]) -> Lance
 
         lance_data_points = list({dp.id: dp for dp in lance_data_points}.values())
 
+        try:
+            async with self.VECTOR_DB_LOCK:
+                await (
+                    collection.merge_insert("id")
+                    .when_matched_update_all()
+                    .when_not_matched_insert_all()
+                    .execute(lance_data_points)
+                )
+        except (ValueError, OSError, RuntimeError) as e:
+            if "not found in target schema" not in str(e):
+                raise
+            logger.warning(
+                "Schema mismatch detected for collection '%s', migrating table: %s",
+                collection_name,
+                e,
+            )
+            await self._migrate_collection_schema(
+                collection_name, collection, payload_schema, lance_data_points
+            )
+
+    async def _migrate_collection_schema(
+        self,
+        collection_name: str,
+        old_collection,
+        payload_schema: type,
+        new_lance_data_points: list,
+    ):
+        """Migrate a LanceDB table to a new schema, preserving existing data."""
+        rows = (await old_collection.to_arrow()).to_pylist()
+
+        vector_size = self.embedding_engine.get_vector_size()
+        schema_model = self.get_data_point_schema(payload_schema)
+        data_point_types = get_type_hints(schema_model)
+        valid_payload_fields = set(schema_model.model_fields.keys())
+        defaults = self._get_payload_defaults(payload_schema)
+
+        new_ids = {dp.id for dp in new_lance_data_points}
+        old_rows = []
+        for row in rows:
+            if row.get("id") in new_ids:
+                continue
+            if isinstance(row.get("payload"), dict):
+                # Strip payload to only fields in the new schema
+                row["payload"] = {
+                    k: v for k, v in row["payload"].items() if k in valid_payload_fields
+                }
+                # Fill in defaults for any new fields
+                for key, val in defaults.items():
+                    row["payload"].setdefault(key, val)
+            old_rows.append(row)
+
+        class MigrationLanceDataPoint(LanceModel):
+            id: data_point_types["id"]
+            vector: Vector(vector_size)
+            payload: schema_model
+
         async with self.VECTOR_DB_LOCK:
+            connection = await self.get_connection()
+            await connection.drop_table(collection_name)
+            await connection.create_table(
+                name=collection_name,
+                schema=MigrationLanceDataPoint,
+            )
+            collection = await connection.open_table(collection_name)
+
+            if old_rows:
+                await collection.add(old_rows)
+
             await (
                 collection.merge_insert("id")
                 .when_matched_update_all()
                 .when_not_matched_insert_all()
-                .execute(lance_data_points)
+                .execute(new_lance_data_points)
             )
 
+        logger.info(
+            "Migrated collection '%s' schema (%d existing rows preserved)",
+            collection_name,
+            len(old_rows),
+        )
+
+    def _get_payload_defaults(self, payload_schema: type) -> dict:
+        """Extract default values from the Pydantic payload model."""
+        schema_model = self.get_data_point_schema(payload_schema)
+        defaults = {}
+        for name, field_info in schema_model.model_fields.items():
+            if field_info.default is not None and not (
+                hasattr(field_info, "is_required") and field_info.is_required()
+            ):
+                defaults[name] = field_info.default
+        return defaults
+
     async def retrieve(self, collection_name: str, data_point_ids: list[str]):
         try:
             collection = await self.get_collection(collection_name)
@@ -396,6 +486,7 @@ async def prune(self):
 
     def get_data_point_schema(self, model_type: BaseModel):
         related_models_fields = []
+
         for field_name, field_config in model_type.model_fields.items():
             if hasattr(field_config, "model_fields"):
                 related_models_fields.append(field_name)
@@ -426,6 +517,7 @@ def get_data_point_schema(self, model_type: BaseModel):
             model_type,
             include_fields={
                 "id": (str, ...),
+                "belongs_to_set": (Optional[List[str]], None),
             },
             exclude_fields=["metadata"] + related_models_fields,
         )