jenkinsci
diff --git a/‎chatbot-core/data/__init__.py‎ b/‎chatbot-core/data/__init__.py‎
diff --git a/‎chatbot-core/data/chunking/__init__.py‎ b/‎chatbot-core/data/chunking/__init__.py‎
diff --git a/‎chatbot-core/data/chunking/chunking_utils/__init__.py‎
Lines changed: 14 additions & 0 deletions b/‎chatbot-core/data/chunking/chunking_utils/__init__.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎chatbot-core/data/chunking/chunking_utils/common.py‎
Lines changed: 53 additions & 0 deletions b/‎chatbot-core/data/chunking/chunking_utils/common.py‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎chatbot-core/data/chunking/chunking_utils/extract_functions.py‎
Lines changed: 88 additions & 0 deletions b/‎chatbot-core/data/chunking/chunking_utils/extract_functions.py‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎chatbot-core/data/chunking/extract_chunk_discourse.py‎
Lines changed: 141 additions & 0 deletions b/‎chatbot-core/data/chunking/extract_chunk_discourse.py‎
Lines changed: 141 additions & 0 deletions
@@ -0,0 +1,14 @@
+"""Utility functions for extracting titles, code blocks, and logging."""
+
+from .extract_functions import(
+    extract_title,
+    extract_code_blocks,
+    assign_code_blocks_to_chunks
+)
+
+from .common import(
+    save_chunks,
+    read_json_file,
+    build_chunk_dict,
+    get_text_splitter
+)
@@ -0,0 +1,53 @@
+"""Shared utilities for reading/writing JSON and standardizing chunk format."""
+
+import json
+import uuid
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+def save_chunks(output_path, all_chunks, logger):
+    """Save chunk list to JSON file and log the outcome."""
+    try:
+        with open(output_path, "w", encoding="utf-8") as f:
+            json.dump(all_chunks, f, ensure_ascii=False, indent=2)
+        logger.info("Written %d chunks to %s.", len(all_chunks), output_path)
+    except OSError as e:
+        logger.error("File error while writing %s: %s", output_path, e)
+
+def read_json_file(input_path, logger):
+    """Load JSON file and return data, with proper error handling."""
+    try:
+        with open(input_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except (FileNotFoundError, OSError) as e:
+        logger.error("File error while reading %s: %s", input_path, e)
+    except json.JSONDecodeError as e:
+        logger.error("JSON decode error in %s: %s", input_path, e)
+    return []
+
+def build_chunk_dict(chunk_text, metadata, code_blocks):
+    """Create a standardized chunk dictionary."""
+    return {
+        "id": str(uuid.uuid4()),
+        "chunk_text": chunk_text,
+        "metadata": metadata,
+        "code_blocks": code_blocks
+    }
+
+def get_text_splitter(chunk_size, chunk_overlap, separators=None):
+    """
+    Creates and returns a RecursiveCharacterTextSplitter with given parameters.
+
+    Args:
+        chunk_size (int): Maximum size of each text chunk.
+        chunk_overlap (int): Number of overlapping characters between chunks.
+        separators (list[str], optional): Custom list of separators for splitting.
+                                          If None or empty, uses a default strategy.
+
+    Returns:
+        RecursiveCharacterTextSplitter: Configured text splitter instance.
+    """
+    return RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        separators=separators or ["\n\n", "\n", " ", ""]
+    )
@@ -0,0 +1,88 @@
+"""Functions for extracting titles and code blocks from HTML content."""
+
+import re
+from bs4 import NavigableString
+
+def extract_title(soup):
+    """
+    Extracts the title from a BeautifulSoup-parsed HTML document.
+
+    Priority:
+    1. <h1> element if present
+    2. <title> tag as fallback
+    3. Returns "Untitled" if neither is found
+
+    Args:
+        soup (BeautifulSoup): Parsed HTML document.
+
+    Returns:
+        str: The extracted title string.
+    """
+    h1 = soup.find("h1")
+    if h1 and h1.get_text(strip=True):
+        return h1.get_text(strip=True)
+    if soup.title:
+        return soup.title.get_text(strip=True)
+    return "Untitled"
+
+def extract_code_blocks(soup, tag, placeholder_template):
+    """
+    Extracts all code blocks of a specified HTML tag (e.g., <pre>, <code>),
+    replaces them with numbered placeholders, and returns the list of raw code strings.
+
+    Args:
+        soup (BeautifulSoup): Parsed HTML content.
+        tag (str): HTML tag to search for (e.g., "pre", "code").
+
+    Returns:
+        list[str]: A list of code block strings, in the order they were found.
+    """
+    code_blocks = []
+    for i, code_block in enumerate(soup.find_all(tag)):
+        placeholder = placeholder_template.format(i)
+        code_blocks.append(code_block.get_text(strip=True))
+        code_block.replace_with(NavigableString(placeholder))
+    return code_blocks
+
+def assign_code_blocks_to_chunks(chunks, code_blocks, placeholder_pattern, logger):
+    """
+    Assigns relevant code blocks to each chunk based on placeholder references.
+    
+    Args:
+        chunks: List of text chunks (strings).
+        code_blocks: List of all extracted code blocks.
+        placeholder_pattern: Regex pattern to find placeholder indices
+
+    Returns:
+        A list of dicts with 'chunk_text' and corresponding 'code_blocks'.
+    """
+    processed_chunks = []
+
+    for chunk in chunks:
+        matches = re.findall(placeholder_pattern, chunk)
+        indices = set()
+
+        for match in matches:
+            try:
+                idx = int(match)
+                if idx < len(code_blocks):
+                    indices.add(idx)
+                else:
+                    logger.warning(
+                        "Placeholder index %d out of range (max index %d). Skipping.",
+                        idx, len(code_blocks) - 1
+                    )
+            except ValueError:
+                logger.warning(
+                    "Malformed placeholder index: '%s'. Skipping.",
+                    match
+                )
+
+        chunk_code_blocks = [code_blocks[i] for i in sorted(indices)]
+
+        processed_chunks.append({
+            "chunk_text": chunk,
+            "code_blocks": chunk_code_blocks
+        })
+
+    return processed_chunks
@@ -0,0 +1,141 @@
+"""Chunk Discourse threads into structured content blocks with metadata."""
+# pylint: disable=R0801
+
+import os
+import re
+from data.chunking.chunking_utils import(
+    assign_code_blocks_to_chunks,
+    save_chunks,
+    read_json_file,
+    build_chunk_dict,
+    get_text_splitter
+)
+from utils import LoggerFactory
+
+logger_factory = LoggerFactory.instance()
+logger = logger_factory.get_logger("chunking")
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+INPUT_PATH = os.path.join(SCRIPT_DIR, "..", "raw", "topics_with_posts.json")
+OUTPUT_PATH = os.path.join(SCRIPT_DIR, "..", "processed", "chunks_discourse_docs.json")
+
+CHUNK_SIZE = 500
+CHUNK_OVERLAP = 100
+CODE_BLOCK_PLACEHOLDER_PATTERN = r"\[\[(?:CODE_BLOCK|CODE_SNIPPET)_(\d+)\]\]"
+TRIPLE_BACKTICK_CODE_PATTERN = r"```(?:\w+\n)?(.*?)```"
+INLINE_BACKTICK_CODE_PATTERN = r"`([^`\n]+?)`"
+
+def extract_code_blocks(text):
+    """
+    Extracts code blocks and replaces them with indexed placeholders.
+    Supports both triple-backtick code blocks and inline code in backticks.
+
+    Args:
+        text (str): Raw text including code blocks.
+
+    Returns:
+        tuple:
+            - List of extracted code blocks (in order of appearance).
+            - Modified text with placeholders inserted in place of code.
+    """
+    code_blocks = []
+    placeholder_counter = 0
+
+    # Replace triple backtick code blocks with indexed placeholders
+    def replace_triple(match):
+        nonlocal placeholder_counter
+        code = match.group(1).strip()
+        placeholder = f"[[CODE_BLOCK_{placeholder_counter}]]"
+        code_blocks.append(code)
+        placeholder_counter += 1
+        return placeholder
+
+    text = re.sub(TRIPLE_BACKTICK_CODE_PATTERN, replace_triple, text, flags=re.DOTALL)
+
+    # Replace inline backtick code with indexed placeholders
+    def replace_inline(match):
+        nonlocal placeholder_counter
+        code = match.group(1).strip()
+        placeholder = f"[[CODE_SNIPPET_{placeholder_counter}]]"
+        code_blocks.append(code)
+        placeholder_counter += 1
+        return placeholder
+
+    text = re.sub(INLINE_BACKTICK_CODE_PATTERN, replace_inline, text)
+
+    return code_blocks, text
+
+
+def process_thread(thread, text_splitter):
+    """
+    Processes a single Discourse thread into structured chunks.
+
+    Args:
+        thread (dict): Thread data including topic ID, title, and post texts.
+        text_splitter (RecursiveCharacterTextSplitter): Chunking utility.
+
+    Returns:
+        list[dict]: List of chunk objects with text, metadata, and associated code blocks.
+    """
+    topic_id = thread.get("topic_id")
+    title = thread.get("title", "Untitled")
+    posts = thread.get("posts", [])
+
+    # Combine all posts into a single text block
+    full_text = "\n\n".join(posts)
+
+    code_blocks, clean_text = extract_code_blocks(full_text)
+    chunks = text_splitter.split_text(clean_text)
+
+    processed_chunks = assign_code_blocks_to_chunks(
+        chunks,
+        code_blocks,
+        CODE_BLOCK_PLACEHOLDER_PATTERN,
+        logger
+    )
+
+    return [
+        build_chunk_dict(
+            chunk["chunk_text"],
+            {
+                "data_source": "discourse_threads",
+                "topic_id": topic_id,
+                "title": title
+            },
+            chunk["code_blocks"]
+        )
+        for chunk in processed_chunks
+    ]
+
+def extract_chunks(threads):
+    """
+    Processes all Discourse threads into a flat list of chunks.
+
+    Args:
+        threads (list): List of Discourse thread dicts.
+
+    Returns:
+        list[dict]: All chunks extracted from all threads.
+    """
+    all_chunks = []
+    text_splitter = get_text_splitter(CHUNK_SIZE, CHUNK_OVERLAP)
+
+    for thread in threads:
+        thread_chunks = process_thread(thread, text_splitter)
+        all_chunks.extend(thread_chunks)
+
+    return all_chunks
+
+def main():
+    """Main entry point."""
+    threads = read_json_file(INPUT_PATH, logger)
+    if not threads:
+        return
+
+    logger.info("Chunking %d Discourse threads.", len(threads))
+    all_chunks = extract_chunks(threads)
+
+    save_chunks(OUTPUT_PATH, all_chunks, logger)
+
+if __name__ == "__main__":
+    main()