Skip to content

Commit 16a1a03

Browse files
Implement Chunking (#6)
* Chunking scripts. * Doc comments ; Refactoring. * Update dependencies required. * Update technical doc. * Fix format. * Refactoring. * Refactoring chunking. * Fix return edge case. * Add custom separator. * Harcoded placeholder fix. * Fix hardcoded regex pattern. * Separator fix. * Validating no placeholder is removed. * Checking out of range indexes ; Refactoring. * Technical update docs. * Global logger. * Fix packages.
1 parent e1326b7 commit 16a1a03

21 files changed

Lines changed: 979 additions & 31 deletions

chatbot-core/data/__init__.py

Whitespace-only changes.

chatbot-core/data/chunking/__init__.py

Whitespace-only changes.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
"""Utility functions for extracting titles, code blocks, and logging."""
2+
3+
from .extract_functions import(
4+
extract_title,
5+
extract_code_blocks,
6+
assign_code_blocks_to_chunks
7+
)
8+
9+
from .common import(
10+
save_chunks,
11+
read_json_file,
12+
build_chunk_dict,
13+
get_text_splitter
14+
)
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""Shared utilities for reading/writing JSON and standardizing chunk format."""
2+
3+
import json
4+
import uuid
5+
from langchain.text_splitter import RecursiveCharacterTextSplitter
6+
7+
def save_chunks(output_path, all_chunks, logger):
8+
"""Save chunk list to JSON file and log the outcome."""
9+
try:
10+
with open(output_path, "w", encoding="utf-8") as f:
11+
json.dump(all_chunks, f, ensure_ascii=False, indent=2)
12+
logger.info("Written %d chunks to %s.", len(all_chunks), output_path)
13+
except OSError as e:
14+
logger.error("File error while writing %s: %s", output_path, e)
15+
16+
def read_json_file(input_path, logger):
17+
"""Load JSON file and return data, with proper error handling."""
18+
try:
19+
with open(input_path, "r", encoding="utf-8") as f:
20+
return json.load(f)
21+
except (FileNotFoundError, OSError) as e:
22+
logger.error("File error while reading %s: %s", input_path, e)
23+
except json.JSONDecodeError as e:
24+
logger.error("JSON decode error in %s: %s", input_path, e)
25+
return []
26+
27+
def build_chunk_dict(chunk_text, metadata, code_blocks):
28+
"""Create a standardized chunk dictionary."""
29+
return {
30+
"id": str(uuid.uuid4()),
31+
"chunk_text": chunk_text,
32+
"metadata": metadata,
33+
"code_blocks": code_blocks
34+
}
35+
36+
def get_text_splitter(chunk_size, chunk_overlap, separators=None):
37+
"""
38+
Creates and returns a RecursiveCharacterTextSplitter with given parameters.
39+
40+
Args:
41+
chunk_size (int): Maximum size of each text chunk.
42+
chunk_overlap (int): Number of overlapping characters between chunks.
43+
separators (list[str], optional): Custom list of separators for splitting.
44+
If None or empty, uses a default strategy.
45+
46+
Returns:
47+
RecursiveCharacterTextSplitter: Configured text splitter instance.
48+
"""
49+
return RecursiveCharacterTextSplitter(
50+
chunk_size=chunk_size,
51+
chunk_overlap=chunk_overlap,
52+
separators=separators or ["\n\n", "\n", " ", ""]
53+
)
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
"""Functions for extracting titles and code blocks from HTML content."""
2+
3+
import re
4+
from bs4 import NavigableString
5+
6+
def extract_title(soup):
7+
"""
8+
Extracts the title from a BeautifulSoup-parsed HTML document.
9+
10+
Priority:
11+
1. <h1> element if present
12+
2. <title> tag as fallback
13+
3. Returns "Untitled" if neither is found
14+
15+
Args:
16+
soup (BeautifulSoup): Parsed HTML document.
17+
18+
Returns:
19+
str: The extracted title string.
20+
"""
21+
h1 = soup.find("h1")
22+
if h1 and h1.get_text(strip=True):
23+
return h1.get_text(strip=True)
24+
if soup.title:
25+
return soup.title.get_text(strip=True)
26+
return "Untitled"
27+
28+
def extract_code_blocks(soup, tag, placeholder_template):
29+
"""
30+
Extracts all code blocks of a specified HTML tag (e.g., <pre>, <code>),
31+
replaces them with numbered placeholders, and returns the list of raw code strings.
32+
33+
Args:
34+
soup (BeautifulSoup): Parsed HTML content.
35+
tag (str): HTML tag to search for (e.g., "pre", "code").
36+
37+
Returns:
38+
list[str]: A list of code block strings, in the order they were found.
39+
"""
40+
code_blocks = []
41+
for i, code_block in enumerate(soup.find_all(tag)):
42+
placeholder = placeholder_template.format(i)
43+
code_blocks.append(code_block.get_text(strip=True))
44+
code_block.replace_with(NavigableString(placeholder))
45+
return code_blocks
46+
47+
def assign_code_blocks_to_chunks(chunks, code_blocks, placeholder_pattern, logger):
48+
"""
49+
Assigns relevant code blocks to each chunk based on placeholder references.
50+
51+
Args:
52+
chunks: List of text chunks (strings).
53+
code_blocks: List of all extracted code blocks.
54+
placeholder_pattern: Regex pattern to find placeholder indices
55+
56+
Returns:
57+
A list of dicts with 'chunk_text' and corresponding 'code_blocks'.
58+
"""
59+
processed_chunks = []
60+
61+
for chunk in chunks:
62+
matches = re.findall(placeholder_pattern, chunk)
63+
indices = set()
64+
65+
for match in matches:
66+
try:
67+
idx = int(match)
68+
if idx < len(code_blocks):
69+
indices.add(idx)
70+
else:
71+
logger.warning(
72+
"Placeholder index %d out of range (max index %d). Skipping.",
73+
idx, len(code_blocks) - 1
74+
)
75+
except ValueError:
76+
logger.warning(
77+
"Malformed placeholder index: '%s'. Skipping.",
78+
match
79+
)
80+
81+
chunk_code_blocks = [code_blocks[i] for i in sorted(indices)]
82+
83+
processed_chunks.append({
84+
"chunk_text": chunk,
85+
"code_blocks": chunk_code_blocks
86+
})
87+
88+
return processed_chunks
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
"""Chunk Discourse threads into structured content blocks with metadata."""
2+
# pylint: disable=R0801
3+
4+
import os
5+
import re
6+
from data.chunking.chunking_utils import(
7+
assign_code_blocks_to_chunks,
8+
save_chunks,
9+
read_json_file,
10+
build_chunk_dict,
11+
get_text_splitter
12+
)
13+
from utils import LoggerFactory
14+
15+
logger_factory = LoggerFactory.instance()
16+
logger = logger_factory.get_logger("chunking")
17+
18+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
19+
INPUT_PATH = os.path.join(SCRIPT_DIR, "..", "raw", "topics_with_posts.json")
20+
OUTPUT_PATH = os.path.join(SCRIPT_DIR, "..", "processed", "chunks_discourse_docs.json")
21+
22+
CHUNK_SIZE = 500
23+
CHUNK_OVERLAP = 100
24+
CODE_BLOCK_PLACEHOLDER_PATTERN = r"\[\[(?:CODE_BLOCK|CODE_SNIPPET)_(\d+)\]\]"
25+
TRIPLE_BACKTICK_CODE_PATTERN = r"```(?:\w+\n)?(.*?)```"
26+
INLINE_BACKTICK_CODE_PATTERN = r"`([^`\n]+?)`"
27+
28+
def extract_code_blocks(text):
29+
"""
30+
Extracts code blocks and replaces them with indexed placeholders.
31+
Supports both triple-backtick code blocks and inline code in backticks.
32+
33+
Args:
34+
text (str): Raw text including code blocks.
35+
36+
Returns:
37+
tuple:
38+
- List of extracted code blocks (in order of appearance).
39+
- Modified text with placeholders inserted in place of code.
40+
"""
41+
code_blocks = []
42+
placeholder_counter = 0
43+
44+
# Replace triple backtick code blocks with indexed placeholders
45+
def replace_triple(match):
46+
nonlocal placeholder_counter
47+
code = match.group(1).strip()
48+
placeholder = f"[[CODE_BLOCK_{placeholder_counter}]]"
49+
code_blocks.append(code)
50+
placeholder_counter += 1
51+
return placeholder
52+
53+
text = re.sub(TRIPLE_BACKTICK_CODE_PATTERN, replace_triple, text, flags=re.DOTALL)
54+
55+
# Replace inline backtick code with indexed placeholders
56+
def replace_inline(match):
57+
nonlocal placeholder_counter
58+
code = match.group(1).strip()
59+
placeholder = f"[[CODE_SNIPPET_{placeholder_counter}]]"
60+
code_blocks.append(code)
61+
placeholder_counter += 1
62+
return placeholder
63+
64+
text = re.sub(INLINE_BACKTICK_CODE_PATTERN, replace_inline, text)
65+
66+
return code_blocks, text
67+
68+
69+
def process_thread(thread, text_splitter):
70+
"""
71+
Processes a single Discourse thread into structured chunks.
72+
73+
Args:
74+
thread (dict): Thread data including topic ID, title, and post texts.
75+
text_splitter (RecursiveCharacterTextSplitter): Chunking utility.
76+
77+
Returns:
78+
list[dict]: List of chunk objects with text, metadata, and associated code blocks.
79+
"""
80+
topic_id = thread.get("topic_id")
81+
title = thread.get("title", "Untitled")
82+
posts = thread.get("posts", [])
83+
84+
# Combine all posts into a single text block
85+
full_text = "\n\n".join(posts)
86+
87+
code_blocks, clean_text = extract_code_blocks(full_text)
88+
chunks = text_splitter.split_text(clean_text)
89+
90+
processed_chunks = assign_code_blocks_to_chunks(
91+
chunks,
92+
code_blocks,
93+
CODE_BLOCK_PLACEHOLDER_PATTERN,
94+
logger
95+
)
96+
97+
return [
98+
build_chunk_dict(
99+
chunk["chunk_text"],
100+
{
101+
"data_source": "discourse_threads",
102+
"topic_id": topic_id,
103+
"title": title
104+
},
105+
chunk["code_blocks"]
106+
)
107+
for chunk in processed_chunks
108+
]
109+
110+
def extract_chunks(threads):
111+
"""
112+
Processes all Discourse threads into a flat list of chunks.
113+
114+
Args:
115+
threads (list): List of Discourse thread dicts.
116+
117+
Returns:
118+
list[dict]: All chunks extracted from all threads.
119+
"""
120+
all_chunks = []
121+
text_splitter = get_text_splitter(CHUNK_SIZE, CHUNK_OVERLAP)
122+
123+
for thread in threads:
124+
thread_chunks = process_thread(thread, text_splitter)
125+
all_chunks.extend(thread_chunks)
126+
127+
return all_chunks
128+
129+
def main():
130+
"""Main entry point."""
131+
threads = read_json_file(INPUT_PATH, logger)
132+
if not threads:
133+
return
134+
135+
logger.info("Chunking %d Discourse threads.", len(threads))
136+
all_chunks = extract_chunks(threads)
137+
138+
save_chunks(OUTPUT_PATH, all_chunks, logger)
139+
140+
if __name__ == "__main__":
141+
main()

0 commit comments

Comments
 (0)