Skip to content

Commit 64a2b6b

Browse files
Standardize logging in data collection (#8)
* Standardize logging for data-collection. * Fix lint issues. * Update docs.
1 parent 16a1a03 commit 64a2b6b

10 files changed

Lines changed: 93 additions & 46 deletions

chatbot-core/data/collection/__init__.py

Whitespace-only changes.

chatbot-core/data/collection/collection_utils/__init__.py

Whitespace-only changes.

chatbot-core/data/collection/utils/convert_stack_threads.py renamed to chatbot-core/data/collection/collection_utils/convert_stack_threads.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
"""Script to convert Stack Overflow CSV thread data to a JSON format."""
22

33
import json
4+
import os
45
import pandas as pd
56

67
# The QueryResults.csv is obtained by running the desired query
78
# on the data explorer tool of StackExchange
8-
THREADS_CSV_PATH = "../../raw/QueryResults.csv"
9-
OUTPUT_JSON_PATH = "../../raw/stack_overflow_threads.json"
9+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
10+
THREADS_CSV_PATH = os.path.join(SCRIPT_DIR, "..", "..", "raw", "QueryResults.csv")
11+
OUTPUT_JSON_PATH = os.path.join(SCRIPT_DIR, "..", "..", "raw", "stack_overflow_threads.json")
1012

1113
def convert_stack_threads():
1214
"""Read CSV thread data and export it as a structured JSON file."""

chatbot-core/data/collection/utils/filter_discourse_threads.py renamed to chatbot-core/data/collection/collection_utils/filter_discourse_threads.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,14 @@
22

33
import json
44
import os
5+
from utils import LoggerFactory
56

6-
DISCOURSE_TOPIC_LIST_PATH = "../../raw/discourse_topic_list.json"
7-
OUTPUT_PATH = "../../raw/filtered_discourse_topics.json"
7+
logger_factory = LoggerFactory.instance()
8+
logger = logger_factory.get_logger("collection")
9+
10+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
11+
DISCOURSE_TOPIC_LIST_PATH = os.path.join(SCRIPT_DIR, "..", "..", "raw", "discourse_topic_list.json")
12+
OUTPUT_PATH = os.path.join(SCRIPT_DIR, "..", "..", "raw", "filtered_discourse_topics.json")
813

914
def filter_discourse_threads():
1015
"""Filter topics that have accepted answers and exclude unanswered threads."""
@@ -25,10 +30,14 @@ def filter_discourse_threads():
2530
if topic["posts_count"] == 1:
2631
non_answered_topics += 1
2732

28-
print(f"There are {len(data.keys()) - non_answered_topics} answered "
29-
f"topics over {len(data.keys())}")
30-
print(f"There are {accepted_answers} topics with accepted answers "
31-
f"over {len(data.keys()) - non_answered_topics} answered topics")
33+
logger.info("There are %d answered topics over %d",
34+
len(data.keys()) - non_answered_topics,
35+
len(data.keys())
36+
)
37+
logger.info("There are %d topics with accepted answers over %d answered topics",
38+
accepted_answers,
39+
len(data.keys()) - non_answered_topics
40+
)
3241

3342
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
3443
json.dump(filtered_topics, f, ensure_ascii=False, indent=2)

chatbot-core/data/collection/discourse_fetch_posts.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
11
"""Module to fetch posts from Jenkins Discourse topics."""
22

3+
import os
34
import json
45
import requests
6+
from utils import LoggerFactory
57

6-
BASE_URL = "https://community.jenkins.io"
7-
FILE_NAME = "../raw/topics_with_posts.json"
8-
FILTERED_TOPICS_PATH = "../raw/filtered_discourse_topics.json"
8+
logger_factory = LoggerFactory.instance()
9+
logger = logger_factory.get_logger("collection")
910

11+
BASE_URL = "https://community.jenkins.io"
12+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
13+
FILE_NAME = os.path.join(SCRIPT_DIR, "..", "raw", "topics_with_posts.json")
14+
FILTERED_TOPICS_PATH = os.path.join(SCRIPT_DIR, "..", "raw", "filtered_discourse_topics.json")
1015

1116
def fetch_topic_posts(topic_id):
1217
"""Fetch all posts in a topic using the topic endpoint."""
@@ -32,7 +37,10 @@ def process_topics(topics):
3237

3338
for idx, topic in enumerate(topics):
3439
topic_id = topic["id"]
35-
print(f"Processing topicId: {topic_id}... Progress at {round((idx/len(topics)) * 100, 2)}%")
40+
logger.info("Processing topicId: %d... Progress at %.2f%%",
41+
topic_id,
42+
(idx / len(topics)) * 100
43+
)
3644
try:
3745
post_ids = fetch_topic_posts(topic_id)
3846
posts_content = [fetch_post_content(post_id) for post_id in post_ids]
@@ -43,7 +51,7 @@ def process_topics(topics):
4351
"posts": posts_content
4452
})
4553
except requests.HTTPError as e:
46-
print(f"Error fetching topic {topic_id}: {e}")
54+
logger.error("Error fetching topic %d: %s", topic_id, e)
4755

4856
return result
4957

@@ -57,7 +65,7 @@ def main():
5765

5866
with open(FILE_NAME, "w", encoding="utf-8") as f:
5967
json.dump(data, f, ensure_ascii=False, indent=4)
60-
print(f"Data saved to {FILE_NAME}")
68+
logger.info("Data saved to %s", FILE_NAME)
6169

6270
if __name__ == "__main__":
6371
main()

chatbot-core/data/collection/discourse_topics_retriever.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
"""Module for retrieving and filtering topics from the 'Using Jenkins' category on Discourse."""
22

33
import json
4+
import os
45
import requests
6+
from utils import LoggerFactory
7+
8+
logger_factory = LoggerFactory.instance()
9+
logger = logger_factory.get_logger("collection")
510

611
BASE_URL = "https://community.jenkins.io"
712
CATEGORY_SLUG = "using-jenkins"
813
CATEGORY_ID = 7 # 'Using Jenkins' Category
9-
OUTPUT_PATH = "../raw/discourse_topic_list.json"
14+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
15+
OUTPUT_PATH = os.path.join(SCRIPT_DIR, "..", "raw", "discourse_topic_list.json")
1016

1117

1218
def fetch_page(category_slug, category_id, page):
@@ -48,15 +54,17 @@ def get_category_topics(category_slug, category_id):
4854
explored_topics = {}
4955

5056
while True:
51-
print(f"Fetching page {page}...")
57+
logger.info("Fetching page %d...", page)
5258
data = fetch_page(category_slug, category_id, page)
5359
topics, more_topics_url = extract_topics(data)
5460

5561
right_category_topics, wrong_category_topics = get_wrong_and_correct_topics(topics)
5662

57-
print(f"Page {page} - Found {len(topics)} topics")
58-
print(f"Right category Topics {len(right_category_topics)} "
59-
f"- Wrong category Topics {len(wrong_category_topics)}")
63+
logger.info("Page %d - Found %d topics", page, len(topics))
64+
logger.info("Right category Topics %d - Wrong category Topics %d",
65+
len(right_category_topics),
66+
len(wrong_category_topics)
67+
)
6068

6169
for topic in right_category_topics:
6270
id_topic = topic["id"]
@@ -66,21 +74,21 @@ def get_category_topics(category_slug, category_id):
6674
explored_pages.add(page)
6775

6876
if not more_topics_url:
69-
print("No more topics to explore.")
77+
logger.info("No more topics to explore.")
7078
break
7179

7280
# Extract the next page number from the more_topics_url
7381
try:
7482
page = int(more_topics_url.split('page=')[-1])
7583
except (IndexError, ValueError):
76-
print("Failed to parse next page number.")
84+
logger.error("Failed to parse next page number.")
7785
break
7886

7987
if page in explored_pages:
80-
print(f"Already explored page {page}.")
88+
logger.info("Already explored page %d.", page)
8189
break
8290

83-
print(f"Explored {len(explored_topics.keys())} topics")
91+
logger.info("Explored %d topics", len(explored_topics.keys()))
8492
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
8593
json.dump(explored_topics, f, ensure_ascii=False, indent=2)
8694

chatbot-core/data/collection/docs_crawler.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,17 @@
11
"""Module for crawling and collecting content from Jenkins documentation pages."""
22

33
import json
4+
import os
45
from urllib.parse import urljoin, urlparse
56
import requests
67
from bs4 import BeautifulSoup
8+
from utils import LoggerFactory
9+
10+
logger_factory = LoggerFactory.instance()
11+
logger = logger_factory.get_logger("collection")
12+
13+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
14+
OUTPUT_PATH = os.path.join(SCRIPT_DIR, "..", "raw", "../raw/jenkins_docs.json")
715

816
# Home URL of jenkins doc
917
BASE_URL = "https://www.jenkins.io/doc/"
@@ -38,7 +46,7 @@ def crawl(url):
3846
if url in visited_urls:
3947
return
4048

41-
print(f"Visiting: {url}")
49+
logger.info("Visiting: %s", url)
4250
try:
4351
visited_urls.add(url)
4452

@@ -64,20 +72,19 @@ def crawl(url):
6472
crawl(full_url)
6573

6674
except requests.RequestException as e:
67-
print(f"Error accessing {url}: {e}")
75+
logger.error("Error accessing %s: %s", url, e)
6876

6977
def start_crawl():
7078
"""Start the crawling process from the base URL."""
71-
print("Crawling started")
79+
logger.info("Crawling started")
7280
crawl(BASE_URL)
73-
print(f"Total pages found: {len(visited_urls)}")
74-
print(f"Total pages with content: {len(page_content)}")
75-
print("Non canonic content page structure links:")
76-
print(non_canonic_content_urls)
77-
print("Crawling ended")
78-
79-
print("Saving results in json")
80-
with open("../raw/jenkins_docs.json", "w", encoding="utf-8") as f:
81+
logger.info("Total pages found: %d", len(visited_urls))
82+
logger.info("Total pages with content: %d", len(page_content))
83+
logger.info("Non canonic content page structure links: %s", non_canonic_content_urls)
84+
logger.info("Crawling ended")
85+
86+
logger.info("Saving results in json")
87+
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
8188
json.dump(page_content, f, ensure_ascii=False, indent=2)
8289

8390
if __name__ == "__main__":

chatbot-core/data/collection/fetch_list_plugins.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
import os
55
import requests
66
from bs4 import BeautifulSoup
7+
from utils import LoggerFactory
8+
9+
logger_factory = LoggerFactory.instance()
10+
logger = logger_factory.get_logger("collection")
711

812
URL = "https://updates.jenkins.io/experimental/latest/"
913
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -16,7 +20,7 @@ def fetch_plugin_names():
1620
Returns:
1721
List[str]: List of raw plugin file names (e.g., 'git.hpi', 'docker-slaves.hpi').
1822
"""
19-
print("Fetching plugin index page...")
23+
logger.info("Fetching plugin index page...")
2024
response = requests.get(URL, timeout=10)
2125
response.raise_for_status()
2226

@@ -33,7 +37,7 @@ def fetch_plugin_names():
3337
if plugin_name:
3438
plugin_list.append(plugin_name)
3539

36-
print(f"Found {len(plugin_list)} plugins.")
40+
logger.info("Found %d plugins.", len(plugin_list))
3741
return plugin_list
3842

3943
def save_plugin_names(plugin_names_with_extension):
@@ -46,7 +50,7 @@ def save_plugin_names(plugin_names_with_extension):
4650

4751
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
4852
json.dump(plugin_names, f, indent=2, ensure_ascii=False)
49-
print(f"Saved {len(plugin_names)} plugin names to {OUTPUT_PATH}")
53+
logger.info("Saved %d plugin names to %s", len(plugin_names), OUTPUT_PATH)
5054

5155
if __name__ == "__main__":
5256
plugins = fetch_plugin_names()

chatbot-core/data/collection/jenkins_plugins_fetch.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
import time
55
import requests
66
from bs4 import BeautifulSoup
7+
from utils import LoggerFactory
8+
9+
logger_factory = LoggerFactory.instance()
10+
logger = logger_factory.get_logger("collection")
711

812
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
913
INPUT_PATH = os.path.join(SCRIPT_DIR, "..", "raw", "plugin_names.json")
@@ -35,14 +39,14 @@ def fetch_plugin_content(plugin_name, retries=3):
3539
if content_div:
3640
return str(content_div)
3741

38-
print(f"No content found for {plugin_name}")
42+
logger.warning("No content found for %s", plugin_name)
3943
return None
4044

4145
except requests.RequestException as e:
42-
print(f"Error fetching {plugin_name} (attempt {attempt + 1}): {e}")
46+
logger.error("Error fetching %s (attempt %d): %s", plugin_name, attempt + 1, e)
4347
time.sleep(1.5 * (attempt + 1))
4448

45-
print(f"Failed to fetch {plugin_name} after {retries} attempts")
49+
logger.error("Failed to fetch %s after %d attempts", plugin_name, retries)
4650
return None
4751

4852
def collect_plugin_docs(plugin_names):
@@ -57,7 +61,7 @@ def collect_plugin_docs(plugin_names):
5761
"""
5862
result = {}
5963
for idx, plugin_name in enumerate(plugin_names):
60-
print(f"[{idx+1}/{len(plugin_names)}] Fetching {plugin_name}...")
64+
logger.info("[%d/%d] Fetching %s...", idx + 1, len(plugin_names), plugin_name)
6165
content = fetch_plugin_content(plugin_name)
6266
if content:
6367
result[plugin_name] = content
@@ -77,7 +81,7 @@ def main():
7781
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
7882
json.dump(collected_docs, f, indent=2, ensure_ascii=False)
7983

80-
print(f"Saved {len(collected_docs)} plugins to {OUTPUT_PATH}")
84+
logger.info("Saved %d plugins to %s", len(collected_docs), OUTPUT_PATH)
8185

8286
if __name__ == "__main__":
8387
main()

docs/README.md

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Below is a brief explanation of the key subdirectories:
1212
- `preprocessing/`: Scripts to clean, filter, the collected data before chunking.
1313
- `raw/`: Output directory for collected data.
1414
- `processed/`: Output directory for cleaned and filtered data.
15+
- `utils/`: Contains utils for the chatbot-core directory(e.g. logger).
1516
- `requirements.txt`: Python dependencies.
1617
- `docs/`: Developer documentation.
1718

@@ -45,6 +46,10 @@ To set up the environment and run the scripts:
4546
```bash
4647
pip install -r requirements.txt
4748
```
49+
5. Set the `PYTHONPATH` to the current directory(`chatbot-core/`):
50+
```bash
51+
export PYTHONPATH=$(pwd)
52+
```
4853

4954
## Data Collection
5055

@@ -98,7 +103,7 @@ python data/collection/discourse_topics_retriever.py
98103

99104
#### 2. Filter topics
100105

101-
**Script**: `utils/filter_discourse_threads.py`
106+
**Script**: `collection_utils/filter_discourse_threads.py`
102107

103108
Filters the previously collected topics, keeping only those with an accepted answer.
104109

@@ -107,7 +112,7 @@ Filters the previously collected topics, keeping only those with an accepted ans
107112

108113
**To run:**
109114
```bash
110-
python data/collection/utils/filter_discourse_threads.py
115+
python data/collection/collection_utils/filter_discourse_threads.py
111116
```
112117

113118
#### 3. Fetch post content
@@ -156,7 +161,7 @@ The result can be downloaded as aCSV file and have to be placed in the following
156161

157162
#### 2. Convert CSV to JSON
158163

159-
**Script**: `utils/convert_stack_threads.py`
164+
**Script**: `collection_utils/convert_stack_threads.py`
160165

161166
This script reads the exported CSV and converts it into a JSON format. The resulting JSON file will contain a list of question-answer pairs with metadata.
162167

@@ -165,7 +170,7 @@ This script reads the exported CSV and converts it into a JSON format. The resul
165170

166171
**To run:**
167172
```bash
168-
python data/collection/utils/convert_stack_threads.py
173+
python data/collection/collection_utils/convert_stack_threads.py
169174
```
170175
### Jenkins Plugins
171176

0 commit comments

Comments
 (0)