Standardize logging in data collection (#8)

giovanni-vaccarino · web-flow · commit 64a2b6b7db30 · 2025-06-02T15:17:27.000+07:00
* Standardize logging for data-collection.

* Fix lint issues.

* Update docs.
diff --git a/chatbot-core/data/collection/__init__.py b/chatbot-core/data/collection/__init__.py
diff --git a/chatbot-core/data/collection/collection_utils/__init__.py b/chatbot-core/data/collection/collection_utils/__init__.py
diff --git a/chatbot-core/data/collection/collection_utils/convert_stack_threads.py b/chatbot-core/data/collection/collection_utils/convert_stack_threads.py
@@ -1,12 +1,14 @@
 """Script to convert Stack Overflow CSV thread data to a JSON format."""
 
 import json
+import os
 import pandas as pd
 
 # The QueryResults.csv is obtained by running the desired query
 # on the data explorer tool of StackExchange
-THREADS_CSV_PATH = "../../raw/QueryResults.csv"
-OUTPUT_JSON_PATH = "../../raw/stack_overflow_threads.json"
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+THREADS_CSV_PATH = os.path.join(SCRIPT_DIR, "..", "..", "raw", "QueryResults.csv")
+OUTPUT_JSON_PATH = os.path.join(SCRIPT_DIR, "..", "..", "raw", "stack_overflow_threads.json")
 
 def convert_stack_threads():
     """Read CSV thread data and export it as a structured JSON file."""
diff --git a/chatbot-core/data/collection/collection_utils/filter_discourse_threads.py b/chatbot-core/data/collection/collection_utils/filter_discourse_threads.py
@@ -2,9 +2,14 @@
 
 import json
 import os
+from utils import LoggerFactory
 
-DISCOURSE_TOPIC_LIST_PATH = "../../raw/discourse_topic_list.json"
-OUTPUT_PATH = "../../raw/filtered_discourse_topics.json"
+logger_factory = LoggerFactory.instance()
+logger = logger_factory.get_logger("collection")
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+DISCOURSE_TOPIC_LIST_PATH = os.path.join(SCRIPT_DIR, "..", "..", "raw", "discourse_topic_list.json")
+OUTPUT_PATH = os.path.join(SCRIPT_DIR, "..", "..", "raw", "filtered_discourse_topics.json")
 
 def filter_discourse_threads():
     """Filter topics that have accepted answers and exclude unanswered threads."""
@@ -25,10 +30,14 @@ def filter_discourse_threads():
             if topic["posts_count"] == 1:
                 non_answered_topics += 1
 
-        print(f"There are {len(data.keys()) - non_answered_topics} answered "
-            f"topics over {len(data.keys())}")
-        print(f"There are {accepted_answers} topics with accepted answers "
-            f"over {len(data.keys()) - non_answered_topics} answered topics")
+        logger.info("There are %d answered topics over %d",
+            len(data.keys()) - non_answered_topics,
+            len(data.keys())
+        )
+        logger.info("There are %d topics with accepted answers over %d answered topics",
+            accepted_answers,
+            len(data.keys()) - non_answered_topics
+        )
 
         with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
             json.dump(filtered_topics, f, ensure_ascii=False, indent=2)
diff --git a/chatbot-core/data/collection/discourse_fetch_posts.py b/chatbot-core/data/collection/discourse_fetch_posts.py
@@ -1,12 +1,17 @@
 """Module to fetch posts from Jenkins Discourse topics."""
 
+import os
 import json
 import requests
+from utils import LoggerFactory
 
-BASE_URL = "https://community.jenkins.io"
-FILE_NAME = "../raw/topics_with_posts.json"
-FILTERED_TOPICS_PATH = "../raw/filtered_discourse_topics.json"
+logger_factory = LoggerFactory.instance()
+logger = logger_factory.get_logger("collection")
 
+BASE_URL = "https://community.jenkins.io"
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+FILE_NAME = os.path.join(SCRIPT_DIR, "..", "raw", "topics_with_posts.json")
+FILTERED_TOPICS_PATH = os.path.join(SCRIPT_DIR, "..", "raw", "filtered_discourse_topics.json")
 
 def fetch_topic_posts(topic_id):
     """Fetch all posts in a topic using the topic endpoint."""
@@ -32,7 +37,10 @@ def process_topics(topics):
 
     for idx, topic in enumerate(topics):
         topic_id = topic["id"]
-        print(f"Processing topicId: {topic_id}... Progress at {round((idx/len(topics)) * 100, 2)}%")
+        logger.info("Processing topicId: %d... Progress at %.2f%%",
+            topic_id,
+            (idx / len(topics)) * 100
+        )
         try:
             post_ids = fetch_topic_posts(topic_id)
             posts_content = [fetch_post_content(post_id) for post_id in post_ids]
@@ -43,7 +51,7 @@ def process_topics(topics):
                 "posts": posts_content
             })
         except requests.HTTPError as e:
-            print(f"Error fetching topic {topic_id}: {e}")
+            logger.error("Error fetching topic %d: %s", topic_id, e)
 
     return result
 
@@ -57,7 +65,7 @@ def main():
 
     with open(FILE_NAME, "w", encoding="utf-8") as f:
         json.dump(data, f, ensure_ascii=False, indent=4)
-    print(f"Data saved to {FILE_NAME}")
+    logger.info("Data saved to %s", FILE_NAME)
 
 if __name__ == "__main__":
     main()
diff --git a/chatbot-core/data/collection/discourse_topics_retriever.py b/chatbot-core/data/collection/discourse_topics_retriever.py
@@ -1,12 +1,18 @@
 """Module for retrieving and filtering topics from the 'Using Jenkins' category on Discourse."""
 
 import json
+import os
 import requests
+from utils import LoggerFactory
+
+logger_factory = LoggerFactory.instance()
+logger = logger_factory.get_logger("collection")
 
 BASE_URL = "https://community.jenkins.io"
 CATEGORY_SLUG = "using-jenkins"
 CATEGORY_ID = 7 # 'Using Jenkins' Category
-OUTPUT_PATH = "../raw/discourse_topic_list.json"
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+OUTPUT_PATH = os.path.join(SCRIPT_DIR, "..", "raw", "discourse_topic_list.json")
 
 
 def fetch_page(category_slug, category_id, page):
@@ -48,15 +54,17 @@ def get_category_topics(category_slug, category_id):
     explored_topics = {}
 
     while True:
-        print(f"Fetching page {page}...")
+        logger.info("Fetching page %d...", page)
         data = fetch_page(category_slug, category_id, page)
         topics, more_topics_url = extract_topics(data)
 
         right_category_topics, wrong_category_topics = get_wrong_and_correct_topics(topics)
 
-        print(f"Page {page} - Found {len(topics)} topics")
-        print(f"Right category Topics {len(right_category_topics)} "
-            f"- Wrong category Topics {len(wrong_category_topics)}")
+        logger.info("Page %d - Found %d topics", page, len(topics))
+        logger.info("Right category Topics %d - Wrong category Topics %d",
+            len(right_category_topics),
+            len(wrong_category_topics)
+        )
 
         for topic in right_category_topics:
             id_topic = topic["id"]
@@ -66,21 +74,21 @@ def get_category_topics(category_slug, category_id):
         explored_pages.add(page)
 
         if not more_topics_url:
-            print("No more topics to explore.")
+            logger.info("No more topics to explore.")
             break
 
         # Extract the next page number from the more_topics_url
         try:
             page = int(more_topics_url.split('page=')[-1])
         except (IndexError, ValueError):
-            print("Failed to parse next page number.")
+            logger.error("Failed to parse next page number.")
             break
 
         if page in explored_pages:
-            print(f"Already explored page {page}.")
+            logger.info("Already explored page %d.", page)
             break
 
-    print(f"Explored {len(explored_topics.keys())} topics")
+    logger.info("Explored %d topics", len(explored_topics.keys()))
     with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
         json.dump(explored_topics, f, ensure_ascii=False, indent=2)
 
diff --git a/chatbot-core/data/collection/docs_crawler.py b/chatbot-core/data/collection/docs_crawler.py
@@ -1,9 +1,17 @@
 """Module for crawling and collecting content from Jenkins documentation pages."""
 
 import json
+import os
 from urllib.parse import urljoin, urlparse
 import requests
 from bs4 import BeautifulSoup
+from utils import LoggerFactory
+
+logger_factory = LoggerFactory.instance()
+logger = logger_factory.get_logger("collection")
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+OUTPUT_PATH = os.path.join(SCRIPT_DIR, "..", "raw", "../raw/jenkins_docs.json")
 
 # Home URL of jenkins doc
 BASE_URL = "https://www.jenkins.io/doc/"
@@ -38,7 +46,7 @@ def crawl(url):
     if url in visited_urls:
         return
 
-    print(f"Visiting: {url}")
+    logger.info("Visiting: %s", url)
     try:
         visited_urls.add(url)
 
@@ -64,20 +72,19 @@ def crawl(url):
                 crawl(full_url)
 
     except requests.RequestException as e:
-        print(f"Error accessing {url}: {e}")
+        logger.error("Error accessing %s: %s", url, e)
 
 def start_crawl():
     """Start the crawling process from the base URL."""
-    print("Crawling started")
+    logger.info("Crawling started")
     crawl(BASE_URL)
-    print(f"Total pages found: {len(visited_urls)}")
-    print(f"Total pages with content: {len(page_content)}")
-    print("Non canonic content page structure links:")
-    print(non_canonic_content_urls)
-    print("Crawling ended")
-
-    print("Saving results in json")
-    with open("../raw/jenkins_docs.json", "w", encoding="utf-8") as f:
+    logger.info("Total pages found: %d", len(visited_urls))
+    logger.info("Total pages with content: %d", len(page_content))
+    logger.info("Non canonic content page structure links: %s", non_canonic_content_urls)
+    logger.info("Crawling ended")
+
+    logger.info("Saving results in json")
+    with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
         json.dump(page_content, f, ensure_ascii=False, indent=2)
 
 if __name__ == "__main__":
diff --git a/chatbot-core/data/collection/fetch_list_plugins.py b/chatbot-core/data/collection/fetch_list_plugins.py
@@ -4,6 +4,10 @@
 import os
 import requests
 from bs4 import BeautifulSoup
+from utils import LoggerFactory
+
+logger_factory = LoggerFactory.instance()
+logger = logger_factory.get_logger("collection")
 
 URL = "https://updates.jenkins.io/experimental/latest/"
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -16,7 +20,7 @@ def fetch_plugin_names():
     Returns:
         List[str]: List of raw plugin file names (e.g., 'git.hpi', 'docker-slaves.hpi').
     """
-    print("Fetching plugin index page...")
+    logger.info("Fetching plugin index page...")
     response = requests.get(URL, timeout=10)
     response.raise_for_status()
 
@@ -33,7 +37,7 @@ def fetch_plugin_names():
                 if plugin_name:
                     plugin_list.append(plugin_name)
 
-    print(f"Found {len(plugin_list)} plugins.")
+    logger.info("Found %d plugins.", len(plugin_list))
     return plugin_list
 
 def save_plugin_names(plugin_names_with_extension):
@@ -46,7 +50,7 @@ def save_plugin_names(plugin_names_with_extension):
 
     with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
         json.dump(plugin_names, f, indent=2, ensure_ascii=False)
-    print(f"Saved {len(plugin_names)} plugin names to {OUTPUT_PATH}")
+    logger.info("Saved %d plugin names to %s", len(plugin_names), OUTPUT_PATH)
 
 if __name__ == "__main__":
     plugins = fetch_plugin_names()
diff --git a/chatbot-core/data/collection/jenkins_plugins_fetch.py b/chatbot-core/data/collection/jenkins_plugins_fetch.py
@@ -4,6 +4,10 @@
 import time
 import requests
 from bs4 import BeautifulSoup
+from utils import LoggerFactory
+
+logger_factory = LoggerFactory.instance()
+logger = logger_factory.get_logger("collection")
 
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 INPUT_PATH = os.path.join(SCRIPT_DIR, "..", "raw", "plugin_names.json")
@@ -35,14 +39,14 @@ def fetch_plugin_content(plugin_name, retries=3):
             if content_div:
                 return str(content_div)
 
-            print(f"No content found for {plugin_name}")
+            logger.warning("No content found for %s", plugin_name)
             return None
 
         except requests.RequestException as e:
-            print(f"Error fetching {plugin_name} (attempt {attempt + 1}): {e}")
+            logger.error("Error fetching %s (attempt %d): %s", plugin_name, attempt + 1, e)
             time.sleep(1.5 * (attempt + 1))
 
-    print(f"Failed to fetch {plugin_name} after {retries} attempts")
+    logger.error("Failed to fetch %s after %d attempts", plugin_name, retries)
     return None
 
 def collect_plugin_docs(plugin_names):
@@ -57,7 +61,7 @@ def collect_plugin_docs(plugin_names):
     """
     result = {}
     for idx, plugin_name in enumerate(plugin_names):
-        print(f"[{idx+1}/{len(plugin_names)}] Fetching {plugin_name}...")
+        logger.info("[%d/%d] Fetching %s...", idx + 1, len(plugin_names), plugin_name)
         content = fetch_plugin_content(plugin_name)
         if content:
             result[plugin_name] = content
@@ -77,7 +81,7 @@ def main():
     with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
         json.dump(collected_docs, f, indent=2, ensure_ascii=False)
 
-    print(f"Saved {len(collected_docs)} plugins to {OUTPUT_PATH}")
+    logger.info("Saved %d plugins to %s", len(collected_docs), OUTPUT_PATH)
 
 if __name__ == "__main__":
     main()
diff --git a/docs/README.md b/docs/README.md
@@ -12,6 +12,7 @@ Below is a brief explanation of the key subdirectories:
     - `preprocessing/`: Scripts to clean, filter, the collected data before chunking.
     - `raw/`: Output directory for collected data.
     - `processed/`: Output directory for cleaned and filtered data.
+  - `utils/`: Contains utils for the chatbot-core directory(e.g. logger).
   - `requirements.txt`: Python dependencies.
 - `docs/`: Developer documentation.
 
@@ -45,6 +46,10 @@ To set up the environment and run the scripts:
     ```bash
     pip install -r requirements.txt
     ```
+5. Set the `PYTHONPATH` to the current directory(`chatbot-core/`):
+    ```bash
+    export PYTHONPATH=$(pwd)
+    ```
 
 ## Data Collection
 
@@ -98,7 +103,7 @@ python data/collection/discourse_topics_retriever.py
 
 #### 2. Filter topics
 
-**Script**: `utils/filter_discourse_threads.py`
+**Script**: `collection_utils/filter_discourse_threads.py`
 
 Filters the previously collected topics, keeping only those with an accepted answer.
 
@@ -107,7 +112,7 @@ Filters the previously collected topics, keeping only those with an accepted ans
 
 **To run:**
 ```bash
-python data/collection/utils/filter_discourse_threads.py
+python data/collection/collection_utils/filter_discourse_threads.py
 ```
 
 #### 3. Fetch post content
@@ -156,7 +161,7 @@ The result can be downloaded as aCSV file and have to be placed in the following
 
 #### 2. Convert CSV to JSON
 
-**Script**: `utils/convert_stack_threads.py`
+**Script**: `collection_utils/convert_stack_threads.py`
 
 This script reads the exported CSV and converts it into a JSON format. The resulting JSON file will contain a list of question-answer pairs with metadata.
 
@@ -165,7 +170,7 @@ This script reads the exported CSV and converts it into a JSON format. The resul
 
 **To run:**
 ```bash
-python data/collection/utils/convert_stack_threads.py
+python data/collection/collection_utils/convert_stack_threads.py
 ```
 ### Jenkins Plugins