-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathpip.py
More file actions
97 lines (81 loc) · 4.08 KB
/
pip.py
File metadata and controls
97 lines (81 loc) · 4.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# Day 20 - 30DaysOfPython Challenge
# PIP: Preferred Installer Program
import requests
import string
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from mypackage import arithmetic
from bs4 import BeautifulSoup
# 1 - Read this url and find the 10 most frequent words. romeo_and_juliet = 'http://www.gutenberg.org/files/1112/1112.txt'
# Note: URL was outdated so had to manually retrieve the working one as of 19/08/2025: 'https://www.gutenberg.org/cache/epub/1513/pg1513.txt'
def read_and_return_common_words(url: str = 'https://www.gutenberg.org/cache/epub/1513/pg1513.txt', n: int = 10) -> list[tuple[str, int]]:
text: str = requests.get(url).text
# Gutenberg books are always contained in between these START-END statements
start: int = text.find('*** START OF THE PROJECT GUTENBERG EBOOK ROMEO AND JULIET ***')
end: int = text.find('*** END OF THE PROJECT GUTENBERG EBOOK ROMEO AND JULIET ***')
text = text[start:end].lower()
translator: dict = str.maketrans('', '', string.punctuation + string.digits) # maketrans(x, y, z) -> x: chars to replace (None); y: replace with (None); y: chars to delete (all chars from string.punctuation and string.digits)
words: list[str] = text.translate(translator).split()
word_count: dict[str, int] = {}
for word in words:
word_count[word] = word_count.get(word, 0) + 1
return sorted(word_count.items(), key=lambda x: x[1], reverse=True)[:n]
# 2 - Read the cats API and cats_api = 'https://api.thecatapi.com/v1/breeds' and find:
# - i. the min, max, mean, median, standard deviation of cats' weight in metric units.
# - ii. the min, max, mean, median, standard deviation of cats' lifespan in years.
# - iii. Create a frequency table of country and breed of cats
def analyze_cat_data(url: str = 'https://api.thecatapi.com/v1/breeds') -> dict:
cats: list[dict] = requests.get(url).json()
# i.
weights: list = []
for cat in cats:
metric_str: str = cat['weight']['metric'] # weight is also a dict with keys 'metric' and 'imperial'
low, high = [float(weight_val.strip()) for weight_val in metric_str.split('-')] # weight values are separated by "-" so we save both values as low and high by using split("-") and then we strip() bc there are spaces (["3 ", " 5"])
avg_weight: float = (low + high) / 2
weights.append(avg_weight)
weight_stats: dict[str, float] = {
'min': min(weights),
'max': max(weights),
'mean': arithmetic.calculate_mean(weights),
'median': arithmetic.calculate_median(weights),
'std': arithmetic.calculate_std(weights)
}
# ii.
lifespans: list[float] = []
for cat in cats:
life_str: str = cat['life_span']
low, high = [float(life_val.strip()) for life_val in life_str.split('-')]
avg_life: float = (low + high) / 2
lifespans.append(avg_life)
lifespan_stats: dict[str, float] = {
'min': min(lifespans),
'max': max(lifespans),
'mean': arithmetic.calculate_mean(lifespans),
'median': arithmetic.calculate_median(lifespans),
'std': arithmetic.calculate_std(lifespans)
}
# iii.
freq_table: dict[str, list[str]] = {}
for cat in cats:
country: str = cat.get('origin', 'Unknown')
breed: str = cat['name']
if country not in freq_table:
freq_table[country] = []
freq_table[country].append(breed)
return {
'weight': weight_stats,
'lifespan': lifespan_stats,
'frequency_table': freq_table
}
# 3 - Read the countries API and find
# i. the 10 largest countries
# ii. the 10 most spoken languages
# iii. the total number of languages in the countries API
"""
Countries API (https://restcountries.eu/rest/v2/all) "Not Found"
"""
# 4 - UCI is one of the most common places to get data sets for data science and machine learning. Read the content of UCL (https://archive.ics.uci.edu/ml/datasets.php). Without additional libraries it will be difficult, so you may try it with BeautifulSoup4
"""
Provided link returns Error 404.
"""