Files
pcs_live/pcslive.py

165 lines
6.3 KiB
Python
Raw Normal View History

2026-05-30 18:39:09 +01:00
from bs4 import BeautifulSoup
import requests, time
2026-05-30 20:52:47 +01:00
# class for storing the live races
2026-05-30 18:39:09 +01:00
class LiveStats:
def __init__(self):
self.races = []
2026-05-30 20:52:47 +01:00
self.refresh_races()
2026-05-30 18:39:09 +01:00
2026-05-30 20:52:47 +01:00
# scrapes from the PCS homepage where the little green live stats boxes are
# it also gets rid of all the horrible polygon stuff
def refresh_races(self):
# the entire front page of PCS loaded into soup
2026-05-30 18:39:09 +01:00
req = requests.get("https://www.procyclingstats.com/")
html = req.text
soup = BeautifulSoup(html, "html.parser")
2026-05-30 20:52:47 +01:00
# narrow it down to the little green live stats boxes showing us what
# races are currently live
live = soup.find(attrs={"class":"hp3-livestats"})
# if there are even any races:
if live:
# also get rid of the horrible polygon number stuff !!! YUCK
# i imagine its how the profiles are drawn. there is A LOT
# and it makes reading the raw html very PAINFUL
for tag in live.find_all(attrs={"class":"inverse"}):
tag.decompose()
2026-05-30 18:39:09 +01:00
# conveniently, every race is a list item (<li>)
# inside is the title, status (live/finished), km to go,
# and brief situation text, but we parse that inside Race
races = live.find_all("li")
self.races = []
for raw_race in races:
# for now just create the Race object and add it to our list
race = Race(raw_race)
self.races.append(race)
2026-05-30 18:39:09 +01:00
2026-05-30 20:52:47 +01:00
# straightforward
2026-05-30 18:39:09 +01:00
def print_races(self):
for race in self.races:
race.print_stats()
2026-05-30 20:52:47 +01:00
# class for storing information about individual live races
2026-05-30 18:39:09 +01:00
class Race:
def __init__(self, raw):
2026-05-30 20:52:47 +01:00
# this is the unparsed html from LiveStats
2026-05-30 18:39:09 +01:00
self.raw = raw
2026-05-30 20:52:47 +01:00
# now go parse it
2026-05-30 18:39:09 +01:00
self.refresh_info()
2026-05-30 20:52:47 +01:00
# keeping these empty unless requested becauses i feel like it
# timeline is headings from the race's page (left side)
# situation_long is groups from the left side and time gaps
# (not to be confused with situation which is a brief homepage summary)
self.timeline = []
self.situation_long = []
# to parse the raw data given by LiveStats
2026-05-30 18:39:09 +01:00
def refresh_info(self):
title_r = self.raw.find(attrs={"class":"title"})
status_r = self.raw.find(attrs={"class":"status"})
togo_r = self.raw.find(attrs={"class":"togo"})
situation_r = self.raw.find(attrs={"class":"situ_txt"})
url_r = self.raw.find("a", href=True)
2026-05-30 20:52:47 +01:00
self.title = self.remove_tags(title_r) # name of the race
self.status = self.remove_tags(status_r) # live or finished (finished races stay online for a short while after finishing)
self.togo = self.remove_tags(togo_r) # how many kms to go (None if finished)
self.situation = self.remove_tags(situation_r) # a brief summary of the situation - None if finished, and - if complete peloton
self.url = self.remove_tags(url_r["href"]) # the url for the race's full live stats page
2026-05-30 18:39:09 +01:00
2026-05-30 20:52:47 +01:00
# self explanatory
2026-05-30 18:39:09 +01:00
def print_stats(self):
print(self.title)
print(self.status)
2026-05-30 20:52:47 +01:00
# dont bother showing kms to go or situation if finished
2026-05-30 18:39:09 +01:00
if self.togo != "None":
print(self.togo, "to go")
print(self.situation)
2026-05-30 20:52:47 +01:00
2026-05-30 18:39:09 +01:00
print("===============")
2026-05-30 20:52:47 +01:00
# for testing
# def print_raw(self):
# print(self.raw)
# print("")
# print(self.url)
2026-05-30 18:39:09 +01:00
2026-05-30 20:52:47 +01:00
# this gets the html of the entire page of stats for this particular race
# for use with get_timeline() and get_situation_long()
def get_race_page(self):
2026-05-30 18:39:09 +01:00
if self.url != "None":
full_url = "https://www.procyclingstats.com/" + self.url
req = requests.get(full_url)
html = req.text
soup = BeautifulSoup(html, "html.parser")
return soup
return None
2026-05-30 18:39:09 +01:00
2026-05-30 20:52:47 +01:00
# fills a list with all displayed timeline items (headings only)
# item 0 is the most recent update
def get_timeline(self):
page = self.get_race_page()
if page:
all = page.find_all(attrs={"class":"timeline3cont"})
2026-05-30 18:39:09 +01:00
live = all[0]
timeline = live.find_all("li")
self.timeline = []
for item in timeline:
stat = item.find(attrs={"class":"stat"})
if stat:
stat_content = stat.find(attrs={"class":"textCont"})
self.timeline.append(self.remove_tags(stat_content))
2026-05-30 20:52:47 +01:00
# creates a dictionary containing time gaps as keys and each
# timegap points to a list of riders in that group
def get_situation_long(self):
page = self.get_race_page()
if page:
all = page.find_all(attrs={"class":"situCont"})
live = all[0]
situation_long = live.find_all("li")
2026-05-30 20:52:47 +01:00
# create the dictionary
self.situation_long = {}
2026-05-30 20:52:47 +01:00
# last_timegap is used for grouping riders together
last_timegap = None
2026-05-30 20:52:47 +01:00
for item in situation_long:
time_gap = item.find(attrs={"class":"time"})
rider = item.find(attrs={"class":"maxw200"})
2026-05-30 20:52:47 +01:00
# time gap is only listed once and subsequent riders in that
# group don't have one (None)
if time_gap:
2026-05-30 20:52:47 +01:00
tg = self.remove_tags(time_gap) # get rid of html tags
self.situation_long[tg] = [] # create the list inside the dict
last_timegap = tg # set last timegap for the loop
rider_name = self.remove_tags(rider) # remove tags from rider name
# the leading rider is basically the group name i guess?
# anyway we dont need it twice
if rider_name not in self.situation_long[last_timegap]:
2026-05-30 20:52:47 +01:00
# add rider to list of riders under that timegap/group
self.situation_long[last_timegap].append(rider_name)
2026-05-30 20:52:47 +01:00
# remove surrounding html tags from final data points,
# like rider names, race names, etc.
2026-05-30 18:39:09 +01:00
def remove_tags(self, text):
text = str(text)
text_soup = BeautifulSoup(text, "html.parser")
for data in text_soup(["style", "script"]):
data.decompose()
return " ".join(text_soup.stripped_strings)