comments

2026-05-30 20:52:47 +01:00
parent db3449fcea
commit 4396e0758e
4 changed files with 86 additions and 38 deletions
--- a/README.md
+++ b/README.md
@@ -42,6 +42,3 @@ stats.print_races()
 `example.py` posts the latest timeline of one current race (checking every 10 seconds for updates)
 `example2.py` organises the situation diagram in a dictionary of timegap keys to a list of riders in that group. it could probably be organised better but it works
--- a/example.py
+++ b/example.py
@@ -3,15 +3,21 @@ import time
 stats = LiveStats()
 # if there are live races...
 if len(stats.races) > 0:
-    race = stats.races[0]
+    race = stats.races[0] # just grab the first one for example's sake
    print("Latest timeline update from", race.title, ":")
-
+    # keep the last update so we don't spam the terminal
    last_update = ""
    while True:
        race.get_timeline()
        # only update terminal if the newest item has changed (is new)
        if last_update != race.timeline[0]:
            print(race.timeline[0])
            last_update = race.timeline[0]
        # wait ten seconds before checking again
        # maybe set to longer if running for extended time
        time.sleep(10)
--- a/example2.py
+++ b/example2.py
@@ -3,10 +3,12 @@ import time
 stats = LiveStats()
 # if there are live races...
 if len(stats.races) > 0:
    race = stats.races[0]
    race.get_situation_long()
    # whats the situation? :)
    for x in race.situation_long:
        print(x, race.situation_long[x])
--- a/pcslive.py
+++ b/pcslive.py
@@ -1,42 +1,65 @@
 from bs4 import BeautifulSoup
 import requests, time
 # class for storing the live races
 class LiveStats:
    def __init__(self):
-        self.refresh_live()
+        self.refresh_races()
        self.get_races()
-        self.timeline = []
+    # scrapes from the PCS homepage where the little green live stats boxes are
-        self.situation_long = []
+    # it also gets rid of all the horrible polygon stuff
-
+    def refresh_races(self):
-    def refresh_live(self):
+        # the entire front page of PCS loaded into soup
        req = requests.get("https://www.procyclingstats.com/")
        html = req.text
        soup = BeautifulSoup(html, "html.parser")
-        all = soup.find_all(attrs={"class":"hp3-livestats"})
+        # narrow it down to the little green live stats boxes showing us what
-        live = all[0]
+        # races are currently live
        live = soup.find(attrs={"class":"hp3-livestats"})
        # also get rid of the horrible polygon number stuff !!! YUCK
        # i imagine its how the profiles are drawn. there is A LOT
        # and it makes reading the raw html very PAINFUL
        for tag in live.find_all(attrs={"class":"inverse"}):
            tag.decompose()
-        self.live = live
+        # conveniently, every race is a list item (<li>)
-    
+        # inside is the title, status (live/finished), km to go, 
-    def get_races(self):
+        # and brief situation text, but we parse that inside Race
-        races_raw = self.live.find_all("li")
+        races = live.find_all("li")
        self.races = []
-        for race in races_raw:
+        for raw_race in races:
-            this_race = Race(race)
+            # for now just create the Race object and add it to our list
-            self.races.append(this_race)
+            race = Race(raw_race)
            self.races.append(race)
    # straightforward
    def print_races(self):
        for race in self.races:
            race.print_stats()
 # class for storing information about individual live races
 class Race:
    def __init__(self, raw):
        # this is the unparsed html from LiveStats
        self.raw = raw
        # now go parse it
        self.refresh_info()
        # keeping these empty unless requested becauses i feel like it
        # timeline is headings from the race's page (left side)
        # situation_long is groups from the left side and time gaps
        # (not to be confused with situation which is a brief homepage summary)
        self.timeline = []
        self.situation_long = []
    # to parse the raw data given by LiveStats
    def refresh_info(self):
        title_r = self.raw.find(attrs={"class":"title"})
        status_r = self.raw.find(attrs={"class":"status"})
@@ -44,25 +67,32 @@ class Race:
        situation_r = self.raw.find(attrs={"class":"situ_txt"})
        url_r = self.raw.find("a", href=True)
-        self.title = self.remove_tags(title_r)
+        self.title = self.remove_tags(title_r) # name of the race
-        self.status = self.remove_tags(status_r)
+        self.status = self.remove_tags(status_r) # live or finished (finished races stay online for a short while after finishing)
-        self.togo = self.remove_tags(togo_r)
+        self.togo = self.remove_tags(togo_r) # how many kms to go (None if finished)
-        self.situation = self.remove_tags(situation_r)
+        self.situation = self.remove_tags(situation_r) # a brief summary of the situation - None if finished, and - if complete peloton
-        self.url = self.remove_tags(url_r["href"])
+        self.url = self.remove_tags(url_r["href"]) # the url for the race's full live stats page
    # self explanatory
    def print_stats(self):
        print(self.title)
        print(self.status)
        # dont bother showing kms to go or situation if finished
        if self.togo != "None":
            print(self.togo, "to go")
            print(self.situation)
        print("===============")
-    def print_raw(self):
+    # for testing
-        print(self.raw)
+    # def print_raw(self):
-        print("")
+    #     print(self.raw)
-        print(self.url)
+    #     print("")
    #     print(self.url)
    # this gets the html of the entire page of stats for this particular race
    # for use with get_timeline() and get_situation_long()
    def get_race_page(self):
        if self.url != "None":
            full_url = "https://www.procyclingstats.com/" + self.url
@@ -73,6 +103,8 @@ class Race:
            return soup
        return None
    # fills a list with all displayed timeline items (headings only)
    # item 0 is the most recent update
    def get_timeline(self):
        page = self.get_race_page()
        if page:
@@ -86,31 +118,42 @@ class Race:
                    stat_content = stat.find(attrs={"class":"textCont"})
                    self.timeline.append(self.remove_tags(stat_content))
    # creates a dictionary containing time gaps as keys and each
    # timegap points to a list of riders in that group
    def get_situation_long(self):
        page = self.get_race_page()
        if page:
            all = page.find_all(attrs={"class":"situCont"})
            live = all[0]
            situation_long = live.find_all("li")
            # create the dictionary
            self.situation_long = {}
            # last_timegap is used for grouping riders together
            last_timegap = None
            for item in situation_long:
                #print(item)
                time_gap = item.find(attrs={"class":"time"})
                group_name = item.find(attrs={"class":"groupname"})
                rider = item.find(attrs={"class":"maxw200"})
                # time gap is only listed once and subsequent riders in that
                # group don't have one (None)
                if time_gap:
-                    tg = self.remove_tags(time_gap)
+                    tg = self.remove_tags(time_gap) # get rid of html tags
-                    self.situation_long[tg] = []
+                    self.situation_long[tg] = [] # create the list inside the dict
-                    last_timegap = tg
+                    last_timegap = tg # set last timegap for the loop
-                rider_name = self.remove_tags(rider)
+
                rider_name = self.remove_tags(rider) # remove tags from rider name
                # the leading rider is basically the group name i guess?
                # anyway we dont need it twice
                if rider_name not in self.situation_long[last_timegap]:
                    # add rider to list of riders under that timegap/group
                    self.situation_long[last_timegap].append(rider_name)
-                #print(self.remove_tags(time_gap), self.remove_tags(group_name), self.remove_tags(riders))
+    # remove surrounding html tags from final data points,
-
+    # like rider names, race names, etc.
    def remove_tags(self, text):
        text = str(text)
        text_soup = BeautifulSoup(text, "html.parser")
`@@ -42,6 +42,3 @@ stats.print_races()`
	`example.py` posts the latest timeline of one current race (checking every 10 seconds for updates)	`example.py` posts the latest timeline of one current race (checking every 10 seconds for updates)

	`example2.py` organises the situation diagram in a dictionary of timegap keys to a list of riders in that group. it could probably be organised better but it works	`example2.py` organises the situation diagram in a dictionary of timegap keys to a list of riders in that group. it could probably be organised better but it works