improvement to situation_long based on todays ethias race

oops
Update README.md
2026-06-02 15:04:42 +01:00 · 2026-06-02 14:36:35 +01:00 · 2026-06-01 19:28:23 +00:00 · 2026-06-01 20:27:15 +01:00 · 2026-06-01 20:23:28 +01:00 · 2026-06-01 20:21:22 +01:00
5 changed files with 181 additions and 51 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -174,3 +174,4 @@ cython_debug/
 # PyPI configuration file
 .pypirc
 test.py
--- a/README.md
+++ b/README.md
@@ -2,16 +2,16 @@
 scrapes procyclingstats livestats homepage and timeline
-timeline updates are simply the titles, so sometimes you get something like "ranking after x km" or "present riders today from last years top 20" with no details. for now i am not getting the details from underneath, because for the most part they can be ignored. the action timeline headings like "wheel change for x rider" are good enough when they come through.
+if running on a vps (or vps-like system such as raspberry pi), it shouldnt 403 anymore
 # todo
- [ ] timeline items with details
+- [x] timeline items with details
 - [ ] pypi package ??
 # setup (windows)
-after cloning set up a virtualenv
+after cloning, cd into directory and set up a virtualenv
 ```
 py -3 -m venv .venv
@@ -24,18 +24,31 @@ if you get an error while activating the virtualenv, you may need to run this fi
 Set-ExecutionPolicy Unrestricted -Force
 ```
-install dependencies
+# setup linux
 after cloning, cd into directory and create a virtualenv
 ```
 python -m venv venv
 source venv/bin/activate
 ```
 # install dependencies
 ```
 pip install beautifulsoup4
 pip install requests
 pip install cloudscraper
 ```
-# usage
+# examples
-```
+in any looping example, use a suitable delay to avoid spamming the site with requests
 from pcslive import LiveStats
-stats = LiveStats()
+`example_latest_timeline.py` : uses a simple infinite loop (async would be better in a full application) to skim the top of the timeline for live updates. it will not display the same update twice
-stats.print_races()
+
-```
+`example_live_timeline.py` : another older way to show a live timeline update, less compatible with async routines due to the checking of the last item
 `example_situation.py` : display the provided time gaps (if any) in a readable format
 `example_timeline_all.py` : display the entire timeline of a race (at this moment) - in reverse so that the latest update is at the bottom for console convenience
--- a/example_latest_timeline.py
+++ b/example_latest_timeline.py
@@ -0,0 +1,11 @@
 from pcslive import LiveStats
 import time
 stats = LiveStats()
 race = stats.find_race("giro")
 while True:
    race.get_timeline()
    latest = race.timeline_latest()
    if latest:
        print(latest)
    time.sleep(10)
--- a/example_situation.py
+++ b/example_situation.py
@@ -1,14 +1,17 @@
 from pcslive import LiveStats
 import time
 stats = LiveStats()
-# if there are live races...
+from pcslive import LiveStats
 if len(stats.races) > 0:
    race = stats.races[0]
-    race.get_situation_long()
+stats = LiveStats()
 race = stats.find_race("ethias")
 race.get_situation_long()
-    # whats the situation? :)
+if race.situation_long:
-    for x in race.situation_long:
+    for group in race.situation_long:
-        print(x, race.situation_long[x])
+        for item in group:
            print(item)
        print("================")
 else:
    print("No situation data")
--- a/pcslive.py
+++ b/pcslive.py
@@ -1,5 +1,5 @@
 from bs4 import BeautifulSoup
-import requests, time
+import requests, time, cloudscraper
 # class for storing the live races
 class LiveStats:
@@ -11,9 +11,13 @@ class LiveStats:
    # it also gets rid of all the horrible polygon stuff
    def refresh_races(self):
        # the entire front page of PCS loaded into soup
-        req = requests.get("https://www.procyclingstats.com/")
+        #self.req = requests.get("https://www.procyclingstats.com/")
-        html = req.text
+        #self.html = self.req.text
-        soup = BeautifulSoup(html, "html.parser")
+
        self.scraper = cloudscraper.create_scraper()
        self.req = self.scraper.get("https://www.procyclingstats.com/")
        self.html = self.req.text
        soup = BeautifulSoup(self.html, "html.parser")
        # narrow it down to the little green live stats boxes showing us what
        # races are currently live
@@ -42,6 +46,13 @@ class LiveStats:
        for race in self.races:
            race.print_stats()
    # finds a race by its title
    def find_race(self, query):
        for race in self.races:
            if query.lower() in race.title.lower():
                return race
        return None
@@ -62,6 +73,10 @@ class Race:
        self.timeline = []
        self.situation_long = []
        # when using timeline_latest() put already returned updates
        # inside here so that they dont get repeated
        self.timeline_latest_store = []
    # to parse the raw data given by LiveStats
    def refresh_info(self):
        title_r = self.raw.find(attrs={"class":"title"})
@@ -100,7 +115,8 @@ class Race:
        if self.url != "None":
            full_url = "https://www.procyclingstats.com/" + self.url
-            req = requests.get(full_url)
+            self.scraper = cloudscraper.create_scraper()
            req = self.scraper.get(full_url)
            html = req.text
            soup = BeautifulSoup(html, "html.parser")
            return soup
@@ -112,48 +128,134 @@ class Race:
        page = self.get_race_page()
        if page:
            all = page.find_all(attrs={"class":"timeline3cont"})
-            live = all[0]
+            self.timeline_live = all[0]
-            timeline = live.find_all("li")
+            timeline = self.timeline_live.find_all("li")
            self.timeline = []
            for item in timeline:
                stat = item.find(attrs={"class":"stat"})
                if stat:
-                    stat_content = stat.find(attrs={"class":"textCont"})
+                    stat_content, is_data, has_info_number = self.timeline_stats(stat)
-                    self.timeline.append(self.remove_tags(stat_content))
+                    if is_data:
                        pass
                    elif has_info_number:
                        # same as in the timeline_latest function
                        number = self.remove_tags(has_info_number)
                        text = self.remove_tags(stat_content)
                        update = number + " " + text
                        self.timeline.append(update)
                    else:
                        self.timeline.append(self.remove_tags(stat_content))
    # a function for getting only the latest timeline updates!
    # useful for making an async timeline feed
    def timeline_latest(self):
        # its a bit weird if it doesnt display Anything at first run
        # so if the seen list is empty, just show the latest update 
        # from the full timeline (then add that to seen)
        # self.timeline_latest_store  is the list for storing seen updates
        if len(self.timeline_latest_store) == 0:
            update = self.timeline[0]
            self.timeline_latest_store.append(update)
            return update
        # now most of this code is identical to the full timeline
        # except that it just uses find instead of find_all 
        # assuming it finds the first one... which it does
        latest = self.timeline_live.find("li")
        stat = latest.find(attrs={"class":"stat"})
        if stat:
            stat_content, is_data, has_info_number = self.timeline_stats(stat)
            if is_data:
                pass
            elif has_info_number:
                # some timeline updates use a big number
                # like 150 kilometers to the finish
                # and scraper only finds "kilometers to the finish"
                # so if there is a big number, get it and add it
                # there is a drawback of it doing this for every Big Number
                # but i sorta dont care rn haha
                number = self.remove_tags(has_info_number)
                text = self.remove_tags(stat_content)
                update = number + " " + text
                # only show the update if it hasnt been seen before
                if update not in self.timeline_latest_store:
                    # then store it so we know we've already seen it
                    self.timeline_latest_store.append(update)
                    return update
            else:
                # same as above but without the big number
                update = self.remove_tags(stat_content)
                if update not in self.timeline_latest_store:
                    self.timeline_latest_store.append(update)
                    return update
        return None
    # function for getting specific things from the timeline
    # it seemed like a good idea to put it here at the time
    # shrug
    def timeline_stats(self, stat):
        stat_content = stat.find(attrs={"class":"textCont"})
        is_data = stat.find(attrs={"class":"chartCont"})
        has_info_number = stat.find(attrs={"class":"number"})
        return stat_content, is_data, has_info_number
    # creates a dictionary containing time gaps as keys and each
    # timegap points to a list of riders in that group
    # def get_situation_long(self):
    #     page = self.get_race_page()
    #     if page:
    #         all = page.find_all(attrs={"class":"situCont"})
    #         live = all[0]
    #         situation_long = live.find_all("li")
    #         # create the dictionary
    #         self.situation_long = {}
    #         # last_timegap is used for grouping riders together
    #         last_timegap = None
    #         for item in situation_long:
    #             time_gap = item.find(attrs={"class":"time"})
    #             rider = item.find(attrs={"class":"maxw200"})
    #             # time gap is only listed once and subsequent riders in that
    #             # group don't have one (None)
    #             if time_gap:
    #                 tg = self.remove_tags(time_gap) # get rid of html tags
    #                 self.situation_long[tg] = [] # create the list inside the dict
    #                 last_timegap = tg # set last timegap for the loop
    #             rider_name = self.remove_tags(rider) # remove tags from rider name
    #             # the leading rider is basically the group name i guess?
    #             # anyway we dont need it twice
    #             if rider_name not in self.situation_long[last_timegap]:
    #                 # add rider to list of riders under that timegap/group
    #                 self.situation_long[last_timegap].append(rider_name)
    def get_situation_long(self):
        page = self.get_race_page()
        if page:
            all = page.find_all(attrs={"class":"situCont"})
-            live = all[0]
+            if all:
-            situation_long = live.find_all("li")
+                live = all[0]
                groups = live.find_all(attrs={"class":"group"})
                self.situation_long = []
                for group in groups:
                    group_name = group.find(attrs={"class":"groupname"})
                    time = group.find(attrs={"class":"time"})
                    riders = group.find_all("li")
                    riders_clean = []
                    for rider in riders:
                        rider_name = rider.find(attrs={"class":"maxw180"})
                        riders_clean.append(self.remove_tags(rider_name))
                    group_name_clean = self.remove_tags(group_name)
                    time_clean = self.remove_tags(time).strip("??")
                    self.situation_long.append([group_name_clean, time_clean, riders_clean])
            else:
                self.situation_long = None
            # create the dictionary
            self.situation_long = {}
            # last_timegap is used for grouping riders together
            last_timegap = None
            for item in situation_long:
                time_gap = item.find(attrs={"class":"time"})
                rider = item.find(attrs={"class":"maxw200"})
                # time gap is only listed once and subsequent riders in that
                # group don't have one (None)
                if time_gap:
                    tg = self.remove_tags(time_gap) # get rid of html tags
                    self.situation_long[tg] = [] # create the list inside the dict
                    last_timegap = tg # set last timegap for the loop
                rider_name = self.remove_tags(rider) # remove tags from rider name
                # the leading rider is basically the group name i guess?
                # anyway we dont need it twice
                if rider_name not in self.situation_long[last_timegap]:
                    # add rider to list of riders under that timegap/group
                    self.situation_long[last_timegap].append(rider_name)
    # remove surrounding html tags from final data points,
    # like rider names, race names, etc.
Author	SHA1	Message	Date
cube	c65fbbb476	improvement to situation_long based on todays ethias race	2026-06-02 15:04:42 +01:00
cube	9c27da9265	oops	2026-06-02 14:36:35 +01:00
cube	bc69b237be	Update README.md	2026-06-01 19:28:23 +00:00
cube	7b6c17e2bf	cloudscraper readme update	2026-06-01 20:27:15 +01:00
cube	04bceacc60	cloudscraper attempt for vps use	2026-06-01 20:23:28 +01:00
cube	331c386bce	cloudscraper attempt for vps use	2026-06-01 20:21:22 +01:00
cube	937b07a61d	some updates from todays racing and functions added for easier timeline use	2026-06-01 16:56:43 +01:00
cube	f16e75731c	Merge branch 'main' of https://tea.cubes.link/cube/pcs_live	2026-05-31 15:01:54 +01:00
cube	eab1caa325	for testing	2026-05-31 15:01:53 +01:00