improvement to situation_long based on todays ethias race

oops
Update README.md
2026-06-02 15:04:42 +01:00 · 2026-06-02 14:36:35 +01:00 · 2026-06-01 19:28:23 +00:00 · 2026-06-01 20:27:15 +01:00 · 2026-06-01 20:23:28 +01:00 · 2026-06-01 20:21:22 +01:00
5 changed files with 181 additions and 51 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -174,3 +174,4 @@ cython_debug/
 # PyPI configuration file
 .pypirc

+test.py
--- a/README.md
+++ b/README.md
@@ -2,16 +2,16 @@

 scrapes procyclingstats livestats homepage and timeline

-timeline updates are simply the titles, so sometimes you get something like "ranking after x km" or "present riders today from last years top 20" with no details. for now i am not getting the details from underneath, because for the most part they can be ignored. the action timeline headings like "wheel change for x rider" are good enough when they come through.
+if running on a vps (or vps-like system such as raspberry pi), it shouldnt 403 anymore

 # todo

- [ ] timeline items with details
+- [x] timeline items with details
 - [ ] pypi package ??

 # setup (windows)

-after cloning set up a virtualenv
+after cloning, cd into directory and set up a virtualenv

 ```
 py -3 -m venv .venv
@@ -24,18 +24,31 @@ if you get an error while activating the virtualenv, you may need to run this fi
 Set-ExecutionPolicy Unrestricted -Force
 ```

-install dependencies
+# setup linux
+
+after cloning, cd into directory and create a virtualenv
+
+```
+python -m venv venv
+source venv/bin/activate
+```
+
+# install dependencies

 ```
 pip install beautifulsoup4
 pip install requests
+pip install cloudscraper
 ```

-# usage
+# examples

-```
-from pcslive import LiveStats
+in any looping example, use a suitable delay to avoid spamming the site with requests

-stats = LiveStats()
-stats.print_races()
-```
+`example_latest_timeline.py` : uses a simple infinite loop (async would be better in a full application) to skim the top of the timeline for live updates. it will not display the same update twice
+
+`example_live_timeline.py` : another older way to show a live timeline update, less compatible with async routines due to the checking of the last item
+
+`example_situation.py` : display the provided time gaps (if any) in a readable format
+
+`example_timeline_all.py` : display the entire timeline of a race (at this moment) - in reverse so that the latest update is at the bottom for console convenience
--- a/example_latest_timeline.py
+++ b/example_latest_timeline.py
@@ -0,0 +1,11 @@
+from pcslive import LiveStats
+import time
+
+stats = LiveStats()
+race = stats.find_race("giro")
+while True:
+    race.get_timeline()
+    latest = race.timeline_latest()
+    if latest:
+        print(latest)
+    time.sleep(10)
--- a/example_situation.py
+++ b/example_situation.py
@@ -1,14 +1,17 @@
 from pcslive import LiveStats
-import time

 stats = LiveStats()

-# if there are live races...
-if len(stats.races) > 0:
-    race = stats.races[0]
+from pcslive import LiveStats

+stats = LiveStats()
+race = stats.find_race("ethias")
 race.get_situation_long()

-    # whats the situation? :)
-    for x in race.situation_long:
-        print(x, race.situation_long[x])
+if race.situation_long:
+    for group in race.situation_long:
+        for item in group:
+            print(item)
+        print("================")
+else:
+    print("No situation data")
--- a/pcslive.py
+++ b/pcslive.py
@@ -1,5 +1,5 @@
 from bs4 import BeautifulSoup
-import requests, time
+import requests, time, cloudscraper

 # class for storing the live races
 class LiveStats:
@@ -11,9 +11,13 @@ class LiveStats:
    # it also gets rid of all the horrible polygon stuff
    def refresh_races(self):
        # the entire front page of PCS loaded into soup
-        req = requests.get("https://www.procyclingstats.com/")
-        html = req.text
-        soup = BeautifulSoup(html, "html.parser")
+        #self.req = requests.get("https://www.procyclingstats.com/")
+        #self.html = self.req.text
+
+        self.scraper = cloudscraper.create_scraper()
+        self.req = self.scraper.get("https://www.procyclingstats.com/")
+        self.html = self.req.text
+        soup = BeautifulSoup(self.html, "html.parser")

        # narrow it down to the little green live stats boxes showing us what
        # races are currently live
@@ -42,6 +46,13 @@ class LiveStats:
        for race in self.races:
            race.print_stats()

+    # finds a race by its title
+    def find_race(self, query):
+        for race in self.races:
+            if query.lower() in race.title.lower():
+                return race
+        return None
+



@@ -62,6 +73,10 @@ class Race:
        self.timeline = []
        self.situation_long = []

+        # when using timeline_latest() put already returned updates
+        # inside here so that they dont get repeated
+        self.timeline_latest_store = []
+
    # to parse the raw data given by LiveStats
    def refresh_info(self):
        title_r = self.raw.find(attrs={"class":"title"})
@@ -100,7 +115,8 @@ class Race:
        if self.url != "None":
            full_url = "https://www.procyclingstats.com/" + self.url

-            req = requests.get(full_url)
+            self.scraper = cloudscraper.create_scraper()
+            req = self.scraper.get(full_url)
            html = req.text
            soup = BeautifulSoup(html, "html.parser")
            return soup
@@ -112,48 +128,134 @@ class Race:
        page = self.get_race_page()
        if page:
            all = page.find_all(attrs={"class":"timeline3cont"})
-            live = all[0]
-            timeline = live.find_all("li")
+            self.timeline_live = all[0]
+            timeline = self.timeline_live.find_all("li")
            self.timeline = []
            for item in timeline:
                stat = item.find(attrs={"class":"stat"})
                if stat:
-                    stat_content = stat.find(attrs={"class":"textCont"})
+                    stat_content, is_data, has_info_number = self.timeline_stats(stat)
+                    if is_data:
+                        pass
+                    elif has_info_number:
+                        # same as in the timeline_latest function
+                        number = self.remove_tags(has_info_number)
+                        text = self.remove_tags(stat_content)
+                        update = number + " " + text
+                        self.timeline.append(update)
+                    else:
                        self.timeline.append(self.remove_tags(stat_content))

+    # a function for getting only the latest timeline updates!
+    # useful for making an async timeline feed
+    def timeline_latest(self):
+        # its a bit weird if it doesnt display Anything at first run
+        # so if the seen list is empty, just show the latest update 
+        # from the full timeline (then add that to seen)
+        # self.timeline_latest_store  is the list for storing seen updates
+        if len(self.timeline_latest_store) == 0:
+            update = self.timeline[0]
+            self.timeline_latest_store.append(update)
+            return update
+        
+        # now most of this code is identical to the full timeline
+        # except that it just uses find instead of find_all 
+        # assuming it finds the first one... which it does
+        latest = self.timeline_live.find("li")
+        stat = latest.find(attrs={"class":"stat"})
+        if stat:
+            stat_content, is_data, has_info_number = self.timeline_stats(stat)
+            if is_data:
+                pass
+            elif has_info_number:
+                # some timeline updates use a big number
+                # like 150 kilometers to the finish
+                # and scraper only finds "kilometers to the finish"
+                # so if there is a big number, get it and add it
+                # there is a drawback of it doing this for every Big Number
+                # but i sorta dont care rn haha
+                number = self.remove_tags(has_info_number)
+                text = self.remove_tags(stat_content)
+                update = number + " " + text
+
+                # only show the update if it hasnt been seen before
+                if update not in self.timeline_latest_store:
+                    # then store it so we know we've already seen it
+                    self.timeline_latest_store.append(update)
+                    return update
+            else:
+                # same as above but without the big number
+                update = self.remove_tags(stat_content)
+                if update not in self.timeline_latest_store:
+                    self.timeline_latest_store.append(update)
+                    return update
+        return None
+    
+    # function for getting specific things from the timeline
+    # it seemed like a good idea to put it here at the time
+    # shrug
+    def timeline_stats(self, stat):
+        stat_content = stat.find(attrs={"class":"textCont"})
+        is_data = stat.find(attrs={"class":"chartCont"})
+        has_info_number = stat.find(attrs={"class":"number"})
+        return stat_content, is_data, has_info_number

    # creates a dictionary containing time gaps as keys and each
    # timegap points to a list of riders in that group
+    # def get_situation_long(self):
+    #     page = self.get_race_page()
+    #     if page:
+    #         all = page.find_all(attrs={"class":"situCont"})
+    #         live = all[0]
+    #         situation_long = live.find_all("li")
+
+    #         # create the dictionary
+    #         self.situation_long = {}
+
+    #         # last_timegap is used for grouping riders together
+    #         last_timegap = None
+
+    #         for item in situation_long:
+    #             time_gap = item.find(attrs={"class":"time"})
+    #             rider = item.find(attrs={"class":"maxw200"})
+
+    #             # time gap is only listed once and subsequent riders in that
+    #             # group don't have one (None)
+    #             if time_gap:
+    #                 tg = self.remove_tags(time_gap) # get rid of html tags
+    #                 self.situation_long[tg] = [] # create the list inside the dict
+    #                 last_timegap = tg # set last timegap for the loop
+
+    #             rider_name = self.remove_tags(rider) # remove tags from rider name
+    #             # the leading rider is basically the group name i guess?
+    #             # anyway we dont need it twice
+    #             if rider_name not in self.situation_long[last_timegap]:
+    #                 # add rider to list of riders under that timegap/group
+    #                 self.situation_long[last_timegap].append(rider_name)
+
    def get_situation_long(self):
        page = self.get_race_page()
        if page:
            all = page.find_all(attrs={"class":"situCont"})
+            if all:
                live = all[0]
-            situation_long = live.find_all("li")
+                groups = live.find_all(attrs={"class":"group"})
+                self.situation_long = []
+                for group in groups:
+                    group_name = group.find(attrs={"class":"groupname"})
+                    time = group.find(attrs={"class":"time"})
+                    riders = group.find_all("li")
+                    riders_clean = []
+                    for rider in riders:
+                        rider_name = rider.find(attrs={"class":"maxw180"})
+                        riders_clean.append(self.remove_tags(rider_name))
+                    group_name_clean = self.remove_tags(group_name)
+                    time_clean = self.remove_tags(time).strip("??")
+                    self.situation_long.append([group_name_clean, time_clean, riders_clean])
+            else:
+                self.situation_long = None

-            # create the dictionary
-            self.situation_long = {}

-            # last_timegap is used for grouping riders together
-            last_timegap = None
-
-            for item in situation_long:
-                time_gap = item.find(attrs={"class":"time"})
-                rider = item.find(attrs={"class":"maxw200"})
-
-                # time gap is only listed once and subsequent riders in that
-                # group don't have one (None)
-                if time_gap:
-                    tg = self.remove_tags(time_gap) # get rid of html tags
-                    self.situation_long[tg] = [] # create the list inside the dict
-                    last_timegap = tg # set last timegap for the loop
-
-                rider_name = self.remove_tags(rider) # remove tags from rider name
-                # the leading rider is basically the group name i guess?
-                # anyway we dont need it twice
-                if rider_name not in self.situation_long[last_timegap]:
-                    # add rider to list of riders under that timegap/group
-                    self.situation_long[last_timegap].append(rider_name)

    # remove surrounding html tags from final data points,
    # like rider names, race names, etc.
Author	SHA1	Message	Date
cube	c65fbbb476	improvement to situation_long based on todays ethias race	2026-06-02 15:04:42 +01:00
cube	9c27da9265	oops	2026-06-02 14:36:35 +01:00
cube	bc69b237be	Update README.md	2026-06-01 19:28:23 +00:00
cube	7b6c17e2bf	cloudscraper readme update	2026-06-01 20:27:15 +01:00
cube	04bceacc60	cloudscraper attempt for vps use	2026-06-01 20:23:28 +01:00
cube	331c386bce	cloudscraper attempt for vps use	2026-06-01 20:21:22 +01:00
cube	937b07a61d	some updates from todays racing and functions added for easier timeline use	2026-06-01 16:56:43 +01:00
cube	f16e75731c	Merge branch 'main' of https://tea.cubes.link/cube/pcs_live	2026-05-31 15:01:54 +01:00
cube	eab1caa325	for testing	2026-05-31 15:01:53 +01:00