Compare commits

...

9 Commits

Author SHA1 Message Date
cube
c65fbbb476 improvement to situation_long based on todays ethias race 2026-06-02 15:04:42 +01:00
cube
9c27da9265 oops 2026-06-02 14:36:35 +01:00
bc69b237be Update README.md 2026-06-01 19:28:23 +00:00
cube
7b6c17e2bf cloudscraper readme update 2026-06-01 20:27:15 +01:00
cube
04bceacc60 cloudscraper attempt for vps use 2026-06-01 20:23:28 +01:00
cube
331c386bce cloudscraper attempt for vps use 2026-06-01 20:21:22 +01:00
cube
937b07a61d some updates from todays racing and functions added for easier timeline use 2026-06-01 16:56:43 +01:00
cube
f16e75731c Merge branch 'main' of https://tea.cubes.link/cube/pcs_live 2026-05-31 15:01:54 +01:00
cube
eab1caa325 for testing 2026-05-31 15:01:53 +01:00
5 changed files with 181 additions and 51 deletions

1
.gitignore vendored
View File

@@ -174,3 +174,4 @@ cython_debug/
# PyPI configuration file # PyPI configuration file
.pypirc .pypirc
test.py

View File

@@ -2,16 +2,16 @@
scrapes procyclingstats livestats homepage and timeline scrapes procyclingstats livestats homepage and timeline
timeline updates are simply the titles, so sometimes you get something like "ranking after x km" or "present riders today from last years top 20" with no details. for now i am not getting the details from underneath, because for the most part they can be ignored. the action timeline headings like "wheel change for x rider" are good enough when they come through. if running on a vps (or vps-like system such as raspberry pi), it shouldnt 403 anymore
# todo # todo
- [ ] timeline items with details - [x] timeline items with details
- [ ] pypi package ?? - [ ] pypi package ??
# setup (windows) # setup (windows)
after cloning set up a virtualenv after cloning, cd into directory and set up a virtualenv
``` ```
py -3 -m venv .venv py -3 -m venv .venv
@@ -24,18 +24,31 @@ if you get an error while activating the virtualenv, you may need to run this fi
Set-ExecutionPolicy Unrestricted -Force Set-ExecutionPolicy Unrestricted -Force
``` ```
install dependencies # setup linux
after cloning, cd into directory and create a virtualenv
```
python -m venv venv
source venv/bin/activate
```
# install dependencies
``` ```
pip install beautifulsoup4 pip install beautifulsoup4
pip install requests pip install requests
pip install cloudscraper
``` ```
# usage # examples
``` in any looping example, use a suitable delay to avoid spamming the site with requests
from pcslive import LiveStats
stats = LiveStats() `example_latest_timeline.py` : uses a simple infinite loop (async would be better in a full application) to skim the top of the timeline for live updates. it will not display the same update twice
stats.print_races()
``` `example_live_timeline.py` : another older way to show a live timeline update, less compatible with async routines due to the checking of the last item
`example_situation.py` : display the provided time gaps (if any) in a readable format
`example_timeline_all.py` : display the entire timeline of a race (at this moment) - in reverse so that the latest update is at the bottom for console convenience

View File

@@ -0,0 +1,11 @@
from pcslive import LiveStats
import time
stats = LiveStats()
race = stats.find_race("giro")
while True:
race.get_timeline()
latest = race.timeline_latest()
if latest:
print(latest)
time.sleep(10)

View File

@@ -1,14 +1,17 @@
from pcslive import LiveStats from pcslive import LiveStats
import time
stats = LiveStats() stats = LiveStats()
# if there are live races... from pcslive import LiveStats
if len(stats.races) > 0:
race = stats.races[0]
race.get_situation_long() stats = LiveStats()
race = stats.find_race("ethias")
race.get_situation_long()
# whats the situation? :) if race.situation_long:
for x in race.situation_long: for group in race.situation_long:
print(x, race.situation_long[x]) for item in group:
print(item)
print("================")
else:
print("No situation data")

View File

@@ -1,5 +1,5 @@
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import requests, time import requests, time, cloudscraper
# class for storing the live races # class for storing the live races
class LiveStats: class LiveStats:
@@ -11,9 +11,13 @@ class LiveStats:
# it also gets rid of all the horrible polygon stuff # it also gets rid of all the horrible polygon stuff
def refresh_races(self): def refresh_races(self):
# the entire front page of PCS loaded into soup # the entire front page of PCS loaded into soup
req = requests.get("https://www.procyclingstats.com/") #self.req = requests.get("https://www.procyclingstats.com/")
html = req.text #self.html = self.req.text
soup = BeautifulSoup(html, "html.parser")
self.scraper = cloudscraper.create_scraper()
self.req = self.scraper.get("https://www.procyclingstats.com/")
self.html = self.req.text
soup = BeautifulSoup(self.html, "html.parser")
# narrow it down to the little green live stats boxes showing us what # narrow it down to the little green live stats boxes showing us what
# races are currently live # races are currently live
@@ -42,6 +46,13 @@ class LiveStats:
for race in self.races: for race in self.races:
race.print_stats() race.print_stats()
# finds a race by its title
def find_race(self, query):
for race in self.races:
if query.lower() in race.title.lower():
return race
return None
@@ -62,6 +73,10 @@ class Race:
self.timeline = [] self.timeline = []
self.situation_long = [] self.situation_long = []
# when using timeline_latest() put already returned updates
# inside here so that they dont get repeated
self.timeline_latest_store = []
# to parse the raw data given by LiveStats # to parse the raw data given by LiveStats
def refresh_info(self): def refresh_info(self):
title_r = self.raw.find(attrs={"class":"title"}) title_r = self.raw.find(attrs={"class":"title"})
@@ -100,7 +115,8 @@ class Race:
if self.url != "None": if self.url != "None":
full_url = "https://www.procyclingstats.com/" + self.url full_url = "https://www.procyclingstats.com/" + self.url
req = requests.get(full_url) self.scraper = cloudscraper.create_scraper()
req = self.scraper.get(full_url)
html = req.text html = req.text
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
return soup return soup
@@ -112,48 +128,134 @@ class Race:
page = self.get_race_page() page = self.get_race_page()
if page: if page:
all = page.find_all(attrs={"class":"timeline3cont"}) all = page.find_all(attrs={"class":"timeline3cont"})
live = all[0] self.timeline_live = all[0]
timeline = live.find_all("li") timeline = self.timeline_live.find_all("li")
self.timeline = [] self.timeline = []
for item in timeline: for item in timeline:
stat = item.find(attrs={"class":"stat"}) stat = item.find(attrs={"class":"stat"})
if stat: if stat:
stat_content = stat.find(attrs={"class":"textCont"}) stat_content, is_data, has_info_number = self.timeline_stats(stat)
self.timeline.append(self.remove_tags(stat_content)) if is_data:
pass
elif has_info_number:
# same as in the timeline_latest function
number = self.remove_tags(has_info_number)
text = self.remove_tags(stat_content)
update = number + " " + text
self.timeline.append(update)
else:
self.timeline.append(self.remove_tags(stat_content))
# a function for getting only the latest timeline updates!
# useful for making an async timeline feed
def timeline_latest(self):
# its a bit weird if it doesnt display Anything at first run
# so if the seen list is empty, just show the latest update
# from the full timeline (then add that to seen)
# self.timeline_latest_store is the list for storing seen updates
if len(self.timeline_latest_store) == 0:
update = self.timeline[0]
self.timeline_latest_store.append(update)
return update
# now most of this code is identical to the full timeline
# except that it just uses find instead of find_all
# assuming it finds the first one... which it does
latest = self.timeline_live.find("li")
stat = latest.find(attrs={"class":"stat"})
if stat:
stat_content, is_data, has_info_number = self.timeline_stats(stat)
if is_data:
pass
elif has_info_number:
# some timeline updates use a big number
# like 150 kilometers to the finish
# and scraper only finds "kilometers to the finish"
# so if there is a big number, get it and add it
# there is a drawback of it doing this for every Big Number
# but i sorta dont care rn haha
number = self.remove_tags(has_info_number)
text = self.remove_tags(stat_content)
update = number + " " + text
# only show the update if it hasnt been seen before
if update not in self.timeline_latest_store:
# then store it so we know we've already seen it
self.timeline_latest_store.append(update)
return update
else:
# same as above but without the big number
update = self.remove_tags(stat_content)
if update not in self.timeline_latest_store:
self.timeline_latest_store.append(update)
return update
return None
# function for getting specific things from the timeline
# it seemed like a good idea to put it here at the time
# shrug
def timeline_stats(self, stat):
stat_content = stat.find(attrs={"class":"textCont"})
is_data = stat.find(attrs={"class":"chartCont"})
has_info_number = stat.find(attrs={"class":"number"})
return stat_content, is_data, has_info_number
# creates a dictionary containing time gaps as keys and each # creates a dictionary containing time gaps as keys and each
# timegap points to a list of riders in that group # timegap points to a list of riders in that group
# def get_situation_long(self):
# page = self.get_race_page()
# if page:
# all = page.find_all(attrs={"class":"situCont"})
# live = all[0]
# situation_long = live.find_all("li")
# # create the dictionary
# self.situation_long = {}
# # last_timegap is used for grouping riders together
# last_timegap = None
# for item in situation_long:
# time_gap = item.find(attrs={"class":"time"})
# rider = item.find(attrs={"class":"maxw200"})
# # time gap is only listed once and subsequent riders in that
# # group don't have one (None)
# if time_gap:
# tg = self.remove_tags(time_gap) # get rid of html tags
# self.situation_long[tg] = [] # create the list inside the dict
# last_timegap = tg # set last timegap for the loop
# rider_name = self.remove_tags(rider) # remove tags from rider name
# # the leading rider is basically the group name i guess?
# # anyway we dont need it twice
# if rider_name not in self.situation_long[last_timegap]:
# # add rider to list of riders under that timegap/group
# self.situation_long[last_timegap].append(rider_name)
def get_situation_long(self): def get_situation_long(self):
page = self.get_race_page() page = self.get_race_page()
if page: if page:
all = page.find_all(attrs={"class":"situCont"}) all = page.find_all(attrs={"class":"situCont"})
live = all[0] if all:
situation_long = live.find_all("li") live = all[0]
groups = live.find_all(attrs={"class":"group"})
self.situation_long = []
for group in groups:
group_name = group.find(attrs={"class":"groupname"})
time = group.find(attrs={"class":"time"})
riders = group.find_all("li")
riders_clean = []
for rider in riders:
rider_name = rider.find(attrs={"class":"maxw180"})
riders_clean.append(self.remove_tags(rider_name))
group_name_clean = self.remove_tags(group_name)
time_clean = self.remove_tags(time).strip("??")
self.situation_long.append([group_name_clean, time_clean, riders_clean])
else:
self.situation_long = None
# create the dictionary
self.situation_long = {}
# last_timegap is used for grouping riders together
last_timegap = None
for item in situation_long:
time_gap = item.find(attrs={"class":"time"})
rider = item.find(attrs={"class":"maxw200"})
# time gap is only listed once and subsequent riders in that
# group don't have one (None)
if time_gap:
tg = self.remove_tags(time_gap) # get rid of html tags
self.situation_long[tg] = [] # create the list inside the dict
last_timegap = tg # set last timegap for the loop
rider_name = self.remove_tags(rider) # remove tags from rider name
# the leading rider is basically the group name i guess?
# anyway we dont need it twice
if rider_name not in self.situation_long[last_timegap]:
# add rider to list of riders under that timegap/group
self.situation_long[last_timegap].append(rider_name)
# remove surrounding html tags from final data points, # remove surrounding html tags from final data points,
# like rider names, race names, etc. # like rider names, race names, etc.