Compare commits
9 Commits
95d14dd48e
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c65fbbb476 | ||
|
|
9c27da9265 | ||
| bc69b237be | |||
|
|
7b6c17e2bf | ||
|
|
04bceacc60 | ||
|
|
331c386bce | ||
|
|
937b07a61d | ||
|
|
f16e75731c | ||
|
|
eab1caa325 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -174,3 +174,4 @@ cython_debug/
|
|||||||
# PyPI configuration file
|
# PyPI configuration file
|
||||||
.pypirc
|
.pypirc
|
||||||
|
|
||||||
|
test.py
|
||||||
|
|||||||
33
README.md
33
README.md
@@ -2,16 +2,16 @@
|
|||||||
|
|
||||||
scrapes procyclingstats livestats homepage and timeline
|
scrapes procyclingstats livestats homepage and timeline
|
||||||
|
|
||||||
timeline updates are simply the titles, so sometimes you get something like "ranking after x km" or "present riders today from last years top 20" with no details. for now i am not getting the details from underneath, because for the most part they can be ignored. the action timeline headings like "wheel change for x rider" are good enough when they come through.
|
if running on a vps (or vps-like system such as raspberry pi), it shouldnt 403 anymore
|
||||||
|
|
||||||
# todo
|
# todo
|
||||||
|
|
||||||
- [ ] timeline items with details
|
- [x] timeline items with details
|
||||||
- [ ] pypi package ??
|
- [ ] pypi package ??
|
||||||
|
|
||||||
# setup (windows)
|
# setup (windows)
|
||||||
|
|
||||||
after cloning set up a virtualenv
|
after cloning, cd into directory and set up a virtualenv
|
||||||
|
|
||||||
```
|
```
|
||||||
py -3 -m venv .venv
|
py -3 -m venv .venv
|
||||||
@@ -24,18 +24,31 @@ if you get an error while activating the virtualenv, you may need to run this fi
|
|||||||
Set-ExecutionPolicy Unrestricted -Force
|
Set-ExecutionPolicy Unrestricted -Force
|
||||||
```
|
```
|
||||||
|
|
||||||
install dependencies
|
# setup linux
|
||||||
|
|
||||||
|
after cloning, cd into directory and create a virtualenv
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m venv venv
|
||||||
|
source venv/bin/activate
|
||||||
|
```
|
||||||
|
|
||||||
|
# install dependencies
|
||||||
|
|
||||||
```
|
```
|
||||||
pip install beautifulsoup4
|
pip install beautifulsoup4
|
||||||
pip install requests
|
pip install requests
|
||||||
|
pip install cloudscraper
|
||||||
```
|
```
|
||||||
|
|
||||||
# usage
|
# examples
|
||||||
|
|
||||||
```
|
in any looping example, use a suitable delay to avoid spamming the site with requests
|
||||||
from pcslive import LiveStats
|
|
||||||
|
|
||||||
stats = LiveStats()
|
`example_latest_timeline.py` : uses a simple infinite loop (async would be better in a full application) to skim the top of the timeline for live updates. it will not display the same update twice
|
||||||
stats.print_races()
|
|
||||||
```
|
`example_live_timeline.py` : another older way to show a live timeline update, less compatible with async routines due to the checking of the last item
|
||||||
|
|
||||||
|
`example_situation.py` : display the provided time gaps (if any) in a readable format
|
||||||
|
|
||||||
|
`example_timeline_all.py` : display the entire timeline of a race (at this moment) - in reverse so that the latest update is at the bottom for console convenience
|
||||||
11
example_latest_timeline.py
Normal file
11
example_latest_timeline.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
from pcslive import LiveStats
|
||||||
|
import time
|
||||||
|
|
||||||
|
stats = LiveStats()
|
||||||
|
race = stats.find_race("giro")
|
||||||
|
while True:
|
||||||
|
race.get_timeline()
|
||||||
|
latest = race.timeline_latest()
|
||||||
|
if latest:
|
||||||
|
print(latest)
|
||||||
|
time.sleep(10)
|
||||||
@@ -1,14 +1,17 @@
|
|||||||
from pcslive import LiveStats
|
from pcslive import LiveStats
|
||||||
import time
|
|
||||||
|
|
||||||
stats = LiveStats()
|
stats = LiveStats()
|
||||||
|
|
||||||
# if there are live races...
|
from pcslive import LiveStats
|
||||||
if len(stats.races) > 0:
|
|
||||||
race = stats.races[0]
|
|
||||||
|
|
||||||
race.get_situation_long()
|
stats = LiveStats()
|
||||||
|
race = stats.find_race("ethias")
|
||||||
|
race.get_situation_long()
|
||||||
|
|
||||||
# whats the situation? :)
|
if race.situation_long:
|
||||||
for x in race.situation_long:
|
for group in race.situation_long:
|
||||||
print(x, race.situation_long[x])
|
for item in group:
|
||||||
|
print(item)
|
||||||
|
print("================")
|
||||||
|
else:
|
||||||
|
print("No situation data")
|
||||||
168
pcslive.py
168
pcslive.py
@@ -1,5 +1,5 @@
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import requests, time
|
import requests, time, cloudscraper
|
||||||
|
|
||||||
# class for storing the live races
|
# class for storing the live races
|
||||||
class LiveStats:
|
class LiveStats:
|
||||||
@@ -11,9 +11,13 @@ class LiveStats:
|
|||||||
# it also gets rid of all the horrible polygon stuff
|
# it also gets rid of all the horrible polygon stuff
|
||||||
def refresh_races(self):
|
def refresh_races(self):
|
||||||
# the entire front page of PCS loaded into soup
|
# the entire front page of PCS loaded into soup
|
||||||
req = requests.get("https://www.procyclingstats.com/")
|
#self.req = requests.get("https://www.procyclingstats.com/")
|
||||||
html = req.text
|
#self.html = self.req.text
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
|
||||||
|
self.scraper = cloudscraper.create_scraper()
|
||||||
|
self.req = self.scraper.get("https://www.procyclingstats.com/")
|
||||||
|
self.html = self.req.text
|
||||||
|
soup = BeautifulSoup(self.html, "html.parser")
|
||||||
|
|
||||||
# narrow it down to the little green live stats boxes showing us what
|
# narrow it down to the little green live stats boxes showing us what
|
||||||
# races are currently live
|
# races are currently live
|
||||||
@@ -42,6 +46,13 @@ class LiveStats:
|
|||||||
for race in self.races:
|
for race in self.races:
|
||||||
race.print_stats()
|
race.print_stats()
|
||||||
|
|
||||||
|
# finds a race by its title
|
||||||
|
def find_race(self, query):
|
||||||
|
for race in self.races:
|
||||||
|
if query.lower() in race.title.lower():
|
||||||
|
return race
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -62,6 +73,10 @@ class Race:
|
|||||||
self.timeline = []
|
self.timeline = []
|
||||||
self.situation_long = []
|
self.situation_long = []
|
||||||
|
|
||||||
|
# when using timeline_latest() put already returned updates
|
||||||
|
# inside here so that they dont get repeated
|
||||||
|
self.timeline_latest_store = []
|
||||||
|
|
||||||
# to parse the raw data given by LiveStats
|
# to parse the raw data given by LiveStats
|
||||||
def refresh_info(self):
|
def refresh_info(self):
|
||||||
title_r = self.raw.find(attrs={"class":"title"})
|
title_r = self.raw.find(attrs={"class":"title"})
|
||||||
@@ -100,7 +115,8 @@ class Race:
|
|||||||
if self.url != "None":
|
if self.url != "None":
|
||||||
full_url = "https://www.procyclingstats.com/" + self.url
|
full_url = "https://www.procyclingstats.com/" + self.url
|
||||||
|
|
||||||
req = requests.get(full_url)
|
self.scraper = cloudscraper.create_scraper()
|
||||||
|
req = self.scraper.get(full_url)
|
||||||
html = req.text
|
html = req.text
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
return soup
|
return soup
|
||||||
@@ -112,48 +128,134 @@ class Race:
|
|||||||
page = self.get_race_page()
|
page = self.get_race_page()
|
||||||
if page:
|
if page:
|
||||||
all = page.find_all(attrs={"class":"timeline3cont"})
|
all = page.find_all(attrs={"class":"timeline3cont"})
|
||||||
live = all[0]
|
self.timeline_live = all[0]
|
||||||
timeline = live.find_all("li")
|
timeline = self.timeline_live.find_all("li")
|
||||||
self.timeline = []
|
self.timeline = []
|
||||||
for item in timeline:
|
for item in timeline:
|
||||||
stat = item.find(attrs={"class":"stat"})
|
stat = item.find(attrs={"class":"stat"})
|
||||||
if stat:
|
if stat:
|
||||||
stat_content = stat.find(attrs={"class":"textCont"})
|
stat_content, is_data, has_info_number = self.timeline_stats(stat)
|
||||||
self.timeline.append(self.remove_tags(stat_content))
|
if is_data:
|
||||||
|
pass
|
||||||
|
elif has_info_number:
|
||||||
|
# same as in the timeline_latest function
|
||||||
|
number = self.remove_tags(has_info_number)
|
||||||
|
text = self.remove_tags(stat_content)
|
||||||
|
update = number + " " + text
|
||||||
|
self.timeline.append(update)
|
||||||
|
else:
|
||||||
|
self.timeline.append(self.remove_tags(stat_content))
|
||||||
|
|
||||||
|
# a function for getting only the latest timeline updates!
|
||||||
|
# useful for making an async timeline feed
|
||||||
|
def timeline_latest(self):
|
||||||
|
# its a bit weird if it doesnt display Anything at first run
|
||||||
|
# so if the seen list is empty, just show the latest update
|
||||||
|
# from the full timeline (then add that to seen)
|
||||||
|
# self.timeline_latest_store is the list for storing seen updates
|
||||||
|
if len(self.timeline_latest_store) == 0:
|
||||||
|
update = self.timeline[0]
|
||||||
|
self.timeline_latest_store.append(update)
|
||||||
|
return update
|
||||||
|
|
||||||
|
# now most of this code is identical to the full timeline
|
||||||
|
# except that it just uses find instead of find_all
|
||||||
|
# assuming it finds the first one... which it does
|
||||||
|
latest = self.timeline_live.find("li")
|
||||||
|
stat = latest.find(attrs={"class":"stat"})
|
||||||
|
if stat:
|
||||||
|
stat_content, is_data, has_info_number = self.timeline_stats(stat)
|
||||||
|
if is_data:
|
||||||
|
pass
|
||||||
|
elif has_info_number:
|
||||||
|
# some timeline updates use a big number
|
||||||
|
# like 150 kilometers to the finish
|
||||||
|
# and scraper only finds "kilometers to the finish"
|
||||||
|
# so if there is a big number, get it and add it
|
||||||
|
# there is a drawback of it doing this for every Big Number
|
||||||
|
# but i sorta dont care rn haha
|
||||||
|
number = self.remove_tags(has_info_number)
|
||||||
|
text = self.remove_tags(stat_content)
|
||||||
|
update = number + " " + text
|
||||||
|
|
||||||
|
# only show the update if it hasnt been seen before
|
||||||
|
if update not in self.timeline_latest_store:
|
||||||
|
# then store it so we know we've already seen it
|
||||||
|
self.timeline_latest_store.append(update)
|
||||||
|
return update
|
||||||
|
else:
|
||||||
|
# same as above but without the big number
|
||||||
|
update = self.remove_tags(stat_content)
|
||||||
|
if update not in self.timeline_latest_store:
|
||||||
|
self.timeline_latest_store.append(update)
|
||||||
|
return update
|
||||||
|
return None
|
||||||
|
|
||||||
|
# function for getting specific things from the timeline
|
||||||
|
# it seemed like a good idea to put it here at the time
|
||||||
|
# shrug
|
||||||
|
def timeline_stats(self, stat):
|
||||||
|
stat_content = stat.find(attrs={"class":"textCont"})
|
||||||
|
is_data = stat.find(attrs={"class":"chartCont"})
|
||||||
|
has_info_number = stat.find(attrs={"class":"number"})
|
||||||
|
return stat_content, is_data, has_info_number
|
||||||
|
|
||||||
# creates a dictionary containing time gaps as keys and each
|
# creates a dictionary containing time gaps as keys and each
|
||||||
# timegap points to a list of riders in that group
|
# timegap points to a list of riders in that group
|
||||||
|
# def get_situation_long(self):
|
||||||
|
# page = self.get_race_page()
|
||||||
|
# if page:
|
||||||
|
# all = page.find_all(attrs={"class":"situCont"})
|
||||||
|
# live = all[0]
|
||||||
|
# situation_long = live.find_all("li")
|
||||||
|
|
||||||
|
# # create the dictionary
|
||||||
|
# self.situation_long = {}
|
||||||
|
|
||||||
|
# # last_timegap is used for grouping riders together
|
||||||
|
# last_timegap = None
|
||||||
|
|
||||||
|
# for item in situation_long:
|
||||||
|
# time_gap = item.find(attrs={"class":"time"})
|
||||||
|
# rider = item.find(attrs={"class":"maxw200"})
|
||||||
|
|
||||||
|
# # time gap is only listed once and subsequent riders in that
|
||||||
|
# # group don't have one (None)
|
||||||
|
# if time_gap:
|
||||||
|
# tg = self.remove_tags(time_gap) # get rid of html tags
|
||||||
|
# self.situation_long[tg] = [] # create the list inside the dict
|
||||||
|
# last_timegap = tg # set last timegap for the loop
|
||||||
|
|
||||||
|
# rider_name = self.remove_tags(rider) # remove tags from rider name
|
||||||
|
# # the leading rider is basically the group name i guess?
|
||||||
|
# # anyway we dont need it twice
|
||||||
|
# if rider_name not in self.situation_long[last_timegap]:
|
||||||
|
# # add rider to list of riders under that timegap/group
|
||||||
|
# self.situation_long[last_timegap].append(rider_name)
|
||||||
|
|
||||||
def get_situation_long(self):
|
def get_situation_long(self):
|
||||||
page = self.get_race_page()
|
page = self.get_race_page()
|
||||||
if page:
|
if page:
|
||||||
all = page.find_all(attrs={"class":"situCont"})
|
all = page.find_all(attrs={"class":"situCont"})
|
||||||
live = all[0]
|
if all:
|
||||||
situation_long = live.find_all("li")
|
live = all[0]
|
||||||
|
groups = live.find_all(attrs={"class":"group"})
|
||||||
|
self.situation_long = []
|
||||||
|
for group in groups:
|
||||||
|
group_name = group.find(attrs={"class":"groupname"})
|
||||||
|
time = group.find(attrs={"class":"time"})
|
||||||
|
riders = group.find_all("li")
|
||||||
|
riders_clean = []
|
||||||
|
for rider in riders:
|
||||||
|
rider_name = rider.find(attrs={"class":"maxw180"})
|
||||||
|
riders_clean.append(self.remove_tags(rider_name))
|
||||||
|
group_name_clean = self.remove_tags(group_name)
|
||||||
|
time_clean = self.remove_tags(time).strip("??")
|
||||||
|
self.situation_long.append([group_name_clean, time_clean, riders_clean])
|
||||||
|
else:
|
||||||
|
self.situation_long = None
|
||||||
|
|
||||||
# create the dictionary
|
|
||||||
self.situation_long = {}
|
|
||||||
|
|
||||||
# last_timegap is used for grouping riders together
|
|
||||||
last_timegap = None
|
|
||||||
|
|
||||||
for item in situation_long:
|
|
||||||
time_gap = item.find(attrs={"class":"time"})
|
|
||||||
rider = item.find(attrs={"class":"maxw200"})
|
|
||||||
|
|
||||||
# time gap is only listed once and subsequent riders in that
|
|
||||||
# group don't have one (None)
|
|
||||||
if time_gap:
|
|
||||||
tg = self.remove_tags(time_gap) # get rid of html tags
|
|
||||||
self.situation_long[tg] = [] # create the list inside the dict
|
|
||||||
last_timegap = tg # set last timegap for the loop
|
|
||||||
|
|
||||||
rider_name = self.remove_tags(rider) # remove tags from rider name
|
|
||||||
# the leading rider is basically the group name i guess?
|
|
||||||
# anyway we dont need it twice
|
|
||||||
if rider_name not in self.situation_long[last_timegap]:
|
|
||||||
# add rider to list of riders under that timegap/group
|
|
||||||
self.situation_long[last_timegap].append(rider_name)
|
|
||||||
|
|
||||||
# remove surrounding html tags from final data points,
|
# remove surrounding html tags from final data points,
|
||||||
# like rider names, race names, etc.
|
# like rider names, race names, etc.
|
||||||
|
|||||||
Reference in New Issue
Block a user