267 lines
11 KiB
Python
267 lines
11 KiB
Python
from bs4 import BeautifulSoup
|
|
import requests, time, cloudscraper
|
|
|
|
# class for storing the live races
|
|
class LiveStats:
|
|
def __init__(self):
|
|
self.races = []
|
|
self.refresh_races()
|
|
|
|
# scrapes from the PCS homepage where the little green live stats boxes are
|
|
# it also gets rid of all the horrible polygon stuff
|
|
def refresh_races(self):
|
|
# the entire front page of PCS loaded into soup
|
|
#self.req = requests.get("https://www.procyclingstats.com/")
|
|
#self.html = self.req.text
|
|
|
|
self.scraper = cloudscraper.create_scraper()
|
|
self.req = self.scraper.get("https://www.procyclingstats.com/")
|
|
self.html = self.req.text
|
|
soup = BeautifulSoup(self.html, "html.parser")
|
|
|
|
# narrow it down to the little green live stats boxes showing us what
|
|
# races are currently live
|
|
live = soup.find(attrs={"class":"hp3-livestats"})
|
|
|
|
# if there are even any races:
|
|
if live:
|
|
# also get rid of the horrible polygon number stuff !!! YUCK
|
|
# i imagine its how the profiles are drawn. there is A LOT
|
|
# and it makes reading the raw html very PAINFUL
|
|
for tag in live.find_all(attrs={"class":"inverse"}):
|
|
tag.decompose()
|
|
|
|
# conveniently, every race is a list item (<li>)
|
|
# inside is the title, status (live/finished), km to go,
|
|
# and brief situation text, but we parse that inside Race
|
|
races = live.find_all("li")
|
|
self.races = []
|
|
for raw_race in races:
|
|
# for now just create the Race object and add it to our list
|
|
race = Race(raw_race)
|
|
self.races.append(race)
|
|
|
|
# straightforward
|
|
def print_races(self):
|
|
for race in self.races:
|
|
race.print_stats()
|
|
|
|
# finds a race by its title
|
|
def find_race(self, query):
|
|
for race in self.races:
|
|
if query.lower() in race.title.lower():
|
|
return race
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# class for storing information about individual live races
|
|
class Race:
|
|
def __init__(self, raw):
|
|
# this is the unparsed html from LiveStats
|
|
self.raw = raw
|
|
|
|
# now go parse it
|
|
self.refresh_info()
|
|
|
|
# keeping these empty unless requested becauses i feel like it
|
|
# timeline is headings from the race's page (left side)
|
|
# situation_long is groups from the left side and time gaps
|
|
# (not to be confused with situation which is a brief homepage summary)
|
|
self.timeline = []
|
|
self.situation_long = []
|
|
|
|
# when using timeline_latest() put already returned updates
|
|
# inside here so that they dont get repeated
|
|
self.timeline_latest_store = []
|
|
|
|
# to parse the raw data given by LiveStats
|
|
def refresh_info(self):
|
|
title_r = self.raw.find(attrs={"class":"title"})
|
|
status_r = self.raw.find(attrs={"class":"status"})
|
|
togo_r = self.raw.find(attrs={"class":"togo"})
|
|
situation_r = self.raw.find(attrs={"class":"situ_txt"})
|
|
url_r = self.raw.find("a", href=True)
|
|
|
|
self.title = self.remove_tags(title_r) # name of the race
|
|
self.status = self.remove_tags(status_r) # live or finished (finished races stay online for a short while after finishing)
|
|
self.togo = self.remove_tags(togo_r) # how many kms to go (None if finished)
|
|
self.situation = self.remove_tags(situation_r) # a brief summary of the situation - None if finished, and - if complete peloton
|
|
self.url = self.remove_tags(url_r["href"]) # the url for the race's full live stats page
|
|
|
|
# self explanatory
|
|
def print_stats(self):
|
|
print(self.title)
|
|
print(self.status)
|
|
|
|
# dont bother showing kms to go or situation if finished
|
|
if self.togo != "None":
|
|
print(self.togo, "to go")
|
|
print(self.situation)
|
|
|
|
print("===============")
|
|
|
|
# for testing
|
|
# def print_raw(self):
|
|
# print(self.raw)
|
|
# print("")
|
|
# print(self.url)
|
|
|
|
# this gets the html of the entire page of stats for this particular race
|
|
# for use with get_timeline() and get_situation_long()
|
|
def get_race_page(self):
|
|
if self.url != "None":
|
|
full_url = "https://www.procyclingstats.com/" + self.url
|
|
|
|
self.scraper = cloudscraper.create_scraper()
|
|
req = self.scraper.get(full_url)
|
|
html = req.text
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
return soup
|
|
return None
|
|
|
|
# fills a list with all displayed timeline items (headings only)
|
|
# item 0 is the most recent update
|
|
def get_timeline(self):
|
|
page = self.get_race_page()
|
|
if page:
|
|
all = page.find_all(attrs={"class":"timeline3cont"})
|
|
self.timeline_live = all[0]
|
|
timeline = self.timeline_live.find_all("li")
|
|
self.timeline = []
|
|
for item in timeline:
|
|
stat = item.find(attrs={"class":"stat"})
|
|
if stat:
|
|
stat_content, is_data, has_info_number = self.timeline_stats(stat)
|
|
if is_data:
|
|
pass
|
|
elif has_info_number:
|
|
# same as in the timeline_latest function
|
|
number = self.remove_tags(has_info_number)
|
|
text = self.remove_tags(stat_content)
|
|
update = number + " " + text
|
|
self.timeline.append(update)
|
|
else:
|
|
self.timeline.append(self.remove_tags(stat_content))
|
|
|
|
# a function for getting only the latest timeline updates!
|
|
# useful for making an async timeline feed
|
|
def timeline_latest(self):
|
|
# its a bit weird if it doesnt display Anything at first run
|
|
# so if the seen list is empty, just show the latest update
|
|
# from the full timeline (then add that to seen)
|
|
# self.timeline_latest_store is the list for storing seen updates
|
|
if len(self.timeline_latest_store) == 0:
|
|
update = self.timeline[0]
|
|
self.timeline_latest_store.append(update)
|
|
return update
|
|
|
|
# now most of this code is identical to the full timeline
|
|
# except that it just uses find instead of find_all
|
|
# assuming it finds the first one... which it does
|
|
latest = self.timeline_live.find("li")
|
|
stat = latest.find(attrs={"class":"stat"})
|
|
if stat:
|
|
stat_content, is_data, has_info_number = self.timeline_stats(stat)
|
|
if is_data:
|
|
pass
|
|
elif has_info_number:
|
|
# some timeline updates use a big number
|
|
# like 150 kilometers to the finish
|
|
# and scraper only finds "kilometers to the finish"
|
|
# so if there is a big number, get it and add it
|
|
# there is a drawback of it doing this for every Big Number
|
|
# but i sorta dont care rn haha
|
|
number = self.remove_tags(has_info_number)
|
|
text = self.remove_tags(stat_content)
|
|
update = number + " " + text
|
|
|
|
# only show the update if it hasnt been seen before
|
|
if update not in self.timeline_latest_store:
|
|
# then store it so we know we've already seen it
|
|
self.timeline_latest_store.append(update)
|
|
return update
|
|
else:
|
|
# same as above but without the big number
|
|
update = self.remove_tags(stat_content)
|
|
if update not in self.timeline_latest_store:
|
|
self.timeline_latest_store.append(update)
|
|
return update
|
|
return None
|
|
|
|
# function for getting specific things from the timeline
|
|
# it seemed like a good idea to put it here at the time
|
|
# shrug
|
|
def timeline_stats(self, stat):
|
|
stat_content = stat.find(attrs={"class":"textCont"})
|
|
is_data = stat.find(attrs={"class":"chartCont"})
|
|
has_info_number = stat.find(attrs={"class":"number"})
|
|
return stat_content, is_data, has_info_number
|
|
|
|
# creates a dictionary containing time gaps as keys and each
|
|
# timegap points to a list of riders in that group
|
|
# def get_situation_long(self):
|
|
# page = self.get_race_page()
|
|
# if page:
|
|
# all = page.find_all(attrs={"class":"situCont"})
|
|
# live = all[0]
|
|
# situation_long = live.find_all("li")
|
|
|
|
# # create the dictionary
|
|
# self.situation_long = {}
|
|
|
|
# # last_timegap is used for grouping riders together
|
|
# last_timegap = None
|
|
|
|
# for item in situation_long:
|
|
# time_gap = item.find(attrs={"class":"time"})
|
|
# rider = item.find(attrs={"class":"maxw200"})
|
|
|
|
# # time gap is only listed once and subsequent riders in that
|
|
# # group don't have one (None)
|
|
# if time_gap:
|
|
# tg = self.remove_tags(time_gap) # get rid of html tags
|
|
# self.situation_long[tg] = [] # create the list inside the dict
|
|
# last_timegap = tg # set last timegap for the loop
|
|
|
|
# rider_name = self.remove_tags(rider) # remove tags from rider name
|
|
# # the leading rider is basically the group name i guess?
|
|
# # anyway we dont need it twice
|
|
# if rider_name not in self.situation_long[last_timegap]:
|
|
# # add rider to list of riders under that timegap/group
|
|
# self.situation_long[last_timegap].append(rider_name)
|
|
|
|
def get_situation_long(self):
|
|
page = self.get_race_page()
|
|
if page:
|
|
all = page.find_all(attrs={"class":"situCont"})
|
|
if all:
|
|
live = all[0]
|
|
groups = live.find_all(attrs={"class":"group"})
|
|
self.situation_long = []
|
|
for group in groups:
|
|
group_name = group.find(attrs={"class":"groupname"})
|
|
time = group.find(attrs={"class":"time"})
|
|
riders = group.find_all("li")
|
|
riders_clean = []
|
|
for rider in riders:
|
|
rider_name = rider.find(attrs={"class":"maxw180"})
|
|
riders_clean.append(self.remove_tags(rider_name))
|
|
group_name_clean = self.remove_tags(group_name)
|
|
time_clean = self.remove_tags(time).strip("??")
|
|
self.situation_long.append([group_name_clean, time_clean, riders_clean])
|
|
else:
|
|
self.situation_long = None
|
|
|
|
|
|
|
|
# remove surrounding html tags from final data points,
|
|
# like rider names, race names, etc.
|
|
def remove_tags(self, text):
|
|
text = str(text)
|
|
text_soup = BeautifulSoup(text, "html.parser")
|
|
for data in text_soup(["style", "script"]):
|
|
data.decompose()
|
|
return " ".join(text_soup.stripped_strings) |