from bs4 import BeautifulSoup import requests, time, cloudscraper # class for storing the live races class LiveStats: def __init__(self): self.races = [] self.refresh_races() # scrapes from the PCS homepage where the little green live stats boxes are # it also gets rid of all the horrible polygon stuff def refresh_races(self): # the entire front page of PCS loaded into soup #self.req = requests.get("https://www.procyclingstats.com/") #self.html = self.req.text self.scraper = cloudscraper.create_scraper() self.req = self.scraper.get("https://www.procyclingstats.com/") soup = BeautifulSoup(self.html, "html.parser") # narrow it down to the little green live stats boxes showing us what # races are currently live live = soup.find(attrs={"class":"hp3-livestats"}) # if there are even any races: if live: # also get rid of the horrible polygon number stuff !!! YUCK # i imagine its how the profiles are drawn. there is A LOT # and it makes reading the raw html very PAINFUL for tag in live.find_all(attrs={"class":"inverse"}): tag.decompose() # conveniently, every race is a list item (
  • ) # inside is the title, status (live/finished), km to go, # and brief situation text, but we parse that inside Race races = live.find_all("li") self.races = [] for raw_race in races: # for now just create the Race object and add it to our list race = Race(raw_race) self.races.append(race) # straightforward def print_races(self): for race in self.races: race.print_stats() # finds a race by its title def find_race(self, query): for race in self.races: if query.lower() in race.title.lower(): return race return None # class for storing information about individual live races class Race: def __init__(self, raw): # this is the unparsed html from LiveStats self.raw = raw # now go parse it self.refresh_info() # keeping these empty unless requested becauses i feel like it # timeline is headings from the race's page (left side) # situation_long is groups from the left side and time gaps # (not to be confused with situation which is a brief homepage summary) self.timeline = [] self.situation_long = [] # when using timeline_latest() put already returned updates # inside here so that they dont get repeated self.timeline_latest_store = [] # to parse the raw data given by LiveStats def refresh_info(self): title_r = self.raw.find(attrs={"class":"title"}) status_r = self.raw.find(attrs={"class":"status"}) togo_r = self.raw.find(attrs={"class":"togo"}) situation_r = self.raw.find(attrs={"class":"situ_txt"}) url_r = self.raw.find("a", href=True) self.title = self.remove_tags(title_r) # name of the race self.status = self.remove_tags(status_r) # live or finished (finished races stay online for a short while after finishing) self.togo = self.remove_tags(togo_r) # how many kms to go (None if finished) self.situation = self.remove_tags(situation_r) # a brief summary of the situation - None if finished, and - if complete peloton self.url = self.remove_tags(url_r["href"]) # the url for the race's full live stats page # self explanatory def print_stats(self): print(self.title) print(self.status) # dont bother showing kms to go or situation if finished if self.togo != "None": print(self.togo, "to go") print(self.situation) print("===============") # for testing # def print_raw(self): # print(self.raw) # print("") # print(self.url) # this gets the html of the entire page of stats for this particular race # for use with get_timeline() and get_situation_long() def get_race_page(self): if self.url != "None": full_url = "https://www.procyclingstats.com/" + self.url req = self.scraper.get(full_url) html = req.text soup = BeautifulSoup(html, "html.parser") return soup return None # fills a list with all displayed timeline items (headings only) # item 0 is the most recent update def get_timeline(self): page = self.get_race_page() if page: all = page.find_all(attrs={"class":"timeline3cont"}) self.timeline_live = all[0] timeline = self.timeline_live.find_all("li") self.timeline = [] for item in timeline: stat = item.find(attrs={"class":"stat"}) if stat: stat_content, is_data, has_info_number = self.timeline_stats(stat) if is_data: pass elif has_info_number: # same as in the timeline_latest function number = self.remove_tags(has_info_number) text = self.remove_tags(stat_content) update = number + " " + text self.timeline.append(update) else: self.timeline.append(self.remove_tags(stat_content)) # a function for getting only the latest timeline updates! # useful for making an async timeline feed def timeline_latest(self): # its a bit weird if it doesnt display Anything at first run # so if the seen list is empty, just show the latest update # from the full timeline (then add that to seen) # self.timeline_latest_store is the list for storing seen updates if len(self.timeline_latest_store) == 0: update = self.timeline[0] self.timeline_latest_store.append(update) return update # now most of this code is identical to the full timeline # except that it just uses find instead of find_all # assuming it finds the first one... which it does latest = self.timeline_live.find("li") stat = latest.find(attrs={"class":"stat"}) if stat: stat_content, is_data, has_info_number = self.timeline_stats(stat) if is_data: pass elif has_info_number: # some timeline updates use a big number # like 150 kilometers to the finish # and scraper only finds "kilometers to the finish" # so if there is a big number, get it and add it # there is a drawback of it doing this for every Big Number # but i sorta dont care rn haha number = self.remove_tags(has_info_number) text = self.remove_tags(stat_content) update = number + " " + text # only show the update if it hasnt been seen before if update not in self.timeline_latest_store: # then store it so we know we've already seen it self.timeline_latest_store.append(update) return update else: # same as above but without the big number update = self.remove_tags(stat_content) if update not in self.timeline_latest_store: self.timeline_latest_store.append(update) return update return None # function for getting specific things from the timeline # it seemed like a good idea to put it here at the time # shrug def timeline_stats(self, stat): stat_content = stat.find(attrs={"class":"textCont"}) is_data = stat.find(attrs={"class":"chartCont"}) has_info_number = stat.find(attrs={"class":"number"}) return stat_content, is_data, has_info_number # creates a dictionary containing time gaps as keys and each # timegap points to a list of riders in that group def get_situation_long(self): page = self.get_race_page() if page: all = page.find_all(attrs={"class":"situCont"}) live = all[0] situation_long = live.find_all("li") # create the dictionary self.situation_long = {} # last_timegap is used for grouping riders together last_timegap = None for item in situation_long: time_gap = item.find(attrs={"class":"time"}) rider = item.find(attrs={"class":"maxw200"}) # time gap is only listed once and subsequent riders in that # group don't have one (None) if time_gap: tg = self.remove_tags(time_gap) # get rid of html tags self.situation_long[tg] = [] # create the list inside the dict last_timegap = tg # set last timegap for the loop rider_name = self.remove_tags(rider) # remove tags from rider name # the leading rider is basically the group name i guess? # anyway we dont need it twice if rider_name not in self.situation_long[last_timegap]: # add rider to list of riders under that timegap/group self.situation_long[last_timegap].append(rider_name) # remove surrounding html tags from final data points, # like rider names, race names, etc. def remove_tags(self, text): text = str(text) text_soup = BeautifulSoup(text, "html.parser") for data in text_soup(["style", "script"]): data.decompose() return " ".join(text_soup.stripped_strings)