from bs4 import BeautifulSoup import requests, time # class for storing the live races class LiveStats: def __init__(self): self.races = [] self.refresh_races() # scrapes from the PCS homepage where the little green live stats boxes are # it also gets rid of all the horrible polygon stuff def refresh_races(self): # the entire front page of PCS loaded into soup self.req = requests.get("https://www.procyclingstats.com/") self.html = self.req.text soup = BeautifulSoup(self.html, "html.parser") # narrow it down to the little green live stats boxes showing us what # races are currently live live = soup.find(attrs={"class":"hp3-livestats"}) # if there are even any races: if live: # also get rid of the horrible polygon number stuff !!! YUCK # i imagine its how the profiles are drawn. there is A LOT # and it makes reading the raw html very PAINFUL for tag in live.find_all(attrs={"class":"inverse"}): tag.decompose() # conveniently, every race is a list item (
  • ) # inside is the title, status (live/finished), km to go, # and brief situation text, but we parse that inside Race races = live.find_all("li") self.races = [] for raw_race in races: # for now just create the Race object and add it to our list race = Race(raw_race) self.races.append(race) # straightforward def print_races(self): for race in self.races: race.print_stats() # class for storing information about individual live races class Race: def __init__(self, raw): # this is the unparsed html from LiveStats self.raw = raw # now go parse it self.refresh_info() # keeping these empty unless requested becauses i feel like it # timeline is headings from the race's page (left side) # situation_long is groups from the left side and time gaps # (not to be confused with situation which is a brief homepage summary) self.timeline = [] self.situation_long = [] # to parse the raw data given by LiveStats def refresh_info(self): title_r = self.raw.find(attrs={"class":"title"}) status_r = self.raw.find(attrs={"class":"status"}) togo_r = self.raw.find(attrs={"class":"togo"}) situation_r = self.raw.find(attrs={"class":"situ_txt"}) url_r = self.raw.find("a", href=True) self.title = self.remove_tags(title_r) # name of the race self.status = self.remove_tags(status_r) # live or finished (finished races stay online for a short while after finishing) self.togo = self.remove_tags(togo_r) # how many kms to go (None if finished) self.situation = self.remove_tags(situation_r) # a brief summary of the situation - None if finished, and - if complete peloton self.url = self.remove_tags(url_r["href"]) # the url for the race's full live stats page # self explanatory def print_stats(self): print(self.title) print(self.status) # dont bother showing kms to go or situation if finished if self.togo != "None": print(self.togo, "to go") print(self.situation) print("===============") # for testing # def print_raw(self): # print(self.raw) # print("") # print(self.url) # this gets the html of the entire page of stats for this particular race # for use with get_timeline() and get_situation_long() def get_race_page(self): if self.url != "None": full_url = "https://www.procyclingstats.com/" + self.url req = requests.get(full_url) html = req.text soup = BeautifulSoup(html, "html.parser") return soup return None # fills a list with all displayed timeline items (headings only) # item 0 is the most recent update def get_timeline(self): page = self.get_race_page() if page: all = page.find_all(attrs={"class":"timeline3cont"}) live = all[0] timeline = live.find_all("li") self.timeline = [] for item in timeline: stat = item.find(attrs={"class":"stat"}) if stat: stat_content = stat.find(attrs={"class":"textCont"}) self.timeline.append(self.remove_tags(stat_content)) # creates a dictionary containing time gaps as keys and each # timegap points to a list of riders in that group def get_situation_long(self): page = self.get_race_page() if page: all = page.find_all(attrs={"class":"situCont"}) live = all[0] situation_long = live.find_all("li") # create the dictionary self.situation_long = {} # last_timegap is used for grouping riders together last_timegap = None for item in situation_long: time_gap = item.find(attrs={"class":"time"}) rider = item.find(attrs={"class":"maxw200"}) # time gap is only listed once and subsequent riders in that # group don't have one (None) if time_gap: tg = self.remove_tags(time_gap) # get rid of html tags self.situation_long[tg] = [] # create the list inside the dict last_timegap = tg # set last timegap for the loop rider_name = self.remove_tags(rider) # remove tags from rider name # the leading rider is basically the group name i guess? # anyway we dont need it twice if rider_name not in self.situation_long[last_timegap]: # add rider to list of riders under that timegap/group self.situation_long[last_timegap].append(rider_name) # remove surrounding html tags from final data points, # like rider names, race names, etc. def remove_tags(self, text): text = str(text) text_soup = BeautifulSoup(text, "html.parser") for data in text_soup(["style", "script"]): data.decompose() return " ".join(text_soup.stripped_strings)