diff --git a/README.md b/README.md index e57612a..aee32ec 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,3 @@ stats.print_races() `example.py` posts the latest timeline of one current race (checking every 10 seconds for updates) `example2.py` organises the situation diagram in a dictionary of timegap keys to a list of riders in that group. it could probably be organised better but it works - - - diff --git a/example.py b/example.py index d45f677..4d22f30 100644 --- a/example.py +++ b/example.py @@ -3,15 +3,21 @@ import time stats = LiveStats() +# if there are live races... if len(stats.races) > 0: - race = stats.races[0] + race = stats.races[0] # just grab the first one for example's sake print("Latest timeline update from", race.title, ":") - + # keep the last update so we don't spam the terminal last_update = "" while True: race.get_timeline() + + # only update terminal if the newest item has changed (is new) if last_update != race.timeline[0]: print(race.timeline[0]) last_update = race.timeline[0] + + # wait ten seconds before checking again + # maybe set to longer if running for extended time time.sleep(10) \ No newline at end of file diff --git a/example2.py b/example2.py index ba53c3a..28fc1d7 100644 --- a/example2.py +++ b/example2.py @@ -3,10 +3,12 @@ import time stats = LiveStats() +# if there are live races... if len(stats.races) > 0: race = stats.races[0] race.get_situation_long() + # whats the situation? :) for x in race.situation_long: print(x, race.situation_long[x]) \ No newline at end of file diff --git a/pcslive.py b/pcslive.py index 784f205..3dc5fdf 100644 --- a/pcslive.py +++ b/pcslive.py @@ -1,42 +1,65 @@ from bs4 import BeautifulSoup import requests, time +# class for storing the live races class LiveStats: def __init__(self): - self.refresh_live() - self.get_races() + self.refresh_races() - self.timeline = [] - self.situation_long = [] - - def refresh_live(self): + # scrapes from the PCS homepage where the little green live stats boxes are + # it also gets rid of all the horrible polygon stuff + def refresh_races(self): + # the entire front page of PCS loaded into soup req = requests.get("https://www.procyclingstats.com/") html = req.text soup = BeautifulSoup(html, "html.parser") - all = soup.find_all(attrs={"class":"hp3-livestats"}) - live = all[0] + # narrow it down to the little green live stats boxes showing us what + # races are currently live + live = soup.find(attrs={"class":"hp3-livestats"}) + + # also get rid of the horrible polygon number stuff !!! YUCK + # i imagine its how the profiles are drawn. there is A LOT + # and it makes reading the raw html very PAINFUL for tag in live.find_all(attrs={"class":"inverse"}): tag.decompose() - self.live = live - - def get_races(self): - races_raw = self.live.find_all("li") + # conveniently, every race is a list item (
  • ) + # inside is the title, status (live/finished), km to go, + # and brief situation text, but we parse that inside Race + races = live.find_all("li") self.races = [] - for race in races_raw: - this_race = Race(race) - self.races.append(this_race) + for raw_race in races: + # for now just create the Race object and add it to our list + race = Race(raw_race) + self.races.append(race) + # straightforward def print_races(self): for race in self.races: race.print_stats() + + + + +# class for storing information about individual live races class Race: def __init__(self, raw): + # this is the unparsed html from LiveStats self.raw = raw + + # now go parse it self.refresh_info() + # keeping these empty unless requested becauses i feel like it + # timeline is headings from the race's page (left side) + # situation_long is groups from the left side and time gaps + # (not to be confused with situation which is a brief homepage summary) + self.timeline = [] + self.situation_long = [] + + # to parse the raw data given by LiveStats def refresh_info(self): title_r = self.raw.find(attrs={"class":"title"}) status_r = self.raw.find(attrs={"class":"status"}) @@ -44,25 +67,32 @@ class Race: situation_r = self.raw.find(attrs={"class":"situ_txt"}) url_r = self.raw.find("a", href=True) - self.title = self.remove_tags(title_r) - self.status = self.remove_tags(status_r) - self.togo = self.remove_tags(togo_r) - self.situation = self.remove_tags(situation_r) - self.url = self.remove_tags(url_r["href"]) + self.title = self.remove_tags(title_r) # name of the race + self.status = self.remove_tags(status_r) # live or finished (finished races stay online for a short while after finishing) + self.togo = self.remove_tags(togo_r) # how many kms to go (None if finished) + self.situation = self.remove_tags(situation_r) # a brief summary of the situation - None if finished, and - if complete peloton + self.url = self.remove_tags(url_r["href"]) # the url for the race's full live stats page + # self explanatory def print_stats(self): print(self.title) print(self.status) + + # dont bother showing kms to go or situation if finished if self.togo != "None": print(self.togo, "to go") print(self.situation) + print("===============") - def print_raw(self): - print(self.raw) - print("") - print(self.url) + # for testing + # def print_raw(self): + # print(self.raw) + # print("") + # print(self.url) + # this gets the html of the entire page of stats for this particular race + # for use with get_timeline() and get_situation_long() def get_race_page(self): if self.url != "None": full_url = "https://www.procyclingstats.com/" + self.url @@ -73,6 +103,8 @@ class Race: return soup return None + # fills a list with all displayed timeline items (headings only) + # item 0 is the most recent update def get_timeline(self): page = self.get_race_page() if page: @@ -86,31 +118,42 @@ class Race: stat_content = stat.find(attrs={"class":"textCont"}) self.timeline.append(self.remove_tags(stat_content)) + + # creates a dictionary containing time gaps as keys and each + # timegap points to a list of riders in that group def get_situation_long(self): page = self.get_race_page() if page: all = page.find_all(attrs={"class":"situCont"}) live = all[0] situation_long = live.find_all("li") + + # create the dictionary self.situation_long = {} + + # last_timegap is used for grouping riders together last_timegap = None + for item in situation_long: - #print(item) time_gap = item.find(attrs={"class":"time"}) - group_name = item.find(attrs={"class":"groupname"}) rider = item.find(attrs={"class":"maxw200"}) + # time gap is only listed once and subsequent riders in that + # group don't have one (None) if time_gap: - tg = self.remove_tags(time_gap) - self.situation_long[tg] = [] - last_timegap = tg - rider_name = self.remove_tags(rider) + tg = self.remove_tags(time_gap) # get rid of html tags + self.situation_long[tg] = [] # create the list inside the dict + last_timegap = tg # set last timegap for the loop + + rider_name = self.remove_tags(rider) # remove tags from rider name + # the leading rider is basically the group name i guess? + # anyway we dont need it twice if rider_name not in self.situation_long[last_timegap]: + # add rider to list of riders under that timegap/group self.situation_long[last_timegap].append(rider_name) - #print(self.remove_tags(time_gap), self.remove_tags(group_name), self.remove_tags(riders)) - - + # remove surrounding html tags from final data points, + # like rider names, race names, etc. def remove_tags(self, text): text = str(text) text_soup = BeautifulSoup(text, "html.parser")