diff --git a/README.md b/README.md
index e57612a..aee32ec 100644
--- a/README.md
+++ b/README.md
@@ -42,6 +42,3 @@ stats.print_races()
`example.py` posts the latest timeline of one current race (checking every 10 seconds for updates)
`example2.py` organises the situation diagram in a dictionary of timegap keys to a list of riders in that group. it could probably be organised better but it works
-
-
-
diff --git a/example.py b/example.py
index d45f677..4d22f30 100644
--- a/example.py
+++ b/example.py
@@ -3,15 +3,21 @@ import time
stats = LiveStats()
+# if there are live races...
if len(stats.races) > 0:
- race = stats.races[0]
+ race = stats.races[0] # just grab the first one for example's sake
print("Latest timeline update from", race.title, ":")
-
+ # keep the last update so we don't spam the terminal
last_update = ""
while True:
race.get_timeline()
+
+ # only update terminal if the newest item has changed (is new)
if last_update != race.timeline[0]:
print(race.timeline[0])
last_update = race.timeline[0]
+
+ # wait ten seconds before checking again
+ # maybe set to longer if running for extended time
time.sleep(10)
\ No newline at end of file
diff --git a/example2.py b/example2.py
index ba53c3a..28fc1d7 100644
--- a/example2.py
+++ b/example2.py
@@ -3,10 +3,12 @@ import time
stats = LiveStats()
+# if there are live races...
if len(stats.races) > 0:
race = stats.races[0]
race.get_situation_long()
+ # whats the situation? :)
for x in race.situation_long:
print(x, race.situation_long[x])
\ No newline at end of file
diff --git a/pcslive.py b/pcslive.py
index 784f205..3dc5fdf 100644
--- a/pcslive.py
+++ b/pcslive.py
@@ -1,42 +1,65 @@
from bs4 import BeautifulSoup
import requests, time
+# class for storing the live races
class LiveStats:
def __init__(self):
- self.refresh_live()
- self.get_races()
+ self.refresh_races()
- self.timeline = []
- self.situation_long = []
-
- def refresh_live(self):
+ # scrapes from the PCS homepage where the little green live stats boxes are
+ # it also gets rid of all the horrible polygon stuff
+ def refresh_races(self):
+ # the entire front page of PCS loaded into soup
req = requests.get("https://www.procyclingstats.com/")
html = req.text
soup = BeautifulSoup(html, "html.parser")
- all = soup.find_all(attrs={"class":"hp3-livestats"})
- live = all[0]
+ # narrow it down to the little green live stats boxes showing us what
+ # races are currently live
+ live = soup.find(attrs={"class":"hp3-livestats"})
+
+ # also get rid of the horrible polygon number stuff !!! YUCK
+ # i imagine its how the profiles are drawn. there is A LOT
+ # and it makes reading the raw html very PAINFUL
for tag in live.find_all(attrs={"class":"inverse"}):
tag.decompose()
- self.live = live
-
- def get_races(self):
- races_raw = self.live.find_all("li")
+ # conveniently, every race is a list item (
)
+ # inside is the title, status (live/finished), km to go,
+ # and brief situation text, but we parse that inside Race
+ races = live.find_all("li")
self.races = []
- for race in races_raw:
- this_race = Race(race)
- self.races.append(this_race)
+ for raw_race in races:
+ # for now just create the Race object and add it to our list
+ race = Race(raw_race)
+ self.races.append(race)
+ # straightforward
def print_races(self):
for race in self.races:
race.print_stats()
+
+
+
+
+# class for storing information about individual live races
class Race:
def __init__(self, raw):
+ # this is the unparsed html from LiveStats
self.raw = raw
+
+ # now go parse it
self.refresh_info()
+ # keeping these empty unless requested becauses i feel like it
+ # timeline is headings from the race's page (left side)
+ # situation_long is groups from the left side and time gaps
+ # (not to be confused with situation which is a brief homepage summary)
+ self.timeline = []
+ self.situation_long = []
+
+ # to parse the raw data given by LiveStats
def refresh_info(self):
title_r = self.raw.find(attrs={"class":"title"})
status_r = self.raw.find(attrs={"class":"status"})
@@ -44,25 +67,32 @@ class Race:
situation_r = self.raw.find(attrs={"class":"situ_txt"})
url_r = self.raw.find("a", href=True)
- self.title = self.remove_tags(title_r)
- self.status = self.remove_tags(status_r)
- self.togo = self.remove_tags(togo_r)
- self.situation = self.remove_tags(situation_r)
- self.url = self.remove_tags(url_r["href"])
+ self.title = self.remove_tags(title_r) # name of the race
+ self.status = self.remove_tags(status_r) # live or finished (finished races stay online for a short while after finishing)
+ self.togo = self.remove_tags(togo_r) # how many kms to go (None if finished)
+ self.situation = self.remove_tags(situation_r) # a brief summary of the situation - None if finished, and - if complete peloton
+ self.url = self.remove_tags(url_r["href"]) # the url for the race's full live stats page
+ # self explanatory
def print_stats(self):
print(self.title)
print(self.status)
+
+ # dont bother showing kms to go or situation if finished
if self.togo != "None":
print(self.togo, "to go")
print(self.situation)
+
print("===============")
- def print_raw(self):
- print(self.raw)
- print("")
- print(self.url)
+ # for testing
+ # def print_raw(self):
+ # print(self.raw)
+ # print("")
+ # print(self.url)
+ # this gets the html of the entire page of stats for this particular race
+ # for use with get_timeline() and get_situation_long()
def get_race_page(self):
if self.url != "None":
full_url = "https://www.procyclingstats.com/" + self.url
@@ -73,6 +103,8 @@ class Race:
return soup
return None
+ # fills a list with all displayed timeline items (headings only)
+ # item 0 is the most recent update
def get_timeline(self):
page = self.get_race_page()
if page:
@@ -86,31 +118,42 @@ class Race:
stat_content = stat.find(attrs={"class":"textCont"})
self.timeline.append(self.remove_tags(stat_content))
+
+ # creates a dictionary containing time gaps as keys and each
+ # timegap points to a list of riders in that group
def get_situation_long(self):
page = self.get_race_page()
if page:
all = page.find_all(attrs={"class":"situCont"})
live = all[0]
situation_long = live.find_all("li")
+
+ # create the dictionary
self.situation_long = {}
+
+ # last_timegap is used for grouping riders together
last_timegap = None
+
for item in situation_long:
- #print(item)
time_gap = item.find(attrs={"class":"time"})
- group_name = item.find(attrs={"class":"groupname"})
rider = item.find(attrs={"class":"maxw200"})
+ # time gap is only listed once and subsequent riders in that
+ # group don't have one (None)
if time_gap:
- tg = self.remove_tags(time_gap)
- self.situation_long[tg] = []
- last_timegap = tg
- rider_name = self.remove_tags(rider)
+ tg = self.remove_tags(time_gap) # get rid of html tags
+ self.situation_long[tg] = [] # create the list inside the dict
+ last_timegap = tg # set last timegap for the loop
+
+ rider_name = self.remove_tags(rider) # remove tags from rider name
+ # the leading rider is basically the group name i guess?
+ # anyway we dont need it twice
if rider_name not in self.situation_long[last_timegap]:
+ # add rider to list of riders under that timegap/group
self.situation_long[last_timegap].append(rider_name)
- #print(self.remove_tags(time_gap), self.remove_tags(group_name), self.remove_tags(riders))
-
-
+ # remove surrounding html tags from final data points,
+ # like rider names, race names, etc.
def remove_tags(self, text):
text = str(text)
text_soup = BeautifulSoup(text, "html.parser")