pcs_live/pcslive.py

from bs4 import BeautifulSoup
import requests, time

# class for storing the live races
class LiveStats:
    def __init__(self):
        self.races = []
        self.refresh_races()

    # scrapes from the PCS homepage where the little green live stats boxes are
    # it also gets rid of all the horrible polygon stuff
    def refresh_races(self):
        # the entire front page of PCS loaded into soup
        req = requests.get("https://www.procyclingstats.com/")
        html = req.text
        soup = BeautifulSoup(html, "html.parser")

        # narrow it down to the little green live stats boxes showing us what
        # races are currently live
        live = soup.find(attrs={"class":"hp3-livestats"})

        # if there are even any races:
        if live:
            # also get rid of the horrible polygon number stuff !!! YUCK
            # i imagine its how the profiles are drawn. there is A LOT
            # and it makes reading the raw html very PAINFUL
            for tag in live.find_all(attrs={"class":"inverse"}):
                tag.decompose()

            # conveniently, every race is a list item (<li>)
            # inside is the title, status (live/finished), km to go,
            # and brief situation text, but we parse that inside Race
            races = live.find_all("li")
            self.races = []
            for raw_race in races:
                # for now just create the Race object and add it to our list
                race = Race(raw_race)
                self.races.append(race)

    # straightforward
    def print_races(self):
        for race in self.races:
            race.print_stats()


# class for storing information about individual live races
class Race:
    def __init__(self, raw):
        # this is the unparsed html from LiveStats
        self.raw = raw

        # now go parse it
        self.refresh_info()

        # keeping these empty unless requested becauses i feel like it
        # timeline is headings from the race's page (left side)
        # situation_long is groups from the left side and time gaps
        # (not to be confused with situation which is a brief homepage summary)
        self.timeline = []
        self.situation_long = []

    # to parse the raw data given by LiveStats
    def refresh_info(self):
        title_r = self.raw.find(attrs={"class":"title"})
        status_r = self.raw.find(attrs={"class":"status"})
        togo_r = self.raw.find(attrs={"class":"togo"})
        situation_r = self.raw.find(attrs={"class":"situ_txt"})
        url_r = self.raw.find("a", href=True)

        self.title = self.remove_tags(title_r) # name of the race
        self.status = self.remove_tags(status_r) # live or finished (finished races stay online for a short while after finishing)
        self.togo = self.remove_tags(togo_r) # how many kms to go (None if finished)
        self.situation = self.remove_tags(situation_r) # a brief summary of the situation - None if finished, and - if complete peloton
        self.url = self.remove_tags(url_r["href"]) # the url for the race's full live stats page

    # self explanatory
    def print_stats(self):
        print(self.title)
        print(self.status)

        # dont bother showing kms to go or situation if finished
        if self.togo != "None":
            print(self.togo, "to go")
            print(self.situation)

        print("===============")

    # for testing
    # def print_raw(self):
    #     print(self.raw)
    #     print("")
    #     print(self.url)

    # this gets the html of the entire page of stats for this particular race
    # for use with get_timeline() and get_situation_long()
    def get_race_page(self):
        if self.url != "None":
            full_url = "https://www.procyclingstats.com/" + self.url

            req = requests.get(full_url)
            html = req.text
            soup = BeautifulSoup(html, "html.parser")
            return soup
        return None

    # fills a list with all displayed timeline items (headings only)
    # item 0 is the most recent update
    def get_timeline(self):
        page = self.get_race_page()
        if page:
            all = page.find_all(attrs={"class":"timeline3cont"})
            live = all[0]
            timeline = live.find_all("li")
            self.timeline = []
            for item in timeline:
                stat = item.find(attrs={"class":"stat"})
                if stat:
                    stat_content = stat.find(attrs={"class":"textCont"})
                    self.timeline.append(self.remove_tags(stat_content))


    # creates a dictionary containing time gaps as keys and each
    # timegap points to a list of riders in that group
    def get_situation_long(self):
        page = self.get_race_page()
        if page:
            all = page.find_all(attrs={"class":"situCont"})
            live = all[0]
            situation_long = live.find_all("li")

            # create the dictionary
            self.situation_long = {}

            # last_timegap is used for grouping riders together
            last_timegap = None

            for item in situation_long:
                time_gap = item.find(attrs={"class":"time"})
                rider = item.find(attrs={"class":"maxw200"})

                # time gap is only listed once and subsequent riders in that
                # group don't have one (None)
                if time_gap:
                    tg = self.remove_tags(time_gap) # get rid of html tags
                    self.situation_long[tg] = [] # create the list inside the dict
                    last_timegap = tg # set last timegap for the loop

                rider_name = self.remove_tags(rider) # remove tags from rider name
                # the leading rider is basically the group name i guess?
                # anyway we dont need it twice
                if rider_name not in self.situation_long[last_timegap]:
                    # add rider to list of riders under that timegap/group
                    self.situation_long[last_timegap].append(rider_name)

    # remove surrounding html tags from final data points,
    # like rider names, race names, etc.
    def remove_tags(self, text):
        text = str(text)
        text_soup = BeautifulSoup(text, "html.parser")
        for data in text_soup(["style", "script"]):
            data.decompose()
        return " ".join(text_soup.stripped_strings)