pcs_live/pcslive.py

from bs4 import BeautifulSoup
import requests, time, cloudscraper

# class for storing the live races
class LiveStats:
    def __init__(self):
        self.races = []
        self.refresh_races()

    # scrapes from the PCS homepage where the little green live stats boxes are
    # it also gets rid of all the horrible polygon stuff
    def refresh_races(self):
        # the entire front page of PCS loaded into soup
        #self.req = requests.get("https://www.procyclingstats.com/")
        #self.html = self.req.text

        self.scraper = cloudscraper.create_scraper()
        self.req = self.scraper.get("https://www.procyclingstats.com/")
        self.html = self.req.text
        soup = BeautifulSoup(self.html, "html.parser")

        # narrow it down to the little green live stats boxes showing us what
        # races are currently live
        live = soup.find(attrs={"class":"hp3-livestats"})

        # if there are even any races:
        if live:
            # also get rid of the horrible polygon number stuff !!! YUCK
            # i imagine its how the profiles are drawn. there is A LOT
            # and it makes reading the raw html very PAINFUL
            for tag in live.find_all(attrs={"class":"inverse"}):
                tag.decompose()

            # conveniently, every race is a list item (<li>)
            # inside is the title, status (live/finished), km to go,
            # and brief situation text, but we parse that inside Race
            races = live.find_all("li")
            self.races = []
            for raw_race in races:
                # for now just create the Race object and add it to our list
                race = Race(raw_race)
                self.races.append(race)

    # straightforward
    def print_races(self):
        for race in self.races:
            race.print_stats()

    # finds a race by its title
    def find_race(self, query):
        for race in self.races:
            if query.lower() in race.title.lower():
                return race
        return None


# class for storing information about individual live races
class Race:
    def __init__(self, raw):
        # this is the unparsed html from LiveStats
        self.raw = raw

        # now go parse it
        self.refresh_info()

        # keeping these empty unless requested becauses i feel like it
        # timeline is headings from the race's page (left side)
        # situation_long is groups from the left side and time gaps
        # (not to be confused with situation which is a brief homepage summary)
        self.timeline = []
        self.situation_long = []

        # when using timeline_latest() put already returned updates
        # inside here so that they dont get repeated
        self.timeline_latest_store = []

    # to parse the raw data given by LiveStats
    def refresh_info(self):
        title_r = self.raw.find(attrs={"class":"title"})
        status_r = self.raw.find(attrs={"class":"status"})
        togo_r = self.raw.find(attrs={"class":"togo"})
        situation_r = self.raw.find(attrs={"class":"situ_txt"})
        url_r = self.raw.find("a", href=True)

        self.title = self.remove_tags(title_r) # name of the race
        self.status = self.remove_tags(status_r) # live or finished (finished races stay online for a short while after finishing)
        self.togo = self.remove_tags(togo_r) # how many kms to go (None if finished)
        self.situation = self.remove_tags(situation_r) # a brief summary of the situation - None if finished, and - if complete peloton
        self.url = self.remove_tags(url_r["href"]) # the url for the race's full live stats page

    # self explanatory
    def print_stats(self):
        print(self.title)
        print(self.status)

        # dont bother showing kms to go or situation if finished
        if self.togo != "None":
            print(self.togo, "to go")
            print(self.situation)

        print("===============")

    # for testing
    # def print_raw(self):
    #     print(self.raw)
    #     print("")
    #     print(self.url)

    # this gets the html of the entire page of stats for this particular race
    # for use with get_timeline() and get_situation_long()
    def get_race_page(self):
        if self.url != "None":
            full_url = "https://www.procyclingstats.com/" + self.url

            self.scraper = cloudscraper.create_scraper()
            req = self.scraper.get(full_url)
            html = req.text
            soup = BeautifulSoup(html, "html.parser")
            return soup
        return None

    # fills a list with all displayed timeline items (headings only)
    # item 0 is the most recent update
    def get_timeline(self):
        page = self.get_race_page()
        if page:
            all = page.find_all(attrs={"class":"timeline3cont"})
            self.timeline_live = all[0]
            timeline = self.timeline_live.find_all("li")
            self.timeline = []
            for item in timeline:
                stat = item.find(attrs={"class":"stat"})
                if stat:
                    stat_content, is_data, has_info_number = self.timeline_stats(stat)
                    if is_data:
                        pass
                    elif has_info_number:
                        # same as in the timeline_latest function
                        number = self.remove_tags(has_info_number)
                        text = self.remove_tags(stat_content)
                        update = number + " " + text
                        self.timeline.append(update)
                    else:
                        self.timeline.append(self.remove_tags(stat_content))

    # a function for getting only the latest timeline updates!
    # useful for making an async timeline feed
    def timeline_latest(self):
        # its a bit weird if it doesnt display Anything at first run
        # so if the seen list is empty, just show the latest update
        # from the full timeline (then add that to seen)
        # self.timeline_latest_store  is the list for storing seen updates
        if len(self.timeline_latest_store) == 0:
            update = self.timeline[0]
            self.timeline_latest_store.append(update)
            return update

        # now most of this code is identical to the full timeline
        # except that it just uses find instead of find_all
        # assuming it finds the first one... which it does
        latest = self.timeline_live.find("li")
        stat = latest.find(attrs={"class":"stat"})
        if stat:
            stat_content, is_data, has_info_number = self.timeline_stats(stat)
            if is_data:
                pass
            elif has_info_number:
                # some timeline updates use a big number
                # like 150 kilometers to the finish
                # and scraper only finds "kilometers to the finish"
                # so if there is a big number, get it and add it
                # there is a drawback of it doing this for every Big Number
                # but i sorta dont care rn haha
                number = self.remove_tags(has_info_number)
                text = self.remove_tags(stat_content)
                update = number + " " + text

                # only show the update if it hasnt been seen before
                if update not in self.timeline_latest_store:
                    # then store it so we know we've already seen it
                    self.timeline_latest_store.append(update)
                    return update
            else:
                # same as above but without the big number
                update = self.remove_tags(stat_content)
                if update not in self.timeline_latest_store:
                    self.timeline_latest_store.append(update)
                    return update
        return None

    # function for getting specific things from the timeline
    # it seemed like a good idea to put it here at the time
    # shrug
    def timeline_stats(self, stat):
        stat_content = stat.find(attrs={"class":"textCont"})
        is_data = stat.find(attrs={"class":"chartCont"})
        has_info_number = stat.find(attrs={"class":"number"})
        return stat_content, is_data, has_info_number

    # creates a dictionary containing time gaps as keys and each
    # timegap points to a list of riders in that group
    # def get_situation_long(self):
    #     page = self.get_race_page()
    #     if page:
    #         all = page.find_all(attrs={"class":"situCont"})
    #         live = all[0]
    #         situation_long = live.find_all("li")

    #         # create the dictionary
    #         self.situation_long = {}

    #         # last_timegap is used for grouping riders together
    #         last_timegap = None

    #         for item in situation_long:
    #             time_gap = item.find(attrs={"class":"time"})
    #             rider = item.find(attrs={"class":"maxw200"})

    #             # time gap is only listed once and subsequent riders in that
    #             # group don't have one (None)
    #             if time_gap:
    #                 tg = self.remove_tags(time_gap) # get rid of html tags
    #                 self.situation_long[tg] = [] # create the list inside the dict
    #                 last_timegap = tg # set last timegap for the loop

    #             rider_name = self.remove_tags(rider) # remove tags from rider name
    #             # the leading rider is basically the group name i guess?
    #             # anyway we dont need it twice
    #             if rider_name not in self.situation_long[last_timegap]:
    #                 # add rider to list of riders under that timegap/group
    #                 self.situation_long[last_timegap].append(rider_name)

    def get_situation_long(self):
        page = self.get_race_page()
        if page:
            all = page.find_all(attrs={"class":"situCont"})
            if all:
                live = all[0]
                groups = live.find_all(attrs={"class":"group"})
                self.situation_long = []
                for group in groups:
                    group_name = group.find(attrs={"class":"groupname"})
                    time = group.find(attrs={"class":"time"})
                    riders = group.find_all("li")
                    riders_clean = []
                    for rider in riders:
                        rider_name = rider.find(attrs={"class":"maxw180"})
                        riders_clean.append(self.remove_tags(rider_name))
                    group_name_clean = self.remove_tags(group_name)
                    time_clean = self.remove_tags(time).strip("??")
                    self.situation_long.append([group_name_clean, time_clean, riders_clean])
            else:
                self.situation_long = None


    # remove surrounding html tags from final data points,
    # like rider names, race names, etc.
    def remove_tags(self, text):
        text = str(text)
        text_soup = BeautifulSoup(text, "html.parser")
        for data in text_soup(["style", "script"]):
            data.decompose()
        return " ".join(text_soup.stripped_strings)