pcslive.py

from bs4 import BeautifulSoup
import requests, time

# class for storing the live races
class LiveStats:
    def __init__(self):
        self.races = []
        self.refresh_races()

    # scrapes from the PCS homepage where the little green live stats boxes are
    # it also gets rid of all the horrible polygon stuff
    def refresh_races(self):
        # the entire front page of PCS loaded into soup
        req = requests.get("https://www.procyclingstats.com/")
        html = req.text
        soup = BeautifulSoup(html, "html.parser")

        # narrow it down to the little green live stats boxes showing us what
        # races are currently live
        live = soup.find(attrs={"class":"hp3-livestats"})

        # if there are even any races:
        if live:
            # also get rid of the horrible polygon number stuff !!! YUCK
            # i imagine its how the profiles are drawn. there is A LOT
            # and it makes reading the raw html very PAINFUL
            for tag in live.find_all(attrs={"class":"inverse"}):
                tag.decompose()

            # conveniently, every race is a list item (<li>)
            # inside is the title, status (live/finished), km to go, 
            # and brief situation text, but we parse that inside Race
            races = live.find_all("li")
            self.races = []
            for raw_race in races:
                # for now just create the Race object and add it to our list
                race = Race(raw_race)
                self.races.append(race)

    # straightforward
    def print_races(self):
        for race in self.races:
            race.print_stats()


# class for storing information about individual live races
class Race:
    def __init__(self, raw):
        # this is the unparsed html from LiveStats
        self.raw = raw

        # now go parse it
        self.refresh_info()

        # keeping these empty unless requested becauses i feel like it
        # timeline is headings from the race's page (left side)
        # situation_long is groups from the left side and time gaps
        # (not to be confused with situation which is a brief homepage summary)
        self.timeline = []
        self.situation_long = []

    # to parse the raw data given by LiveStats
    def refresh_info(self):
        title_r = self.raw.find(attrs={"class":"title"})
        status_r = self.raw.find(attrs={"class":"status"})
        togo_r = self.raw.find(attrs={"class":"togo"})
        situation_r = self.raw.find(attrs={"class":"situ_txt"})
        url_r = self.raw.find("a", href=True)

        self.title = self.remove_tags(title_r) # name of the race
        self.status = self.remove_tags(status_r) # live or finished (finished races stay online for a short while after finishing)
        self.togo = self.remove_tags(togo_r) # how many kms to go (None if finished)
        self.situation = self.remove_tags(situation_r) # a brief summary of the situation - None if finished, and - if complete peloton
        self.url = self.remove_tags(url_r["href"]) # the url for the race's full live stats page

    # self explanatory
    def print_stats(self):
        print(self.title)
        print(self.status)

        # dont bother showing kms to go or situation if finished
        if self.togo != "None":
            print(self.togo, "to go")
            print(self.situation)

        print("===============")

    # for testing
    # def print_raw(self):
    #     print(self.raw)
    #     print("")
    #     print(self.url)

    # this gets the html of the entire page of stats for this particular race
    # for use with get_timeline() and get_situation_long()
    def get_race_page(self):
        if self.url != "None":
            full_url = "https://www.procyclingstats.com/" + self.url

            req = requests.get(full_url)
            html = req.text
            soup = BeautifulSoup(html, "html.parser")
            return soup
        return None

    # fills a list with all displayed timeline items (headings only)
    # item 0 is the most recent update
    def get_timeline(self):
        page = self.get_race_page()
        if page:
            all = page.find_all(attrs={"class":"timeline3cont"})
            live = all[0]
            timeline = live.find_all("li")
            self.timeline = []
            for item in timeline:
                stat = item.find(attrs={"class":"stat"})
                if stat:
                    stat_content = stat.find(attrs={"class":"textCont"})
                    self.timeline.append(self.remove_tags(stat_content))


    # creates a dictionary containing time gaps as keys and each
    # timegap points to a list of riders in that group
    def get_situation_long(self):
        page = self.get_race_page()
        if page:
            all = page.find_all(attrs={"class":"situCont"})
            live = all[0]
            situation_long = live.find_all("li")

            # create the dictionary
            self.situation_long = {}

            # last_timegap is used for grouping riders together
            last_timegap = None

            for item in situation_long:
                time_gap = item.find(attrs={"class":"time"})
                rider = item.find(attrs={"class":"maxw200"})

                # time gap is only listed once and subsequent riders in that
                # group don't have one (None)
                if time_gap:
                    tg = self.remove_tags(time_gap) # get rid of html tags
                    self.situation_long[tg] = [] # create the list inside the dict
                    last_timegap = tg # set last timegap for the loop

                rider_name = self.remove_tags(rider) # remove tags from rider name
                # the leading rider is basically the group name i guess?
                # anyway we dont need it twice
                if rider_name not in self.situation_long[last_timegap]:
                    # add rider to list of riders under that timegap/group
                    self.situation_long[last_timegap].append(rider_name)

    # remove surrounding html tags from final data points,
    # like rider names, race names, etc.
    def remove_tags(self, text):
        text = str(text)
        text_soup = BeautifulSoup(text, "html.parser")
        for data in text_soup(["style", "script"]):
            data.decompose()
        return " ".join(text_soup.stripped_strings)
ayaya 2026-05-30 18:39:09 +01:00			`from bs4 import BeautifulSoup`
			`import requests, time`

comments 2026-05-30 20:52:47 +01:00			`# class for storing the live races`
ayaya 2026-05-30 18:39:09 +01:00			`class LiveStats:`
			`def __init__(self):`
in case there are no live races dont error 2026-05-30 22:00:59 +01:00			`self.races = []`
comments 2026-05-30 20:52:47 +01:00			`self.refresh_races()`
ayaya 2026-05-30 18:39:09 +01:00
comments 2026-05-30 20:52:47 +01:00			`# scrapes from the PCS homepage where the little green live stats boxes are`
			`# it also gets rid of all the horrible polygon stuff`
			`def refresh_races(self):`
			`# the entire front page of PCS loaded into soup`
ayaya 2026-05-30 18:39:09 +01:00			`req = requests.get("https://www.procyclingstats.com/")`
			`html = req.text`
			`soup = BeautifulSoup(html, "html.parser")`

comments 2026-05-30 20:52:47 +01:00			`# narrow it down to the little green live stats boxes showing us what`
			`# races are currently live`
			`live = soup.find(attrs={"class":"hp3-livestats"})`

in case there are no live races dont error 2026-05-30 22:00:59 +01:00			`# if there are even any races:`
			`if live:`
			`# also get rid of the horrible polygon number stuff !!! YUCK`
			`# i imagine its how the profiles are drawn. there is A LOT`
			`# and it makes reading the raw html very PAINFUL`
			`for tag in live.find_all(attrs={"class":"inverse"}):`
			`tag.decompose()`
ayaya 2026-05-30 18:39:09 +01:00
in case there are no live races dont error 2026-05-30 22:00:59 +01:00			`# conveniently, every race is a list item (<li>)`
			`# inside is the title, status (live/finished), km to go,`
			`# and brief situation text, but we parse that inside Race`
			`races = live.find_all("li")`
			`self.races = []`
			`for raw_race in races:`
			`# for now just create the Race object and add it to our list`
			`race = Race(raw_race)`
			`self.races.append(race)`
ayaya 2026-05-30 18:39:09 +01:00
comments 2026-05-30 20:52:47 +01:00			`# straightforward`
ayaya 2026-05-30 18:39:09 +01:00			`def print_races(self):`
			`for race in self.races:`
			`race.print_stats()`

comments 2026-05-30 20:52:47 +01:00



			`# class for storing information about individual live races`
ayaya 2026-05-30 18:39:09 +01:00			`class Race:`
			`def __init__(self, raw):`
comments 2026-05-30 20:52:47 +01:00			`# this is the unparsed html from LiveStats`
ayaya 2026-05-30 18:39:09 +01:00			`self.raw = raw`
comments 2026-05-30 20:52:47 +01:00
			`# now go parse it`
ayaya 2026-05-30 18:39:09 +01:00			`self.refresh_info()`

comments 2026-05-30 20:52:47 +01:00			`# keeping these empty unless requested becauses i feel like it`
			`# timeline is headings from the race's page (left side)`
			`# situation_long is groups from the left side and time gaps`
			`# (not to be confused with situation which is a brief homepage summary)`
			`self.timeline = []`
			`self.situation_long = []`

			`# to parse the raw data given by LiveStats`
ayaya 2026-05-30 18:39:09 +01:00			`def refresh_info(self):`
			`title_r = self.raw.find(attrs={"class":"title"})`
			`status_r = self.raw.find(attrs={"class":"status"})`
			`togo_r = self.raw.find(attrs={"class":"togo"})`
			`situation_r = self.raw.find(attrs={"class":"situ_txt"})`
			`url_r = self.raw.find("a", href=True)`

comments 2026-05-30 20:52:47 +01:00			`self.title = self.remove_tags(title_r) # name of the race`
			`self.status = self.remove_tags(status_r) # live or finished (finished races stay online for a short while after finishing)`
			`self.togo = self.remove_tags(togo_r) # how many kms to go (None if finished)`
			`self.situation = self.remove_tags(situation_r) # a brief summary of the situation - None if finished, and - if complete peloton`
			`self.url = self.remove_tags(url_r["href"]) # the url for the race's full live stats page`
ayaya 2026-05-30 18:39:09 +01:00
comments 2026-05-30 20:52:47 +01:00			`# self explanatory`
ayaya 2026-05-30 18:39:09 +01:00			`def print_stats(self):`
			`print(self.title)`
			`print(self.status)`
comments 2026-05-30 20:52:47 +01:00
			`# dont bother showing kms to go or situation if finished`
ayaya 2026-05-30 18:39:09 +01:00			`if self.togo != "None":`
			`print(self.togo, "to go")`
			`print(self.situation)`
comments 2026-05-30 20:52:47 +01:00
ayaya 2026-05-30 18:39:09 +01:00			`print("===============")`

comments 2026-05-30 20:52:47 +01:00			`# for testing`
			`# def print_raw(self):`
			`# print(self.raw)`
			`# print("")`
			`# print(self.url)`
ayaya 2026-05-30 18:39:09 +01:00
comments 2026-05-30 20:52:47 +01:00			`# this gets the html of the entire page of stats for this particular race`
			`# for use with get_timeline() and get_situation_long()`
can now get the situation diagram from the race stats page 2026-05-30 19:46:32 +01:00			`def get_race_page(self):`
ayaya 2026-05-30 18:39:09 +01:00			`if self.url != "None":`
			`full_url = "https://www.procyclingstats.com/" + self.url`

			`req = requests.get(full_url)`
			`html = req.text`
			`soup = BeautifulSoup(html, "html.parser")`
can now get the situation diagram from the race stats page 2026-05-30 19:46:32 +01:00			`return soup`
			`return None`
ayaya 2026-05-30 18:39:09 +01:00
comments 2026-05-30 20:52:47 +01:00			`# fills a list with all displayed timeline items (headings only)`
			`# item 0 is the most recent update`
can now get the situation diagram from the race stats page 2026-05-30 19:46:32 +01:00			`def get_timeline(self):`
			`page = self.get_race_page()`
			`if page:`
			`all = page.find_all(attrs={"class":"timeline3cont"})`
ayaya 2026-05-30 18:39:09 +01:00			`live = all[0]`
			`timeline = live.find_all("li")`
			`self.timeline = []`
			`for item in timeline:`
			`stat = item.find(attrs={"class":"stat"})`
			`if stat:`
			`stat_content = stat.find(attrs={"class":"textCont"})`
			`self.timeline.append(self.remove_tags(stat_content))`

comments 2026-05-30 20:52:47 +01:00
			`# creates a dictionary containing time gaps as keys and each`
			`# timegap points to a list of riders in that group`
can now get the situation diagram from the race stats page 2026-05-30 19:46:32 +01:00			`def get_situation_long(self):`
			`page = self.get_race_page()`
			`if page:`
			`all = page.find_all(attrs={"class":"situCont"})`
			`live = all[0]`
			`situation_long = live.find_all("li")`
comments 2026-05-30 20:52:47 +01:00
			`# create the dictionary`
can now get the situation diagram from the race stats page 2026-05-30 19:46:32 +01:00			`self.situation_long = {}`
comments 2026-05-30 20:52:47 +01:00
			`# last_timegap is used for grouping riders together`
can now get the situation diagram from the race stats page 2026-05-30 19:46:32 +01:00			`last_timegap = None`
comments 2026-05-30 20:52:47 +01:00
can now get the situation diagram from the race stats page 2026-05-30 19:46:32 +01:00			`for item in situation_long:`
			`time_gap = item.find(attrs={"class":"time"})`
			`rider = item.find(attrs={"class":"maxw200"})`

comments 2026-05-30 20:52:47 +01:00			`# time gap is only listed once and subsequent riders in that`
			`# group don't have one (None)`
can now get the situation diagram from the race stats page 2026-05-30 19:46:32 +01:00			`if time_gap:`
comments 2026-05-30 20:52:47 +01:00			`tg = self.remove_tags(time_gap) # get rid of html tags`
			`self.situation_long[tg] = [] # create the list inside the dict`
			`last_timegap = tg # set last timegap for the loop`

			`rider_name = self.remove_tags(rider) # remove tags from rider name`
			`# the leading rider is basically the group name i guess?`
			`# anyway we dont need it twice`
can now get the situation diagram from the race stats page 2026-05-30 19:46:32 +01:00			`if rider_name not in self.situation_long[last_timegap]:`
comments 2026-05-30 20:52:47 +01:00			`# add rider to list of riders under that timegap/group`
can now get the situation diagram from the race stats page 2026-05-30 19:46:32 +01:00			`self.situation_long[last_timegap].append(rider_name)`

comments 2026-05-30 20:52:47 +01:00			`# remove surrounding html tags from final data points,`
			`# like rider names, race names, etc.`
ayaya 2026-05-30 18:39:09 +01:00			`def remove_tags(self, text):`
			`text = str(text)`
			`text_soup = BeautifulSoup(text, "html.parser")`
			`for data in text_soup(["style", "script"]):`
			`data.decompose()`
			`return " ".join(text_soup.stripped_strings)`