With this webscraping project in R, I webscrape the English Premier League football table. The setup below features loading the necessary libraries and using read_html()
to load in the Premier League table page.
Setup
# English Premier League Football League Table
# Load libraries:
library(dplyr)
library(tidyr)
library(rvest)
library(stringr)
library(writexl)
## Extract English Premier League Soccer League tables page.
page <- read_html("https://www.premierleague.com/tables")
For extracting the parts of the table, I use a combination of html_nodes()
and xpaths. This is used for the ranks, names, number of matches played, wins, draws, losses, goals for, goals against, goals difference and league points.
I had to use filtering such as played <- played[1:20]
as using html_nodes()
extracts more than it should.
Rank
# Rank:
rank <- page %>%
html_nodes(xpath='//*[@class="tableContainer"]/div/table/tbody/tr/td[2]/span[1]') %>%
html_text2() %>%
readr::parse_integer()
Team Name In Premier League
# EPL Club Name:
name <- page %>%
html_nodes(xpath='//*[@class="tableContainer"]/div/table/tbody/tr/td[3]/a/span[2]') %>%
html_text2()
Matches Played
# Played Matches Amount, only take 20 as there are 20 teams
played <- page %>%
html_nodes(xpath='//*[@class="tableContainer"]/div/table/tbody/tr/td[4]') %>%
html_text2() %>%
readr::parse_integer()
played <- played[1:20]
Wins
# Won
wins <- page %>%
html_nodes(xpath='//*[@class="tableContainer"]/div/table/tbody/tr/td[5]') %>%
html_text2() %>%
readr::parse_integer()
wins <- wins[1:20]
Draws
# Draws
draws <- page %>%
html_nodes(xpath='//*[@class="tableContainer"]/div/table/tbody/tr/td[6]') %>%
html_text2() %>%
readr::parse_integer()
draws <- draws[1:20]
Losses
# Losses
losses <- page %>%
html_nodes(xpath='//*[@class="tableContainer"]/div/table/tbody/tr/td[7]') %>%
html_text2() %>%
readr::parse_integer()
losses <- losses[1:20]
Goals For
# Goals For
goals_for <- page %>%
html_nodes(xpath='//*[@class="tableContainer"]/div/table/tbody/tr/td[8]') %>%
html_text2() %>%
readr::parse_integer()
goals_for <- goals_for[1:20]
Goals Against
# Goals Against
goals_against <- page %>%
html_nodes(xpath='//*[@class="tableContainer"]/div/table/tbody/tr/td[9]') %>%
html_text2() %>%
readr::parse_integer()
goals_against <- goals_against[1:20]
Goals Difference
# Goals Difference
goals_diff <- page %>%
html_nodes(xpath='//*[@class="tableContainer"]/div/table/tbody/tr/td[10]') %>%
html_text2() %>%
readr::parse_integer()
goals_diff <- goals_diff[1:20]
Points
# Points
points <- page %>%
html_nodes("[class='points']") %>%
html_text2() %>%
readr::parse_integer()
points <- points[2:21]
Once the parts of the table are extracted, the dataframe can be put together.
### Create Dataframe based on raw data:
epl_table <- data.frame(
Rank = seq(1, 20),
Club = name,
Played = played,
Wins = wins,
Draws = draws,
Losses = losses,
Goals_For = goals_for,
Goals_Against = goals_against,
Goals_Difference = goals_diff,
Points = points
)
## Preview table:
epl_table
## Rank Club Played Wins Draws Losses Goals_For
## 1 1 Manchester City 31 23 5 3 72
## 2 2 Liverpool 31 22 7 2 79
## 3 3 Chelsea 30 18 8 4 64
## 4 4 Tottenham Hotspur 31 18 3 10 56
## 5 5 Arsenal 30 17 3 10 45
## 6 6 West Ham United 32 15 6 11 51
## 7 7 Manchester United 31 14 9 8 49
## 8 8 Wolverhampton Wanderers 32 15 4 13 33
## 9 9 Leicester City 29 11 7 11 45
## 10 10 Crystal Palace 31 8 13 10 43
## 11 11 Brighton and Hove Albion 31 8 13 10 28
## 12 12 Aston Villa 31 11 3 17 42
## 13 13 Brentford 32 10 6 16 39
## 14 14 Southampton 31 8 12 11 37
## 15 15 Newcastle United 31 8 10 13 34
## 16 16 Leeds United 32 8 9 15 38
## 17 17 Everton 30 8 4 18 33
## 18 18 Burnley 30 4 12 14 25
## 19 19 Watford 31 6 4 21 29
## 20 20 Norwich City 31 5 6 20 20
## Goals_Against Goals_Difference Points
## 1 20 52 74
## 2 22 57 73
## 3 23 41 62
## 4 37 19 57
## 5 36 9 54
## 6 42 9 51
## 7 42 7 51
## 8 28 5 49
## 9 48 -3 40
## 10 40 3 37
## 11 37 -9 37
## 12 46 -4 36
## 13 48 -9 36
## 14 52 -15 36
## 15 54 -20 34
## 16 68 -30 33
## 17 52 -19 28
## 18 44 -19 24
## 19 60 -31 22
## 20 63 -43 21
As an optional thing, you can save the dataframe as an Excel file or as a .csv file.
Sys.Date()
## [1] "2022-04-13"
### Save data Dataframe as Excel File:
library(writexl)
write_xlsx(epl_table, paste("EPL_Table", Sys.Date(), ".xlsx", sep = ""))
## Save dataframe as .csv File Option:
write.csv(epl_table, paste("EPL_Table", Sys.Date(), ".csv", sep = ""), row.names = FALSE)