In this webscraping project in R, I webscrape the most recent La Liga Spanish football table.
Setup
The packages that are needed for webscraping and dealing with text are dplyr
, tidyr
, rvest
, stringr
. writexl
package is for saving dataframes into Excel files and .csv
files.
From rvest
, the read_html()
function is used for extracting the HTML page from the La Liga website.
## La Liga Spanish Soccer Table Web Scraping
# Reference: https://stackoverflow.com/questions/45450981/rvest-scrape-2-classes-in-1-tag
# Load libraries:
library(dplyr)
library(tidyr)
library(rvest)
library(stringr)
library(writexl)
## Extract La Liga Football League tables page.
page <- read_html("https://www.laliga.com/en-GB/laliga-santander/standing")
When it comes to extracting the table components from the La Liga standings table, I use xpaths along with html_nodes()
for the job. In this case extracting ranks, club names and the rest outputs 60 items. I only need the first 20. This is why you see something like rank <- rank[1:20]
.
Ranks
# Extract the first 20 elements when I extract each part. There are 20 La Liga teams
# Rank:
rank <- page %>%
html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[1]') %>%
html_text2() %>%
readr::parse_integer()
rank <- rank[1:20]
Club Names
# Club Names
club_names <- page %>%
html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[2]/div[2]/p') %>%
html_text2()
club_names <- club_names[1:20]
club_names
## [1] "Real Madrid" "FC Barcelona"
## [3] "Sevilla FC" "Atlético de Madrid"
## [5] "Real Betis" "Real Sociedad"
## [7] "Villarreal CF" "Athletic Club"
## [9] "Valencia CF" "CA Osasuna"
## [11] "RCD Espanyol de Barcelona" "RC Celta"
## [13] "Rayo Vallecano" "Getafe CF"
## [15] "Elche CF" "Granada CF"
## [17] "RCD Mallorca" "Cádiz CF"
## [19] "Levante UD" "Deportivo Alavés"
Club Abbreviation
# Club Abbreviations:
# Club Names
club_tickers <- page %>%
html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[2]/div[1]/p') %>%
html_text2()
club_tickers <- club_tickers[1:20]
club_tickers
## [1] "RMA" "BAR" "SEV" "ATM" "BET" "RSO" "VIL" "ATH" "VAL" "OSA" "ESP" "CEL"
## [13] "RAY" "GET" "ELC" "GRA" "MLL" "CAD" "LEV" "ALA"
Points
# Points:
points <- page %>%
html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[3]') %>%
html_text2() %>%
readr::parse_integer()
points <- points[1:20]
points
## [1] 72 60 60 57 56 54 46 45 42 41 39 36 34 32 32 29 29 28 22 22
Played Matches
# Played Matches Amount:
played <- page %>%
html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[4]') %>%
html_text2() %>%
readr::parse_integer()
played <- played[1:20]
Wins
# Wins
wins <- page %>%
html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[5]') %>%
html_text2() %>%
readr::parse_integer()
wins <- wins[1:20]
Draws
# Draws
draws <- page %>%
html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[6]') %>%
html_text2() %>%
readr::parse_integer()
draws <- draws[1:20]
Losses
# Losses
losses <- page %>%
html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[7]') %>%
html_text2() %>%
readr::parse_integer()
losses <- losses[1:20]
Goals For
# Goals For
goals_for <- page %>%
html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[8]') %>%
html_text2() %>%
readr::parse_integer()
goals_for <- goals_for[1:20]
Goals Against
# Goals Against
goals_against <- page %>%
html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[9]') %>%
html_text2() %>%
readr::parse_integer()
goals_against <- goals_against[1:20]
Goal Difference
# Goals Difference
goal_diff <- page %>%
html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[10]') %>%
html_text2() %>%
readr::parse_integer()
goal_diff <- goal_diff[1:20]
### Create Dataframe based on raw data:
laliga_table <- data.frame(
Rank = rank,
Club = club_names,
Club_Abbrev = club_tickers,
Points = points,
Played = played,
Wins = wins,
Draws = draws,
Losses = losses,
Goals_For = goals_for,
Goals_Against = goals_against,
Goal_Diff = goal_diff
)
## Preview table:
head(laliga_table, 10)
## Rank Club Club_Abbrev Points Played Wins Draws Losses
## 1 1 Real Madrid RMA 72 31 22 6 3
## 2 2 FC Barcelona BAR 60 30 17 9 4
## 3 3 Sevilla FC SEV 60 31 16 12 3
## 4 4 Atlético de Madrid ATM 57 31 17 6 8
## 5 5 Real Betis BET 56 31 17 5 9
## 6 6 Real Sociedad RSO 54 31 15 9 7
## 7 7 Villarreal CF VIL 46 31 12 10 9
## 8 8 Athletic Club ATH 45 31 11 12 8
## 9 9 Valencia CF VAL 42 31 10 12 9
## 10 10 CA Osasuna OSA 41 31 11 8 12
## Goals_For Goals_Against Goal_Diff
## 1 63 26 37
## 2 60 31 29
## 3 44 22 22
## 4 57 38 19
## 5 56 37 19
## 6 32 30 2
## 7 49 30 19
## 8 36 30 6
## 9 43 44 -1
## 10 31 40 -9
Sys.Date()
## [1] "2022-04-13"
### Save Raw data Dataframe as Excel File:
library(writexl)
write_xlsx(laliga_table, paste("LaLiga_Table", Sys.Date(), ".xlsx", sep = ""))
## Save dataframe as .csv File Option:
write.csv(laliga_table, paste("LaLiga_Table", Sys.Date(), ".csv", sep = ""), row.names = FALSE)