In this webscraping project in R, I webscrape the most recent La Liga Spanish football table.

Setup

The packages that are needed for webscraping and dealing with text are dplyr, tidyr, rvest, stringr. writexl package is for saving dataframes into Excel files and .csv files.

From rvest, the read_html() function is used for extracting the HTML page from the La Liga website.

## La Liga Spanish Soccer Table Web Scraping
# Reference: https://stackoverflow.com/questions/45450981/rvest-scrape-2-classes-in-1-tag
  
# Load libraries:
  
library(dplyr)
library(tidyr)
library(rvest)
library(stringr)
library(writexl)


## Extract La Liga Football League tables page.

page <- read_html("https://www.laliga.com/en-GB/laliga-santander/standing")

 

Extract Table Components

When it comes to extracting the table components from the La Liga standings table, I use xpaths along with html_nodes() for the job. In this case extracting ranks, club names and the rest outputs 60 items. I only need the first 20. This is why you see something like rank <- rank[1:20].

Ranks

# Extract the first 20 elements when I extract each part. There are 20 La Liga teams
# Rank:
rank <- page %>% 
        html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[1]') %>%
        html_text2() %>%
        readr::parse_integer()
rank <- rank[1:20]

 

Club Names

# Club Names 
club_names <- page %>% 
              html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[2]/div[2]/p') %>%
              html_text2()
club_names <- club_names[1:20]

club_names
##  [1] "Real Madrid"               "FC Barcelona"             
##  [3] "Sevilla FC"                "Atlético de Madrid"       
##  [5] "Real Betis"                "Real Sociedad"            
##  [7] "Villarreal CF"             "Athletic Club"            
##  [9] "Valencia CF"               "CA Osasuna"               
## [11] "RCD Espanyol de Barcelona" "RC Celta"                 
## [13] "Rayo Vallecano"            "Getafe CF"                
## [15] "Elche CF"                  "Granada CF"               
## [17] "RCD Mallorca"              "Cádiz CF"                 
## [19] "Levante UD"                "Deportivo Alavés"

 

Club Abbreviation

# Club Abbreviations:
# Club Names 
club_tickers <- page %>% 
                html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[2]/div[1]/p') %>%
                html_text2()
club_tickers <- club_tickers[1:20]

club_tickers
##  [1] "RMA" "BAR" "SEV" "ATM" "BET" "RSO" "VIL" "ATH" "VAL" "OSA" "ESP" "CEL"
## [13] "RAY" "GET" "ELC" "GRA" "MLL" "CAD" "LEV" "ALA"

 

Points

# Points:
points <- page %>% 
          html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[3]') %>%
          html_text2() %>%
          readr::parse_integer()
points <- points[1:20]

points
##  [1] 72 60 60 57 56 54 46 45 42 41 39 36 34 32 32 29 29 28 22 22

 

Played Matches

# Played Matches Amount:
played <- page %>% 
          html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[4]') %>%
          html_text2() %>%
          readr::parse_integer()
played <- played[1:20]

 

Wins

# Wins
wins <- page %>% 
        html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[5]') %>%
        html_text2() %>%
        readr::parse_integer()
wins <- wins[1:20]

 

Draws

# Draws
draws <- page %>% 
         html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[6]') %>%
         html_text2() %>%
         readr::parse_integer()
draws <- draws[1:20]

 

Losses

# Losses
losses <- page %>% 
          html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[7]') %>%
          html_text2() %>%
          readr::parse_integer()
losses <- losses[1:20]

 

Goals For

# Goals For
goals_for <- page %>% 
             html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[8]') %>%
             html_text2() %>%
             readr::parse_integer()
goals_for <- goals_for[1:20]

 

Goals Against

# Goals Against
goals_against <- page %>% 
                 html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[9]') %>%
                 html_text2() %>%
                 readr::parse_integer()
goals_against <- goals_against[1:20]

 

Goal Difference

# Goals Difference
goal_diff <- page %>% 
             html_nodes(xpath = '//*[@class="styled__StandingTableBody-e89col-5 cDiDQb"]/div/div[1]/div/div[1]/div[10]') %>%
             html_text2() %>%
             readr::parse_integer()
goal_diff <- goal_diff[1:20]

 

Create Dataframe For La Liga Football Table

### Create Dataframe based on raw data:

laliga_table <- data.frame(
  Rank = rank,
  Club = club_names,
  Club_Abbrev = club_tickers,
  Points = points,
  Played = played,
  Wins = wins,
  Draws = draws,
  Losses = losses,
  Goals_For = goals_for,
  Goals_Against = goals_against,
  Goal_Diff = goal_diff
)

## Preview table:

head(laliga_table, 10)
##    Rank               Club Club_Abbrev Points Played Wins Draws Losses
## 1     1        Real Madrid         RMA     72     31   22     6      3
## 2     2       FC Barcelona         BAR     60     30   17     9      4
## 3     3         Sevilla FC         SEV     60     31   16    12      3
## 4     4 Atlético de Madrid         ATM     57     31   17     6      8
## 5     5         Real Betis         BET     56     31   17     5      9
## 6     6      Real Sociedad         RSO     54     31   15     9      7
## 7     7      Villarreal CF         VIL     46     31   12    10      9
## 8     8      Athletic Club         ATH     45     31   11    12      8
## 9     9        Valencia CF         VAL     42     31   10    12      9
## 10   10         CA Osasuna         OSA     41     31   11     8     12
##    Goals_For Goals_Against Goal_Diff
## 1         63            26        37
## 2         60            31        29
## 3         44            22        22
## 4         57            38        19
## 5         56            37        19
## 6         32            30         2
## 7         49            30        19
## 8         36            30         6
## 9         43            44        -1
## 10        31            40        -9

 

Sys.Date()
## [1] "2022-04-13"

 

Optional - Save Dataframe as Excel File or .csv File

### Save Raw data Dataframe as Excel File:

library(writexl)

write_xlsx(laliga_table, paste("LaLiga_Table", Sys.Date(), ".xlsx", sep = ""))

## Save dataframe as .csv File Option:

write.csv(laliga_table, paste("LaLiga_Table", Sys.Date(), ".csv", sep = ""), row.names = FALSE)