ATP Men’s Tennis Rankings Webscrape

In this webscrape project in R, I extract the latest men’s tennis rankings from the ATP website.

Setup

# Reference: https://stackoverflow.com/questions/45450981/rvest-scrape-2-classes-in-1-tag
  
# Load libraries:
  
library(dplyr)
library(tidyr)
library(rvest)
library(stringr)

Load Page

#---------------------------------------
### ATP Men's Tennis:
# When extracting text, there is \r which appears. This is removed each time with gsub & trimws functions
#---------------------------------------

# Obtain singles rankings for men, just top 100:

atp_link <- read_html("https://www.atptour.com/en/rankings/singles")

page <- atp_link %>% html_elements('tbody')

Extract Table Components

When it comes to extracting table components in R and rvest, I use html_nodes() to extract selected data given their class names. The class names are found after right clicking on a page and clicking Inspect to see the HTML of the page.

Ranks

# Ranks, remove \r and whitespace. Convert into numeric from string.

men_ranks <- page %>%
  html_nodes("[class='rank-cell border-left-4 border-right-dash-1']") %>%
  html_text2()

men_ranks

##   [1] "\r 1\r"   "\r 2\r"   "\r 3\r"   "\r 4\r"   "\r 5\r"   "\r 6\r"  
##   [7] "\r 7\r"   "\r 8\r"   "\r 9\r"   "\r 10\r"  "\r 11\r"  "\r 12\r" 
##  [13] "\r 13\r"  "\r 14\r"  "\r 15\r"  "\r 16\r"  "\r 17\r"  "\r 18\r" 
##  [19] "\r 19\r"  "\r 20\r"  "\r 21\r"  "\r 22\r"  "\r 23\r"  "\r 24\r" 
##  [25] "\r 25\r"  "\r 26\r"  "\r 27\r"  "\r 28\r"  "\r 29\r"  "\r 30\r" 
##  [31] "\r 31\r"  "\r 32\r"  "\r 33\r"  "\r 34\r"  "\r 35\r"  "\r 36\r" 
##  [37] "\r 37\r"  "\r 38\r"  "\r 39\r"  "\r 40\r"  "\r 41\r"  "\r 42\r" 
##  [43] "\r 43\r"  "\r 44\r"  "\r 45\r"  "\r 46\r"  "\r 47\r"  "\r 48\r" 
##  [49] "\r 49\r"  "\r 50\r"  "\r 51\r"  "\r 52\r"  "\r 53\r"  "\r 54\r" 
##  [55] "\r 55\r"  "\r 56\r"  "\r 57\r"  "\r 58\r"  "\r 59\r"  "\r 60\r" 
##  [61] "\r 61\r"  "\r 62\r"  "\r 63\r"  "\r 64\r"  "\r 65\r"  "\r 66\r" 
##  [67] "\r 67\r"  "\r 68\r"  "\r 69\r"  "\r 70\r"  "\r 71\r"  "\r 72\r" 
##  [73] "\r 73\r"  "\r 74\r"  "\r 75\r"  "\r 76\r"  "\r 77\r"  "\r 78\r" 
##  [79] "\r 79\r"  "\r 80\r"  "\r 81\r"  "\r 82\r"  "\r 83\r"  "\r 84\r" 
##  [85] "\r 85\r"  "\r 86\r"  "\r 87\r"  "\r 88\r"  "\r 89\r"  "\r 90\r" 
##  [91] "\r 91\r"  "\r 92\r"  "\r 93\r"  "\r 94\r"  "\r 95\r"  "\r 96\r" 
##  [97] "\r 97\r"  "\r 98\r"  "\r 99\r"  "\r 100\r"

As you can see there are a lot of \rs that surround the webscraped items. I use gsub the remove the \rs and convert the string into numbers.

men_ranks <- gsub('[\r]', ' ', men_ranks) %>% trimws(which = "both")
men_ranks <- as.numeric(men_ranks)

Countries

# Obtain countries (Abbreviated), found in alt atttribute in image part.

men_country <- page %>%
  html_nodes("[class='country-item']") %>%
  html_element('img') %>%
  html_attr('alt')

Players

# Players:

atp_players <- page %>%
  html_nodes("[class='player-cell border-left-dash-1 border-right-dash-1']") %>%
  html_text2() 

atp_players <- gsub('[\r]', ' ', atp_players) %>% trimws(which = "both")

Player Ages

# Player Age:

atp_age <- page %>%
  html_nodes("[class='age-cell border-left-dash-1 border-right-4']") %>%
  html_text2() 

atp_age <- gsub('[\r]', ' ', atp_age) %>% trimws(which = "both")
atp_age <- as.numeric(atp_age)

ATP Ranking Points

# ATP points (Remove /r, white space, remove comma & convert into numeric)

atp_pts <- page %>%
  html_nodes("[class='points-cell border-right-dash-1']") %>%
  html_text2() 

atp_pts <- gsub('[\r]', ' ', atp_pts) %>% trimws(which = "both")
atp_pts <- gsub(',', '', atp_pts)
atp_pts <- as.numeric(atp_pts)

Tournaments Played

# Tournaments Played

atp_played <- page %>%
  html_nodes("[class='tourn-cell border-left-dash-1 border-right-dash-1']") %>%
  html_text2() 

atp_played <- gsub('[\r]', ' ', atp_played ) %>% trimws(which = "both")
atp_played <- as.numeric(atp_played)

Points Dropping

# Points Dropping

atp_drop_pts <- page %>%
  html_nodes("[class='pts-cell border-left-dash-1 border-right-dash-1']") %>%
  html_text2() 

atp_drop_pts <- gsub('[\r]', ' ', atp_drop_pts) %>% trimws(which = "both")
atp_drop_pts <- as.numeric(atp_drop_pts)

## Warning: NAs introduced by coercion

Next Best

# Next Best

atp_next_best <- page %>%
  html_nodes("[class='next-cell border-left-dash-1 border-right-4']") %>%
  html_text2() 

atp_next_best <- gsub('[\r]', ' ', atp_next_best) %>% trimws(which = "both")
atp_next_best <- as.numeric(atp_next_best)

Create Dataframe

Once the parts of the table have been extracted we can form the dataframe.

### Create Men's Ranking Table Dataframe now:

atp_ranks_table <- data.frame(
  Rank = men_ranks,
  Player = atp_players,
  Country = men_country,
  Points = atp_pts,
  Tournaments_Played = atp_played,
  Pts_Drop = atp_drop_pts,
  Next_Best = atp_next_best
)

#Show top 15 players
head(atp_ranks_table, 15)

##    Rank                Player Country Points Tournaments_Played Pts_Drop
## 1     1        Novak Djokovic     SRB   8420                 13       90
## 2     2       Daniil Medvedev     RUS   8410                 23      180
## 3     3      Alexander Zverev     GER   7195                 23       90
## 4     4          Rafael Nadal     ESP   7115                 12      180
## 5     5    Stefanos Tsitsipas     GRE   5980                 26       NA
## 6     6     Matteo Berrettini     ITA   4945                 22        0
## 7     7           Casper Ruud     NOR   4380                 27      360
## 8     8         Andrey Rublev     RUS   4375                 27      600
## 9     9 Felix Auger-Aliassime     CAN   3625                 26        0
## 10   10        Cameron Norrie     GBR   3440                 27        0
## 11   11        Carlos Alcaraz     ESP   3320                 19        0
## 12   12         Jannik Sinner     ITA   3054                 33       45
## 13   13          Taylor Fritz     USA   2875                 30       45
## 14   14        Hubert Hurkacz     POL   2873                 28       45
## 15   15      Denis Shapovalov     CAN   2693                 26        0
##    Next_Best
## 1          0
## 2          0
## 3          0
## 4          0
## 5         45
## 6          0
## 7         90
## 8         45
## 9          0
## 10         0
## 11         0
## 12        20
## 13        35
## 14        10
## 15         0

Optional: Save Dataframe As .csv File

Sys.Date()

## [1] "2022-04-13"

## Save Top 100 Male Tennis Players Table into a .csv File:

write.csv(atp_ranks_table, paste("ATP_Top100_", Sys.Date(), sep = ""), row.names = FALSE)