In this webscrape project in R, I extract the latest men’s tennis rankings from the ATP website.
Setup
# Reference: https://stackoverflow.com/questions/45450981/rvest-scrape-2-classes-in-1-tag
# Load libraries:
library(dplyr)
library(tidyr)
library(rvest)
library(stringr)
Load Page
#---------------------------------------
### ATP Men's Tennis:
# When extracting text, there is \r which appears. This is removed each time with gsub & trimws functions
#---------------------------------------
# Obtain singles rankings for men, just top 100:
atp_link <- read_html("https://www.atptour.com/en/rankings/singles")
page <- atp_link %>% html_elements('tbody')
When it comes to extracting table components in R and rvest
, I use html_nodes()
to extract selected data given their class names. The class names are found after right clicking on a page and clicking Inspect to see the HTML of the page.
Ranks
# Ranks, remove \r and whitespace. Convert into numeric from string.
men_ranks <- page %>%
html_nodes("[class='rank-cell border-left-4 border-right-dash-1']") %>%
html_text2()
men_ranks
## [1] "\r 1\r" "\r 2\r" "\r 3\r" "\r 4\r" "\r 5\r" "\r 6\r"
## [7] "\r 7\r" "\r 8\r" "\r 9\r" "\r 10\r" "\r 11\r" "\r 12\r"
## [13] "\r 13\r" "\r 14\r" "\r 15\r" "\r 16\r" "\r 17\r" "\r 18\r"
## [19] "\r 19\r" "\r 20\r" "\r 21\r" "\r 22\r" "\r 23\r" "\r 24\r"
## [25] "\r 25\r" "\r 26\r" "\r 27\r" "\r 28\r" "\r 29\r" "\r 30\r"
## [31] "\r 31\r" "\r 32\r" "\r 33\r" "\r 34\r" "\r 35\r" "\r 36\r"
## [37] "\r 37\r" "\r 38\r" "\r 39\r" "\r 40\r" "\r 41\r" "\r 42\r"
## [43] "\r 43\r" "\r 44\r" "\r 45\r" "\r 46\r" "\r 47\r" "\r 48\r"
## [49] "\r 49\r" "\r 50\r" "\r 51\r" "\r 52\r" "\r 53\r" "\r 54\r"
## [55] "\r 55\r" "\r 56\r" "\r 57\r" "\r 58\r" "\r 59\r" "\r 60\r"
## [61] "\r 61\r" "\r 62\r" "\r 63\r" "\r 64\r" "\r 65\r" "\r 66\r"
## [67] "\r 67\r" "\r 68\r" "\r 69\r" "\r 70\r" "\r 71\r" "\r 72\r"
## [73] "\r 73\r" "\r 74\r" "\r 75\r" "\r 76\r" "\r 77\r" "\r 78\r"
## [79] "\r 79\r" "\r 80\r" "\r 81\r" "\r 82\r" "\r 83\r" "\r 84\r"
## [85] "\r 85\r" "\r 86\r" "\r 87\r" "\r 88\r" "\r 89\r" "\r 90\r"
## [91] "\r 91\r" "\r 92\r" "\r 93\r" "\r 94\r" "\r 95\r" "\r 96\r"
## [97] "\r 97\r" "\r 98\r" "\r 99\r" "\r 100\r"
As you can see there are a lot of \r
s that surround the webscraped items. I use gsub the remove the \r
s and convert the string into numbers.
men_ranks <- gsub('[\r]', ' ', men_ranks) %>% trimws(which = "both")
men_ranks <- as.numeric(men_ranks)
Countries
# Obtain countries (Abbreviated), found in alt atttribute in image part.
men_country <- page %>%
html_nodes("[class='country-item']") %>%
html_element('img') %>%
html_attr('alt')
Players
# Players:
atp_players <- page %>%
html_nodes("[class='player-cell border-left-dash-1 border-right-dash-1']") %>%
html_text2()
atp_players <- gsub('[\r]', ' ', atp_players) %>% trimws(which = "both")
Player Ages
# Player Age:
atp_age <- page %>%
html_nodes("[class='age-cell border-left-dash-1 border-right-4']") %>%
html_text2()
atp_age <- gsub('[\r]', ' ', atp_age) %>% trimws(which = "both")
atp_age <- as.numeric(atp_age)
ATP Ranking Points
# ATP points (Remove /r, white space, remove comma & convert into numeric)
atp_pts <- page %>%
html_nodes("[class='points-cell border-right-dash-1']") %>%
html_text2()
atp_pts <- gsub('[\r]', ' ', atp_pts) %>% trimws(which = "both")
atp_pts <- gsub(',', '', atp_pts)
atp_pts <- as.numeric(atp_pts)
Tournaments Played
# Tournaments Played
atp_played <- page %>%
html_nodes("[class='tourn-cell border-left-dash-1 border-right-dash-1']") %>%
html_text2()
atp_played <- gsub('[\r]', ' ', atp_played ) %>% trimws(which = "both")
atp_played <- as.numeric(atp_played)
Points Dropping
# Points Dropping
atp_drop_pts <- page %>%
html_nodes("[class='pts-cell border-left-dash-1 border-right-dash-1']") %>%
html_text2()
atp_drop_pts <- gsub('[\r]', ' ', atp_drop_pts) %>% trimws(which = "both")
atp_drop_pts <- as.numeric(atp_drop_pts)
## Warning: NAs introduced by coercion
Next Best
# Next Best
atp_next_best <- page %>%
html_nodes("[class='next-cell border-left-dash-1 border-right-4']") %>%
html_text2()
atp_next_best <- gsub('[\r]', ' ', atp_next_best) %>% trimws(which = "both")
atp_next_best <- as.numeric(atp_next_best)
Once the parts of the table have been extracted we can form the dataframe.
### Create Men's Ranking Table Dataframe now:
atp_ranks_table <- data.frame(
Rank = men_ranks,
Player = atp_players,
Country = men_country,
Points = atp_pts,
Tournaments_Played = atp_played,
Pts_Drop = atp_drop_pts,
Next_Best = atp_next_best
)
#Show top 15 players
head(atp_ranks_table, 15)
## Rank Player Country Points Tournaments_Played Pts_Drop
## 1 1 Novak Djokovic SRB 8420 13 90
## 2 2 Daniil Medvedev RUS 8410 23 180
## 3 3 Alexander Zverev GER 7195 23 90
## 4 4 Rafael Nadal ESP 7115 12 180
## 5 5 Stefanos Tsitsipas GRE 5980 26 NA
## 6 6 Matteo Berrettini ITA 4945 22 0
## 7 7 Casper Ruud NOR 4380 27 360
## 8 8 Andrey Rublev RUS 4375 27 600
## 9 9 Felix Auger-Aliassime CAN 3625 26 0
## 10 10 Cameron Norrie GBR 3440 27 0
## 11 11 Carlos Alcaraz ESP 3320 19 0
## 12 12 Jannik Sinner ITA 3054 33 45
## 13 13 Taylor Fritz USA 2875 30 45
## 14 14 Hubert Hurkacz POL 2873 28 45
## 15 15 Denis Shapovalov CAN 2693 26 0
## Next_Best
## 1 0
## 2 0
## 3 0
## 4 0
## 5 45
## 6 0
## 7 90
## 8 45
## 9 0
## 10 0
## 11 0
## 12 20
## 13 35
## 14 10
## 15 0
Sys.Date()
## [1] "2022-04-13"
## Save Top 100 Male Tennis Players Table into a .csv File:
write.csv(atp_ranks_table, paste("ATP_Top100_", Sys.Date(), sep = ""), row.names = FALSE)