-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy path01_scrape_ATP.Rmd
111 lines (83 loc) · 3.07 KB
/
01_scrape_ATP.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
---
title: "01_scrape ATP tennis ranking"
author: "Duc-Quang Nguyen"
date: "14 June 2016"
output: html_document
---
# Scrape
## ATP world tour
* [ATP ranking singles](http://www.atpworldtour.com/en/rankings/singles)
* [all former n°1](http://www.atpworldtour.com/en/rankings/former-no-1s)
## rvest
* [rvest tutorial: scraping the web using R](http://stat4701.github.io/edav/2015/04/02/rvest_tutorial/)
```{r setup, include=FALSE}
library(rvest)
library(magrittr)
library(dplyr)
library(zoo)
```
## Get the url of all n°1 players
** Data **
* This [url](http://www.atpworldtour.com/en/rankings/former-no-1s) lists all n° 1
* All n° 1 players in the table are of class .player-cell
* In chrome, using selectorGadget check whether '.player-cell' works and select the n° 1
```{r get n1s}
url <- read_html("http://www.atpworldtour.com/en/rankings/former-no-1s")
# Get all n° 1s and their URL
players <- url %>%
html_nodes(".player-cell")
raw.links <- players %>% html_nodes("a")
links <- gsub("\\\" data-ga-label.*a>$", "", gsub("^<a href=\\\"", "", raw.links))
names <- gsub("\\\r\\\n", "", players %>% html_text())
names <- gsub("\\\t\\\t\\\t\\\t", "", names)
# for every player page, his historical ranking page is "rankings-history" instead of "overview"
ov.url <- paste0("http://www.atpworldtour.com", links)
links <- paste0("http://www.atpworldtour.com", gsub("overview", "rankings-history", links))
names(links) <- names
# Get the nationality of each player
nat <- url %>%
html_nodes(".country-item")
nat <- toupper(gsub(".*flags\\/(.*)\\.(svg|png).*", "\\1", nat))
names(nat) <- names
write.csv(nat, file = "input/alln1_nationlity.csv")
```
## Download historical ranking of n°1s
```{r get historical rankings}
link <- links[25]
atp <- lapply(links, function(link) {
cat("\n", link)
rtable <- read_html(link) %>%
html_nodes(".mega-table") %>% html_table()
rtable <- rtable[[1]] %>% select (-Doubles)
rtable$Date <- as.Date(rtable$Date, format = "%Y.%m.%d")
#reformat date
rtable$Date <- format(rtable$Date, "%m/%d/%Y")
# remove leading T
rtable$Singles <- as.numeric(gsub("T", "", rtable$Singles))
# replace 0 by NA
rtable$Singles[which( rtable$Singles == 0)] <- NA
zoo(rtable$Singles, rtable$Date)
})
# get all dates
dates <- as.Date(unique(unlist(sapply(atp, function(dff) index(dff)))), format = "%m/%d/%Y")
dates <- format(dates[order(dates)], "%m/%d/%Y")
## save a vanilla data.frame
n1.ranking <- matrix(nrow = length(dates), ncol = length(atp))
colnames(n1.ranking) <- names(atp)
rownames(n1.ranking) <- dates
for (i in seq_along(atp)) {
idx <- match(index(atp[[i]]), format(as.Date(row.names(n1.ranking), format = "%m/%d/%Y"), "%m/%d/%Y"))
n1.ranking[idx,i] <- atp[[i]]
}
stopifnot(which(n1.ranking == 0) != integer(0))
write.csv(n1.ranking, file = "input/alln1_ATP_ranking.csv", row.names = T)
# get player DOB
dob <- sapply(ov.url, function(url) {
cat("\n", url)
dob <- read_html(url) %>%
html_nodes(".table-birthday")
gsub(".*(\\d{4}\\.\\d{2}\\.\\d{2}).*", "\\1", dob)
})
names(dob) <- names
write.csv(dob, file = "input/alln1_dob.csv")
```