Automated Data Collection with R 第一二章的代码烦的一笔,花了一天的时间对代码进行了优化。
library(rvest)
library(RCurl)
#Generate empty folder
dir.create("Bills_111/")
# Iterate over all 4059 pieces of legislation
for(i in 1:4059){
# Generate the unique URL for each piece of legislation
url <- str_c(
"http://thomas.loc.gov/cgi-bin/bdquery/D?d111:",
i,
":./list/bss/d111SN.lst:@@@P"
)
# Download the page
bill.result <- getURL(url)
# Write the page to local hard drive
write(
bill.result, str_c(
"Bills_111/Bill_111_S",
i,
".html"
)
)
# Print progress of download
cat(i, "\n")
}
#简化部分
myfun = function(i) {
url = str_c("Bills_111/Bill_111_S", i, ".html")
sponsors = url %>% html() %>% html_nodes("div a") %>% html_text() %>% .[[15]] %>% gsub("Sen","",.)
cosponsors = url %>% html() %>% html_nodes("p a") %>% html_text() %>% .[-1] %>% gsub("Sen","",.)
list(sponsors,cosponsors)
}
sponsor.list = lapply(1:4059,myfun)
sponsor.list[[1]][1] = " Reid, Harry"
all.senators = sponsor.list %>% unlist %>% unique %>% sort %>% .[-length(.)] #109个参议员
# Create a matrix of sponsors
sponsor.matrix = matrix(NA, nrow = 4059, ncol = length(all.senators))
colnames(sponsor.matrix) = all.senators
rownames(sponsor.matrix) = paste("S.", seq(1, 4059), sep ="")
for(i in 1:length(sponsor.list)){
sponsor.matrix[i, which(all.senators == sponsor.list[[i]][1])] = "Sponsor"
sponsor.matrix[i, which(all.senators %in% unlist(sponsor.list[[i]][2]))] = "Cosponsor"
}
> sponsor.matrix[30:35,31:34]
Cornyn, John Crapo, Mike DeMint, Jim Dodd, Christopher J.
S.30 NA NA NA NA
S.31 NA NA NA NA
S.32 NA NA NA NA
S.33 NA NA NA NA
S.34 "Cosponsor" "Cosponsor" "Sponsor" NA
S.35 "Cosponsor" NA NA NA