各位老师,我在用rvest爬取网页数据时,遇到翻页问题,不知道该如何设置循环来一次性抓取,希望各位老师指点:
以下是我做的几次尝试,但是页面仍然只是显示一页。
(1)
n=seq(0,860,length=44)
page=function(n)
{
url=str_c('https://yz.chsi.com.cn/sch/?start=',n)
web=read_html(url)
university=html_nodes(web,'body > div.main-wrapper > div.container > div.yxk-table > table > tbody > tr > td:nth-child(1) > a')%>%html_text()
location=html_nodes(web,'body > div.main-wrapper > div.container > div.yxk-table > table > tbody > tr > td:nth-child(2)')%>%html_text()
subo=html_nodes(web,'body > div.main-wrapper > div.container > div.yxk-table > table > tbody > tr > td:nth-child(3)')%>%html_text()
link=html_nodes(web,'body > div.main-wrapper > div.container > div.yxk-table > table > tbody > tr > td:nth-child(7) > a')%>%html_attrs()
site=rep('https://yz.chsi.com.cn/sch/?start=0',20*43)
link1=paste(site,link,sep = '')
messages=data.frame(university,location,subo,link1)
return(messages)
}
(2)
page=function(n)
{
x=seq(0,860,length=44)
s=c(x)
url=str_c('https://yz.chsi.com.cn/sch/?start=',s[[n]])
web=read_html(url)
university=html_nodes(web,'body > div.main-wrapper > div.container > div.yxk-table > table > tbody > tr > td:nth-child(1) > a')%>%html_text()
location=html_nodes(web,'body > div.main-wrapper > div.container > div.yxk-table > table > tbody > tr > td:nth-child(2)')%>%html_text()
subo=html_nodes(web,'body > div.main-wrapper > div.container > div.yxk-table > table > tbody > tr > td:nth-child(3)')%>%html_text()
link=html_nodes(web,'body > div.main-wrapper > div.container > div.yxk-table > table > tbody > tr > td:nth-child(7) > a')%>%html_attrs()
site=rep('https://yz.chsi.com.cn/sch/?start=',20*43)
link1=paste(site,link,sep = '')
messages=data.frame(university,location,subo,link1)
return(messages)
for (n in 1:length(s)) {
messages=rbind2(messages,page(n))
}
}
(3)
x=seq(0,860,length=44)
s=c(x)
i=1
for (n in 1:length(s)) {
page=function(i){
url=str_c('https://yz.chsi.com.cn/sch/?start=',s[i])
web=read_html(url)
university=html_nodes(web,'body > div.main-wrapper > div.container > div.yxk-table > table > tbody > tr > td:nth-child(1) > a')%>%html_text()
location=html_nodes(web,'body > div.main-wrapper > div.container > div.yxk-table > table > tbody > tr > td:nth-child(2)')%>%html_text()
link=html_nodes(web,'body > div.main-wrapper > div.container > div.yxk-table > table > tbody > tr > td:nth-child(7) > a')%>%html_attrs()
site=rep('https://yz.chsi.com.cn/sch/',20)
link1=paste(site,link,sep = '')
messages=data.frame(university,location,link1)
messagess=rbind2(messages,page(i))
}
}


雷达卡


京公网安备 11010802022788号







