我抓取了一页得知获取的规律(详见代码),我想循环这个抓取页面代码,比如100次或自动判断无“下一页”连接时停止,得到一个变量 page_next( 每行一个链接网址)
- library(xml2)
- library(rvest)
- library(dplyr)
- url <- "https://list.tmall.com/search_product.htm?spm=a220m.1000858.1000724.9.jNt2XM&cat=50892008&brand=30652&q=%C3%C0%B5%C4&sort=s&style=l&from=sn_1_cat-qp&industryCatId=50892008&tmhkmain=0#J_Filter" #起始页面
- html_session(url)
- web <- read_html(url,encoding="GBK")
- # 获取商品链接网址
- page_next <- web%>%
- html_nodes("a.ui-page-next")%>%
- html_attr("href")%>%
- as.character()%>%
- iconv("utf-8","gbk") #获取第2页的网址
- url_next <- paste0("https://list.tmall.com/search_product.htm?spm=a220m.1000858.1000724.9.jNt2XM",page_next)
- url_next <- read_html(url_next,encoding="GBK")
- page_next_2 <- url_next%>%
- html_nodes("a.ui-page-next")%>%
- html_attr("href")%>%
- as.character()%>%
- iconv("utf-8","gbk") #获取第3页的网址
- url_next_3 <- paste0("https://list.tmall.com/search_product.htm?spm=a220m.1000858.1000724.9.jNt2XM",page_next_2) #获取第四页网址


雷达卡



京公网安备 11010802022788号







