欢迎大家讨论。
本帖隐藏的内容
- ########################################################
- # Contents:
- # 1. 读取链接
- # 2. 构造报头
- # 3. 读取网页资源
- # 4. 获取美食的页面内容
- # 5. 构建一个urllist
- # 6. 循环读取url,并解析页面内容
- # 7. 输出结果到文件
- #########################################################
- # 0. 初始化
- setwd('E:/R/RCode/RCurl')
- rm(list=ls())
- # 1. 读取链接
- library(RCurl)
- url <- "http://hefei.lashou.com/cate/meishi"
- url.exists(url)
- # 2. 构造报头
- myheader <- c("User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
- "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
- "Accept-Language"="en-us",
- "Connection"="keep-alive",
- "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7")
- # 3. 读取网页资源
- temp <- getURL(url,httpheader=myheader,encoding="UTF-8")
- # 4. 获取美食的页面内容,包括名称、内容、团购价、原价
- library(XML)
- pageContent <- function(page){
- doc <- htmlParse(page)
- meishi_name <- sapply(getNodeSet(doc, '//div[@class="goods "]//a[@class="goods-name"]//text()'), xmlValue)
- meishi_text <- sapply(getNodeSet(doc, '//div[@class="goods "]//a[@class="goods-text"]//text()'), xmlValue)
- Count_price <- sapply(getNodeSet(doc, '//div[@class="goods "]//span[@class="price"]'), xmlValue)
- org_price <- sapply(getNodeSet(doc, '//div[@class="goods "]//span[@class="money"]//del//text()'), xmlValue)
-
- result <- data.frame(meishi_name, meishi_text, Count_price, org_price)
- }
- # 5. 构建一个urllist(读取翻页数据)
- urllist <- ""
- page_num <- 1:20
- urllist[page_num] <- paste("http://hefei.lashou.com/cate/meishi/page",page_num,sep="")
- # 6. 循环读取url,并解析页面内容
- Page_Results <- NULL
- for(url in urllist){
- page_url <- getURL(url, httpheader=myheader, encoding="utf-8")
- Page_Results <- pageContent(page_url)
- Sys.sleep(2)
- }
- # 7. 输出结果到文件
- write.table(Page_Results,"meishi.txt")


雷达卡





京公网安备 11010802022788号







