我用RCurl抓取拉手网的电影网页上信息,是借用论坛上一位仁兄的帖子,我增加了一个变量抓取,出现了一下问题:
错误于data.frame(goods_name, goods_text, price, org_price, snumber) :
arguments imply differing number of rows: 5, 3
大家有解决方法吗?
具体程序如下:
library(bitops)
library(RCurl)
library(XML)
start_url = "http://shanghai.lashou.com/cate/dianying"
cust_header =c("User-Agent"="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:26.0) Gecko/20100101 Firefox/26.0","Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language"="en-us","Connection"="keep-alive"
pagesource <- getURL(start_url,httpheader=cust_header,.encoding="utf-8"
parseTotalPage <- function(pagesource)
{
doc <- htmlParse(pagesource)
as.numeric(sapply(getNodeSet(doc, '//div[@class="page"]/a[last()-1]/text()'), xmlValue))
}
parseContent <- function(pagesource)
{
doc <- htmlParse(pagesource)
goods_name <- sapply(getNodeSet(doc, '//div[contains(@class,"goods"]//a[@class="goods-name"]//text()'), xmlValue)
goods_text <- sapply(getNodeSet(doc, '//div[contains(@class,"goods"]//a[@class="goods-text"]//text()'), xmlValue)
price <- sapply(getNodeSet(doc, '//div[contains(@class,"goods"]//span[@class="price"]/text()'), xmlValue)
org_price <- sapply(getNodeSet(doc, '//div[contains(@class,"goods"]//span[@class="money"]/del/text()'), xmlValue)
snumber <- sapply(getNodeSet(doc, '//div[contains(@class,"goods"]//span[@class="number"]/i/text()'), xmlValue)
result <- data.frame(goods_name, goods_text, price, org_price, snumber)
}
total_page <- parseTotalPage(pagesource)
pageresults <- parseContent(pagesource)
page = 1total_page-1)
url_list = ""
url_list= paste0("http://shanghai.lashou.com/cate/dianying/page",page +1)
for (url in url_list)
{
pagesource <- getURL(url,httpheader=cust_header,.encoding="utf-8"
pageresult <- parseContent(pagesource)
pageresults <- rbind(pageresults,pageresult)
}
write.table(pageresults,"d://lashoumove.txt"