- rm(list=ls())
- library(RCurl)
- library(downloader)
- url<-'http://www.bathandbodyworks.com/product/index.jsp?productId=23418996&cp=12586965.12587140.4191845'
- # 获取网页原代码,以行的形式存放在web变量中
- web <-readLines(url,encoding="UTF-8")
- # 找到包含图片格式.jpg的行编号
- name <- web[grep("\\jpg',+$",web)+1]
- name<-name[grep("jpg",name)]
- # 用正则表达式来提取网址
- pattern <-'http://[-A-Za-z0-9_.%]+/[-A-Za-z0-9_.%]+/[-A-Za-z0-9_.%]+/[-A-Za-z0-9_.%]'
- gregout <- gregexpr(pattern,name)
- downurl<-0
- filenames<-0
- for(i in 1:ncol(t(name)))
- {
- downurl[i]<-substr(name[i],gregout[[i]],gregout[[i]]+attr(gregout[[i]],'match.length')+22)
- downurl[i]<-sub(pattern = "'|',", replacement = "", downurl[i])
- filenames[i]<- strsplit(downurl[i],"/")[[1]][6]
- download(sprintf(downurl[i]),filenames[i],mode = "wb");
- }


雷达卡






京公网安备 11010802022788号







