阅读权限 255 威望 0 级论坛币 14858 个 通用积分 67.6972 学术水平 13 点 热心指数 14 点 信用等级 9 点 经验 51098 点 帖子 96 精华 0 在线时间 638 小时 注册时间 2009-4-20 最后登录 2024-3-15
给个可用的版本,用了多线程,但要加载js,效率不高,解析时cpu占用比较高,线程数要根据自己的电脑来控制。用phantomjs也快不了多少,有兴趣可以尝试下。
# -*- coding:UTF-8 -*-
from threading import Thread
from queue import Queue
from datetime import datetime
from sqlalchemy import create_engine,String,Integer,DATE
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import pandas as pd
import time
engine = create_engine('sqlite:///PM25.db')
cities = ["北京", "上海", "天津", "重庆", "杭州", "哈尔滨", "长春", "沈阳", "石家庄", "太原", "西安", "济南", "乌鲁木齐", "拉萨", "西宁", "兰州", "银川", "郑州",
"南京", "武汉", "合肥", "福州", "南昌", "长沙", "贵阳", "成都", "广州", "昆明", "南宁", "深圳"]
columns =['日期','AQI','质量等级','PM2.5','PM10','SO2','CO','NO2','O3_8h']
dtypes={
"日期":String(12),
"AQI":Integer,
"质量等级":String(10),
"PM2.5":Integer,
"PM10":Integer,
"SO2":Integer,
"CO":Integer,
"NO2":Integer,
'O3_8h':Integer
}
def prepare_month(start,end,fmt='%Y%m'):
date_range = pd.date_range(start=start,end=end,freq='M')
months = [datetime.strftime(date, fmt) for date in date_range]
return months
def get_data(tasks):
# For chrome
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("prefs",{"profile.managed_default_content_settings.images":2})
dr = webdriver.Chrome(chrome_options=chrome_options)
# # For phantomjs
# dcap = dict(DesiredCapabilities.PHANTOMJS)
# dcap['phantomjs.page.settings.loadImages']=False
# dr=webdriver.PhantomJS(desired_capabilities=dcap)
while not tasks.empty():
city,month = tasks.get()
dr.get('https://www.aqistudy.cn/historydata/daydata.php?city={0}&month={1}'.format(city, month))
time.sleep(2)
trs = []
for tr in dr.find_elements_by_css_selector('tr')[1:]:
tds = []
for td in tr.find_elements_by_css_selector('td'):
tds.append(td.text)
trs.append(tds)
df = pd.DataFrame(trs,columns=columns)
df['city']=city
df.to_sql("jiance", engine, if_exists='append', index=False,dtype=dtypes)
dr.quit()
if __name__ == '__main__':
threads=[]
tasks=Queue()
months=prepare_month(start='2013-12-1',end='2017-11-1')
for city in cities:
for month in months:
tasks.put((city,month))
for i in range(5):
thread=Thread(target=get_data,args=(tasks,))
threads.append(thread)
thread.start()
for thread in threads:
thread.join() 复制代码
总评分: 经验 + 60
论坛币 + 5
查看全部评分