【原创】【申精】如题,可以对网站上的内容进行爬取,大神请路过,小白可以参考!上手就能用!这是爬取nature上的文章标题,关键词:***和***,2021-2024年的,你可以根据自己的要求进行适当修改,供坛友参考!
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt
date_range = '&date_range=2021-2024'
base_url = 'https://www.nature.com/search?q=***&***&order=relevance'
first_page = '&page=1'
url = base_url + date_range + first_page
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
}
response = requests.get(url=url, headers=headers)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
pages = soup.find(class_="c-pagination").find_all("li")
j = len(pages) - 2
#方法一,获取总页数,通过截取对方式,第6位开始,第7位结束
b = pages[j].a.text
b = int(b[6:7]) + 1
#方法二,获取页数
#a,b = pages[j].a.stripped_strings
#b = int(b) + 1
#print(b)
k = soup.find(class_="u-display-flex").span.next_sibling.text
k = k[0:3]
def get_list(page):
url = base_url + date_range +'&page='+ str(page)
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
}
response = requests.get(url=url, headers=headers)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
list = soup.find(class_="app-article-list-row").find_all("li")
return list
def get_articles(list):
for i in list:
if i.a:
#if set_datetime < i.time['datetime']:
item_riqi = i.time['datetime']
item_name = i.a.text
df.loc[df.shape[0]] = [item_riqi, item_name]
#j = j + 1
return df
columns = ['riqi', 'name']
df = pd.DataFrame(columns = columns)
#set_datetime = '2019-01-01'
for i in range(1,b):
print('正在爬取第 -{}- 页'.format(i))
list = get_list(i)
get_articles(list)
df.to_csv(r'/Users/**/Desktop/article2.csv', index=False,encoding = 'utf_8_sig')
print('共爬取 -{}- 条文章'.format(k))


雷达卡




京公网安备 11010802022788号







