在网上找了段多线程的示例代码,试着改了下,添加了模拟登入。好吧问题来了,因为没学过类继承的概念,
定义了登入的空方法login后,不知道怎么调用了。请教下,添加login后的代码具体要如何修改。
(header和post参数可能跟我给url的不匹配,因为我的目标url不是下面的那个;希望用requests 的 session方法模拟登入,红色处应该是要修改的地方)
import requests
from bs4 import BeautifulSoup
from threading import Thread,Lock
from Queue import Queue
import time
class Fetcher:
def __init__(self,threads):
self.
self.lock = Lock() #线程锁
self.q_req = Queue() #任务队列
self.q_ans = Queue() #完成队列
self.threads = threads
for i in range(threads):
t = Thread(target=self.threadget)
t.setDaemon(True)
t.start()
self.running = 0
def login(self):
url = 'http://www.verycd.com/signin'
UA = "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:51.0) Gecko/20100101 Firefox/51.0"
header = { "User-Agent": UA,
"Referer": "http://www.verycd.com/signin"
}
login_session = requests.Session()
r = login_session.get(url, headers = header)
login_soup = BeautifulSoup(r.content, "html.parser")
token = login_soup.find('input', {'name':'_token'})['value']
print(token)
postData = { 'username': 'xxxxxxx',
'password': 'xxxxxxx',
'_token': token,
}
login_session.post(url, data = postData, headers = header)
return self.login_session
def __del__(self): #解构时需等待两个队列完成
time.sleep(0.5)
self.q_req.join()
self.q_ans.join()
def taskleft(self):
return self.q_req.qsize()+self.q_ans.qsize()+self.running
def push(self,req):
self.q_req.put(req)
def pop(self):
return self.q_ans.get()
def threadget(self):
while True:
req = self.q_req.get()
with self.lock: #要保证该操作的原子性,进入critical area
self.running += 1
try:
ans = self.login_session.get(req).content
except Exception, what:
ans = ''
print what
self.q_ans.put((req,ans))
with self.lock:
self.running -= 1
self.q_req.task_done()
time.sleep(1) # don't spam
if __name__ == "__main__":
links = [ 'http://www.verycd.com/topics/%d/'%i for i in range(5420,5430) ]
f = Fetcher(threads=10)
for url in links:
f.push(url)
while f.taskleft():
url,content = f.pop()
print url,len(content)更多