【多线程 待解决】爬取糗事百科

网友投稿 254 2022-11-23

【多线程 待解决】爬取糗事百科

代码: # -*- coding: utf-8 -*- ''' import urllib.request import re import ssl import urllib.error headers = ("User-Agent","Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0") opener = urllib.request.build_opener() opener.addheaders = [headers] urllib.request.install_opener(opener) for i in range(1, 3): url = "+ str(i)+"/" # print(url) context = ssl._create_unverified_context() pageData = urllib.request.urlopen(url,context=context).read().decode("utf-8","ignore") # pageData = urllib.request.urlopen(url).read().decode("utf-8", "ignore") pat = '

.*?(.*?).*?
' datalist = re.compile(pat, re.S).findall(pageData) for j in range(0, len(datalist)): print("第" + str(i) + "页第" + str(j) + "个段子的内容是:") print(datalist[j]) import threading class A(threading.Thread): def __init__(self): threading.Thread.__init__(self) #初始化线程 def run(self): for i in range(0,10): print("我是线程A") class B(threading.Thread): def __init__(self): threading.Thread.__init__(self) #初始化线程 def run(self): for i in range(0,10): print("我是线程B") thread1 = A() thread1.start() thread2 = B() thread2.start() ''' import urllib.request import re import ssl import threading import urllib.error headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} # headers = ("User-Agent","Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0") opener = urllib.request.build_opener() opener.addheaders = [headers] urllib.request.install_opener(opener) class One(threading.Thread): def __init__(self): threading.Thread.__init__(self) #初始化线程 def run(self): for i in range(1,36,2): url="https://qiushibaike.com/8hr/page/"+str(i)+"/" context = ssl._create_unverified_context() pageData = urllib.request.urlopen(url, context=context).read().decode("utf-8", "ignore") # pageData=urllib.request.urlopen(url).read().decode("utf-8", "ignore") pat = '
.*?(.*?).*?
' datalist = re.compile(pat, re.S).findall(pageData) for j in range(0, len(datalist)): try: print("第" + str(i) + "页第" + str(j) + "个段子的内容是:") print(datalist[j]) except Exception as e: print("exception:"+str(e)) time.sleep(1) class Two(threading.Thread): def __init__(self): threading.Thread.__init__(self) #初始化线程 def run(self): for i in range(2, 36,2): url = "+ str(i)+"/" # print(url) context = ssl._create_unverified_context() pageData = urllib.request.urlopen(url,context=context).read().decode("utf-8","ignore") # pageData = urllib.request.urlopen(url).read().decode("utf-8", "ignore") pat = '
.*?(.*?).*?
' datalist = re.compile(pat, re.S).findall(pageData) for j in range(0, len(datalist)): try: print("第" + str(i) + "页第" + str(j) + "个段子的内容是:") print(datalist[j]) except Exception as e: print("exception:"+str(e)) time.sleep(1) one = One() one.start() two = Two() two.start() 报错如下: D:\python.exe F:/pycodes/webCrawl/qiuShiBaiKe.py Exception in thread Thread-2: Traceback (most recent call last): File "D:\lib\threading.py", line 916, in _bootstrap_inner self.run() File "F:/pycodes/webCrawl/qiuShiBaiKe.py", line 81, in run pageData = urllib.request.urlopen(url,context=context).read().decode("utf-8","ignore") File "D:\lib\urllib\request.py", line 223, in urlopen return opener.open(url, data, timeout) File "D:\lib\urllib\request.py", line 532, in open response = meth(req, response) File "D:\lib\urllib\request.py", line 642, in http_response 'request, response, code, msg, hdrs) File "D:\lib\urllib\request.py", line 570, in error return self._call_chain(*args) File "D:\lib\urllib\request.py", line 504, in _call_chain result = func(*args) File "D:\lib\urllib\request.py", line 650, in http_error_default raise HTTPError(req.full_url, code, msg, hdrs, fp) urllib.error.HTTPError: HTTP Error 504: Fiddler - Receive Failure Exception in thread Thread-1: Traceback (most recent call last): File "D:\lib\threading.py", line 916, in _bootstrap_inner self.run() File "F:/pycodes/webCrawl/qiuShiBaiKe.py", line 61, in run pageData = urllib.request.urlopen(url, context=context).read().decode("utf-8", "ignore") File "D:\lib\urllib\request.py", line 223, in urlopen return opener.open(url, data, timeout) File "D:\lib\urllib\request.py", line 532, in open response = meth(req, response) File "D:\lib\urllib\request.py", line 642, in http_response 'request, response, code, msg, hdrs) File "D:\lib\urllib\request.py", line 570, in error return self._call_chain(*args) File "D:\lib\urllib\request.py", line 504, in _call_chain result = func(*args) File "D:\lib\urllib\request.py", line 650, in http_error_default raise HTTPError(req.full_url, code, msg, hdrs, fp) urllib.error.HTTPError: HTTP Error 504: Fiddler - Receive Failure Process finished with exit code 0

版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。

上一篇:Java之JNDI注入的实现
下一篇:USB读数装置及上位机的设计
相关文章

 发表评论

暂时没有评论,来抢沙发吧~