【多线程 待解决】爬取糗事百科
代码:
# -*- coding: utf-8 -*-
'''
import urllib.request
import re
import ssl
import urllib.error
headers = ("User-Agent","Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
for i in range(1, 3):
url = "+ str(i)+"/"
# print(url)
context = ssl._create_unverified_context()
pageData = urllib.request.urlopen(url,context=context).read().decode("utf-8","ignore")
# pageData = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
pat = '
.*?(.*?).*?
'
datalist = re.compile(pat, re.S).findall(pageData)
for j in range(0, len(datalist)):
print("第" + str(i) + "页第" + str(j) + "个段子的内容是:")
print(datalist[j])
import threading
class A(threading.Thread):
def __init__(self):
threading.Thread.__init__(self) #初始化线程
def run(self):
for i in range(0,10):
print("我是线程A")
class B(threading.Thread):
def __init__(self):
threading.Thread.__init__(self) #初始化线程
def run(self):
for i in range(0,10):
print("我是线程B")
thread1 = A()
thread1.start()
thread2 = B()
thread2.start()
'''
import urllib.request
import re
import ssl
import threading
import urllib.error
headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
# headers = ("User-Agent","Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
class One(threading.Thread):
def __init__(self):
threading.Thread.__init__(self) #初始化线程
def run(self):
for i in range(1,36,2):
url="https://qiushibaike.com/8hr/page/"+str(i)+"/"
context = ssl._create_unverified_context()
pageData = urllib.request.urlopen(url, context=context).read().decode("utf-8", "ignore")
# pageData=urllib.request.urlopen(url).read().decode("utf-8", "ignore")
pat = '
.*?(.*?).*?
'
datalist = re.compile(pat, re.S).findall(pageData)
for j in range(0, len(datalist)):
try:
print("第" + str(i) + "页第" + str(j) + "个段子的内容是:")
print(datalist[j])
except Exception as e:
print("exception:"+str(e))
time.sleep(1)
class Two(threading.Thread):
def __init__(self):
threading.Thread.__init__(self) #初始化线程
def run(self):
for i in range(2, 36,2):
url = "+ str(i)+"/"
# print(url)
context = ssl._create_unverified_context()
pageData = urllib.request.urlopen(url,context=context).read().decode("utf-8","ignore")
# pageData = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
pat = '
.*?(.*?).*?
'
datalist = re.compile(pat, re.S).findall(pageData)
for j in range(0, len(datalist)):
try:
print("第" + str(i) + "页第" + str(j) + "个段子的内容是:")
print(datalist[j])
except Exception as e:
print("exception:"+str(e))
time.sleep(1)
one = One()
one.start()
two = Two()
two.start()
报错如下:
D:\python.exe F:/pycodes/webCrawl/qiuShiBaiKe.py
Exception in thread Thread-2:
Traceback (most recent call last):
File "D:\lib\threading.py", line 916, in _bootstrap_inner
self.run()
File "F:/pycodes/webCrawl/qiuShiBaiKe.py", line 81, in run
pageData = urllib.request.urlopen(url,context=context).read().decode("utf-8","ignore")
File "D:\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "D:\lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "D:\lib\urllib\request.py", line 642, in http_response
'request, response, code, msg, hdrs)
File "D:\lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "D:\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "D:\lib\urllib\request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 504: Fiddler - Receive Failure
Exception in thread Thread-1:
Traceback (most recent call last):
File "D:\lib\threading.py", line 916, in _bootstrap_inner
self.run()
File "F:/pycodes/webCrawl/qiuShiBaiKe.py", line 61, in run
pageData = urllib.request.urlopen(url, context=context).read().decode("utf-8", "ignore")
File "D:\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "D:\lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "D:\lib\urllib\request.py", line 642, in http_response
'request, response, code, msg, hdrs)
File "D:\lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "D:\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "D:\lib\urllib\request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 504: Fiddler - Receive Failure
Process finished with exit code 0
版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。
暂时没有评论,来抢沙发吧~