1 import requests
2 import re
3 # from bs4 import BeautifulSoup
4 from urllib import request
5 # import threading
6 import gevent
7 from gevent import monkey
8
9 monkey.patch_all()
10
11 def get_html_text(url):
12 try:
13 hd = {'User-Agent':'Mozilla/5.0'} #添加伪装浏览器头部信息
14 r = requests.get(url, timeout=10, headers = hd)
15 r.raise_for_status()
16 r.encoding = r.apparent_encoding
17 print(len(r.text))
18 return r.text
19 except Exception as result:
20 print('错误类型:', result)
21
22
23 def html_text_parser(img_list, html):
24
25 # 下面是修改的重点部分,采用了正则表达式,没有采用bs4
26 # 技术大神可以研究一下斗鱼这个网页的源代码,真正的图片信息都存储在后面,不是传统的# # html,我还没怎么接触过前端的知识,不知怎么使用bs4,所以使用了正则表达式。
27
28 img_pat = r'"rs\w+":"(.*?g)"'
29 links = re.compile(img_pat, re.S).findall(html)
30 print(len(links))
31 print(links)
32 for link in links:
33 if link:
34 img_list.append(link)
35 return img_list
36
37
38
39
40 def get_douyu_img(Img_list):
41 for i,j in enumerate(Img_list):
42 # name = j.split('.')[-1]
43 try: #异常捕获,如果链接不能访问,退出当前一次循环,进入下一次循环
44 r = request.urlopen(j)
45 ima_content = r.read()
46 path = str(i)
47 with open(path, 'wb') as f:
48 f.write(ima_content)
49 except:
50 continue
51 def main():
52 url = 'https://www.douyu.com/g_yz'
53 html = get_html_text(url)
54 img_list = list()
55 Img_list = html_text_parser(img_list, html)
56 # print(Img_list)
57 #t1 = threading.Thread(target=get_html_text, args=(url,))
58 #t2 = threading.Thread(target=html_text_parser, args=(img_list,html))
59 #t3 = threading.Thread(target=get_douyu_img, args=(Img_list,))
60 #t1.start()
61 #t2.start()
62 #t3.start()
63 gevent.joinall([
64 gevent.spawn(get_html_text, url),
65 gevent.spawn(html_text_parser, img_list, html),
66 gevent.spawn(get_douyu_img, Img_list)
67 ])
68
69
70 if __name__ == '__main__':
71 main()