在Python写爬虫时候,经常会遇到爬虫与反爬虫的博弈,高强度、高频次地爬取网页信息,一般会给网站服务器带来巨大压力与性能损坏,故同一个IP不断爬取网页的信息,很快就可能被网站管理员封掉。故我们可以搭建自己的代理IP库,不停的更换自己的IP去爬去网页,不会因为同一IP而影响爬虫的进行。将爬取到的IP信息进行判断筛选可用的代理地址存入数据库MySQL/Redis/Mongodb/Memcache,后期需要使用代理IP,直接从私有库中获取以逸待劳。
2.1 使用的Python模块
- Requests 获取网络请求
- BeautifulSoup处理网页文件获取需要的信息
- configparser读取配置文件信息,获取相关内容信息
- pymysql用于MySQL数据库操作
- redis用于Redis的操作
- pymongo用于Mongodb操作
- memcache用于Memcache操作
2.2 相关参考链接
Redis可参考Redis-3.2主从复制与集群搭建
Mongodb可参考Mongodb基础
Memcache可参考Memcached 安装脚本(附服务器自启动)
Python基础爬虫可参考利用Python搜索51CTO推荐博客并保存至Excel
3.1 github地址
PROXIES
3.2 代码
a.spider.py
#!/bin/env python
# -*- coding:utf-8 -*-
# _author:kaliarch
import requests
from bs4 import BeautifulSoup
import random
class GetProxyIP:
def __init__(self, page=10):
self._page = page
self.url_head = 'http://www.xicidaili.com/wt/'
def get_ip(self):
"""
get resouce proxy ip pool
:return: res_pool list
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
res_pool = []
for pagenum in range(1, self._page):
url = self.url_head + str(pagenum)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
soup_tr = soup.find_all('tr')
for item in soup_tr:
try:
soup_td = item.find_all('td')
# 获取到网页的代理IP信息
res_pool.append(soup_td[5].text.lower() + '://' + soup_td[1].text + ':' + soup_td[2].text)
except IndexError:
pass
return res_pool
def right_proxies(self, res_pool):
"""
check available ip
:param res_pool:
:return:right_pool list
"""
right_pool = []
for ip in res_pool:
if 'https' in ip:
proxies = {'http': ip}
else:
proxies = {"http": ip}
check_urllist = ['http://www.baidu.com', 'http://www.taobao.com', 'https://cloud.tencent.com/']
try:
response = requests.get(random.choice(check_urllist), proxies=proxies, timeout=1)
# 判断筛选可用IP
if response.status_code:
right_pool.append(proxies)
print('add ip %s' % proxies)
except Exception as e:
continue
return right_pool
if __name__ == '__main__':
# 实例化类,可以传入page
proxyhelper = GetProxyIP(2)
res_pool = proxyhelper.get_ip()
proxy_ip = proxyhelper.right_proxies(res_pool)
print(proxy_ip)
b.db.conf
[mysql]
HOST = 172.20.6.100
PORT = 3306
USER = root
PASSWD = mysqladmin
DB = pydb
TABLE = pytab
CHARSET = utf8
[redis]
HOST = 172.20.6.100
PORT = 6379
PASSWD = redisadmin
[memcache]
HOST = 172.20.6.100
PORT = 11211
[mongodb]
HOST = 172.20.6.100
PORT = 27017
DB = db1
USER = mongoadmin
PASSWD = mongopwd
c.save_mysql.py
#!/bin/env python
# -*- coding:utf-8 -*-
# _author:kaliarch
import pymysql
import configparser
import spider
class MysqlOper:
# initial database information
def __init__(self, result_list):
#初始化mysql数据库的性格信息
config = configparser.ConfigParser()
config.read('db.conf')
self.host = config['mysql']['HOST']
self.port = int(config['mysql']['PORT'])
self.user = config['mysql']['USER']
self.passwd = config['mysql']['PASSWD']
self.db = config['mysql']['DB']
self.table = config['mysql']['TABLE']
self.charset = config['mysql']['CHARSET']
self.result_list = result_list
def mysql_save(self):
# create db cursor
try:
DB = pymysql.connect(self.host, self.user, self.passwd, self.db, port=self.port, charset=self.charset)
cursor = DB.cursor()
except Exception as e:
print("connect dbserver fail,Please see information:")
print(e)
exit(1)
# check and create tables
cursor.execute('show tables in pydb')
tables = cursor.fetchall()
flag = True
for tab in tables:
if self.table in tab:
flag = False
print('%s is exist' % self.table)
print(flag)
if flag:
#创建pytab表
cursor.execute(
'''create table pytab (id int unsigned not null primary key auto_increment, protocol varchar(10),content varchar(50))''')
else:
return 0
# 讲获取到的IP写入到mysql数据库
for values in self.result_list:
for prot, cont in values.items():
try:
cursor.execute("insert into pytab (protocol,content) value (%s,%s);", [prot, cont])
except Exception as e:
print("insert db occer error", e)
if __name__ == "__main__":
proxyhelper = spider.GetProxyIP(3)
res_pool = proxyhelper.get_ip()
proxy_ip = proxyhelper.right_proxies(res_pool)
dbhelper = MysqlOper(proxy_ip)
dbhelper.mysql_save()
3.save_redis.py
#!/bin/env python
# -*- coding:utf-8 -*-
# _author:kaliarch
import redis
import random
import configparser
import spider
class RedisOper:
def __init__(self):
"""
initialization redis infomation
:param
"""
config = configparser.ConfigParser()
config.read('db.conf')
self.host = config['redis']['HOST']
self.port = config['redis']['PORT']
self.passwd = config['redis']['PASSWD']
self.pool = redis.ConnectionPool(host=self.host, port=self.port, password=self.passwd)
self.redis_helper = redis.Redis(connection_pool=self.pool)
self.pipe = self.redis_helper.pipeline(transaction=True)
def redis_save(self, result_list):
"""
save data
:return:None
"""
for num, cont in enumerate(result_list):
self.redis_helper.set(num, cont)
self.pipe.execute()
def redis_gain(self):
"""
gain data
:return: proxies
"""
num = random.randint(0, 10)
ip = self.redis_helper.get(num)
self.pipe.execute()
return ip
if __name__ == '__main__':
proxyhelper = spider.GetProxyIP(2)
res_pool = proxyhelper.get_ip()
proxy_ip = proxyhelper.right_proxies(res_pool)
dbhelper = RedisOper()
dbhelper.redis_save(proxy_ip)
ip = dbhelper.redis_gain()
print(ip)
4.save_mongodb.py
#!/bin/env python
# -*- coding:utf-8 -*-
# _author:kaliarch
import configparser
import spider
from pymongo import MongoClient
class MongodbOper:
def __init__(self):
"""
initialization redis infomation
:param
"""
config = configparser.ConfigParser()
config.read('db.conf')
self.host = config['mongodb']['HOST']
self.port = config['mongodb']['PORT']
self.db = config['mongodb']['DB']
self.user = config['mongodb']['USER']
self.pwd = config['mongodb']['PASSWD']
self.client = MongoClient(self.host, int(self.port))
self.db_auth = self.client.admin
self.db_auth.authenticate(self.user, self.pwd)
self.DB = self.client[self.db]
self.collection = self.DB.myset
def mongodb_save(self, result_list):
"""
save data
:return:None
"""
for values in result_list:
self.collection.insert(values)
def mongodb_gain(self):
"""
gain data
:return: proxies
"""
ip = self.collection.find_one()
return ip
if __name__ == '__main__':
proxyhelper = spider.GetProxyIP(2)
res_pool = proxyhelper.get_ip()
proxy_ip = proxyhelper.right_proxies(res_pool)
dbhelper = MongodbOper()
dbhelper.mongodb_save(proxy_ip)
ip = dbhelper.mongodb_gain()
print(ip)
5.save_memcache.py
#!/bin/env python
# -*- coding:utf-8 -*-
# _author:kaliarch
import memcache
import random
import configparser
import spider
class MemcacheOper:
def __init__(self):
"""
initialization redis infomation
:param
"""
config = configparser.ConfigParser()
config.read('db.conf')
self.host = config['memcache']['HOST']
self.port = config['memcache']['PORT']
self.mcoper = memcache.Client([self.host + ':' + self.port], debug=True)
def memcache_save(self, result_list):
"""
save data
:return:None
"""
for num, cont in enumerate(result_list):
self.mcoper.set(str(num), cont)
def memcache_gain(self):
"""
gain data
:return: proxies
"""
num = random.randint(0, 10)
ip = self.mcoper.get(str(num))
return ip
if __name__ == '__main__':
proxyhelper = spider.GetProxyIP(2)
res_pool = proxyhelper.get_ip()
proxy_ip = proxyhelper.right_proxies(res_pool)
dbhelper = MemcacheOper()
dbhelper.memcache_save(proxy_ip)
ip = dbhelper.memcache_gain()
print(ip)
单独运行spider.py
可以查看到爬取并筛选出的可用ip池
运行其他保存文件,可以进入对应数据库查看存储的信息。
MySQL
Redis
Mongodb
Memcache
至此我们就利用Python构建了一个属于自己的私有代理库,在进行爬去的时候可方便从数据库中获取使用。