前言:
验证码是大多数爬虫都需要克服的难题。pyppteer是目前市场主流的自动化工具之一,它的优势在于不易被浏览器检测到,ddddocr也是目前主流的验证码识别的第三方模块。接下来就通过python自动化工具和ddddocr等第三方模块完成极验滑块。
一:使用自动化工具打开网站
目标网站:行为验证4.0-适应型验证码-滑动验证,点选验证,图片验证-极验GeeTest
# 浏览器 启动参数 start_parm = { # 关闭无头浏览器 "headless": False, "args": [ '--disable-infobars', # 关闭自动化提示框 '--no-sandbox', # 关闭沙盒模式 '--start-maximized', # 窗口最大化模式 ], } browser = await launch(**start_parm) page = await browser.newPage() # 设置网页 视图大小 await page.setViewport(viewport={'width': 1920, 'height': 1080}) await page.goto('https://www.geetest.com/adaptive-captcha-demo')
二:控制鼠标定位到指定元素
await page.waitForXPath('//div[@class="type-config"]') # 等待元素加载 botton1 = await page.xpath('//div[@class="tab-item tab-item-1"]') # 滑块拼图验证按钮 await botton1[0].click() await page.click('#captcha', options={ 'button': 'left', 'clickCount': 2, 'delay': 300, # 延迟点击(ms) }) botton2 = await page.xpath('//*[@aria-label="点击按钮开始验证"]') # 开始验证按钮 await botton2[0].click()
三:提取滑块拼图照片url
elements_1 = await page.xpath( '//*[@id="captcha"]/div[2]/div[1]/div[4]/div[1]/div[2]/div/div/div[1]/div[1]/div[1]/@style') # 滑块图片链接 elements_2 = await page.xpath( '//*[@id="captcha"]/div[2]/div[1]/div[4]/div[1]/div[2]/div/div/div[1]/div[2]/@style') # 背景图片链接 for element in elements_1: sc = await page.evaluate('(element) => element.textContent', element) sc_url = sc.split('"')[1].split('"')[0] # 提取滑块图片链接 with open('slice.png', 'wb')as f1: f1.write(requests.get(sc_url).content) for element in elements_2: bg = await page.evaluate('(element) => element.textContent', element) bg_url = bg.split('"')[1].split('"')[0] # 提取背景图片链接 with open('bg.png', 'wb') as f2: f2.write(requests.get(bg_url).content)
四:获取拼图坐标偏移量
async def get_xy(): det = ddddocr.DdddOcr(det=False, ocr=False) with open('slice.png', 'rb') as f: target_bytes = f.read() with open('bg.png', 'rb') as f: background_bytes = f.read() try: res = det.slide_match(target_bytes, background_bytes) print(res) return res.get('target')[0] except: return False
五:操作鼠标移动滑块
其中x,y需要根据自己电脑进行微调。
if target: # print(target) botton3 = await page.xpath( '//*[@id="captcha"]/div[2]/div[1]/div[4]/div[1]/div[2]/div/div/div[2]/div/div[3]') await botton3[0].hover() # 鼠标悬停元素上 await page.mouse.down() # 鼠标落下 await page.waitFor(500) x = 1116 + target y = 641 await page.mouse.move(x, y, {'steps': 2}) # 鼠标移动 await page.waitFor(500) await page.mouse.up() # 鼠标松开 time.sleep(2) elements_3 = await page.xpath('//*[@id="captcha"]/div[2]/div[1]/div[3]/div[2]/div/div[2]/text()') msg = '' for element in elements_3: msg = await page.evaluate('(element) => element.textContent', element) if msg == '验证通过': break else: print(msg) else: # 获取坐标失败时刷新验证 botton4 = await page.xpath('//*[@aria-label="刷新验证"]') await botton4[0].click()
验证通过,欧耶~
六:完整代码
# coding:utf-8import ddddocr# coding:utf-8import asyncioimport timeimport requestsfrom pyppeteer.launcher import DEFAULT_ARGSDEFAULT_ARGS.remove("--enable-automation")from pyppeteer import launchasync def main(): # 浏览器 启动参数 start_parm = { # 关闭无头浏览器 "headless": False, "args": [ '--disable-infobars', # 关闭自动化提示框 '--no-sandbox', # 关闭沙盒模式 '--start-maximized', # 窗口最大化模式 ], } browser = await launch(**start_parm) page = await browser.newPage() # 设置网页 视图大小 await page.setViewport(viewport={'width': 1920, 'height': 1080}) await page.goto('https://www.geetest.com/adaptive-captcha-demo') time.sleep(2) await page.waitForXPath('//div[@class="type-config"]') # 等待元素加载 botton1 = await page.xpath('//div[@class="tab-item tab-item-1"]') # 滑块拼图验证按钮 await botton1[0].click() await page.click('#captcha', options={ 'button': 'left', 'clickCount': 2, 'delay': 300, # 延迟点击(ms) }) botton2 = await page.xpath('//*[@aria-label="点击按钮开始验证"]') # 开始验证按钮 await botton2[0].click() while True: time.sleep(5) elements_1 = await page.xpath( '//*[@id="captcha"]/div[2]/div[1]/div[4]/div[1]/div[2]/div/div/div[1]/div[1]/div[1]/@style') # 滑块图片链接 elements_2 = await page.xpath( '//*[@id="captcha"]/div[2]/div[1]/div[4]/div[1]/div[2]/div/div/div[1]/div[2]/@style') # 背景图片链接 for element in elements_1: sc = await page.evaluate('(element) => element.textContent', element) sc_url = sc.split('"')[1].split('"')[0] # 提取滑块图片链接 with open('slice.png', 'wb')as f1: f1.write(requests.get(sc_url).content) for element in elements_2: bg = await page.evaluate('(element) => element.textContent', element) bg_url = bg.split('"')[1].split('"')[0] # 提取背景图片链接 with open('bg.png', 'wb') as f2: f2.write(requests.get(bg_url).content) target = await get_xy() # 得到滑块x坐标偏移量 if target: # print(target) botton3 = await page.xpath( '//*[@id="captcha"]/div[2]/div[1]/div[4]/div[1]/div[2]/div/div/div[2]/div/div[3]') await botton3[0].hover() # 鼠标悬停元素上 await page.mouse.down() # 鼠标落下 await page.waitFor(500) x = 1116 + target y = 641 await page.mouse.move(x, y, {'steps': 2}) # 鼠标移动 await page.waitFor(500) await page.mouse.up() # 鼠标松开 time.sleep(2) elements_3 = await page.xpath('//*[@id="captcha"]/div[2]/div[1]/div[3]/div[2]/div/div[2]/text()') msg = '' for element in elements_3: msg = await page.evaluate('(element) => element.textContent', element) if msg == '验证通过': break else: print(msg) else: # 获取坐标失败时刷新验证 botton4 = await page.xpath('//*[@aria-label="刷新验证"]') await botton4[0].click() input('---验证通过---') await browser.close()async def get_xy(): det = ddddocr.DdddOcr(det=False, ocr=False) with open('slice.png', 'rb') as f: target_bytes = f.read() with open('bg.png', 'rb') as f: background_bytes = f.read() try: res = det.slide_match(target_bytes, background_bytes) print(res) return res.get('target')[0] except: return Falseif __name__ == '__main__': main() asyncio.get_event_loop().run_until_complete(main())
来源地址:https://blog.csdn.net/weixin_61736939/article/details/130048614