Files
MediaCrawler/media_platform/xhs/login.py
Relakkes b8093a2c0f refactor:优化部分代码
feat: 增加IP代理账号池
2023-06-27 23:38:30 +08:00

169 lines
7.3 KiB
Python

import sys
import asyncio
import logging
import aioredis
from tenacity import (
retry,
stop_after_attempt,
wait_fixed,
retry_if_result
)
from playwright.async_api import Page
from playwright.async_api import BrowserContext
import config
from tools import utils
from base.base_crawler import AbstractLogin
class XHSLogin(AbstractLogin):
def __init__(self,
login_type: str,
browser_context: BrowserContext,
context_page: Page,
login_phone: str = None,
cookie_str: str = None
):
self.login_type = login_type
self.browser_context = browser_context
self.context_page = context_page
self.login_phone = login_phone
self.cookie_str = cookie_str
@retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
async def check_login_state(self, no_logged_in_session: str) -> bool:
"""Check if the current login status is successful and return True otherwise return False"""
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
current_web_session = cookie_dict.get("web_session")
if current_web_session != no_logged_in_session:
return True
return False
async def begin(self):
if self.login_type == "qrcode":
await self.login_by_qrcode()
elif self.login_type == "phone":
await self.login_by_mobile()
elif self.login_type == "cookies":
await self.login_by_cookies()
else:
raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookies ...")
async def login_by_mobile(self):
logging.info("Begin login xiaohongshu by mobile ...")
await asyncio.sleep(1)
try:
# 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮
login_button_ele = await self.context_page.wait_for_selector(
selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button",
timeout=5000
)
await login_button_ele.click()
# 弹窗的登录对话框也有两种形态,一种是直接可以看到手机号和验证码的
# 另一种是需要点击切换到手机登录的
element = await self.context_page.wait_for_selector(
selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]',
timeout=5000
)
await element.click()
except Exception as e:
logging.info("have not found mobile button icon and keep going ...")
await asyncio.sleep(1)
login_container_ele = await self.context_page.wait_for_selector("div.login-container")
input_ele = await login_container_ele.query_selector("label.phone > input")
await input_ele.fill(self.login_phone)
await asyncio.sleep(0.5)
send_btn_ele = await login_container_ele.query_selector("label.auth-code > span")
await send_btn_ele.click() # 点击发送验证码
sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
no_logged_in_session = ""
while max_get_sms_code_time > 0:
logging.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1)
sms_code_key = f"xhs_{self.login_phone}"
sms_code_value = await redis_obj.get(sms_code_key)
if not sms_code_value:
max_get_sms_code_time -= 1
continue
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
no_logged_in_session = cookie_dict.get("web_session")
await sms_code_input_ele.fill(value=sms_code_value) # 输入短信验证码
await asyncio.sleep(0.5)
agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
await agree_privacy_ele.click() # 点击同意隐私协议
await asyncio.sleep(0.5)
await submit_btn_ele.click() # 点击登录
# todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确
break
login_flag: bool = await self.check_login_state(no_logged_in_session)
if not login_flag:
logging.info("login failed please confirm ...")
sys.exit()
wait_redirect_seconds = 5
logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
async def login_by_qrcode(self):
"""login xiaohongshu website and keep webdriver login state"""
logging.info("Begin login xiaohongshu by qrcode ...")
await asyncio.sleep(10)
# login_selector = "div.login-container > div.left > div.qrcode > img"
qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
# find login qrcode
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector
)
if not base64_qrcode_img:
logging.info("login failed , have not found qrcode please check ....")
# if this website does not automatically popup login dialog box, we will manual click login button
await asyncio.sleep(0.5)
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
await login_button_ele.click()
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector
)
if not base64_qrcode_img:
sys.exit()
# get not logged session
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
no_logged_in_session = cookie_dict.get("web_session")
# show login qrcode
utils.show_qrcode(base64_qrcode_img)
logging.info(f"waiting for scan code login, remaining time is 20s")
login_flag: bool = await self.check_login_state(no_logged_in_session)
if not login_flag:
logging.info("login failed please confirm ...")
sys.exit()
wait_redirect_seconds = 5
logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
async def login_by_cookies(self):
logging.info("Begin login xiaohongshu by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{
'name': key,
'value': value,
'domain': ".xiaohongshu.com",
'path': "/"
}])