Files
YYeTsBot/yyetsbot/fansub.py

296 lines
10 KiB
Python
Raw Normal View History

2021-01-18 22:26:02 +08:00
# coding: utf-8
# YYeTsBot - fansub.py
# 2019/8/15 18:30
__author__ = 'Benny <benny.think@gmail.com>'
import os
import logging
import requests
import pickle
import sys
import json
2021-01-20 21:51:03 +08:00
import hashlib
2021-01-18 22:26:02 +08:00
from bs4 import BeautifulSoup
2021-01-20 21:51:03 +08:00
from config import (YYETS_SEARCH_URL, GET_USER, BASE_URL, SHARE_WEB,
2021-01-18 22:26:02 +08:00
SHARE_URL, WORKERS, SHARE_API, USERNAME, PASSWORD,
2021-01-20 21:51:03 +08:00
AJAX_LOGIN, REDIS,
FIX_SEARCH)
2021-01-18 22:26:02 +08:00
import redis
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(filename)s [%(levelname)s]: %(message)s')
session = requests.Session()
ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
session.headers.update({"User-Agent": ua})
class BaseFansub:
"""
all the subclass should implement three kinds of methods:
1. online search, contains preview for bot and complete result
2. offline search (set pass if not applicable)
3. login and check (set pass if not applicable)
4. search_result this is critical for bot to draw markup
"""
label = None
cookie_file = None
def __init__(self):
self.data = None
self.url = None
self.redis = redis.StrictRedis(host=REDIS, decode_responses=True)
@property
def id(self):
# implement how to get the unique id for this resource
return None
def __get_search_html__(self, kw: str) -> str:
# return html text of search page
pass
def online_search_preview(self, search_text: str) -> dict:
# try to retrieve critical information from html
# this result must return to bot for manual selection
# {"url1": "name1", "url2": "name2"}
pass
def online_search_result(self, resource_url: str) -> dict:
"""
This will happen when user click one of the button, only by then we can know the resource link
From the information above, try to get a detail dict structure.
This method should check cache first if applicable
This method should set self.link and self.data
This method should call __execute_online_search
:param resource_url:
:return: {"all": rss_result, "share": share_link, "cnname": cnname}
"""
pass
def __execute_online_search_result__(self) -> dict:
2021-01-18 22:26:02 +08:00
"""
Do the real search job, without any cache mechanism
:return: {"all": rss_result, "share": share_link, "cnname": cnname}
"""
pass
def offline_search_preview(self, search_text: str) -> dict:
# this result must return to bot for manual selection
# the same as online
pass
def offline_search_result(self, resource_url) -> dict:
"""
Same as online_search_result
:param resource_url:
:return:
"""
pass
def __execute_offline_search_result(self) -> dict:
"""
Do the search job, without any cache mechanism
:return: {"all": rss_result, "share": share_link, "cnname": cnname}
"""
pass
def __login_check(self):
pass
def __manual_login(self):
pass
def __save_cookies__(self, requests_cookiejar):
2021-01-18 22:26:02 +08:00
with open(self.cookie_file, 'wb') as f:
pickle.dump(requests_cookiejar, f)
def __load_cookies__(self):
2021-01-18 22:26:02 +08:00
with open(self.cookie_file, 'rb') as f:
return pickle.load(f)
def __get_from_cache__(self, url: str, method_name: str) -> dict:
2021-01-20 21:51:03 +08:00
logging.info("[%s] Reading data from cache %s", self.label, url)
2021-01-18 22:26:02 +08:00
data = self.redis.get(url)
if data:
logging.info("Cache hit")
return json.loads(data)
else:
logging.info("Cache miss")
result_method = getattr(self, method_name)
2021-01-19 23:27:11 +08:00
self.__save_to_cache__(url, result_method())
return self.__get_from_cache__(url, method_name)
2021-01-18 22:26:02 +08:00
2021-01-19 23:27:11 +08:00
def __save_to_cache__(self, url: str, value: dict, ex=3600 * 12) -> None:
2021-01-18 22:26:02 +08:00
data = json.dumps(value, ensure_ascii=False)
self.redis.set(url, data, ex=ex)
class YYeTs(BaseFansub):
label = "yyets"
cookie_file = os.path.join("data", "cookies.dump")
@property
def id(self):
# implement how to get the unique id for this resource
rid = self.url.split('/')[-1]
return rid
def __get_search_html__(self, kw: str) -> str:
# don't have to login here
2021-01-20 21:51:03 +08:00
logging.info("[%s] Searching for %s", self.label, kw)
r = session.get(YYETS_SEARCH_URL.format(kw=kw))
2021-01-18 22:26:02 +08:00
r.close()
return r.text
def online_search_preview(self, search_text: str) -> dict:
html_text = self.__get_search_html__(search_text)
2021-01-20 21:51:03 +08:00
logging.info('[%s] Parsing html...', self.label)
2021-01-18 22:26:02 +08:00
soup = BeautifulSoup(html_text, 'lxml')
link_list = soup.find_all("div", class_="clearfix search-item")
dict_result = {}
for block in link_list:
name = block.find_all('a')[-1].text
url = BASE_URL + block.find_all('a')[-1].attrs['href']
dict_result[url] = name
2021-01-20 22:43:02 +08:00
dict_result["source"] = self.label
2021-01-18 22:26:02 +08:00
return dict_result
def online_search_result(self, resource_url: str) -> dict:
self.url = resource_url
self.data = self.__get_from_cache__(self.url, self.__execute_online_search_result__.__name__)
2021-01-18 22:26:02 +08:00
return self.data
def __execute_online_search_result__(self) -> dict:
2021-01-20 21:51:03 +08:00
logging.info("[%s] Loading detail page %s", self.label, self.url)
2021-01-18 22:26:02 +08:00
share_link, api_res = self.__get_share_page()
cnname = api_res["data"]["info"]["cnname"]
self.data = {"all": api_res, "share": share_link, "cnname": cnname}
return self.data
def offline_search_preview(self, search_text: str) -> dict:
# from cloudflare workers
2021-01-19 22:30:25 +08:00
# no redis cache for now - why? because we may update cloudflare
2021-01-20 21:51:03 +08:00
logging.info("[%s] Loading offline data from cloudflare KV storage...", self.label)
2021-01-18 22:26:02 +08:00
index = WORKERS.format(id="index")
data: dict = requests.get(index).json()
results = {}
for name, rid in data.items():
if search_text in name:
fake_url = f"http://www.rrys2020.com/resource/{rid}"
results[fake_url] = name.replace("\n", " ")
2021-01-20 21:51:03 +08:00
logging.info("[%s] Offline search complete", self.label)
2021-01-18 22:26:02 +08:00
return results
def offline_search_result(self, resource_url) -> dict:
self.url = resource_url
query_url = WORKERS.format(id=self.id)
2021-01-19 22:30:25 +08:00
# for universal purpose, we return the same structure.
2021-01-18 22:26:02 +08:00
self.data = {"all": None, "share": query_url, "cnname": None}
return self.data
def __login_check(self):
2021-01-20 21:51:03 +08:00
logging.debug("[%s] Checking login status...", self.label)
2021-01-18 22:26:02 +08:00
if not os.path.exists(self.cookie_file):
2021-01-20 21:51:03 +08:00
logging.warning("[%s] Cookie file not found", self.label)
2021-01-18 22:26:02 +08:00
self.__manual_login()
r = session.get(GET_USER, cookies=self.__load_cookies__())
2021-01-18 22:26:02 +08:00
if not r.json()['status'] == 1:
self.__manual_login()
def __manual_login(self):
data = {"account": USERNAME, "password": PASSWORD, "remember": 1}
2021-01-20 21:51:03 +08:00
logging.info("[%s] Login in as %s", self.label, data)
2021-01-18 22:26:02 +08:00
r = requests.post(AJAX_LOGIN, data=data)
resp = r.json()
if resp.get('status') == 1:
2021-01-20 21:51:03 +08:00
logging.debug("Login success! %s", r.cookies)
self.__save_cookies__(r.cookies)
2021-01-18 22:26:02 +08:00
else:
logging.error("Login failed! %s", resp)
sys.exit(1)
r.close()
def __get_share_page(self):
self.__login_check()
2021-01-18 22:26:02 +08:00
rid = self.id
res = session.post(SHARE_URL, data={"rid": rid}, cookies=self.__load_cookies__()).json()
2021-01-18 22:26:02 +08:00
share_code = res['data'].split('/')[-1]
share_url = SHARE_WEB.format(code=share_code)
2021-01-20 21:51:03 +08:00
logging.info("[%s] Share url is %s", self.label, share_url)
2021-01-18 22:26:02 +08:00
# get api response
api_response = session.get(SHARE_API.format(code=share_code)).json()
return share_url, api_response
2021-01-20 21:51:03 +08:00
class Zimuxia(BaseFansub):
label = "zimuxia"
@property
def id(self):
# implement how to get the unique id for this resource
rid = self.url.split('/')[-1]
return rid
def __get_search_html__(self, kw: str) -> str:
# don't have to login here
logging.info("[%s] Searching for %s", self.label, kw)
r = session.get(FIX_SEARCH.format(kw=kw))
r.close()
return r.text
def online_search_preview(self, search_text: str) -> dict:
html_text = self.__get_search_html__(search_text)
logging.info('[%s] Parsing html...', self.label)
soup = BeautifulSoup(html_text, 'lxml')
link_list = soup.find_all("h2", class_="post-title")
dict_result = {}
for link in link_list:
# Warning: we can't simple return url here.
# Telegram bot button callback data must be less than 64bytes.
# Therefore we use sha1 to hash the url, save to redis.
url = link.a['href']
url_hash = hashlib.sha1(url.encode('u8')).hexdigest()
name = link.a.text
dict_result[url_hash] = name
self.redis.set(url_hash, url)
2021-01-20 22:43:02 +08:00
dict_result["source"] = self.label
2021-01-20 21:51:03 +08:00
return dict_result
def online_search_result(self, url_hash: str) -> dict:
self.redis.get(url_hash)
self.url = self.redis.get(url_hash)
self.data = self.__get_from_cache__(self.url, self.__execute_online_search_result__.__name__)
return self.data
def __execute_online_search_result__(self) -> dict:
logging.info("[%s] Loading detail page %s", self.label, self.url)
cnname, html_text = self.obtain_all_response()
self.data = {"all": html_text, "share": self.url, "cnname": cnname}
return self.data
def obtain_all_response(self) -> (str, str):
r = session.get(self.url)
soup = BeautifulSoup(r.text, 'lxml')
cnname = soup.title.text.split("|")[0]
return cnname, dict(html=r.text)
def offline_search_preview(self, search_text: str) -> dict:
raise NotImplementedError("Give me some time...")
def offline_search_result(self, resource_url) -> dict:
raise NotImplementedError("Give me some time...")
2021-01-20 22:43:02 +08:00
class FansubEntrance(BaseFansub):
pass