subtitle update

This commit is contained in:
Benny
2024-12-05 18:15:34 +01:00
parent 494a6b95df
commit 4897e7f4be
13 changed files with 179 additions and 176 deletions

1
.gitignore vendored
View File

@@ -146,3 +146,4 @@ logs/*
/yyetsweb/templates/dump/yyets_mongo.gz
/yyetsweb/templates/dump/yyets_mysql.zip
/yyetsweb/templates/dump/yyets_sqlite.zip
/yyetsweb/subtitle_data/attachment/201001/17/758231_1263706947i2nW.rar

Submodule YYeTsFE updated: ce8767aaa1...a049969352

View File

@@ -1,7 +0,0 @@
version: '3.1'
# link this file as docker-compose.override.yml if you want to use replica set
services:
mongo:
command: --quiet --replSet rs0

View File

@@ -25,6 +25,8 @@ services:
- MEILI_HTTP_PAYLOAD_SIZE_LIMIT=1073741824 #1GiB
volumes:
- meilisearch_data:/meili_data
ports:
- "127.0.0.1:7700:7700"
mysql:
image: ubuntu/mysql:8.0-22.04_beta
@@ -35,27 +37,11 @@ services:
driver: none
command: "--skip-log-bin --default-authentication-plugin=mysql_native_password"
socat:
image: bennythink/socat
restart: unless-stopped
volumes:
- /var/run/docker.sock:/var/run/docker.sock
entrypoint: [ "socat", "tcp-listen:2375,fork,reuseaddr","unix-connect:/var/run/docker.sock" ]
logging:
driver: none
mailhog:
image: cd2team/mailhog
restart: unless-stopped
logging:
driver: none
bot:
image: bennythink/yyetsbot
depends_on:
- redis
- mongo
- socat
restart: always
env_file:
- env/yyets.env
@@ -70,6 +56,8 @@ services:
- redis
- mysql
working_dir: /YYeTsBot/yyetsweb/
volumes:
- ./subtitle_data:/YYeTsBot/yyetsweb/subtitle_data
command: [ "python3","server.py","-h=0.0.0.0" ]
ports:
- "127.0.0.1:8888:8888"

19
scripts/migrate_sub.py Normal file
View File

@@ -0,0 +1,19 @@
#!/usr/bin/env python3
# coding: utf-8
# YYeTsBot - migrate_sub.py
import pymongo
import pymysql
from pymysql.cursors import DictCursor
con = pymysql.connect(host="mysql", user="root", password="root", database="yyets", charset="utf8")
cur = con.cursor(cursor=DictCursor)
mongo_client = pymongo.MongoClient(host="mongo")
col = mongo_client["zimuzu"]["subtitle"]
cur.execute("select * from subtitle")
# 56134 rows
for sub in cur.fetchall():
col.insert_one(sub)

View File

@@ -65,9 +65,7 @@ class BaseSync:
class Zhuixinfan(BaseSync):
def run(self):
zhuixinfan = "http://www.fanxinzhui.com/rr/{}"
start = (self.sync.find_one({"name": "zhuixinfan"}) or {}).get(
"resource_id", os.getenv("ZHUIXINFAN_START", 20)
)
start = (self.sync.find_one({"name": "zhuixinfan"}) or {}).get("resource_id", os.getenv("ZHUIXINFAN_START", 20))
end = os.getenv("ZHUIXINFAN_END", 2500)
for i in range(start, end):
url = zhuixinfan.format(i)
@@ -123,13 +121,9 @@ class Zhuixinfan(BaseSync):
for item in links:
content = item["href"]
if "ed2k" in content:
resource["files"].append(
{"way": "1", "way_cn": "电驴", "address": content, "passwd": ""}
)
resource["files"].append({"way": "1", "way_cn": "电驴", "address": content, "passwd": ""})
elif "magnet" in content:
resource["files"].append(
{"way": "2", "way_cn": "磁力", "address": content, "passwd": ""}
)
resource["files"].append({"way": "2", "way_cn": "磁力", "address": content, "passwd": ""})
elif "pan.baidu" in content:
baidu_password = res.span.a.nextSibling.nextSibling.text
resource["files"].append(
@@ -141,9 +135,7 @@ class Zhuixinfan(BaseSync):
}
)
elif "weiyun" in content:
resource["files"].append(
{"way": "14", "way_cn": "微云", "address": content, "passwd": ""}
)
resource["files"].append({"way": "14", "way_cn": "微云", "address": content, "passwd": ""})
else:
logging.debug("Unknown link: %s", content)
@@ -167,9 +159,7 @@ class Zhuixinfan(BaseSync):
)
else:
last_id = 90000
last = self.yyets.find_one(
{"data.info.id": {"$gte": last_id}}, sort=[("data.info.id", -1)]
)
last = self.yyets.find_one({"data.info.id": {"$gte": last_id}}, sort=[("data.info.id", -1)])
if last:
last_id = last["data"]["info"]["id"] + 1
logging.info("Inserting data.info.id: %s", last_id)
@@ -213,19 +203,13 @@ class YYSub(BaseSync):
structure["data"]["info"]["enname"] = data["enname"]
structure["data"]["info"]["aliasname"] = data["aliasname"]
structure["data"]["info"]["channel"] = data["channel"]
structure["data"]["info"]["channel_cn"] = (
data["channel_cn"] or channel_cn
)
structure["data"]["info"]["channel_cn"] = data["channel_cn"] or channel_cn
structure["data"]["info"]["area"] = data["area"]
structure["data"]["list"] = []
structure["data"]["info"][
"source"
] = f"https://www.yysub.net/resource/{i}"
structure["data"]["info"]["source"] = f"https://www.yysub.net/resource/{i}"
self.insert_data(structure.copy())
self.sync.update_one(
{"name": "yysub"}, {"$set": {"resource_id": end}}, upsert=True
)
self.sync.update_one({"name": "yysub"}, {"$set": {"resource_id": end}}, upsert=True)
logging.info("YYsub Finished")
def insert_data(self, data):

View File

@@ -113,6 +113,7 @@ class SearchEngine(Mongo):
self.yyets_index = self.search_client.index("yyets")
self.comment_index = self.search_client.index("comment")
self.douban_index = self.search_client.index("douban")
self.subtitle_index = self.search_client.index("subtitle")
super().__init__()
def __del__(self):
@@ -152,6 +153,17 @@ class SearchEngine(Mongo):
]
)
def __get_subtitle(self):
return self.db["subtitle"].aggregate(
[
{
"$addFields": {
"_id": {"$toString": "$_id"},
}
},
]
)
def add_yyets(self):
logging.info("Adding yyets data to search engine")
data = list(self.__get_yyets())
@@ -167,6 +179,11 @@ class SearchEngine(Mongo):
data = list(self.__get_douban())
self.douban_index.add_documents(data, primary_key="_id")
def add_subtitle(self):
logging.info("Adding subtitle data to search engine")
data = list(self.__get_subtitle())
self.subtitle_index.add_documents(data, primary_key="_id")
def search_yyets(self, keyword: "str"):
return self.yyets_index.search(keyword, {"matchingStrategy": "all"})["hits"]
@@ -176,11 +193,15 @@ class SearchEngine(Mongo):
def search_douban(self, keyword: "str"):
return self.douban_index.search(keyword, {"matchingStrategy": "all"})["hits"]
def search_subtitle(self, keyword: "str"):
return self.subtitle_index.search(keyword, {"matchingStrategy": "all"})["hits"]
def run_import(self):
t0 = time.time()
self.add_yyets()
self.add_comment()
self.add_douban()
self.add_subtitle()
logging.info(f"Import data to search engine in {time.time() - t0:.2f}s")
def __monitor(self, col, fun):

View File

@@ -33,9 +33,7 @@ class Douban(Mongo):
douban_col = self.db["douban"]
yyets_col = self.db["yyets"]
data = douban_col.find_one(
{"resourceId": resource_id}, {"_id": False, "raw": False}
)
data = douban_col.find_one({"resourceId": resource_id}, {"_id": False, "raw": False})
if data:
logging.info("Existing data for %s", resource_id)
return data
@@ -58,12 +56,8 @@ class Douban(Mongo):
douban_item = soup.find_all("div", class_="content")
fwd_link = unquote(douban_item[0].a["href"])
douban_id = re.findall(
r"https://movie\.douban\.com/subject/(\d*)/.*", fwd_link
)[0]
final_data = self.get_craw_data(
cname, douban_id, resource_id, search_html, session
)
douban_id = re.findall(r"https://movie\.douban\.com/subject/(\d*)/.*", fwd_link)[0]
final_data = self.get_craw_data(cname, douban_id, resource_id, search_html, session)
douban_col.insert_one(final_data.copy())
final_data.pop("raw")
return final_data
@@ -76,9 +70,7 @@ class Douban(Mongo):
soup = BeautifulSoup(detail_html, "html.parser")
directors = [i.text for i in (soup.find_all("a", rel="v:directedBy"))]
release_date = (
poster_image_link
) = rating = year_text = intro = writers = episode_count = episode_duration = ""
release_date = poster_image_link = rating = year_text = intro = writers = episode_count = episode_duration = ""
with contextlib.suppress(IndexError):
episode_duration = soup.find_all("span", property="v:runtime")[0].text
for i in soup.find_all("span", class_="pl"):
@@ -92,21 +84,15 @@ class Douban(Mongo):
genre = [i.text for i in soup.find_all("span", property="v:genre")]
with contextlib.suppress(IndexError):
release_date = soup.find_all("span", property="v:initialReleaseDate")[
0
].text
release_date = soup.find_all("span", property="v:initialReleaseDate")[0].text
with contextlib.suppress(IndexError):
poster_image_link = soup.find_all("div", id="mainpic")[0].a.img["src"]
with contextlib.suppress(IndexError):
rating = soup.find_all("strong", class_="ll rating_num")[0].text
with contextlib.suppress(IndexError):
year_text = re.sub(
r"[()]", "", soup.find_all("span", class_="year")[0].text
)
year_text = re.sub(r"[()]", "", soup.find_all("span", class_="year")[0].text)
with contextlib.suppress(IndexError):
intro = re.sub(
r"\s", "", soup.find_all("span", property="v:summary")[0].text
)
intro = re.sub(r"\s", "", soup.find_all("span", property="v:summary")[0].text)
final_data = {
"name": cname,
@@ -139,9 +125,7 @@ class DoubanReport(Mongo):
def get_error(self) -> dict:
return dict(data=list(self.db["douban_error"].find(projection={"_id": False})))
def report_error(
self, captcha: str, captcha_id: int, content: str, resource_id: int
) -> dict:
def report_error(self, captcha: str, captcha_id: int, content: str, resource_id: int) -> dict:
returned = {"status_code": 0, "message": ""}
verify_result = Captcha().verify_code(captcha, captcha_id)
if not verify_result["status"]:

View File

@@ -1,13 +1,11 @@
#!/usr/bin/env python3
# coding: utf-8
import contextlib
import json
import logging
import os
import random
import pymongo
import requests
import zhconv
from tqdm import tqdm
@@ -43,101 +41,112 @@ class Resource(SearchEngine):
return data
def search_resource(self, keyword: str, search_type: "str") -> dict:
# search_type: default,subtitle,douban,comment
# TODO
if os.getenv("MEILISEARCH"):
return self.meili_search(keyword, search_type)
else:
return self.mongodb_search(keyword)
return self.mongodb_search(keyword, search_type)
def meili_search(self, keyword: "str", search_type: "str") -> dict:
returned = {"data": [], "comment": [], "extra": []}
resource_data, subtitle_data, comment_data = [], [], []
if search_type == "resource":
resource_data = self.search_yyets(keyword)
if search_type == "comment":
comment_data = hide_phone(self.search_comment(keyword))
if search_type == "subtitle":
subtitle_data = self.search_subtitle(keyword)
if search_type == "default":
yyets = self.search_yyets(keyword)
comment = hide_phone(self.search_comment(keyword))
returned["data"] = yyets
returned["comment"] = comment
return returned
elif search_type == "douban":
douban = self.search_douban(keyword)
returned["data"] = douban
return returned
elif search_type == "fansub":
# TODO disable fansub for now
# fansub = self.search_extra(keyword)
# returned["extra"] = fansub
return returned
else:
return returned
resource_data = self.search_yyets(keyword)
subtitle_data = self.search_yyets(keyword)
comment_data = hide_phone(self.search_comment(keyword))
return {
"resource": resource_data,
"subtitle": subtitle_data,
"comment": comment_data,
}
def mongodb_search(self, keyword: str) -> dict:
# convert any text to zh-hans - only for traditional search with MongoDB
def mongodb_search(self, keyword: str, search_type: str) -> dict:
# search_type: default,resource,subtitle,comment default is everything
keyword = zhconv.convert(keyword, "zh-hans")
resource_data, subtitle_data, comment_data = [], [], []
zimuzu_data = []
returned = {"data": [], "extra": [], "comment": []}
def search_resource():
data = self.db["yyets"].find(
{
"$or": [
{"data.info.cnname": {"$regex": f".*{keyword}.*", "$options": "i"}},
{"data.info.enname": {"$regex": f".*{keyword}.*", "$options": "i"}},
{
"data.info.aliasname": {
"$regex": f".*{keyword}.*",
"$options": "i",
}
},
]
},
{"_id": False, "data.info": True},
)
projection = {"_id": False, "data.info": True}
for item in data:
item["data"]["info"]["origin"] = "yyets"
resource_data.append(item["data"]["info"])
resource_data = self.db["yyets"].find(
{
"$or": [
{"data.info.cnname": {"$regex": f".*{keyword}.*", "$options": "i"}},
{"data.info.enname": {"$regex": f".*{keyword}.*", "$options": "i"}},
{
"data.info.aliasname": {
"$regex": f".*{keyword}.*",
"$options": "i",
def search_subtitle():
subdata = self.db["subtitle"].find(
{
"$or": [
{"cnname": {"$regex": f".*{keyword}.*", "$options": "i"}},
{"enname": {"$regex": f".*{keyword}.*", "$options": "i"}},
]
},
{"_id": False},
)
subtitle_data.extend(list(subdata))
def search_comment():
comments = CommentSearch().get_comment(1, 2**10, keyword)
hide_phone(comments.get("data", []))
for c in comments.get("data", []):
comment_rid = c["resource_id"]
res = self.db["yyets"].find_one({"data.info.id": comment_rid}, projection={"data.info": True})
if res:
comment_data.append(
{
"username": c["username"],
"date": c["date"],
"comment": c["content"],
"commentID": c["id"],
"resourceID": comment_rid,
"resourceName": res["data"]["info"]["cnname"],
"origin": "comment",
"hasAvatar": c["hasAvatar"],
"hash": c.get("hash"),
}
},
]
},
projection,
)
)
for item in resource_data:
item["data"]["info"]["origin"] = "yyets"
zimuzu_data.append(item["data"]["info"])
if search_type == "resource":
search_resource()
# get comment
c_search = []
comments = CommentSearch().get_comment(1, 2**10, keyword)
hide_phone(comments.get("data", []))
for c in comments.get("data", []):
comment_rid = c["resource_id"]
res = self.db["yyets"].find_one({"data.info.id": comment_rid}, projection={"data.info": True})
if res:
c_search.append(
{
"username": c["username"],
"date": c["date"],
"comment": c["content"],
"commentID": c["id"],
"resourceID": comment_rid,
"resourceName": res["data"]["info"]["cnname"],
"origin": "comment",
"hasAvatar": c["hasAvatar"],
"hash": c.get("hash"),
}
)
# zimuzu -> comment -> extra
if zimuzu_data:
returned["data"] = zimuzu_data
elif not c_search:
# only returned when no data found
returned["extra"] = self.search_extra(keyword)
# comment data will always be returned
returned["comment"] = c_search
return returned
if search_type == "comment":
search_comment()
def search_extra(self, keyword: "str") -> list:
order = os.getenv("ORDER", "YYeTsOffline,ZimuxiaOnline,NewzmzOnline,ZhuixinfanOnline").split(",")
order.pop(0)
extra = []
with contextlib.suppress(requests.exceptions.RequestException):
for name in order:
extra = self.fansub_search(name, keyword)
if extra:
break
return extra
if search_type == "subtitle":
search_subtitle()
if search_type == "default":
search_resource()
search_comment()
search_subtitle()
return {
"resource": resource_data,
"subtitle": subtitle_data,
"comment": comment_data,
}
def patch_resource(self, new_data: dict):
rid = new_data["resource_id"]

View File

@@ -157,7 +157,10 @@ class User(Mongo, Redis):
valid_data["email"] = {"verified": False, "address": user_email}
# send email confirm
subject = "[人人影视下载分享站] 请验证你的邮箱"
text = f"请输入如下验证码完成你的邮箱认证。验证码有效期为24小时。<br>" f"如果您未有此请求,请忽略此邮件。<br><br>验证码: {verify_code}"
text = (
f"请输入如下验证码完成你的邮箱认证。验证码有效期为24小时。<br>"
f"如果您未有此请求,请忽略此邮件。<br><br>验证码: {verify_code}"
)
context = {"username": username, "text": text}
send_mail(user_email, subject, context)
# 发送成功才设置缓存

View File

@@ -1,19 +1,37 @@
#!/usr/bin/env python3
# coding: utf-8
import os
import platform
from http import HTTPStatus
from pathlib import Path
from tornado import gen
from tornado.concurrent import run_on_executor
from handlers import cf
from handlers.base import BaseHandler
filename = Path(__file__).name.split(".")[0]
class SubtitleDownloadHandler(BaseHandler):
@run_on_executor()
def find_and_download(self):
file = self.json.get("file")
self.set_header("x-filename", Path(file).name)
p = Path(__file__).parent.parent.joinpath("subtitle_data", file)
self.set_header("Content-Type", "application/bin")
try:
data = p.read_bytes()
return data
except FileNotFoundError:
self.set_status(HTTPStatus.NOT_FOUND)
return b""
@gen.coroutine
def post(self):
resp = yield self.find_and_download()
self.write(resp)
class ResourceHandler(BaseHandler):
filename = filename
@@ -22,9 +40,6 @@ class ResourceHandler(BaseHandler):
query = self.get_query_argument("id", None)
resource_id = int(query) if query.isdigit() else 0
username = self.get_current_user()
if str(resource_id) in os.getenv("HIDDEN_RESOURCE", "").split(","):
self.set_status(HTTPStatus.NOT_FOUND)
return {"status": 0, "info": "资源已隐藏"}
data = self.instance.get_resource_data(resource_id, username)
if not data:
self.ban()
@@ -33,24 +48,8 @@ class ResourceHandler(BaseHandler):
return data
def make_some_fun(self):
if platform.uname().system == "Darwin":
return
referer = self.request.headers.get("referer")
ip = self.get_real_ip()
if not referer and self.request.headers.get("origin") != "tauri://localhost":
cf.ban_new_ip(ip)
if os.getenv("GIFT"):
self.set_header("Content-Type", "text/html")
self.set_header("Content-Encoding", "gzip")
with open("templates/gift.gz", "rb") as f:
return f.read()
@run_on_executor()
def search_resource(self):
if gift := self.make_some_fun():
return gift
kw = self.get_query_argument("keyword").lower()
search_type = self.get_query_argument("type", "default")
self.set_header("search-engine", "Meilisearch" if os.getenv("MEILISEARCH") else "MongoDB")

View File

@@ -60,6 +60,7 @@ from handlers.resources import (
NameHandler,
ResourceHandler,
ResourceLatestHandler,
SubtitleDownloadHandler,
TopHandler,
)
from handlers.user import LikeHandler, UserAvatarHandler, UserEmailHandler, UserHandler
@@ -76,6 +77,7 @@ class RunServer:
handlers = [
(r"/", IndexHandler),
(r"/api/resource", ResourceHandler),
(r"/api/download", SubtitleDownloadHandler),
(r"/api/resource/latest", ResourceLatestHandler),
(r"/api/top", TopHandler),
(r"/api/like", LikeHandler),

Binary file not shown.