mirror of
https://github.com/tgbot-collection/YYeTsBot.git
synced 2025-11-25 03:15:05 +08:00
douban enhance and tools
more douban data, douban episode time enhance, douban fix command and douban craw
This commit is contained in:
@@ -5,7 +5,6 @@ logs/*
|
||||
YYeTsFE/node_modules/*
|
||||
.github/*
|
||||
assets/*
|
||||
tools/*
|
||||
scripts/*
|
||||
conf/*
|
||||
tests/*
|
||||
43
API.md
43
API.md
@@ -567,12 +567,43 @@
|
||||
|
||||
```json
|
||||
{
|
||||
"douban_id": "26816519",
|
||||
"douban_link": "https://movie.douban.com/subject/26816519/",
|
||||
"poster_link": "https://img2.doubanio.com/view/photo/s_ratio_poster/public/p2400201631.jpg",
|
||||
"resource_id": 34812,
|
||||
"name": "逃避可耻却有用",
|
||||
"doubanId": 26816519,
|
||||
"doubanLink": "https://movie.douban.com/subject/26816519/",
|
||||
"posterLink": "https://img2.doubanio.com/view/photo/s_ratio_poster/public/p2400201631.jpg",
|
||||
"resourceId": 34812,
|
||||
"rating": "8.4",
|
||||
"actors": "新垣结衣 / 星野源 / 大谷亮平 / 藤井隆 / 真野惠里菜 / 成田凌 / 山贺琴子 / 宇梶刚士 / 富田靖子 / 古田新太 / 石田百合子 / 细田善彦 / 古馆宽治 / 叶山奖之",
|
||||
"actors": [
|
||||
"新垣结衣",
|
||||
"星野源",
|
||||
"大谷亮平",
|
||||
"藤井隆",
|
||||
"真野惠里菜",
|
||||
"成田凌",
|
||||
"山贺琴子",
|
||||
"宇梶刚士",
|
||||
"富田靖子",
|
||||
"古田新太",
|
||||
"石田百合子",
|
||||
"细田善彦",
|
||||
"古馆宽治",
|
||||
"叶山奖之"
|
||||
],
|
||||
"directors": [
|
||||
"金子文纪",
|
||||
"土井裕泰",
|
||||
"石井康晴"
|
||||
],
|
||||
"genre": [
|
||||
"喜剧"
|
||||
],
|
||||
"releaseDate": "2016-10-11(日本)",
|
||||
"episodeCount": " 11",
|
||||
"episodeDuration": " 45分钟",
|
||||
"writers": [
|
||||
"野木亚纪子",
|
||||
"海野纲弥"
|
||||
],
|
||||
"year": "2016",
|
||||
"introduction": "森山实栗(新垣结衣饰)自研究生毕业之后就一直仕途不顺,最近更是惨遭解雇,处于“无业游民”的状态之下,日子过得十分凄惨。经由父亲的介绍,无处可去的实栗来到了名为津崎平匡(星野源饰)的单身男子家中,为其料理家事,就这样,二十五岁的实栗成为了一名家政妇。实栗心地善良手脚勤快,在她的安排和劳作下,平匡家中的一切被打理的井井有条,实栗因此获得了平匡的信赖,亦找到了生活的重心,重新振作了起来。然而好景不长,实栗的父母决定搬离此地,这也就意味着实栗必须“离职”。实在无法接受此事的实栗决定和平匡“契约结婚”,在外装做夫妻,在内依旧是雇主和职员。就这样,这对“孤男寡女”开始了他们的同居生活。"
|
||||
}
|
||||
@@ -581,4 +612,4 @@
|
||||
## 获取海报
|
||||
|
||||
* GET `api/douban?resource_id=34812&type=image`
|
||||
会返回相应格式(jpeg、webp、png等)的图片,与上次数据中 `poster_link`所看到的内容相同
|
||||
会返回相应格式(jpeg、webp、png等)的图片,与上次数据中 `posterLink`所看到的内容相同
|
||||
@@ -12,4 +12,5 @@ passlib==1.7.4
|
||||
fakeredis==1.5.0
|
||||
pytz==2021.1
|
||||
filetype==1.0.7
|
||||
requests[socks]
|
||||
requests[socks]
|
||||
tqdm
|
||||
39
tools/BagAndDrag/douban_sync.py
Normal file
39
tools/BagAndDrag/douban_sync.py
Normal file
@@ -0,0 +1,39 @@
|
||||
#!/usr/local/bin/python3
|
||||
# coding: utf-8
|
||||
|
||||
# YYeTsBot - douban.py
|
||||
# 7/11/21 10:17
|
||||
#
|
||||
|
||||
__author__ = "Benny <benny.think@gmail.com>"
|
||||
|
||||
import contextlib
|
||||
import random
|
||||
import sys
|
||||
import pathlib
|
||||
import time
|
||||
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
lib_path = pathlib.Path(__file__).parent.parent.parent.joinpath("yyetsweb").resolve().as_posix()
|
||||
sys.path.append(lib_path)
|
||||
from Mongo import DoubanMongoResource
|
||||
|
||||
douban = DoubanMongoResource()
|
||||
session = requests.Session()
|
||||
ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
|
||||
session.headers.update({"User-Agent": ua})
|
||||
|
||||
yyets_data = douban.db["yyets"].find()
|
||||
|
||||
for data in tqdm(yyets_data):
|
||||
resource_id = data["data"]["info"]["id"]
|
||||
with contextlib.suppress(Exception):
|
||||
d = douban.find_douban(resource_id)
|
||||
logging.info("Processed %s, length %d", resource_id, len(d))
|
||||
time.sleep(random.randint(5, 20))
|
||||
|
||||
logging.info("ALL FINISH!")
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
__author__ = "Benny <benny.think@gmail.com>"
|
||||
|
||||
import contextlib
|
||||
import sys
|
||||
import pathlib
|
||||
import pymongo
|
||||
@@ -494,7 +495,7 @@ class DoubanMongoResource(DoubanResource, Mongo):
|
||||
|
||||
def get_douban_image(self, rid: int) -> bytes:
|
||||
db_data = self.get_douban_data(rid)
|
||||
return db_data["poster_data"]
|
||||
return db_data["posterData"]
|
||||
|
||||
def find_douban(self, resource_id: int):
|
||||
session = requests.Session()
|
||||
@@ -503,7 +504,7 @@ class DoubanMongoResource(DoubanResource, Mongo):
|
||||
|
||||
douban_col = self.db["douban"]
|
||||
yyets_col = self.db["yyets"]
|
||||
data = douban_col.find_one({"resource_id": resource_id}, {"_id": False})
|
||||
data = douban_col.find_one({"resourceId": resource_id}, {"_id": False, "raw": False})
|
||||
if data:
|
||||
logging.info("Existing data for %s", resource_id)
|
||||
return data
|
||||
@@ -514,44 +515,67 @@ class DoubanMongoResource(DoubanResource, Mongo):
|
||||
return {}
|
||||
cname = names["data"]["info"]["cnname"]
|
||||
logging.info("cnname for douban is %s", cname)
|
||||
# enname = names["data"]["info"]["enname"]
|
||||
# aliasname = names["data"]["info"]["aliasname"].split("/")
|
||||
|
||||
search_html = session.get(DOUBAN_SEARCH.format(cname)).text
|
||||
logging.info("Analysis search html...%s", search_html)
|
||||
logging.info("Analysis search html...length %s", len(search_html))
|
||||
soup = BeautifulSoup(search_html, 'html.parser')
|
||||
douban_item = soup.find_all("div", class_="content")
|
||||
|
||||
fwd_link = unquote(douban_item[0].a["href"])
|
||||
douban_id = re.findall(r"https://movie.douban.com/subject/(\d*)/&query=", fwd_link)[0]
|
||||
detail_link = DOUBAN_DETAIL.format(douban_id)
|
||||
final_data = self.get_craw_data(cname, douban_id, resource_id, search_html, session)
|
||||
douban_col.insert_one(final_data.copy())
|
||||
final_data.pop("raw")
|
||||
return final_data
|
||||
|
||||
@staticmethod
|
||||
def get_craw_data(cname, douban_id, resource_id, search_html, session):
|
||||
detail_link = DOUBAN_DETAIL.format(douban_id)
|
||||
detail_html = session.get(detail_link).text
|
||||
logging.info("Analysis detail html...%s", detail_link)
|
||||
soup = BeautifulSoup(detail_html, 'html.parser')
|
||||
|
||||
poster = soup.find_all("div", id="mainpic")
|
||||
poster_image_link = poster[0].a.img["src"]
|
||||
directors = [i.text for i in (soup.find_all("a", rel="v:directedBy"))]
|
||||
writers = episode_count = episode_duration = ""
|
||||
with contextlib.suppress(IndexError):
|
||||
episode_duration = soup.find_all("span", property="v:runtime")[0].text
|
||||
for i in soup.find_all("span", class_="pl"):
|
||||
if i.text == "编剧":
|
||||
writers = re.sub(r"\s", "", list(i.next_siblings)[1].text).split("/")
|
||||
if i.text == "集数:":
|
||||
episode_count = str(i.nextSibling)
|
||||
if i.text == "单集片长:" and not episode_duration:
|
||||
episode_duration = str(i.nextSibling)
|
||||
actors = [i.text for i in soup.find_all("a", rel="v:starring")]
|
||||
genre = [i.text for i in soup.find_all("span", property="v:genre")]
|
||||
release_date = soup.find_all("span", property="v:initialReleaseDate")[0].text
|
||||
poster_image_link = soup.find_all("div", id="mainpic")[0].a.img["src"]
|
||||
rating = soup.find_all("strong", class_="ll rating_num")[0].text
|
||||
year_text = re.sub(r"[()]", "", soup.find_all("span", class_="year")[0].text)
|
||||
intro = re.sub(r"\s", "", soup.find_all("span", property="v:summary")[0].text)
|
||||
|
||||
rating_obj = soup.find_all("strong", class_="ll rating_num")
|
||||
rating = rating_obj[0].text
|
||||
|
||||
actors_obj = soup.find_all("span", class_="attrs")
|
||||
actors = actors_obj[-1].text
|
||||
year = soup.find_all("span", class_="year")[0].text
|
||||
year_text = re.sub(r"[()]", "", year)
|
||||
intro = soup.find_all("span", property="v:summary")[0].text
|
||||
intro = re.sub(r"\s", "", intro)
|
||||
final_data = {
|
||||
"douban_id": douban_id,
|
||||
"douban_link": detail_link,
|
||||
"poster_link": poster_image_link,
|
||||
"poster_data": session.get(poster_image_link).content,
|
||||
"resource_id": resource_id,
|
||||
"name": cname,
|
||||
"raw": {
|
||||
"search_url": DOUBAN_SEARCH.format(cname),
|
||||
"detail_url": detail_link,
|
||||
"search_html": search_html,
|
||||
"detail_html": detail_html
|
||||
},
|
||||
"doubanId": int(douban_id),
|
||||
"doubanLink": detail_link,
|
||||
"posterLink": poster_image_link,
|
||||
"posterData": session.get(poster_image_link).content,
|
||||
"resourceId": resource_id,
|
||||
"rating": rating,
|
||||
"actors": actors,
|
||||
"directors": directors,
|
||||
"genre": genre,
|
||||
"releaseDate": release_date,
|
||||
"episodeCount": episode_count,
|
||||
"episodeDuration": episode_duration,
|
||||
"writers": writers,
|
||||
"year": year_text,
|
||||
"introduction": intro
|
||||
}
|
||||
douban_col.insert_one(final_data.copy())
|
||||
return final_data
|
||||
|
||||
@@ -14,7 +14,3 @@ with open("douban_detail.html") as f:
|
||||
detail_html = f.read()
|
||||
soup = BeautifulSoup(detail_html, 'html.parser')
|
||||
|
||||
intro = soup.find_all("span", property="v:summary")[0].text
|
||||
i = re.sub(r"\s", "", intro)
|
||||
|
||||
print(i)
|
||||
|
||||
2
yyetsweb/craw_data/douban_detail.html
vendored
2
yyetsweb/craw_data/douban_detail.html
vendored
@@ -516,7 +516,7 @@
|
||||
<span ><span class='pl'>导演</span>: <span class='attrs'><a href="/celebrity/1275989/" rel="v:directedBy">金子文纪</a> / <a href="/celebrity/1001640/" rel="v:directedBy">土井裕泰</a> / <a href="/celebrity/1320373/" rel="v:directedBy">石井康晴</a></span></span><br/>
|
||||
<span ><span class='pl'>编剧</span>: <span class='attrs'><a href="/celebrity/1335592/">野木亚纪子</a> / <a href="/celebrity/1363329/">海野纲弥</a></span></span><br/>
|
||||
<span class="actor"><span class='pl'>主演</span>: <span class='attrs'><a href="/celebrity/1018562/" rel="v:starring">新垣结衣</a> / <a href="/celebrity/1316625/" rel="v:starring">星野源</a> / <a href="/celebrity/1318236/" rel="v:starring">大谷亮平</a> / <a href="/celebrity/1033503/" rel="v:starring">藤井隆</a> / <a href="/celebrity/1275342/" rel="v:starring">真野惠里菜</a> / <a href="/celebrity/1345900/" rel="v:starring">成田凌</a> / <a href="/celebrity/1361659/" rel="v:starring">山贺琴子</a> / <a href="/celebrity/1024153/" rel="v:starring">宇梶刚士</a> / <a href="/celebrity/1033967/" rel="v:starring">富田靖子</a> / <a href="/celebrity/1037957/" rel="v:starring">古田新太</a> / <a href="/celebrity/1028681/" rel="v:starring">石田百合子</a> / <a href="/celebrity/1275143/" rel="v:starring">细田善彦</a> / <a href="/celebrity/1327503/" rel="v:starring">古馆宽治</a> / <a href="/celebrity/1323959/" rel="v:starring">叶山奖之</a></span></span><br/>
|
||||
<span class="pl">类型:</span> <span property="v:genre">喜剧</span><br/>
|
||||
<span class="pl">类型:</span> <span property="v:genre">剧情</span> / <span property="v:genre">动作</span> / <span property="v:genre">惊悚</span> / <span property="v:genre">犯罪</span><br/>
|
||||
<span class="pl">官方网站:</span> <a href="http://www.tbs.co.jp/NIGEHAJI_tbs/" rel="nofollow" target="_blank">www.tbs.co.jp/NIGEHAJI_tbs/</a><br/>
|
||||
<span class="pl">制片国家/地区:</span> 日本<br/>
|
||||
<span class="pl">语言:</span> 日语<br/>
|
||||
|
||||
@@ -599,7 +599,7 @@ class DoubanHandler(BaseHandler):
|
||||
def douban_data(self):
|
||||
rid = self.get_query_argument("resource_id")
|
||||
data = self.instance.get_douban_data(int(rid))
|
||||
data.pop("poster_data")
|
||||
data.pop("posterData")
|
||||
return data
|
||||
|
||||
def get_image(self) -> bytes:
|
||||
|
||||
40
yyetsweb/migration/douban_fix.py
Normal file
40
yyetsweb/migration/douban_fix.py
Normal file
@@ -0,0 +1,40 @@
|
||||
#!/usr/local/bin/python3
|
||||
# coding: utf-8
|
||||
|
||||
# YYeTsBot - douban_fix.py
|
||||
# 7/11/21 09:37
|
||||
#
|
||||
|
||||
__author__ = "Benny <benny.think@gmail.com>"
|
||||
|
||||
import argparse
|
||||
|
||||
import requests
|
||||
import sys
|
||||
import pathlib
|
||||
|
||||
lib_path = pathlib.Path(__file__).parent.parent.resolve().as_posix()
|
||||
sys.path.append(lib_path)
|
||||
|
||||
from Mongo import DoubanMongoResource
|
||||
|
||||
parser = argparse.ArgumentParser(description='豆瓣数据修复')
|
||||
parser.add_argument('resource_id', metavar='r', type=int, help='resource id')
|
||||
parser.add_argument('douban_id', metavar='d', type=int, help='douban id')
|
||||
args = parser.parse_args()
|
||||
resource_id = args.resource_id
|
||||
douban_id = args.douban_id
|
||||
|
||||
douban = DoubanMongoResource()
|
||||
session = requests.Session()
|
||||
ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
|
||||
session.headers.update({"User-Agent": ua})
|
||||
|
||||
yyets_data = douban.db["yyets"].find_one({"data.info.id": resource_id})
|
||||
search_html = ""
|
||||
cname = yyets_data["data"]["info"]["cnname"]
|
||||
|
||||
final_data = douban.get_craw_data(cname, douban_id, resource_id, search_html, session)
|
||||
douban.db["douban"].find_one_and_replace({"resourceId": resource_id}, final_data)
|
||||
print("fix complete")
|
||||
sys.exit(0)
|
||||
Reference in New Issue
Block a user