douban enhance and tools

more douban data, douban episode time enhance, douban fix command and douban craw
This commit is contained in:
BennyThink
2021-07-11 09:33:20 +08:00
parent f7b72aa769
commit ca088d8678
9 changed files with 167 additions and 37 deletions

View File

@@ -5,7 +5,6 @@ logs/*
YYeTsFE/node_modules/*
.github/*
assets/*
tools/*
scripts/*
conf/*
tests/*

43
API.md
View File

@@ -567,12 +567,43 @@
```json
{
"douban_id": "26816519",
"douban_link": "https://movie.douban.com/subject/26816519/",
"poster_link": "https://img2.doubanio.com/view/photo/s_ratio_poster/public/p2400201631.jpg",
"resource_id": 34812,
"name": "逃避可耻却有用",
"doubanId": 26816519,
"doubanLink": "https://movie.douban.com/subject/26816519/",
"posterLink": "https://img2.doubanio.com/view/photo/s_ratio_poster/public/p2400201631.jpg",
"resourceId": 34812,
"rating": "8.4",
"actors": "新垣结衣 / 星野源 / 大谷亮平 / 藤井隆 / 真野惠里菜 / 成田凌 / 山贺琴子 / 宇梶刚士 / 富田靖子 / 古田新太 / 石田百合子 / 细田善彦 / 古馆宽治 / 叶山奖之",
"actors": [
"新垣结衣",
"星野源",
"大谷亮平",
"藤井隆",
"真野惠里菜",
"成田凌",
"山贺琴子",
"宇梶刚士",
"富田靖子",
"古田新太",
"石田百合子",
"细田善彦",
"古馆宽治",
"叶山奖之"
],
"directors": [
"金子文纪",
"土井裕泰",
"石井康晴"
],
"genre": [
"喜剧"
],
"releaseDate": "2016-10-11(日本)",
"episodeCount": " 11",
"episodeDuration": " 45分钟",
"writers": [
"野木亚纪子",
"海野纲弥"
],
"year": "2016",
"introduction": "森山实栗(新垣结衣饰)自研究生毕业之后就一直仕途不顺,最近更是惨遭解雇,处于“无业游民”的状态之下,日子过得十分凄惨。经由父亲的介绍,无处可去的实栗来到了名为津崎平匡(星野源饰)的单身男子家中,为其料理家事,就这样,二十五岁的实栗成为了一名家政妇。实栗心地善良手脚勤快,在她的安排和劳作下,平匡家中的一切被打理的井井有条,实栗因此获得了平匡的信赖,亦找到了生活的重心,重新振作了起来。然而好景不长,实栗的父母决定搬离此地,这也就意味着实栗必须“离职”。实在无法接受此事的实栗决定和平匡“契约结婚”,在外装做夫妻,在内依旧是雇主和职员。就这样,这对“孤男寡女”开始了他们的同居生活。"
}
@@ -581,4 +612,4 @@
## 获取海报
* GET `api/douban?resource_id=34812&type=image`
会返回相应格式jpeg、webp、png等的图片与上次数据中 `poster_link`所看到的内容相同
会返回相应格式jpeg、webp、png等的图片与上次数据中 `posterLink`所看到的内容相同

View File

@@ -12,4 +12,5 @@ passlib==1.7.4
fakeredis==1.5.0
pytz==2021.1
filetype==1.0.7
requests[socks]
requests[socks]
tqdm

View File

@@ -0,0 +1,39 @@
#!/usr/local/bin/python3
# coding: utf-8
# YYeTsBot - douban.py
# 7/11/21 10:17
#
__author__ = "Benny <benny.think@gmail.com>"
import contextlib
import random
import sys
import pathlib
import time
import requests
from tqdm import tqdm
import logging
logging.basicConfig(level=logging.INFO)
lib_path = pathlib.Path(__file__).parent.parent.parent.joinpath("yyetsweb").resolve().as_posix()
sys.path.append(lib_path)
from Mongo import DoubanMongoResource
douban = DoubanMongoResource()
session = requests.Session()
ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
session.headers.update({"User-Agent": ua})
yyets_data = douban.db["yyets"].find()
for data in tqdm(yyets_data):
resource_id = data["data"]["info"]["id"]
with contextlib.suppress(Exception):
d = douban.find_douban(resource_id)
logging.info("Processed %s, length %d", resource_id, len(d))
time.sleep(random.randint(5, 20))
logging.info("ALL FINISH!")

View File

@@ -7,6 +7,7 @@
__author__ = "Benny <benny.think@gmail.com>"
import contextlib
import sys
import pathlib
import pymongo
@@ -494,7 +495,7 @@ class DoubanMongoResource(DoubanResource, Mongo):
def get_douban_image(self, rid: int) -> bytes:
db_data = self.get_douban_data(rid)
return db_data["poster_data"]
return db_data["posterData"]
def find_douban(self, resource_id: int):
session = requests.Session()
@@ -503,7 +504,7 @@ class DoubanMongoResource(DoubanResource, Mongo):
douban_col = self.db["douban"]
yyets_col = self.db["yyets"]
data = douban_col.find_one({"resource_id": resource_id}, {"_id": False})
data = douban_col.find_one({"resourceId": resource_id}, {"_id": False, "raw": False})
if data:
logging.info("Existing data for %s", resource_id)
return data
@@ -514,44 +515,67 @@ class DoubanMongoResource(DoubanResource, Mongo):
return {}
cname = names["data"]["info"]["cnname"]
logging.info("cnname for douban is %s", cname)
# enname = names["data"]["info"]["enname"]
# aliasname = names["data"]["info"]["aliasname"].split("/")
search_html = session.get(DOUBAN_SEARCH.format(cname)).text
logging.info("Analysis search html...%s", search_html)
logging.info("Analysis search html...length %s", len(search_html))
soup = BeautifulSoup(search_html, 'html.parser')
douban_item = soup.find_all("div", class_="content")
fwd_link = unquote(douban_item[0].a["href"])
douban_id = re.findall(r"https://movie.douban.com/subject/(\d*)/&query=", fwd_link)[0]
detail_link = DOUBAN_DETAIL.format(douban_id)
final_data = self.get_craw_data(cname, douban_id, resource_id, search_html, session)
douban_col.insert_one(final_data.copy())
final_data.pop("raw")
return final_data
@staticmethod
def get_craw_data(cname, douban_id, resource_id, search_html, session):
detail_link = DOUBAN_DETAIL.format(douban_id)
detail_html = session.get(detail_link).text
logging.info("Analysis detail html...%s", detail_link)
soup = BeautifulSoup(detail_html, 'html.parser')
poster = soup.find_all("div", id="mainpic")
poster_image_link = poster[0].a.img["src"]
directors = [i.text for i in (soup.find_all("a", rel="v:directedBy"))]
writers = episode_count = episode_duration = ""
with contextlib.suppress(IndexError):
episode_duration = soup.find_all("span", property="v:runtime")[0].text
for i in soup.find_all("span", class_="pl"):
if i.text == "编剧":
writers = re.sub(r"\s", "", list(i.next_siblings)[1].text).split("/")
if i.text == "集数:":
episode_count = str(i.nextSibling)
if i.text == "单集片长:" and not episode_duration:
episode_duration = str(i.nextSibling)
actors = [i.text for i in soup.find_all("a", rel="v:starring")]
genre = [i.text for i in soup.find_all("span", property="v:genre")]
release_date = soup.find_all("span", property="v:initialReleaseDate")[0].text
poster_image_link = soup.find_all("div", id="mainpic")[0].a.img["src"]
rating = soup.find_all("strong", class_="ll rating_num")[0].text
year_text = re.sub(r"[()]", "", soup.find_all("span", class_="year")[0].text)
intro = re.sub(r"\s", "", soup.find_all("span", property="v:summary")[0].text)
rating_obj = soup.find_all("strong", class_="ll rating_num")
rating = rating_obj[0].text
actors_obj = soup.find_all("span", class_="attrs")
actors = actors_obj[-1].text
year = soup.find_all("span", class_="year")[0].text
year_text = re.sub(r"[()]", "", year)
intro = soup.find_all("span", property="v:summary")[0].text
intro = re.sub(r"\s", "", intro)
final_data = {
"douban_id": douban_id,
"douban_link": detail_link,
"poster_link": poster_image_link,
"poster_data": session.get(poster_image_link).content,
"resource_id": resource_id,
"name": cname,
"raw": {
"search_url": DOUBAN_SEARCH.format(cname),
"detail_url": detail_link,
"search_html": search_html,
"detail_html": detail_html
},
"doubanId": int(douban_id),
"doubanLink": detail_link,
"posterLink": poster_image_link,
"posterData": session.get(poster_image_link).content,
"resourceId": resource_id,
"rating": rating,
"actors": actors,
"directors": directors,
"genre": genre,
"releaseDate": release_date,
"episodeCount": episode_count,
"episodeDuration": episode_duration,
"writers": writers,
"year": year_text,
"introduction": intro
}
douban_col.insert_one(final_data.copy())
return final_data

View File

@@ -14,7 +14,3 @@ with open("douban_detail.html") as f:
detail_html = f.read()
soup = BeautifulSoup(detail_html, 'html.parser')
intro = soup.find_all("span", property="v:summary")[0].text
i = re.sub(r"\s", "", intro)
print(i)

View File

@@ -516,7 +516,7 @@
<span ><span class='pl'>导演</span>: <span class='attrs'><a href="/celebrity/1275989/" rel="v:directedBy">金子文纪</a> / <a href="/celebrity/1001640/" rel="v:directedBy">土井裕泰</a> / <a href="/celebrity/1320373/" rel="v:directedBy">石井康晴</a></span></span><br/>
<span ><span class='pl'>编剧</span>: <span class='attrs'><a href="/celebrity/1335592/">野木亚纪子</a> / <a href="/celebrity/1363329/">海野纲弥</a></span></span><br/>
<span class="actor"><span class='pl'>主演</span>: <span class='attrs'><a href="/celebrity/1018562/" rel="v:starring">新垣结衣</a> / <a href="/celebrity/1316625/" rel="v:starring">星野源</a> / <a href="/celebrity/1318236/" rel="v:starring">大谷亮平</a> / <a href="/celebrity/1033503/" rel="v:starring">藤井隆</a> / <a href="/celebrity/1275342/" rel="v:starring">真野惠里菜</a> / <a href="/celebrity/1345900/" rel="v:starring">成田凌</a> / <a href="/celebrity/1361659/" rel="v:starring">山贺琴子</a> / <a href="/celebrity/1024153/" rel="v:starring">宇梶刚士</a> / <a href="/celebrity/1033967/" rel="v:starring">富田靖子</a> / <a href="/celebrity/1037957/" rel="v:starring">古田新太</a> / <a href="/celebrity/1028681/" rel="v:starring">石田百合子</a> / <a href="/celebrity/1275143/" rel="v:starring">细田善彦</a> / <a href="/celebrity/1327503/" rel="v:starring">古馆宽治</a> / <a href="/celebrity/1323959/" rel="v:starring">叶山奖之</a></span></span><br/>
<span class="pl">类型:</span> <span property="v:genre"></span><br/>
<span class="pl">类型:</span> <span property="v:genre"></span> / <span property="v:genre">动作</span> / <span property="v:genre">惊悚</span> / <span property="v:genre">犯罪</span><br/>
<span class="pl">官方网站:</span> <a href="http://www.tbs.co.jp/NIGEHAJI_tbs/" rel="nofollow" target="_blank">www.tbs.co.jp/NIGEHAJI_tbs/</a><br/>
<span class="pl">制片国家/地区:</span> 日本<br/>
<span class="pl">语言:</span> 日语<br/>

View File

@@ -599,7 +599,7 @@ class DoubanHandler(BaseHandler):
def douban_data(self):
rid = self.get_query_argument("resource_id")
data = self.instance.get_douban_data(int(rid))
data.pop("poster_data")
data.pop("posterData")
return data
def get_image(self) -> bytes:

View File

@@ -0,0 +1,40 @@
#!/usr/local/bin/python3
# coding: utf-8
# YYeTsBot - douban_fix.py
# 7/11/21 09:37
#
__author__ = "Benny <benny.think@gmail.com>"
import argparse
import requests
import sys
import pathlib
lib_path = pathlib.Path(__file__).parent.parent.resolve().as_posix()
sys.path.append(lib_path)
from Mongo import DoubanMongoResource
parser = argparse.ArgumentParser(description='豆瓣数据修复')
parser.add_argument('resource_id', metavar='r', type=int, help='resource id')
parser.add_argument('douban_id', metavar='d', type=int, help='douban id')
args = parser.parse_args()
resource_id = args.resource_id
douban_id = args.douban_id
douban = DoubanMongoResource()
session = requests.Session()
ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
session.headers.update({"User-Agent": ua})
yyets_data = douban.db["yyets"].find_one({"data.info.id": resource_id})
search_html = ""
cname = yyets_data["data"]["info"]["cnname"]
final_data = douban.get_craw_data(cname, douban_id, resource_id, search_html, session)
douban.db["douban"].find_one_and_replace({"resourceId": resource_id}, final_data)
print("fix complete")
sys.exit(0)