sync douban as a job

This commit is contained in:
BennyThink
2021-07-11 15:50:52 +08:00
parent 990e1b12ea
commit 3873f85fda
3 changed files with 55 additions and 47 deletions

View File

@@ -1,40 +0,0 @@
#!/usr/local/bin/python3
# coding: utf-8
# YYeTsBot - douban.py
# 7/11/21 10:17
#
__author__ = "Benny <benny.think@gmail.com>"
import contextlib
import random
import sys
import pathlib
import time
import requests
from tqdm import tqdm
import logging
logging.basicConfig(level=logging.INFO)
lib_path = pathlib.Path(__file__).parent.parent.parent.joinpath("yyetsweb").resolve().as_posix()
sys.path.append(lib_path)
from Mongo import DoubanMongoResource
douban = DoubanMongoResource()
session = requests.Session()
ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
session.headers.update({"User-Agent": ua})
yyets_data = douban.db["yyets"].find()
rids = [i["data"]["info"]["id"] for i in yyets_data]
logging.info("resource id complete")
for rid in tqdm(rids):
with contextlib.suppress(Exception):
d = douban.find_douban(rid)
logging.info("Processed %s, length %d", rid, len(d))
time.sleep(random.randint(1, 5))
logging.info("ALL FINISH!")

View File

@@ -0,0 +1,45 @@
#!/usr/local/bin/python3
# coding: utf-8
# YYeTsBot - douban.py
# 7/11/21 10:17
#
__author__ = "Benny <benny.think@gmail.com>"
import contextlib
import random
import sys
import pathlib
import time
import logging
import requests
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)
lib_path = pathlib.Path(__file__).parent.parent.resolve().as_posix()
sys.path.append(lib_path)
from Mongo import DoubanMongoResource
def sync_douban():
douban = DoubanMongoResource()
session = requests.Session()
ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
session.headers.update({"User-Agent": ua})
yyets_data = douban.db["yyets"].find()
douban_data = douban.db["douban"].find()
id1 = [i["data"]["info"]["id"] for i in yyets_data]
id2 = [i["resourceId"] for i in douban_data]
rids = list(set(id1).difference(id2))
logging.info("resource id complete %d", len(rids))
for rid in tqdm(rids):
with contextlib.suppress(Exception):
d = douban.find_douban(rid)
logging.info("Processed %s, length %d", rid, len(d))
time.sleep(random.randint(1, 5))
logging.info("ALL FINISH!")

View File

@@ -7,21 +7,23 @@
__author__ = "Benny <benny.think@gmail.com>"
import os
import logging
import os
import platform
import pytz
from apscheduler.schedulers.background import BackgroundScheduler
from tornado import httpserver, ioloop, options, web
from tornado.log import enable_pretty_logging
from tornado import web, httpserver, ioloop, options
from handler import (AnnouncementHandler, BlacklistHandler, CaptchaHandler,
CommentChildHandler, CommentHandler, CommentNewestHandler,
DBDumpHandler, DoubanHandler, GrafanaIndexHandler,
GrafanaQueryHandler, GrafanaSearchHandler, IndexHandler,
MetricsHandler, NameHandler, NotFoundHandler,
ResourceHandler, TopHandler, UserHandler, UserLikeHandler)
from migration.douban_sync import sync_douban
from Mongo import OtherMongoResource
from handler import IndexHandler, UserHandler, ResourceHandler, TopHandler, UserLikeHandler, NameHandler, \
CommentHandler, AnnouncementHandler, CaptchaHandler, MetricsHandler, GrafanaIndexHandler, GrafanaSearchHandler, \
GrafanaQueryHandler, BlacklistHandler, NotFoundHandler, DBDumpHandler, CommentChildHandler, DoubanHandler, \
CommentNewestHandler
enable_pretty_logging()
@@ -83,6 +85,7 @@ if __name__ == "__main__":
timez = pytz.timezone('Asia/Shanghai')
scheduler = BackgroundScheduler(timezone=timez)
scheduler.add_job(OtherMongoResource().reset_top, 'cron', hour=0, minute=0, day=1)
scheduler.add_job(sync_douban, 'cron', hour=0, minute=0, day=1)
scheduler.start()
options.define("p", default=8888, help="running port", type=int)
options.define("h", default='127.0.0.1', help="listen address", type=str)