Files
pansou/plugin/discourse/discourse.go
www.xueximeng.com 0aad3b9ca0 新增插件discourse
2025-10-22 17:26:04 +08:00

594 lines
16 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package discourse
import (
"fmt"
"io"
"net/http"
"net/url"
"pansou/model"
"pansou/plugin"
"pansou/util/json"
"regexp"
"strings"
"time"
cloudscraper "github.com/Advik-B/cloudscraper/lib"
)
// 预编译的正则表达式 - 用于从blurb中提取网盘链接
var (
// 网盘链接正则表达式
quarkRegex = regexp.MustCompile(`https://pan\.quark\.cn/s/[0-9a-zA-Z]+`)
baiduRegex = regexp.MustCompile(`https://pan\.baidu\.com/s/[0-9a-zA-Z_\-]+(?:\?pwd=([0-9a-zA-Z]+))?`)
aliyunRegex = regexp.MustCompile(`https://(?:www\.)?aliyundrive\.com/s/[0-9a-zA-Z]+`)
xunleiRegex = regexp.MustCompile(`https://pan\.xunlei\.com/s/[0-9a-zA-Z_\-]+(?:\?pwd=([0-9a-zA-Z]+))?`)
tianyiRegex = regexp.MustCompile(`https://cloud\.189\.cn/t/[0-9a-zA-Z]+`)
ucRegex = regexp.MustCompile(`https://drive\.uc\.cn/s/[0-9a-zA-Z]+`)
pan115Regex = regexp.MustCompile(`https://115\.com/s/[0-9a-zA-Z]+`)
// 百度网盘提取码 (出现在文本中)
baiduPwdRegex = regexp.MustCompile(`(?:提取码|密码|pwd)[:]\s*([0-9a-zA-Z]{4})`)
)
// 常量定义
const (
pluginName = "discourse"
searchURLTemplate = "https://linux.do/search.json?q=%s%%20%%23resource%%3Acloud-asset%%20in%%3Atitle&page=%d"
detailURLTemplate = "https://linux.do/t/%d.json?track_visit=true&forceLoad=true"
defaultPriority = 2
defaultTimeout = 30 * time.Second
// 多页获取配置
defaultMaxPages = 2 // 默认最多获取2页
maxAllowedPages = 10 // 最多允许获取10页
pageRequestDelay = 500 * time.Millisecond // 每页请求间隔
)
// DiscourseAsyncPlugin 是 Discourse 论坛的异步搜索插件实现
type DiscourseAsyncPlugin struct {
*plugin.BaseAsyncPlugin
scraper *cloudscraper.Scraper
}
// SearchResponse 搜索API响应结构
type SearchResponse struct {
Posts []Post `json:"posts"`
Topics []Topic `json:"topics"`
GroupedSearchResult GroupedSearchResult `json:"grouped_search_result"`
}
// Post 帖子信息
type Post struct {
ID int `json:"id"`
Name string `json:"name"`
Username string `json:"username"`
CreatedAt string `json:"created_at"`
LikeCount int `json:"like_count"`
Blurb string `json:"blurb"`
TopicID int `json:"topic_id"`
}
// Topic 主题信息
type Topic struct {
ID int `json:"id"`
Title string `json:"title"`
FancyTitle string `json:"fancy_title"`
Tags []string `json:"tags"`
PostsCount int `json:"posts_count"`
CreatedAt string `json:"created_at"`
CategoryID int `json:"category_id"`
}
// GroupedSearchResult 搜索元数据
type GroupedSearchResult struct {
Term string `json:"term"`
PostIDs []int `json:"post_ids"`
MoreResults bool `json:"more_full_page_results"`
}
// DetailResponse 详情API响应结构
type DetailResponse struct {
PostStream PostStream `json:"post_stream"`
ID int `json:"id"`
Title string `json:"title"`
Tags []string `json:"tags"`
}
// PostStream 帖子流
type PostStream struct {
Posts []DetailPost `json:"posts"`
}
// DetailPost 详情帖子
type DetailPost struct {
ID int `json:"id"`
Username string `json:"username"`
CreatedAt string `json:"created_at"`
Cooked string `json:"cooked"`
TopicID int `json:"topic_id"`
LinkCounts []LinkCount `json:"link_counts"`
}
// LinkCount 链接统计
type LinkCount struct {
URL string `json:"url"`
Internal bool `json:"internal"`
Reflection bool `json:"reflection"`
Clicks int `json:"clicks"`
}
// 确保 DiscourseAsyncPlugin 实现了 AsyncSearchPlugin 接口
var _ plugin.AsyncSearchPlugin = (*DiscourseAsyncPlugin)(nil)
// init 在包初始化时注册插件
func init() {
plugin.RegisterGlobalPlugin(NewDiscourseAsyncPlugin())
}
// NewDiscourseAsyncPlugin 创建一个新的 Discourse 异步插件实例
func NewDiscourseAsyncPlugin() *DiscourseAsyncPlugin {
// 创建 cloudscraper 实例
scraper, err := cloudscraper.New()
if err != nil {
// 如果创建失败,记录错误但不阻止插件注册
fmt.Printf("[%s] Failed to create cloudscraper: %v\n", pluginName, err)
return &DiscourseAsyncPlugin{
BaseAsyncPlugin: plugin.NewBaseAsyncPlugin(pluginName, defaultPriority),
}
}
return &DiscourseAsyncPlugin{
BaseAsyncPlugin: plugin.NewBaseAsyncPlugin(pluginName, defaultPriority),
scraper: scraper,
}
}
// Search 执行搜索并返回结果(兼容性方法)
func (p *DiscourseAsyncPlugin) Search(keyword string, ext map[string]interface{}) ([]model.SearchResult, error) {
result, err := p.SearchWithResult(keyword, ext)
if err != nil {
return nil, err
}
return result.Results, nil
}
// SearchWithResult 执行搜索并返回包含IsFinal标记的结果
func (p *DiscourseAsyncPlugin) SearchWithResult(keyword string, ext map[string]interface{}) (model.PluginSearchResult, error) {
// 使用BaseAsyncPlugin的异步搜索能力
return p.AsyncSearchWithResult(keyword, p.searchImpl, p.MainCacheKey, ext)
}
// searchImpl 实现具体的搜索逻辑
func (p *DiscourseAsyncPlugin) searchImpl(client *http.Client, keyword string, ext map[string]interface{}) ([]model.SearchResult, error) {
// 检查 cloudscraper 是否初始化成功
if p.scraper == nil {
return nil, fmt.Errorf("cloudscraper not initialized")
}
// 提取 max_pages 参数(最多获取多少页)
maxPages := defaultMaxPages
if maxPagesVal, ok := ext["max_pages"]; ok {
if maxPagesInt, ok := maxPagesVal.(int); ok {
maxPages = maxPagesInt
} else if maxPagesFloat, ok := maxPagesVal.(float64); ok {
maxPages = int(maxPagesFloat)
}
}
// 限制最大页数
if maxPages > maxAllowedPages {
maxPages = maxAllowedPages
}
if maxPages < 1 {
maxPages = 1
}
// 提取起始page参数默认为1
startPage := 1
if pageVal, ok := ext["page"]; ok {
if pageInt, ok := pageVal.(int); ok {
startPage = pageInt
}
}
// URL编码关键词
encodedKeyword := url.QueryEscape(keyword)
// 存储所有结果
var allResults []model.SearchResult
seenPostIDs := make(map[int]bool) // 用于去重
fetchedPages := 0 // 实际获取的页数
// 循环获取多页
for currentPage := startPage; currentPage < startPage+maxPages; currentPage++ {
fetchedPages++
// 如果不是第一页,添加延迟避免请求过快
if currentPage > startPage {
time.Sleep(pageRequestDelay)
}
searchURL := fmt.Sprintf(searchURLTemplate, encodedKeyword, currentPage)
// 发送搜索请求
resp, err := p.scraper.Get(searchURL)
if err != nil {
// 如果已经获取到一些结果,返回已有结果而不是报错
if len(allResults) > 0 {
fmt.Printf("[%s] Warning: failed to fetch page %d: %v\n", p.Name(), currentPage, err)
break
}
return nil, fmt.Errorf("[%s] search request failed on page %d: %w", p.Name(), currentPage, err)
}
// 检查HTTP状态码
if resp.StatusCode != 200 {
resp.Body.Close()
// 如果已经获取到一些结果,返回已有结果
if len(allResults) > 0 {
fmt.Printf("[%s] Warning: unexpected status code %d on page %d\n", p.Name(), resp.StatusCode, currentPage)
break
}
return nil, fmt.Errorf("[%s] unexpected status code: %d on page %d", p.Name(), resp.StatusCode, currentPage)
}
// 读取响应体
body, err := io.ReadAll(resp.Body)
resp.Body.Close()
if err != nil {
if len(allResults) > 0 {
fmt.Printf("[%s] Warning: failed to read page %d: %v\n", p.Name(), currentPage, err)
break
}
return nil, fmt.Errorf("[%s] read response failed on page %d: %w", p.Name(), currentPage, err)
}
// 解析JSON响应
var searchResp SearchResponse
if err := json.Unmarshal(body, &searchResp); err != nil {
if len(allResults) > 0 {
fmt.Printf("[%s] Warning: failed to parse page %d: %v\n", p.Name(), currentPage, err)
break
}
return nil, fmt.Errorf("[%s] parse json failed on page %d: %w", p.Name(), currentPage, err)
}
// 如果没有帖子了,停止获取
if len(searchResp.Posts) == 0 {
break
}
// 转换为SearchResult并去重
pageResults := p.convertToSearchResults(searchResp)
// 添加结果(去重)
for _, result := range pageResults {
// 从 UniqueID 中提取帖子ID
var postID int
fmt.Sscanf(result.UniqueID, "discourse-%d", &postID)
if !seenPostIDs[postID] {
seenPostIDs[postID] = true
allResults = append(allResults, result)
}
}
// 如果 API 返回没有更多结果了,停止获取
if !searchResp.GroupedSearchResult.MoreResults {
break
}
// 如果这一页没有新的结果,也停止
if len(pageResults) == 0 {
break
}
}
// 如果启用了多页获取,在日志中显示获取的总结果数
if maxPages > 1 && len(allResults) > 0 {
fmt.Printf("[%s] Fetched %d unique results from %d pages for keyword: %s\n",
p.Name(), len(allResults), fetchedPages, keyword)
}
return allResults, nil
}
// max 返回两个整数中的较大值
func max(a, b int) int {
if a > b {
return a
}
return b
}
// convertToSearchResults 将搜索响应转换为SearchResult列表
func (p *DiscourseAsyncPlugin) convertToSearchResults(resp SearchResponse) []model.SearchResult {
var results []model.SearchResult
// 创建 topic 映射,方便快速查找
topicMap := make(map[int]Topic)
for _, topic := range resp.Topics {
topicMap[topic.ID] = topic
}
// 遍历所有帖子
for _, post := range resp.Posts {
// 获取对应的主题
topic, found := topicMap[post.TopicID]
if !found {
// 如果找不到主题,使用默认值
topic = Topic{
ID: post.TopicID,
Title: "未知标题",
Tags: []string{},
}
}
// 从blurb中提取网盘链接
links := p.extractNetDiskLinksFromBlurb(post.Blurb)
// 如果没有提取到链接,跳过这个结果
if len(links) == 0 {
continue
}
// 解析时间
createdAt, _ := time.Parse(time.RFC3339, post.CreatedAt)
// 构建 SearchResult
result := model.SearchResult{
UniqueID: fmt.Sprintf("%s-%d", pluginName, post.ID),
Title: topic.Title,
Content: p.cleanContent(post.Blurb),
Links: links,
Tags: topic.Tags,
Channel: "", // 插件搜索结果必须为空
Datetime: createdAt,
}
results = append(results, result)
}
return results
}
// extractNetDiskLinksFromBlurb 从blurb文本中提取网盘链接
func (p *DiscourseAsyncPlugin) extractNetDiskLinksFromBlurb(blurb string) []model.Link {
var links []model.Link
// 提取夸克网盘
quarkLinks := quarkRegex.FindAllString(blurb, -1)
for _, linkURL := range quarkLinks {
links = append(links, model.Link{
Type: "quark",
URL: linkURL,
})
}
// 提取百度网盘(带提取码)
baiduMatches := baiduRegex.FindAllStringSubmatch(blurb, -1)
for _, match := range baiduMatches {
link := model.Link{
Type: "baidu",
URL: match[0],
}
// 如果URL中包含pwd参数
if len(match) > 1 && match[1] != "" {
link.Password = match[1]
} else {
// 尝试从文本中查找提取码
pwdMatch := baiduPwdRegex.FindStringSubmatch(blurb)
if len(pwdMatch) > 1 {
link.Password = pwdMatch[1]
}
}
links = append(links, link)
}
// 提取阿里云盘
aliyunLinks := aliyunRegex.FindAllString(blurb, -1)
for _, linkURL := range aliyunLinks {
links = append(links, model.Link{
Type: "aliyun",
URL: linkURL,
})
}
// 提取迅雷网盘(带提取码)
xunleiMatches := xunleiRegex.FindAllStringSubmatch(blurb, -1)
for _, match := range xunleiMatches {
link := model.Link{
Type: "xunlei",
URL: match[0],
}
if len(match) > 1 && match[1] != "" {
link.Password = match[1]
}
links = append(links, link)
}
// 提取天翼云盘
tianyiLinks := tianyiRegex.FindAllString(blurb, -1)
for _, linkURL := range tianyiLinks {
links = append(links, model.Link{
Type: "tianyi",
URL: linkURL,
})
}
// 提取UC网盘
ucLinks := ucRegex.FindAllString(blurb, -1)
for _, linkURL := range ucLinks {
links = append(links, model.Link{
Type: "uc",
URL: linkURL,
})
}
// 提取115网盘
pan115Links := pan115Regex.FindAllString(blurb, -1)
for _, linkURL := range pan115Links {
links = append(links, model.Link{
Type: "115",
URL: linkURL,
})
}
return links
}
// cleanContent 清理内容移除HTML标签
func (p *DiscourseAsyncPlugin) cleanContent(content string) string {
// 移除HTML标签
content = regexp.MustCompile(`<[^>]+>`).ReplaceAllString(content, "")
// 解码HTML实体
content = strings.ReplaceAll(content, "&amp;", "&")
content = strings.ReplaceAll(content, "&lt;", "<")
content = strings.ReplaceAll(content, "&gt;", ">")
content = strings.ReplaceAll(content, "&quot;", "\"")
content = strings.ReplaceAll(content, "&#39;", "'")
// 移除多余空白
content = regexp.MustCompile(`\s+`).ReplaceAllString(content, " ")
content = strings.TrimSpace(content)
// 限制长度
if len(content) > 200 {
content = content[:200] + "..."
}
return content
}
// GetTopicDetail 获取主题详情(可选实现,用于获取完整链接)
func (p *DiscourseAsyncPlugin) GetTopicDetail(topicID int) ([]model.Link, error) {
// 检查 cloudscraper 是否初始化成功
if p.scraper == nil {
return nil, fmt.Errorf("cloudscraper not initialized")
}
// 构建详情URL
detailURL := fmt.Sprintf(detailURLTemplate, topicID)
// 发送详情请求
resp, err := p.scraper.Get(detailURL)
if err != nil {
return nil, fmt.Errorf("detail request failed: %w", err)
}
defer resp.Body.Close()
// 检查HTTP状态码
if resp.StatusCode != 200 {
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
// 读取响应体
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("read response failed: %w", err)
}
// 解析JSON响应
var detailResp DetailResponse
if err := json.Unmarshal(body, &detailResp); err != nil {
return nil, fmt.Errorf("parse json failed: %w", err)
}
// 提取第一个帖子的链接
if len(detailResp.PostStream.Posts) == 0 {
return nil, fmt.Errorf("no posts found")
}
mainPost := detailResp.PostStream.Posts[0]
// 从 link_counts 中提取网盘链接
var links []model.Link
for _, linkCount := range mainPost.LinkCounts {
// 跳过内部链接
if linkCount.Internal {
continue
}
// 判断是否为网盘链接并解析
link := p.parseNetDiskLink(linkCount.URL)
if link != nil {
links = append(links, *link)
}
}
return links, nil
}
// parseNetDiskLink 解析网盘链接
func (p *DiscourseAsyncPlugin) parseNetDiskLink(linkURL string) *model.Link {
// 夸克网盘
if quarkRegex.MatchString(linkURL) {
return &model.Link{
Type: "quark",
URL: linkURL,
}
}
// 百度网盘
if baiduRegex.MatchString(linkURL) {
link := &model.Link{
Type: "baidu",
URL: linkURL,
}
// 提取pwd参数
if matches := baiduRegex.FindStringSubmatch(linkURL); len(matches) > 1 && matches[1] != "" {
link.Password = matches[1]
}
return link
}
// 阿里云盘
if aliyunRegex.MatchString(linkURL) {
return &model.Link{
Type: "aliyun",
URL: linkURL,
}
}
// 迅雷网盘
if xunleiRegex.MatchString(linkURL) {
link := &model.Link{
Type: "xunlei",
URL: linkURL,
}
// 提取pwd参数
if matches := xunleiRegex.FindStringSubmatch(linkURL); len(matches) > 1 && matches[1] != "" {
link.Password = matches[1]
}
return link
}
// 天翼云盘
if tianyiRegex.MatchString(linkURL) {
return &model.Link{
Type: "tianyi",
URL: linkURL,
}
}
// UC网盘
if ucRegex.MatchString(linkURL) {
return &model.Link{
Type: "uc",
URL: linkURL,
}
}
// 115网盘
if pan115Regex.MatchString(linkURL) {
return &model.Link{
Type: "115",
URL: linkURL,
}
}
// 不是网盘链接
return nil
}