新增插件leijing

2025-11-25 03:14:59 +08:00 · 2025-08-23 10:37:40 +08:00
parent 8e074e5fdb
commit d5a9752e7f
4 changed files with 627 additions and 1 deletions
--- a/main.go
+++ b/main.go
@@ -54,6 +54,7 @@ import (
 	_ "pansou/plugin/cldi"
 	_ "pansou/plugin/xiaozhang"
 	_ "pansou/plugin/libvio"
+	_ "pansou/plugin/leijing"
 )

 // 全局缓存写入管理器
--- a/plugin/leijing/html结构分析.md
+++ b/plugin/leijing/html结构分析.md
@@ -0,0 +1,162 @@
+# leijing插件HTML结构分析
+
+## 网站信息
+- 网站名称：雷鲸小站-天翼云盘交流站
+- 主域名：https://leijing.xyz
+- 网站类型：天翼云盘资源分享论坛
+- 特点：
+  - **专注天翼云盘**（只有天翼云盘链接）
+  - 部分帖子需要回复才能看到链接（这些会被自动忽略）
+  - 有些搜索结果直接在摘要中包含链接
+
+## 1. 搜索页面结构
+
+### 搜索URL格式
+```
+https://leijing.xyz/search?keyword={关键词}
+```
+
+### 搜索结果容器
+- 主容器：`<div class="topicModule">`
+- 结果列表：`<div class="topicList">`
+- 单个结果项：`<div class="topicItem">`
+
+### 搜索结果项结构
+```html
+<div class="topicItem">
+    <div class="avatarBox">
+        <!-- 用户头像 -->
+    </div>
+    
+    <div class="content clearfix">
+        <ul class="info">
+            <li>
+                <span class="module">话题</span>
+                <span class="tag">剧集</span>
+                <a class="userName" href="...">用户名</a>
+                <span class="postTime">发表时间：2025-07-27 12:15:41</span>
+                <span class="lastReplyTime">最新回复：2025-08-09 18:48:25</span>
+            </li>
+        </ul>
+        <h2 class="title highlight clearfix">
+            <a href="thread?topicId=42230">凡人修仙传 (2025) 杨洋/金晨 4K 普码+高码 首更 04 集</a>
+        </h2>
+        <div class="detail">
+            <h2 class="summary highlight">
+                凡人修仙传 (2025) 杨洋/金晨 首更 04 集 
+                普码 -https://cloud.189.cn/t/YZRfuuAnaeQz 
+                4K60 帧 -https://cloud.189.cn/t/aiuYru7zIfqq 
+                4KHQ 高码 - https://cloud.189.cn/t/RZBjQ3Y77ZNb
+            </h2>
+        </div>
+    </div>
+    
+    <div class="statistic clearfix">
+        <div class="viewTotal">
+            <i class="cms-view icon"></i>
+            7442
+        </div>
+        <div class="commentTotal">
+            <i class="cms-commentCount icon"></i>
+            3
+        </div>
+    </div>
+</div>
+```
+
+### 字段提取要点
+- **标题**：`.title a` 的文本内容
+- **详情页链接**：`.title a` 的 `href` 属性（格式：`thread?topicId={id}`）
+- **摘要**：`.summary` 的文本内容（可能包含天翼云盘链接）
+- **分类标签**：`.tag` 的文本内容
+- **发布时间**：`.postTime` 的文本内容
+- **查看数**：`.viewTotal` 的文本内容
+- **评论数**：`.commentTotal` 的文本内容
+
+### 天翼云盘链接提取
+从摘要文本中使用正则表达式提取：
+```
+https://cloud.189.cn/t/[a-zA-Z0-9]+
+```
+
+## 2. 详情页面结构
+
+### 详情页URL格式
+```
+https://leijing.xyz/thread?topicId={id}
+```
+
+### 页面结构
+```html
+<div class="topicContentModule">
+    <div class="left">
+        <div class="topic-wrap">
+            <div class="topicBox">
+                <div class="title">
+                    凡人修仙传 (2025) 杨洋/金晨 4K 普码+高码 首更 04 集
+                </div>
+                <div class="topicInfo clearfix">
+                    <div class="postTime">2025-07-27 12:15:41</div>
+                    <div class="viewTotal">7443次阅读</div>
+                    <div class="comment">3个评论</div>
+                </div>
+                <div topicId="42230" class="topicContent">
+                    <div style="text-align: center;">
+                        <strong>凡人修仙传 (2025) 杨洋/金晨 首更 04 集</strong>
+                        <br><strong>普码</strong>
+                        <br><strong><a href="https://cloud.189.cn/t/YZRfuuAnaeQz">https://cloud.189.cn/t/YZRfuuAnaeQz</a></strong>
+                        <br><strong>4K60 帧</strong>
+                        <br><strong><a href="https://cloud.189.cn/t/aiuYru7zIfqq">https://cloud.189.cn/t/aiuYru7zIfqq</a></strong>
+                        <!-- 更多内容 -->
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+</div>
+```
+
+### 字段提取要点
+- **标题**：`.topicBox .title` 的文本内容
+- **内容**：`.topicContent` 的HTML内容
+- **发布时间**：`.topicInfo .postTime` 的文本内容
+- **查看数**：`.topicInfo .viewTotal` 的文本内容
+
+### 天翼云盘链接提取
+从详情页内容中提取：
+1. 查找所有 `<a>` 标签
+2. 过滤出包含 `cloud.189.cn` 的链接
+3. 提取 `href` 属性
+
+## 3. 特殊处理事项
+
+### 回复可见内容
+- 有些帖子内容需要回复才能看到
+- 这类帖子通常不包含可提取的链接
+- 如果提取不到链接，直接忽略该结果
+
+### 链接格式统一
+天翼云盘链接格式：
+- 正常格式：`https://cloud.189.cn/t/{shareCode}`
+- 部分可能有访问码：`https://cloud.189.cn/t/{shareCode}?pwd={password}`
+
+### 搜索策略
+1. 先从搜索结果的摘要中提取链接（速度快）
+2. 如果摘要中有链接，直接使用
+3. 如果摘要中没有链接，访问详情页提取
+4. 如果详情页也没有链接（需要回复），忽略该结果
+
+## 4. 实现建议
+
+### 优化策略
+1. **优先使用摘要链接**：很多搜索结果的摘要中已包含完整链接
+2. **批量处理**：对需要访问详情页的结果进行并发处理
+3. **缓存机制**：缓存详情页结果，避免重复访问
+
+### 错误处理
+1. 处理需要回复才能看到的内容（返回空结果）
+2. 处理链接提取失败的情况
+3. 处理网站访问异常
+
+### 链接类型
+所有链接统一标记为 `tianyi`（天翼云盘）
--- a/plugin/leijing/leijing.go
+++ b/plugin/leijing/leijing.go
@@ -0,0 +1,463 @@
+package leijing
+
+import (
+	"compress/gzip"
+	"fmt"
+	"io"
+	"log"
+	"net/http"
+	"net/url"
+	"regexp"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/PuerkitoBio/goquery"
+	"pansou/model"
+	"pansou/plugin"
+)
+
+const (
+	BaseURL        = "https://leijing.xyz"
+	SearchPath     = "/search"
+	UserAgent      = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"
+	MaxConcurrency = 20 // 详情页最大并发数
+	MaxPages       = 1  // 最大搜索页数（暂时只搜索第一页）
+)
+
+// LeijingPlugin 雷鲸小站插件
+type LeijingPlugin struct {
+	*plugin.BaseAsyncPlugin
+	debugMode    bool
+	detailCache  sync.Map // 缓存详情页结果
+	cacheTTL     time.Duration
+}
+
+// NewLeijingPlugin 创建新的雷鲸小站插件实例
+func NewLeijingPlugin() *LeijingPlugin {
+	// 检查调试模式
+	debugMode := false // 默认关闭调试
+	
+	p := &LeijingPlugin{
+		BaseAsyncPlugin: plugin.NewBaseAsyncPlugin("leijing", 2),
+		debugMode:       debugMode,
+		cacheTTL:        30 * time.Minute,
+	}
+	
+	return p
+}
+
+// Name 返回插件名称
+func (p *LeijingPlugin) Name() string {
+	return "leijing"
+}
+
+// DisplayName 返回插件显示名称
+func (p *LeijingPlugin) DisplayName() string {
+	return "雷鲸小站"
+}
+
+// Description 返回插件描述
+func (p *LeijingPlugin) Description() string {
+	return "雷鲸小站 - 天翼云盘资源分享站"
+}
+
+// Search 执行搜索并返回结果（兼容性方法）
+func (p *LeijingPlugin) Search(keyword string, ext map[string]interface{}) ([]model.SearchResult, error) {
+	result, err := p.SearchWithResult(keyword, ext)
+	if err != nil {
+		return nil, err
+	}
+	return result.Results, nil
+}
+
+// SearchWithResult 执行搜索并返回包含IsFinal标记的结果
+func (p *LeijingPlugin) SearchWithResult(keyword string, ext map[string]interface{}) (model.PluginSearchResult, error) {
+	return p.AsyncSearchWithResult(keyword, p.searchImpl, p.MainCacheKey, ext)
+}
+
+// setRequestHeaders 设置请求头
+func (p *LeijingPlugin) setRequestHeaders(req *http.Request, referer string) {
+	req.Header.Set("User-Agent", UserAgent)
+	req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
+	req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
+	req.Header.Set("Accept-Encoding", "gzip, deflate")
+	req.Header.Set("Cache-Control", "no-cache")
+	req.Header.Set("Pragma", "no-cache")
+	if referer != "" {
+		req.Header.Set("Referer", referer)
+	}
+}
+
+// doRequest 发送HTTP请求
+func (p *LeijingPlugin) doRequest(client *http.Client, url string, referer string) (*http.Response, error) {
+	req, err := http.NewRequest("GET", url, nil)
+	if err != nil {
+		return nil, err
+	}
+	
+	p.setRequestHeaders(req, referer)
+	
+	if p.debugMode {
+		log.Printf("[Leijing] 发送请求: %s", url)
+	}
+	
+	resp, err := client.Do(req)
+	if err != nil {
+		if p.debugMode {
+			log.Printf("[Leijing] 请求失败: %v", err)
+		}
+		return nil, err
+	}
+	
+	if p.debugMode {
+		log.Printf("[Leijing] 响应状态: %d", resp.StatusCode)
+	}
+	
+	return resp, nil
+}
+
+// searchImpl 实际的搜索实现
+func (p *LeijingPlugin) searchImpl(client *http.Client, keyword string, ext map[string]interface{}) ([]model.SearchResult, error) {
+	searchURL := fmt.Sprintf("%s%s?keyword=%s", BaseURL, SearchPath, url.QueryEscape(keyword))
+	
+	if p.debugMode {
+		log.Printf("[Leijing] 开始搜索: %s", keyword)
+		log.Printf("[Leijing] 搜索URL: %s", searchURL)
+	}
+	
+	// 发送搜索请求
+	resp, err := p.doRequest(client, searchURL, BaseURL)
+	if err != nil {
+		return nil, fmt.Errorf("发送搜索请求失败: %w", err)
+	}
+	defer resp.Body.Close()
+	
+	if resp.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("搜索响应状态码异常: %d", resp.StatusCode)
+	}
+	
+	// 处理响应体（可能是gzip压缩的）
+	reader, err := p.getResponseReader(resp)
+	if err != nil {
+		return nil, err
+	}
+	
+	// 解析HTML
+	doc, err := goquery.NewDocumentFromReader(reader)
+	if err != nil {
+		return nil, fmt.Errorf("解析HTML失败: %w", err)
+	}
+	
+	// 提取搜索结果
+	results := p.extractSearchResults(doc, keyword)
+	
+	if p.debugMode {
+		log.Printf("[Leijing] 找到 %d 个搜索结果", len(results))
+	}
+	
+	// 对于没有直接提取到链接的结果，访问详情页获取链接
+	results = p.enrichWithDetailLinks(client, results, keyword)
+	
+	// 过滤结果（去掉没有链接的）
+	filteredResults := p.filterValidResults(results)
+	
+	if p.debugMode {
+		log.Printf("[Leijing] 过滤后剩余 %d 个有效结果", len(filteredResults))
+	}
+	
+	return filteredResults, nil
+}
+
+// getResponseReader 获取响应读取器（处理gzip压缩）
+func (p *LeijingPlugin) getResponseReader(resp *http.Response) (io.Reader, error) {
+	var reader io.Reader = resp.Body
+	
+	// 检查Content-Encoding
+	contentEncoding := resp.Header.Get("Content-Encoding")
+	if p.debugMode {
+		log.Printf("[Leijing] Content-Encoding: %s", contentEncoding)
+	}
+	
+	// 如果是gzip压缩，手动解压
+	if contentEncoding == "gzip" {
+		gzReader, err := gzip.NewReader(resp.Body)
+		if err != nil {
+			return nil, fmt.Errorf("创建gzip reader失败: %w", err)
+		}
+		reader = gzReader
+	}
+	
+	return reader, nil
+}
+
+// extractSearchResults 从HTML中提取搜索结果
+func (p *LeijingPlugin) extractSearchResults(doc *goquery.Document, keyword string) []model.SearchResult {
+	var results []model.SearchResult
+	
+	// 选择所有搜索结果项
+	doc.Find(".topicItem").Each(func(i int, s *goquery.Selection) {
+		// 提取标题和详情页链接
+		titleElem := s.Find(".title a")
+		title := strings.TrimSpace(titleElem.Text())
+		detailPath, _ := titleElem.Attr("href")
+		
+		if title == "" || detailPath == "" {
+			return
+		}
+		
+		// 构建完整的详情页URL
+		detailURL := BaseURL + "/" + strings.TrimPrefix(detailPath, "/")
+		
+		// 提取摘要（可能包含链接）
+		summary := strings.TrimSpace(s.Find(".summary").Text())
+		
+		// 提取其他信息
+		postTime := strings.TrimSpace(s.Find(".postTime").Text())
+		postTime = strings.TrimPrefix(postTime, "发表时间：")
+		
+		// 从详情页路径提取ID（如：thread?topicId=42230 -> 42230）
+		idMatch := regexp.MustCompile(`topicId=(\d+)`).FindStringSubmatch(detailPath)
+		resourceID := ""
+		if len(idMatch) > 1 {
+			resourceID = idMatch[1]
+		} else {
+			resourceID = fmt.Sprintf("%d", time.Now().UnixNano())
+		}
+		
+		if p.debugMode {
+			log.Printf("[Leijing] 提取结果 %d: %s, URL: %s", i+1, title, detailURL)
+		}
+		
+		// 尝试从摘要中提取天翼云盘链接
+		links := p.extractTianyiLinks(summary)
+		
+		if p.debugMode {
+			log.Printf("[Leijing] 从摘要中提取到 %d 个链接", len(links))
+		}
+		
+		// 解析时间
+		var publishTime time.Time
+		if postTime != "" {
+			parsedTime, err := time.Parse("2006-01-02 15:04:05", postTime)
+			if err == nil {
+				publishTime = parsedTime
+			} else {
+				publishTime = time.Now()
+			}
+		} else {
+			publishTime = time.Now()
+		}
+		
+		result := model.SearchResult{
+			Title:     title,
+			Content:   summary,
+			Channel:   "",
+			MessageID: fmt.Sprintf("%s-%s", p.Name(), resourceID),
+			UniqueID:  fmt.Sprintf("%s-%s", p.Name(), resourceID),
+			Datetime:  publishTime,
+			Links:     links,
+		}
+		
+		// 如果没有从摘要中提取到链接，将详情页URL存储在Tags中供后续使用
+		if len(links) == 0 {
+			result.Tags = []string{detailURL}
+		}
+		
+		results = append(results, result)
+	})
+	
+	return results
+}
+
+// extractTianyiLinks 从文本中提取天翼云盘链接
+func (p *LeijingPlugin) extractTianyiLinks(text string) []model.Link {
+	var links []model.Link
+	
+	// 天翼云盘链接正则
+	tianyiRegex := regexp.MustCompile(`https://cloud\.189\.cn/t/[a-zA-Z0-9]+`)
+	matches := tianyiRegex.FindAllString(text, -1)
+	
+	// 去重
+	linkMap := make(map[string]bool)
+	for _, match := range matches {
+		if !linkMap[match] {
+			linkMap[match] = true
+			links = append(links, model.Link{
+				URL:  match,
+				Type: "tianyi",
+			})
+		}
+	}
+	
+	return links
+}
+
+// enrichWithDetailLinks 并发获取详情页的下载链接
+func (p *LeijingPlugin) enrichWithDetailLinks(client *http.Client, results []model.SearchResult, keyword string) []model.SearchResult {
+	if p.debugMode {
+		log.Printf("[Leijing] 开始获取详情页链接")
+	}
+	
+	var wg sync.WaitGroup
+	var mu sync.Mutex
+	semaphore := make(chan struct{}, MaxConcurrency)
+	
+	for i := range results {
+		// 如果已经有链接了，跳过
+		if len(results[i].Links) > 0 {
+			continue
+		}
+		
+		// 如果没有详情页URL，跳过
+		if len(results[i].Tags) == 0 {
+			continue
+		}
+		
+		wg.Add(1)
+		go func(idx int) {
+			defer wg.Done()
+			semaphore <- struct{}{}
+			defer func() { <-semaphore }()
+			
+			// 添加小延迟避免请求过快
+			time.Sleep(time.Duration(idx*50) * time.Millisecond)
+			
+			detailURL := results[idx].Tags[0]
+			links := p.fetchDetailPageLinks(client, detailURL)
+			
+			mu.Lock()
+			if len(links) > 0 {
+				results[idx].Links = links
+			}
+			// 清空Tags
+			results[idx].Tags = nil
+			mu.Unlock()
+			
+			if p.debugMode {
+				log.Printf("[Leijing] 详情页 %d/%d 获取到 %d 个链接", idx+1, len(results), len(links))
+			}
+		}(i)
+	}
+	
+	wg.Wait()
+	
+	return results
+}
+
+// fetchDetailPageLinks 获取详情页的下载链接
+func (p *LeijingPlugin) fetchDetailPageLinks(client *http.Client, detailURL string) []model.Link {
+	// 检查缓存
+	if cached, ok := p.detailCache.Load(detailURL); ok {
+		if links, ok := cached.([]model.Link); ok {
+			if p.debugMode {
+				log.Printf("[Leijing] 使用缓存的详情页结果: %s", detailURL)
+			}
+			return links
+		}
+	}
+	
+	// 访问详情页
+	resp, err := p.doRequest(client, detailURL, BaseURL)
+	if err != nil {
+		if p.debugMode {
+			log.Printf("[Leijing] 获取详情页失败: %v", err)
+		}
+		return nil
+	}
+	defer resp.Body.Close()
+	
+	if resp.StatusCode != http.StatusOK {
+		if p.debugMode {
+			log.Printf("[Leijing] 详情页响应状态码异常: %d", resp.StatusCode)
+		}
+		return nil
+	}
+	
+	// 处理响应体
+	reader, err := p.getResponseReader(resp)
+	if err != nil {
+		return nil
+	}
+	
+	// 解析HTML
+	doc, err := goquery.NewDocumentFromReader(reader)
+	if err != nil {
+		if p.debugMode {
+			log.Printf("[Leijing] 解析详情页HTML失败: %v", err)
+		}
+		return nil
+	}
+	
+	// 提取详情页中的天翼云盘链接
+	links := p.extractDetailPageLinks(doc)
+	
+	// 缓存结果
+	if len(links) > 0 {
+		p.detailCache.Store(detailURL, links)
+		
+		// 设置缓存过期
+		go func() {
+			time.Sleep(p.cacheTTL)
+			p.detailCache.Delete(detailURL)
+		}()
+	}
+	
+	return links
+}
+
+// extractDetailPageLinks 从详情页HTML中提取天翼云盘链接
+func (p *LeijingPlugin) extractDetailPageLinks(doc *goquery.Document) []model.Link {
+	var links []model.Link
+	linkMap := make(map[string]bool) // 用于去重
+	
+	// 从详情页内容中查找所有链接
+	doc.Find(".topicContent a[href*='cloud.189.cn']").Each(func(i int, s *goquery.Selection) {
+		href, exists := s.Attr("href")
+		if !exists || href == "" {
+			return
+		}
+		
+		// 去重
+		if linkMap[href] {
+			return
+		}
+		linkMap[href] = true
+		
+		links = append(links, model.Link{
+			URL:  href,
+			Type: "tianyi",
+		})
+		
+		if p.debugMode {
+			log.Printf("[Leijing] 提取到天翼云盘链接: %s", href)
+		}
+	})
+	
+	// 如果没有找到链接，尝试从文本中提取
+	if len(links) == 0 {
+		content := doc.Find(".topicContent").Text()
+		links = p.extractTianyiLinks(content)
+	}
+	
+	return links
+}
+
+// filterValidResults 过滤有效结果（去掉没有链接的）
+func (p *LeijingPlugin) filterValidResults(results []model.SearchResult) []model.SearchResult {
+	var validResults []model.SearchResult
+	
+	for _, result := range results {
+		if len(result.Links) > 0 {
+			validResults = append(validResults, result)
+		} else if p.debugMode {
+			log.Printf("[Leijing] 忽略无链接结果: %s", result.Title)
+		}
+	}
+	
+	return validResults
+}
+
+func init() {
+	plugin.RegisterGlobalPlugin(NewLeijingPlugin())
+}
--- a/plugin/libvio/libvio.go
+++ b/plugin/libvio/libvio.go
@@ -41,7 +41,7 @@ func NewLibvioPlugin() *LibvioPlugin {
 	debugMode := false // 开启调试模式
 	
 	p := &LibvioPlugin{
-		BaseAsyncPlugin: plugin.NewBaseAsyncPluginWithFilter("libvio", 4, true ), // 优先级4	
+		BaseAsyncPlugin: plugin.NewBaseAsyncPluginWithFilter("libvio", 1, true ),	
 		debugMode:       debugMode,
 		cacheTTL:        30 * time.Minute,
 	}