mirror of
https://github.com/fish2018/pansou.git
synced 2025-11-25 03:14:59 +08:00
新增插件ahhhhfs
This commit is contained in:
@@ -38,7 +38,7 @@ susu,thepiratebay,wanou,xuexizhinan,panyq,zhizhen,labi,muou,ouge,shandian,
|
||||
duoduo,huban,cyg,erxiao,miaoso,fox4k,pianku,clmao,wuji,cldi,xiaozhang,
|
||||
libvio,leijing,xb6v,xys,ddys,hdmoli,yuhuage,u3c3,javdb,clxiong,jutoushe,
|
||||
sdso,xiaoji,xdyh,haisou,bixin,djgou,nyaa,xinjuc,aikanzy,qupanshe,xdpan,
|
||||
discourse,yunsou
|
||||
discourse,yunsou,ahhhhfs
|
||||
</pre>
|
||||
</details>
|
||||
|
||||
|
||||
BIN
cache/shard_13/b84edcde43dc50163b0bdca4d4877599
vendored
Normal file
BIN
cache/shard_13/b84edcde43dc50163b0bdca4d4877599
vendored
Normal file
Binary file not shown.
1
cache/shard_13/b84edcde43dc50163b0bdca4d4877599.meta
vendored
Normal file
1
cache/shard_13/b84edcde43dc50163b0bdca4d4877599.meta
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"key":"86dfe1d575e2529c0fcc682e5ce6ac18","expiry":"2025-10-27T18:57:36.308013158+08:00","last_used":"2025-10-27T17:58:00.252958+08:00","size":271,"last_modified":"2025-10-27T17:58:00.252958+08:00"}
|
||||
BIN
cache/shard_15/9f8f814bd1d2c580f5926791ee1daaaf
vendored
Normal file
BIN
cache/shard_15/9f8f814bd1d2c580f5926791ee1daaaf
vendored
Normal file
Binary file not shown.
1
cache/shard_15/9f8f814bd1d2c580f5926791ee1daaaf.meta
vendored
Normal file
1
cache/shard_15/9f8f814bd1d2c580f5926791ee1daaaf.meta
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"key":"8a1ceb1a686b9eb4fb9a83b93166ab45","expiry":"2025-10-27T18:57:38.623512363+08:00","last_used":"2025-10-27T17:58:00.254162+08:00","size":3535,"last_modified":"2025-10-27T17:58:00.254162+08:00"}
|
||||
BIN
cache/shard_4/3efad7600545961197b7d17e7a39eb92
vendored
Normal file
BIN
cache/shard_4/3efad7600545961197b7d17e7a39eb92
vendored
Normal file
Binary file not shown.
1
cache/shard_4/3efad7600545961197b7d17e7a39eb92.meta
vendored
Normal file
1
cache/shard_4/3efad7600545961197b7d17e7a39eb92.meta
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"key":"b1bc7f0835474af246e9f96c1b8befff","expiry":"2025-10-27T18:57:15.116751458+08:00","last_used":"2025-10-27T17:58:00.249897+08:00","size":271,"last_modified":"2025-10-27T17:58:00.249897+08:00"}
|
||||
BIN
cache/shard_9/468298d39de2c822a2c4f2b97ebe2305
vendored
Normal file
BIN
cache/shard_9/468298d39de2c822a2c4f2b97ebe2305
vendored
Normal file
Binary file not shown.
1
cache/shard_9/468298d39de2c822a2c4f2b97ebe2305.meta
vendored
Normal file
1
cache/shard_9/468298d39de2c822a2c4f2b97ebe2305.meta
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"key":"b19c2520964407945e4e9e3576b97577","expiry":"2025-10-27T18:57:15.118728017+08:00","last_used":"2025-10-27T17:58:00.251944+08:00","size":4310,"last_modified":"2025-10-27T17:58:00.251944+08:00"}
|
||||
1
main.go
1
main.go
@@ -78,6 +78,7 @@ import (
|
||||
_ "pansou/plugin/xdpan"
|
||||
_ "pansou/plugin/discourse"
|
||||
_ "pansou/plugin/yunsou"
|
||||
_ "pansou/plugin/ahhhhfs"
|
||||
)
|
||||
|
||||
// 全局缓存写入管理器
|
||||
|
||||
537
plugin/ahhhhfs/ahhhhfs.go
Normal file
537
plugin/ahhhhfs/ahhhhfs.go
Normal file
@@ -0,0 +1,537 @@
|
||||
package ahhhhfs
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"pansou/model"
|
||||
"pansou/plugin"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
// 预编译的正则表达式
|
||||
var (
|
||||
// 从详情页URL中提取文章ID的正则表达式
|
||||
articleIDRegex = regexp.MustCompile(`/(\d+)/?$`)
|
||||
|
||||
// 常见网盘链接的正则表达式
|
||||
quarkLinkRegex = regexp.MustCompile(`https?://pan\.quark\.cn/s/[0-9a-zA-Z]+`)
|
||||
baiduLinkRegex = regexp.MustCompile(`https?://pan\.baidu\.com/s/[0-9a-zA-Z_\-]+`)
|
||||
aliyunLinkRegex = regexp.MustCompile(`https?://(www\.)?(aliyundrive\.com|alipan\.com)/s/[0-9a-zA-Z]+`)
|
||||
ucLinkRegex = regexp.MustCompile(`https?://drive\.uc\.cn/s/[0-9a-zA-Z]+`)
|
||||
xunleiLinkRegex = regexp.MustCompile(`https?://pan\.xunlei\.com/s/[0-9a-zA-Z_\-]+`)
|
||||
tianyiLinkRegex = regexp.MustCompile(`https?://cloud\.189\.cn/(t|web)/[0-9a-zA-Z]+`)
|
||||
link115Regex = regexp.MustCompile(`https?://115\.com/s/[0-9a-zA-Z]+`)
|
||||
link123Regex = regexp.MustCompile(`https?://123pan\.com/s/[0-9a-zA-Z]+`)
|
||||
pikpakLinkRegex = regexp.MustCompile(`https?://mypikpak\.com/s/[0-9a-zA-Z]+`)
|
||||
|
||||
// 提取码匹配模式
|
||||
pwdPatterns = []*regexp.Regexp{
|
||||
regexp.MustCompile(`提取码[::]\s*([0-9a-zA-Z]+)`),
|
||||
regexp.MustCompile(`密码[::]\s*([0-9a-zA-Z]+)`),
|
||||
regexp.MustCompile(`pwd[=::]\s*([0-9a-zA-Z]+)`),
|
||||
regexp.MustCompile(`code[=::]\s*([0-9a-zA-Z]+)`),
|
||||
}
|
||||
|
||||
// 缓存相关
|
||||
detailCache = sync.Map{} // 缓存详情页解析结果
|
||||
lastCleanupTime = time.Now()
|
||||
cacheTTL = 1 * time.Hour
|
||||
)
|
||||
|
||||
const (
|
||||
// 插件名称
|
||||
pluginName = "ahhhhfs"
|
||||
|
||||
// 优先级
|
||||
defaultPriority = 2
|
||||
|
||||
// 超时时间
|
||||
DefaultTimeout = 10 * time.Second
|
||||
DetailTimeout = 8 * time.Second
|
||||
|
||||
// 并发数限制
|
||||
MaxConcurrency = 15
|
||||
|
||||
// HTTP连接池配置
|
||||
MaxIdleConns = 100
|
||||
MaxIdleConnsPerHost = 30
|
||||
MaxConnsPerHost = 50
|
||||
IdleConnTimeout = 90 * time.Second
|
||||
)
|
||||
|
||||
// 性能统计
|
||||
var (
|
||||
searchRequests int64 = 0
|
||||
detailPageRequests int64 = 0
|
||||
cacheHits int64 = 0
|
||||
cacheMisses int64 = 0
|
||||
)
|
||||
|
||||
// AhhhhfsAsyncPlugin ahhhhfs异步插件
|
||||
type AhhhhfsAsyncPlugin struct {
|
||||
*plugin.BaseAsyncPlugin
|
||||
optimizedClient *http.Client
|
||||
}
|
||||
|
||||
// 在init函数中注册插件
|
||||
func init() {
|
||||
plugin.RegisterGlobalPlugin(NewAhhhhfsPlugin())
|
||||
|
||||
// 启动缓存清理goroutine
|
||||
go startCacheCleaner()
|
||||
}
|
||||
|
||||
// startCacheCleaner 启动一个定期清理缓存的goroutine
|
||||
func startCacheCleaner() {
|
||||
ticker := time.NewTicker(30 * time.Minute)
|
||||
defer ticker.Stop()
|
||||
|
||||
for range ticker.C {
|
||||
// 清空所有缓存
|
||||
detailCache = sync.Map{}
|
||||
lastCleanupTime = time.Now()
|
||||
}
|
||||
}
|
||||
|
||||
// createOptimizedHTTPClient 创建优化的HTTP客户端
|
||||
func createOptimizedHTTPClient() *http.Client {
|
||||
transport := &http.Transport{
|
||||
MaxIdleConns: MaxIdleConns,
|
||||
MaxIdleConnsPerHost: MaxIdleConnsPerHost,
|
||||
MaxConnsPerHost: MaxConnsPerHost,
|
||||
IdleConnTimeout: IdleConnTimeout,
|
||||
DisableKeepAlives: false,
|
||||
}
|
||||
|
||||
return &http.Client{
|
||||
Transport: transport,
|
||||
Timeout: DefaultTimeout,
|
||||
}
|
||||
}
|
||||
|
||||
// NewAhhhhfsPlugin 创建新的ahhhhfs异步插件
|
||||
func NewAhhhhfsPlugin() *AhhhhfsAsyncPlugin {
|
||||
return &AhhhhfsAsyncPlugin{
|
||||
BaseAsyncPlugin: plugin.NewBaseAsyncPlugin(pluginName, defaultPriority),
|
||||
optimizedClient: createOptimizedHTTPClient(),
|
||||
}
|
||||
}
|
||||
|
||||
// Search 执行搜索并返回结果(兼容性方法)
|
||||
func (p *AhhhhfsAsyncPlugin) Search(keyword string, ext map[string]interface{}) ([]model.SearchResult, error) {
|
||||
result, err := p.SearchWithResult(keyword, ext)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return result.Results, nil
|
||||
}
|
||||
|
||||
// SearchWithResult 执行搜索并返回包含IsFinal标记的结果
|
||||
func (p *AhhhhfsAsyncPlugin) SearchWithResult(keyword string, ext map[string]interface{}) (model.PluginSearchResult, error) {
|
||||
return p.AsyncSearchWithResult(keyword, p.searchImpl, p.MainCacheKey, ext)
|
||||
}
|
||||
|
||||
// searchImpl 实现具体的搜索逻辑
|
||||
func (p *AhhhhfsAsyncPlugin) searchImpl(client *http.Client, keyword string, ext map[string]interface{}) ([]model.SearchResult, error) {
|
||||
// 性能统计
|
||||
start := time.Now()
|
||||
atomic.AddInt64(&searchRequests, 1)
|
||||
defer func() {
|
||||
fmt.Printf("[%s] 搜索耗时: %v\n", p.Name(), time.Since(start))
|
||||
}()
|
||||
|
||||
// 使用优化的客户端
|
||||
if p.optimizedClient != nil {
|
||||
client = p.optimizedClient
|
||||
}
|
||||
|
||||
// 1. 构建搜索URL
|
||||
searchURL := fmt.Sprintf("https://www.ahhhhfs.com/search/%s", url.QueryEscape(keyword))
|
||||
|
||||
// 2. 创建带超时的上下文
|
||||
ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout)
|
||||
defer cancel()
|
||||
|
||||
// 3. 创建请求
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", searchURL, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("[%s] 创建请求失败: %w", p.Name(), err)
|
||||
}
|
||||
|
||||
// 4. 设置完整的请求头(避免反爬虫)
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
||||
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
|
||||
req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
|
||||
req.Header.Set("Connection", "keep-alive")
|
||||
req.Header.Set("Upgrade-Insecure-Requests", "1")
|
||||
req.Header.Set("Cache-Control", "max-age=0")
|
||||
req.Header.Set("Referer", "https://www.ahhhhfs.com/")
|
||||
|
||||
// 5. 发送请求(带重试机制)
|
||||
resp, err := p.doRequestWithRetry(req, client)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("[%s] 搜索请求失败: %w", p.Name(), err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("[%s] 搜索请求返回状态码: %d", p.Name(), resp.StatusCode)
|
||||
}
|
||||
|
||||
// 6. 解析搜索结果页面
|
||||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("[%s] 解析搜索页面失败: %w", p.Name(), err)
|
||||
}
|
||||
|
||||
// 7. 提取搜索结果
|
||||
var results []model.SearchResult
|
||||
var wg sync.WaitGroup
|
||||
var mu sync.Mutex
|
||||
semaphore := make(chan struct{}, MaxConcurrency)
|
||||
|
||||
doc.Find("article.post-item.item-list").Each(func(i int, s *goquery.Selection) {
|
||||
// 解析基本信息
|
||||
titleElem := s.Find(".entry-title a")
|
||||
title := strings.TrimSpace(titleElem.Text())
|
||||
if title == "" {
|
||||
title = strings.TrimSpace(titleElem.AttrOr("title", ""))
|
||||
}
|
||||
|
||||
detailURL, exists := titleElem.Attr("href")
|
||||
if !exists || detailURL == "" || title == "" {
|
||||
return
|
||||
}
|
||||
|
||||
// 提取文章ID
|
||||
articleID := p.extractArticleID(detailURL)
|
||||
if articleID == "" {
|
||||
return
|
||||
}
|
||||
|
||||
// 提取分类标签
|
||||
var tags []string
|
||||
s.Find(".entry-cat-dot a").Each(func(j int, tag *goquery.Selection) {
|
||||
tagText := strings.TrimSpace(tag.Text())
|
||||
if tagText != "" {
|
||||
tags = append(tags, tagText)
|
||||
}
|
||||
})
|
||||
|
||||
// 提取描述
|
||||
content := strings.TrimSpace(s.Find(".entry-desc").Text())
|
||||
|
||||
// 提取时间
|
||||
datetime := ""
|
||||
timeElem := s.Find(".entry-meta .meta-date time")
|
||||
if dt, exists := timeElem.Attr("datetime"); exists {
|
||||
datetime = dt
|
||||
} else {
|
||||
datetime = strings.TrimSpace(timeElem.Text())
|
||||
}
|
||||
|
||||
// 解析时间
|
||||
publishTime := p.parseDateTime(datetime)
|
||||
|
||||
// 异步获取详情页的网盘链接
|
||||
wg.Add(1)
|
||||
semaphore <- struct{}{} // 获取信号量
|
||||
|
||||
go func(title, detailURL, articleID, content string, tags []string, publishTime time.Time) {
|
||||
defer wg.Done()
|
||||
defer func() { <-semaphore }() // 释放信号量
|
||||
|
||||
// 获取网盘链接
|
||||
links := p.fetchDetailLinks(client, detailURL, articleID)
|
||||
|
||||
if len(links) > 0 {
|
||||
result := model.SearchResult{
|
||||
UniqueID: fmt.Sprintf("%s-%s", p.Name(), articleID),
|
||||
Title: title,
|
||||
Content: content,
|
||||
Links: links,
|
||||
Tags: tags,
|
||||
Channel: "", // 插件搜索结果 Channel 必须为空
|
||||
Datetime: publishTime,
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
results = append(results, result)
|
||||
mu.Unlock()
|
||||
}
|
||||
}(title, detailURL, articleID, content, tags, publishTime)
|
||||
})
|
||||
|
||||
// 等待所有详情页请求完成
|
||||
wg.Wait()
|
||||
|
||||
fmt.Printf("[%s] 搜索结果: %d 条\n", p.Name(), len(results))
|
||||
|
||||
// 关键词过滤
|
||||
return plugin.FilterResultsByKeyword(results, keyword), nil
|
||||
}
|
||||
|
||||
// extractArticleID 从URL中提取文章ID
|
||||
func (p *AhhhhfsAsyncPlugin) extractArticleID(detailURL string) string {
|
||||
matches := articleIDRegex.FindStringSubmatch(detailURL)
|
||||
if len(matches) >= 2 {
|
||||
return matches[1]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// parseDateTime 解析时间字符串
|
||||
func (p *AhhhhfsAsyncPlugin) parseDateTime(datetime string) time.Time {
|
||||
datetime = strings.TrimSpace(datetime)
|
||||
|
||||
// 尝试解析 ISO 格式
|
||||
if t, err := time.Parse(time.RFC3339, datetime); err == nil {
|
||||
return t
|
||||
}
|
||||
|
||||
// 尝试解析标准日期格式
|
||||
layouts := []string{
|
||||
"2006-01-02",
|
||||
"2006-01-02 15:04:05",
|
||||
"2006-01-02T15:04:05",
|
||||
"2006-01-02T15:04:05Z07:00",
|
||||
}
|
||||
|
||||
for _, layout := range layouts {
|
||||
if t, err := time.Parse(layout, datetime); err == nil {
|
||||
return t
|
||||
}
|
||||
}
|
||||
|
||||
// 处理相对时间(如"1 周前"、"2 天前")
|
||||
now := time.Now()
|
||||
|
||||
if strings.Contains(datetime, "小时前") || strings.Contains(datetime, "hours ago") {
|
||||
// 简单处理,返回当天
|
||||
return now
|
||||
}
|
||||
|
||||
if strings.Contains(datetime, "天前") || strings.Contains(datetime, "days ago") {
|
||||
// 简单处理,返回近期
|
||||
return now.AddDate(0, 0, -7)
|
||||
}
|
||||
|
||||
if strings.Contains(datetime, "周前") || strings.Contains(datetime, "weeks ago") {
|
||||
// 简单处理,返回一个月前
|
||||
return now.AddDate(0, -1, 0)
|
||||
}
|
||||
|
||||
// 默认返回当前时间
|
||||
return now
|
||||
}
|
||||
|
||||
// fetchDetailLinks 获取详情页的网盘链接
|
||||
func (p *AhhhhfsAsyncPlugin) fetchDetailLinks(client *http.Client, detailURL, articleID string) []model.Link {
|
||||
atomic.AddInt64(&detailPageRequests, 1)
|
||||
|
||||
// 检查缓存
|
||||
if cached, ok := detailCache.Load(articleID); ok {
|
||||
atomic.AddInt64(&cacheHits, 1)
|
||||
return cached.([]model.Link)
|
||||
}
|
||||
|
||||
atomic.AddInt64(&cacheMisses, 1)
|
||||
|
||||
// 创建带超时的上下文
|
||||
ctx, cancel := context.WithTimeout(context.Background(), DetailTimeout)
|
||||
defer cancel()
|
||||
|
||||
// 创建请求
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", detailURL, nil)
|
||||
if err != nil {
|
||||
fmt.Printf("[%s] 创建详情页请求失败: %v\n", p.Name(), err)
|
||||
return nil
|
||||
}
|
||||
|
||||
// 设置请求头
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
||||
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
|
||||
req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
|
||||
req.Header.Set("Referer", "https://www.ahhhhfs.com/")
|
||||
|
||||
// 发送请求
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
fmt.Printf("[%s] 详情页请求失败: %v\n", p.Name(), err)
|
||||
return nil
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
fmt.Printf("[%s] 详情页返回状态码: %d\n", p.Name(), resp.StatusCode)
|
||||
return nil
|
||||
}
|
||||
|
||||
// 解析详情页
|
||||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||||
if err != nil {
|
||||
fmt.Printf("[%s] 解析详情页失败: %v\n", p.Name(), err)
|
||||
return nil
|
||||
}
|
||||
|
||||
// 提取网盘链接
|
||||
links := p.extractNetDiskLinks(doc)
|
||||
|
||||
// 缓存结果
|
||||
if len(links) > 0 {
|
||||
detailCache.Store(articleID, links)
|
||||
}
|
||||
|
||||
return links
|
||||
}
|
||||
|
||||
// extractNetDiskLinks 从详情页提取网盘链接
|
||||
func (p *AhhhhfsAsyncPlugin) extractNetDiskLinks(doc *goquery.Document) []model.Link {
|
||||
var links []model.Link
|
||||
linkMap := make(map[string]model.Link) // 用于去重
|
||||
|
||||
// 在文章内容中查找所有链接
|
||||
doc.Find(".post-content a").Each(func(i int, s *goquery.Selection) {
|
||||
href, exists := s.Attr("href")
|
||||
if !exists || href == "" {
|
||||
return
|
||||
}
|
||||
|
||||
// 判断是否为网盘链接
|
||||
cloudType := p.determineCloudType(href)
|
||||
if cloudType == "others" {
|
||||
return
|
||||
}
|
||||
|
||||
// 提取提取码
|
||||
password := p.extractPassword(s, href)
|
||||
|
||||
// 添加到结果(去重)
|
||||
if _, exists := linkMap[href]; !exists {
|
||||
link := model.Link{
|
||||
Type: cloudType,
|
||||
URL: href,
|
||||
Password: password,
|
||||
}
|
||||
linkMap[href] = link
|
||||
links = append(links, link)
|
||||
}
|
||||
})
|
||||
|
||||
return links
|
||||
}
|
||||
|
||||
// determineCloudType 判断链接类型
|
||||
func (p *AhhhhfsAsyncPlugin) determineCloudType(url string) string {
|
||||
switch {
|
||||
case strings.Contains(url, "pan.quark.cn"):
|
||||
return "quark"
|
||||
case strings.Contains(url, "drive.uc.cn"):
|
||||
return "uc"
|
||||
case strings.Contains(url, "pan.baidu.com"):
|
||||
return "baidu"
|
||||
case strings.Contains(url, "aliyundrive.com") || strings.Contains(url, "alipan.com"):
|
||||
return "aliyun"
|
||||
case strings.Contains(url, "pan.xunlei.com"):
|
||||
return "xunlei"
|
||||
case strings.Contains(url, "cloud.189.cn"):
|
||||
return "tianyi"
|
||||
case strings.Contains(url, "115.com"):
|
||||
return "115"
|
||||
case strings.Contains(url, "123pan.com"):
|
||||
return "123"
|
||||
case strings.Contains(url, "mypikpak.com"):
|
||||
return "pikpak"
|
||||
default:
|
||||
return "others"
|
||||
}
|
||||
}
|
||||
|
||||
// extractPassword 提取提取码
|
||||
func (p *AhhhhfsAsyncPlugin) extractPassword(linkElem *goquery.Selection, url string) string {
|
||||
// 1. 从链接的 title 属性中提取
|
||||
if title, exists := linkElem.Attr("title"); exists {
|
||||
for _, pattern := range pwdPatterns {
|
||||
if matches := pattern.FindStringSubmatch(title); len(matches) >= 2 {
|
||||
return matches[1]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. 从链接文本中提取
|
||||
linkText := linkElem.Text()
|
||||
for _, pattern := range pwdPatterns {
|
||||
if matches := pattern.FindStringSubmatch(linkText); len(matches) >= 2 {
|
||||
return matches[1]
|
||||
}
|
||||
}
|
||||
|
||||
// 3. 从链接后面的兄弟节点或父节点的文本中提取
|
||||
parent := linkElem.Parent()
|
||||
parentText := parent.Text()
|
||||
|
||||
// 获取链接在父元素文本中的位置
|
||||
linkIndex := strings.Index(parentText, linkText)
|
||||
if linkIndex >= 0 {
|
||||
// 获取链接后面的文本
|
||||
afterText := parentText[linkIndex+len(linkText):]
|
||||
for _, pattern := range pwdPatterns {
|
||||
if matches := pattern.FindStringSubmatch(afterText); len(matches) >= 2 {
|
||||
return matches[1]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4. 从 URL 参数中提取
|
||||
if strings.Contains(url, "pwd=") {
|
||||
parts := strings.Split(url, "pwd=")
|
||||
if len(parts) >= 2 {
|
||||
pwd := parts[1]
|
||||
// 只取密码部分(去除其他参数)
|
||||
if idx := strings.IndexAny(pwd, "&?#"); idx >= 0 {
|
||||
pwd = pwd[:idx]
|
||||
}
|
||||
return pwd
|
||||
}
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
// doRequestWithRetry 带重试机制的HTTP请求
|
||||
func (p *AhhhhfsAsyncPlugin) doRequestWithRetry(req *http.Request, client *http.Client) (*http.Response, error) {
|
||||
maxRetries := 3
|
||||
var lastErr error
|
||||
|
||||
for i := 0; i < maxRetries; i++ {
|
||||
if i > 0 {
|
||||
// 指数退避重试
|
||||
backoff := time.Duration(1<<uint(i-1)) * 200 * time.Millisecond
|
||||
time.Sleep(backoff)
|
||||
}
|
||||
|
||||
// 克隆请求避免并发问题
|
||||
reqClone := req.Clone(req.Context())
|
||||
|
||||
resp, err := client.Do(reqClone)
|
||||
if err == nil && resp.StatusCode == 200 {
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
if resp != nil {
|
||||
resp.Body.Close()
|
||||
}
|
||||
lastErr = err
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("重试 %d 次后仍然失败: %w", maxRetries, lastErr)
|
||||
}
|
||||
|
||||
214
plugin/ahhhhfs/html结构分析.md
Normal file
214
plugin/ahhhhfs/html结构分析.md
Normal file
@@ -0,0 +1,214 @@
|
||||
# ahhhhfs (A姐分享) HTML结构分析
|
||||
|
||||
## 网站信息
|
||||
- **网站名称**: ahhhhfs (A姐分享)
|
||||
- **域名**: www.ahhhhfs.com
|
||||
- **类型**: 资源分享网站(WordPress 站点)
|
||||
- **特点**: 分享各类学习资源、软件、教程等
|
||||
|
||||
## 搜索页面结构
|
||||
|
||||
### 1. 搜索URL模式
|
||||
```
|
||||
https://www.ahhhhfs.com/search/{关键词}
|
||||
或
|
||||
https://www.ahhhhfs.com/?s={关键词}
|
||||
|
||||
示例:
|
||||
https://www.ahhhhfs.com/search/小红书
|
||||
https://www.ahhhhfs.com/?s=小红书
|
||||
|
||||
参数说明:
|
||||
- 关键词: 直接使用中文或URL编码都可以
|
||||
```
|
||||
|
||||
### 2. 搜索结果容器
|
||||
- **父容器**: `.row` (结果列表容器)
|
||||
- **结果项**: `<article class="post-item item-list">` (每个搜索结果)
|
||||
|
||||
### 3. 单个搜索结果结构
|
||||
|
||||
#### 标题区域 (.entry-title)
|
||||
```html
|
||||
<h2 class="entry-title">
|
||||
<a target="_blank" href="https://www.ahhhhfs.com/76567/"
|
||||
title="AI小红书虚拟电商全链路实战课:从选品到变现的AI爆款打法">
|
||||
AI小红书虚拟电商全链路实战课:从选品到变现的AI爆款打法
|
||||
</a>
|
||||
</h2>
|
||||
|
||||
提取要素:
|
||||
- 标题: a 的文本内容或 title 属性
|
||||
- 详情页链接: a 的 href 属性
|
||||
```
|
||||
|
||||
#### 分类标签 (.entry-cat-dot)
|
||||
```html
|
||||
<div class="entry-cat-dot">
|
||||
<a href="https://www.ahhhhfs.com/recourse/%e7%9f%ad%e8%a7%86%e9%a2%91/">短视频</a>
|
||||
<a href="https://www.ahhhhfs.com/recourse/">资源</a>
|
||||
</div>
|
||||
|
||||
提取要素:
|
||||
- 分类: 所有 a 标签的文本内容
|
||||
```
|
||||
|
||||
#### 描述区域 (.entry-desc)
|
||||
```html
|
||||
<div class="entry-desc">
|
||||
AI小红书虚拟电商全链路实战课程概览 《AI小红书虚拟电商5.0实战课》是一门聚焦AI与小红书生态融合的系统课程,围绕AI赋能选品、创作、运营与变现四大环节展开...
|
||||
</div>
|
||||
|
||||
提取要素:
|
||||
- 描述: div 的文本内容
|
||||
```
|
||||
|
||||
#### 元数据栏 (.entry-meta)
|
||||
```html
|
||||
<div class="entry-meta">
|
||||
<span class="meta-date">
|
||||
<i class="far fa-clock me-1"></i>
|
||||
<time class="pub-date" datetime="2025-10-18T13:43:10+08:00">1 周前</time>
|
||||
</span>
|
||||
<span class="meta-likes d-none d-md-inline-block"><i class="far fa-heart me-1"></i>0</span>
|
||||
<span class="meta-fav d-none d-md-inline-block"><i class="far fa-star me-1"></i>1</span>
|
||||
</div>
|
||||
|
||||
提取要素:
|
||||
- 发布时间: time 标签的 datetime 属性或文本内容
|
||||
```
|
||||
|
||||
## 详情页面结构
|
||||
|
||||
### 1. 详情页URL模式
|
||||
```
|
||||
https://www.ahhhhfs.com/{文章ID}/
|
||||
|
||||
示例:
|
||||
https://www.ahhhhfs.com/76567/
|
||||
```
|
||||
|
||||
### 2. 下载链接位置
|
||||
下载链接在文章正文内容中 `.post-content` 里面,通常在文章末尾部分。
|
||||
|
||||
#### 下载链接格式示例
|
||||
```html
|
||||
<p>
|
||||
学习地址:
|
||||
<a title="..."
|
||||
href="https://pan.quark.cn/s/c16a5ae18ea0"
|
||||
target="_blank"
|
||||
rel="nofollow noopener noreferrer">夸克</a>
|
||||
</p>
|
||||
|
||||
或者
|
||||
|
||||
<p>
|
||||
下载地址:
|
||||
<a href="https://pan.baidu.com/s/xxxxx"
|
||||
target="_blank"
|
||||
rel="nofollow noopener noreferrer">百度网盘</a>
|
||||
提取码: xxxx
|
||||
</p>
|
||||
|
||||
或者多个网盘链接:
|
||||
<p>
|
||||
阿里云盘:<a href="...">链接</a><br>
|
||||
夸克网盘:<a href="...">链接</a><br>
|
||||
百度网盘:<a href="...">链接</a> 提取码: xxxx
|
||||
</p>
|
||||
|
||||
提取要素:
|
||||
- 网盘链接: .post-content 中包含网盘域名的 a 标签的 href 属性
|
||||
- 提取码/密码: 链接附近的文本内容,可能包含 "提取码"、"密码"、"pwd" 等关键词
|
||||
```
|
||||
|
||||
## CSS选择器总结
|
||||
|
||||
| 数据项 | CSS选择器 | 提取方式 |
|
||||
|--------|-----------|----------|
|
||||
| 搜索结果列表 | `article.post-item.item-list` | 遍历所有结果项 |
|
||||
| 标题 | `.entry-title a` | 文本内容或 title 属性 |
|
||||
| 详情页链接 | `.entry-title a` | href 属性 |
|
||||
| 分类标签 | `.entry-cat-dot a` | 所有 a 标签的文本内容 |
|
||||
| 描述 | `.entry-desc` | 文本内容 |
|
||||
| 发布时间 | `.entry-meta .meta-date time` | datetime 属性或文本内容 |
|
||||
| 文章内容 | `.post-content` | HTML 内容 |
|
||||
| 网盘链接 | `.post-content a[href*="pan"]` 或匹配网盘域名 | href 属性 |
|
||||
|
||||
## 实现要点
|
||||
|
||||
### 1. 支持的网盘类型
|
||||
- 夸克网盘: `pan.quark.cn`
|
||||
- 阿里云盘: `aliyundrive.com`, `alipan.com`
|
||||
- 百度网盘: `pan.baidu.com`
|
||||
- UC网盘: `drive.uc.cn`
|
||||
- 迅雷网盘: `pan.xunlei.com`
|
||||
- 天翼云盘: `cloud.189.cn`
|
||||
- 115网盘: `115.com`
|
||||
- 123网盘: `123pan.com`
|
||||
|
||||
### 2. 提取码识别
|
||||
提取码可能出现在以下位置:
|
||||
- 链接后面的文本: `提取码: xxxx` 或 `密码: xxxx`
|
||||
- 链接的 title 属性中
|
||||
- `<br>` 标签分隔的下一行
|
||||
- 括号内: `(提取码: xxxx)`
|
||||
|
||||
常见关键词:
|
||||
- 提取码
|
||||
- 密码
|
||||
- pwd
|
||||
- code
|
||||
- 取码
|
||||
|
||||
### 3. 链接提取策略
|
||||
1. 先从搜索结果页获取文章列表
|
||||
2. 访问每篇文章的详情页
|
||||
3. 在详情页的 `.post-content` 中查找包含网盘域名的链接
|
||||
4. 提取链接和相应的提取码
|
||||
5. 如果文章没有网盘链接,则跳过
|
||||
|
||||
### 4. 时间格式处理
|
||||
- 相对时间: "1 周前"、"2 天前" 需要转换为具体日期
|
||||
- 绝对时间: "2025-10-18" 可以直接使用
|
||||
- datetime 属性: "2025-10-18T13:43:10+08:00" 标准ISO格式
|
||||
|
||||
### 5. 去重标识
|
||||
- 使用文章ID作为唯一标识: 从详情页URL中提取 `/76567/`
|
||||
|
||||
## 注意事项
|
||||
|
||||
1. **搜索结果可能为空**: 如果关键词没有匹配结果,页面会显示"没有找到相关内容"
|
||||
2. **分页**: 搜索结果可能有多页,但通常只抓取第一页即可
|
||||
3. **网盘链接位置不固定**: 链接可能在文章开头、中间或结尾,需要遍历整个 `.post-content`
|
||||
4. **广告干扰**: 页面包含广告,需要准确定位到实际内容区域
|
||||
5. **需要访问详情页**: 搜索结果页不包含下载链接,必须访问详情页才能获取
|
||||
6. **请求频率**: 需要访问详情页,建议控制请求频率避免被封
|
||||
|
||||
## 示例数据流
|
||||
|
||||
```
|
||||
1. 搜索请求: https://www.ahhhhfs.com/search/小红书
|
||||
↓
|
||||
2. 解析搜索结果页,提取文章列表
|
||||
- 标题: "AI小红书虚拟电商全链路实战课:从选品到变现的AI爆款打法"
|
||||
- 详情页URL: https://www.ahhhhfs.com/76567/
|
||||
- 分类: ["短视频", "资源"]
|
||||
- 发布时间: 2025-10-18
|
||||
↓
|
||||
3. 访问详情页: https://www.ahhhhfs.com/76567/
|
||||
↓
|
||||
4. 解析详情页 .post-content,提取网盘链接
|
||||
- 夸克网盘: https://pan.quark.cn/s/c16a5ae18ea0
|
||||
- 提取码: (如果有)
|
||||
↓
|
||||
5. 构建最终结果
|
||||
- UniqueID: ahhhhfs-76567
|
||||
- Title: "AI小红书虚拟电商全链路实战课:从选品到变现的AI爆款打法"
|
||||
- Content: 文章描述
|
||||
- Links: [{Type: "quark", URL: "...", Password: ""}]
|
||||
- Tags: ["短视频", "资源"]
|
||||
- Datetime: 2025-10-18T13:43:10+08:00
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user