package pansearch import ( "context" "fmt" "io" "net" "net/http" "net/url" "regexp" "strings" "sync" "time" "pansou/model" "pansou/plugin" "pansou/util/json" "sync/atomic" ) // 预编译正则表达式 var ( // 从HTML中提取buildId的正则表达式 buildIdRegex = regexp.MustCompile(`"buildId":"([^"]+)"`) // 从__NEXT_DATA__脚本中提取数据的正则表达式 nextDataRegex = regexp.MustCompile(``) // 缓存相关变量 searchResultCache = sync.Map{} lastCacheCleanTime = time.Now() cacheTTL = 1 * time.Hour ) // 在init函数中注册插件 func init() { // 使用全局超时时间创建插件实例并注册 plugin.RegisterGlobalPlugin(NewPanSearchPlugin()) // 启动缓存清理goroutine go startCacheCleaner() } // startCacheCleaner 启动一个定期清理缓存的goroutine func startCacheCleaner() { // 每小时清理一次缓存 ticker := time.NewTicker(1 * time.Hour) defer ticker.Stop() for range ticker.C { // 清空所有缓存 searchResultCache = sync.Map{} lastCacheCleanTime = time.Now() } } // 缓存响应结构 type cachedResponse struct { results []model.SearchResult timestamp time.Time } const ( // 网站基础URL WebsiteURL = "https://www.pansearch.me/search" // API基础URL模板 - 需要替换buildId BaseURLTemplate = "https://www.pansearch.me/_next/data/%s/search.json" // 默认参数 DefaultTimeout = 6 * time.Second // 减少默认超时时间 PageSize = 10 MaxResults = 1000 MaxConcurrent = 200 // 增加最大并发数 MaxRetries = 2 MaxAPIPages = 100 // API最大页数限制 // HTTP 客户端配置 MaxIdleConns = 500 // 增加最大空闲连接数 MaxIdleConnsPerHost = 200 // 增加每个主机的最大空闲连接数 MaxConnsPerHost = 400 // 增加每个主机的最大连接数 IdleConnTimeout = 120 * time.Second TLSHandshakeTimeout = 10 * time.Second ExpectContinueTimeout = 1 * time.Second WriteBufferSize = 16 * 1024 ReadBufferSize = 16 * 1024 // buildId缓存有效期(分钟)- 减少缓存时间以确保更及时更新 BuildIdCacheDuration = 30 ) // 缓存buildId和过期时间 var ( buildIdCache string buildIdCacheTime time.Time buildIdMutex sync.RWMutex ) // PanSearchPlugin 盘搜插件 type PanSearchPlugin struct { client *http.Client timeout time.Duration maxResults int maxConcurrent int retries int workerPool *WorkerPool // 添加工作池 } // WorkerPool 工作池结构 type WorkerPool struct { tasks chan Task results chan TaskResult errors chan error wg sync.WaitGroup closed atomic.Bool // 添加原子标志来标记工作池是否已关闭 mu sync.Mutex // 添加互斥锁保护提交操作 } // Task 工作任务 type Task struct { keyword string offset int baseURL string } // TaskResult 任务结果 type TaskResult struct { offset int results []PanSearchItem } // NewWorkerPool 创建新的工作池 func NewWorkerPool(size int) *WorkerPool { return &WorkerPool{ tasks: make(chan Task, size*3), // 增加任务通道容量 results: make(chan TaskResult, size*3), // 增加结果通道容量 errors: make(chan error, size*3), // 增加错误通道容量 } } // Start 启动工作池 func (wp *WorkerPool) Start(ctx context.Context, handler func(ctx context.Context, task Task) (TaskResult, error)) { for i := 0; i < cap(wp.tasks); i++ { wp.wg.Add(1) go func() { defer wp.wg.Done() for { select { case task, ok := <-wp.tasks: if !ok { return } result, err := handler(ctx, task) if err != nil { select { case wp.errors <- err: // 成功发送错误 default: // 通道可能已关闭,忽略错误 fmt.Printf("无法发送错误: %v\n", err) } } else { select { case wp.results <- result: // 成功发送结果 default: // 通道可能已关闭,忽略结果 fmt.Printf("无法发送结果\n") } } case <-ctx.Done(): return } } }() } } // Submit 提交任务到工作池 func (wp *WorkerPool) Submit(task Task) bool { wp.mu.Lock() defer wp.mu.Unlock() // 检查工作池是否已关闭 if wp.closed.Load() { return false } select { case wp.tasks <- task: return true default: // 如果通道已满,返回失败 return false } } // Close 关闭工作池 func (wp *WorkerPool) Close() { wp.mu.Lock() if !wp.closed.Load() { wp.closed.Store(true) close(wp.tasks) } wp.mu.Unlock() wp.wg.Wait() // 安全关闭结果和错误通道 wp.mu.Lock() defer wp.mu.Unlock() select { case _, ok := <-wp.results: if ok { close(wp.results) } default: close(wp.results) } select { case _, ok := <-wp.errors: if ok { close(wp.errors) } default: close(wp.errors) } } // NewPanSearchPlugin 创建新的盘搜插件 func NewPanSearchPlugin() *PanSearchPlugin { timeout := DefaultTimeout // 创建自定义 Transport 以优化连接池 transport := &http.Transport{ Proxy: http.ProxyFromEnvironment, DialContext: (&net.Dialer{ Timeout: 30 * time.Second, KeepAlive: 60 * time.Second, DualStack: true, }).DialContext, ForceAttemptHTTP2: true, MaxIdleConns: MaxIdleConns, MaxIdleConnsPerHost: MaxIdleConnsPerHost, MaxConnsPerHost: MaxConnsPerHost, IdleConnTimeout: IdleConnTimeout, TLSHandshakeTimeout: TLSHandshakeTimeout, ExpectContinueTimeout: ExpectContinueTimeout, WriteBufferSize: WriteBufferSize, ReadBufferSize: ReadBufferSize, DisableKeepAlives: false, } maxConcurrent := MaxConcurrent p := &PanSearchPlugin{ client: &http.Client{ Transport: transport, Timeout: timeout, }, timeout: timeout, maxResults: MaxResults, maxConcurrent: maxConcurrent, retries: MaxRetries, workerPool: NewWorkerPool(maxConcurrent), // 初始化工作池 } // 初始化时预热获取 buildId go func() { _, err := p.getBuildId() if err != nil { fmt.Printf("预热获取 buildId 失败: %v\n", err) } }() // 启动后台 buildId 更新器 go p.startBuildIdUpdater() return p } // startBuildIdUpdater 启动一个定期更新 buildId 的后台协程 func (p *PanSearchPlugin) startBuildIdUpdater() { // 每10分钟更新一次 buildId ticker := time.NewTicker(10 * time.Minute) defer ticker.Stop() for range ticker.C { p.updateBuildId() } } // updateBuildId 更新 buildId 缓存 func (p *PanSearchPlugin) updateBuildId() { // 创建带超时的上下文 ctx, cancel := context.WithTimeout(context.Background(), p.timeout) defer cancel() // 发送请求获取页面 req, err := http.NewRequestWithContext(ctx, "GET", WebsiteURL, nil) if err != nil { fmt.Printf("创建请求失败: %v\n", err) return } // 设置完整的请求头 req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") req.Header.Set("Connection", "keep-alive") req.Header.Set("Upgrade-Insecure-Requests", "1") req.Header.Set("Cache-Control", "max-age=0") resp, err := p.client.Do(req) if err != nil { fmt.Printf("请求失败: %v\n", err) return } defer resp.Body.Close() if resp.StatusCode != 200 { fmt.Printf("获取buildId时服务器返回非200状态码: %d\n", resp.StatusCode) return } // 使用更高效的方式读取响应体 var bodyBuilder strings.Builder _, err = io.Copy(&bodyBuilder, resp.Body) if err != nil { fmt.Printf("读取响应失败: %v\n", err) return } body := bodyBuilder.String() // 尝试提取 buildId newBuildId := extractBuildId(body) if newBuildId == "" { fmt.Println("未能从响应中提取 buildId") return } // 更新缓存 buildIdMutex.Lock() defer buildIdMutex.Unlock() // 只有当新的 buildId 不为空且与当前缓存不同时才更新 if newBuildId != "" && newBuildId != buildIdCache { buildIdCache = newBuildId buildIdCacheTime = time.Now() fmt.Printf("成功更新 buildId: %s\n", newBuildId) } } // extractBuildId 从 HTML 内容中提取 buildId func extractBuildId(body string) string { // 使用预编译的正则表达式提取buildId matches := buildIdRegex.FindStringSubmatch(body) if len(matches) >= 2 { return matches[1] } // 尝试从NEXT_DATA中提取 scriptMatches := nextDataRegex.FindStringSubmatch(body) if len(scriptMatches) >= 2 { var nextData map[string]interface{} if err := json.Unmarshal([]byte(scriptMatches[1]), &nextData); err == nil { if buildId, ok := nextData["buildId"].(string); ok && buildId != "" { return buildId } } } return "" } // Name 返回插件名称 func (p *PanSearchPlugin) Name() string { return "pansearch" } // Priority 返回插件优先级 func (p *PanSearchPlugin) Priority() int { return 2 // 较高优先级 } // getBuildId 获取buildId,优先使用缓存 func (p *PanSearchPlugin) getBuildId() (string, error) { // 检查缓存是否有效 buildIdMutex.RLock() if buildIdCache != "" && time.Since(buildIdCacheTime) < BuildIdCacheDuration*time.Minute { defer buildIdMutex.RUnlock() return buildIdCache, nil } buildIdMutex.RUnlock() // 缓存无效,需要重新获取 buildIdMutex.Lock() defer buildIdMutex.Unlock() // 双重检查 if buildIdCache != "" && time.Since(buildIdCacheTime) < BuildIdCacheDuration*time.Minute { return buildIdCache, nil } // 创建带超时的上下文 ctx, cancel := context.WithTimeout(context.Background(), p.timeout) defer cancel() // 发送请求获取页面 req, err := http.NewRequestWithContext(ctx, "GET", WebsiteURL, nil) if err != nil { // 如果创建请求失败但有旧的缓存,使用旧的缓存(优雅降级) if buildIdCache != "" { fmt.Printf("创建请求失败,使用旧的buildId: %v\n", err) return buildIdCache, nil } return "", fmt.Errorf("创建请求失败: %w", err) } // 设置完整的请求头 req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") req.Header.Set("Connection", "keep-alive") req.Header.Set("Upgrade-Insecure-Requests", "1") req.Header.Set("Cache-Control", "max-age=0") // 使用重试机制发送请求 var resp *http.Response var respErr error for retry := 0; retry <= p.retries; retry++ { if retry > 0 { // 指数退避重试 backoffTime := time.Duration(1< neededPages { end = neededPages } // 提交一批任务 for j := i; j < end; j++ { offset := PageSize + j*PageSize if offset < p.maxResults { task := Task{ keyword: keyword, offset: offset, baseURL: baseURL, } // 尝试提交任务,如果失败则跳出循环 if !p.workerPool.Submit(task) { fmt.Printf("无法提交任务,工作池可能已关闭\n") goto CollectResults } submittedTasks++ } } // 只有在有多个批次且不是最后一批时才等待 if batchSize < neededPages && end < neededPages { select { case <-time.After(50 * time.Millisecond): // 继续执行 case <-ctx.Done(): // 上下文已取消,停止提交任务 goto CollectResults } } } CollectResults: // 关闭任务提交通道 go p.workerPool.Close() // 收集结果 resultCount := 0 errorCount := 0 var lastError error // 使用select非阻塞地收集结果和错误 for resultCount+errorCount < submittedTasks { select { case result, ok := <-p.workerPool.results: if !ok { // 结果通道已关闭 goto ProcessResults } allResults = append(allResults, result.results...) resultCount++ case err, ok := <-p.workerPool.errors: if !ok { // 错误通道已关闭 goto ProcessResults } errorCount++ lastError = err case <-ctx.Done(): // 上下文超时,返回已收集的结果 results := p.convertResults(allResults, keyword) // 缓存结果(即使超时也缓存已获取的结果) searchResultCache.Store(cacheKey, cachedResponse{ results: results, timestamp: time.Now(), }) return results, fmt.Errorf("搜索超时: %w", ctx.Err()) } } ProcessResults: // 如果所有请求都失败且没有获得首页以外的结果,则返回错误 if submittedTasks > 0 && errorCount == submittedTasks && len(allResults) == len(firstPageResults) { results := p.convertResults(allResults, keyword) // 缓存结果(即使有错误也缓存已获取的结果) searchResultCache.Store(cacheKey, cachedResponse{ results: results, timestamp: time.Now(), }) return results, fmt.Errorf("所有后续页面请求失败: %v", lastError) } // 4. 去重和格式化结果 uniqueResults := p.deduplicateItems(allResults) results := p.convertResults(uniqueResults, keyword) // 缓存结果 searchResultCache.Store(cacheKey, cachedResponse{ results: results, timestamp: time.Now(), }) return results, nil } // fetchFirstPage 获取第一页结果和总数 func (p *PanSearchPlugin) fetchFirstPage(keyword string, baseURL string) ([]PanSearchItem, int, error) { // 构建请求URL reqURL := fmt.Sprintf("%s?keyword=%s&offset=0", baseURL, url.QueryEscape(keyword)) // 创建带超时的上下文 ctx, cancel := context.WithTimeout(context.Background(), p.timeout) defer cancel() // 发送请求 req, err := http.NewRequestWithContext(ctx, "GET", reqURL, nil) if err != nil { return nil, 0, fmt.Errorf("创建请求失败: %w", err) } // 设置完整的请求头 req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") req.Header.Set("Referer", "https://www.pansearch.me/") req.Header.Set("Accept", "application/json, text/plain, */*") req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") req.Header.Set("Connection", "keep-alive") req.Header.Set("Cache-Control", "no-cache") req.Header.Set("Pragma", "no-cache") // 发送请求 resp, err := p.client.Do(req) if err != nil { return nil, 0, fmt.Errorf("请求失败: %w", err) } defer resp.Body.Close() // 检查状态码 if resp.StatusCode == 404 { return nil, 0, fmt.Errorf("404 Not Found,buildId可能已过期") } if resp.StatusCode != 200 { return nil, 0, fmt.Errorf("服务器返回非200状态码: %d", resp.StatusCode) } // 读取响应体 respBody, err := io.ReadAll(resp.Body) if err != nil { return nil, 0, fmt.Errorf("读取响应失败: %w", err) } // 解析响应 var apiResp PanSearchResponse if err := json.Unmarshal(respBody, &apiResp); err != nil { return nil, 0, fmt.Errorf("解析响应失败: %w", err) } // 获取total和结果 total := apiResp.PageProps.Data.Total items := apiResp.PageProps.Data.Data return items, total, nil } // fetchPage 获取指定偏移量的页面 func (p *PanSearchPlugin) fetchPage(keyword string, offset int, baseURL string) ([]PanSearchItem, error) { // 构建请求URL reqURL := fmt.Sprintf("%s?keyword=%s&offset=%d", baseURL, url.QueryEscape(keyword), offset) // 创建带超时的上下文 ctx, cancel := context.WithTimeout(context.Background(), p.timeout) defer cancel() // 发送请求 req, err := http.NewRequestWithContext(ctx, "GET", reqURL, nil) if err != nil { return nil, fmt.Errorf("创建请求失败: %w", err) } // 设置完整的请求头 req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") req.Header.Set("Referer", "https://www.pansearch.me/") req.Header.Set("Accept", "application/json, text/plain, */*") req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") req.Header.Set("Connection", "keep-alive") req.Header.Set("Cache-Control", "no-cache") req.Header.Set("Pragma", "no-cache") // 发送请求 resp, err := p.client.Do(req) if err != nil { return nil, fmt.Errorf("请求失败: %w", err) } defer resp.Body.Close() // 检查状态码 if resp.StatusCode == 404 { return nil, fmt.Errorf("404 Not Found,buildId可能已过期") } if resp.StatusCode != 200 { return nil, fmt.Errorf("服务器返回非200状态码: %d", resp.StatusCode) } // 读取响应体 respBody, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("读取响应失败: %w", err) } // 解析响应 var apiResp PanSearchResponse if err := json.Unmarshal(respBody, &apiResp); err != nil { return nil, fmt.Errorf("解析响应失败: %w", err) } return apiResp.PageProps.Data.Data, nil } // deduplicateItems 去重处理 func (p *PanSearchPlugin) deduplicateItems(items []PanSearchItem) []PanSearchItem { // 使用map进行去重,键为资源ID uniqueMap := make(map[int]PanSearchItem) for _, item := range items { uniqueMap[item.ID] = item } // 将map转回切片 result := make([]PanSearchItem, 0, len(uniqueMap)) for _, item := range uniqueMap { result = append(result, item) } return result } // convertResults 将API响应转换为标准SearchResult格式 func (p *PanSearchPlugin) convertResults(items []PanSearchItem, keyword string) []model.SearchResult { results := make([]model.SearchResult, 0, len(items)) for _, item := range items { // 提取链接和密码 linkInfo := extractLinkAndPassword(item.Content) // 获取链接类型,确保映射到系统支持的类型 linkType := item.Pan // 将aliyundrive映射到aliyun if linkType == "aliyundrive" { linkType = "aliyun" } // 创建链接 link := model.Link{ URL: linkInfo.URL, Type: linkType, Password: linkInfo.Password, } // 创建唯一ID uniqueID := fmt.Sprintf("pansearch-%d", item.ID) // 解析时间 var datetime time.Time if item.Time != "" { // 尝试解析时间,格式:2025-07-07T13:54:43+08:00 parsedTime, err := time.Parse(time.RFC3339, item.Time) if err == nil { datetime = parsedTime } } // 如果时间解析失败,使用零值 if datetime.IsZero() { datetime = time.Time{} } // 创建搜索结果 result := model.SearchResult{ UniqueID: uniqueID, Title: extractTitle(item.Content, keyword), Content: item.Content, Datetime: datetime, Links: []model.Link{link}, } results = append(results, result) } return results } // LinkInfo 链接信息 type LinkInfo struct { URL string Password string } // extractLinkAndPassword 从内容中提取链接和密码 func extractLinkAndPassword(content string) LinkInfo { // 实现从内容中提取链接和密码的逻辑 // 这里需要解析HTML内容,提取标签中的链接和密码 // 简单实现,实际可能需要使用正则表达式或HTML解析库 // 示例实现 linkInfo := LinkInfo{} // 提取链接 linkStartIndex := strings.Index(content, "href=\"") if linkStartIndex != -1 { linkStartIndex += 6 // "href="的长度 linkEndIndex := strings.Index(content[linkStartIndex:], "\"") if linkEndIndex != -1 { linkInfo.URL = content[linkStartIndex : linkStartIndex+linkEndIndex] } } // 提取密码 pwdIndex := strings.Index(content, "?pwd=") if pwdIndex != -1 { pwdStartIndex := pwdIndex + 5 // "?pwd="的长度 pwdEndIndex := strings.Index(content[pwdStartIndex:], "\"") if pwdEndIndex != -1 { linkInfo.Password = content[pwdStartIndex : pwdStartIndex+pwdEndIndex] } else { // 可能是百度网盘链接结尾形式 pwdEndIndex = strings.Index(content[pwdStartIndex:], "#") if pwdEndIndex != -1 { linkInfo.Password = content[pwdStartIndex : pwdStartIndex+pwdEndIndex] } else { // 取到结尾 linkInfo.Password = content[pwdStartIndex:] } } } return linkInfo } // extractTitle 从内容中提取标题 func extractTitle(content string, keyword string) string { // 实现从内容中提取标题的逻辑 // 标题通常在"名称:"之后 titlePrefix := "名称:" titleStartIndex := strings.Index(content, titlePrefix) if titleStartIndex == -1 { return keyword // 使用搜索关键词作为默认标题 } titleStartIndex += len(titlePrefix) titleEndIndex := strings.Index(content[titleStartIndex:], "\n") if titleEndIndex == -1 { return cleanHTML(content[titleStartIndex:]) } return cleanHTML(content[titleStartIndex : titleStartIndex+titleEndIndex]) } // cleanHTML 清理HTML标签 func cleanHTML(html string) string { // 实现清理HTML标签的逻辑 // 这里简单实现,实际可能需要使用HTML解析库 // 替换常见HTML标签 replacements := map[string]string{ "": "", "": "", "": "", "
": "\n", "

": "", "

": "\n", } result := html for tag, replacement := range replacements { result = strings.Replace(result, tag, replacement, -1) } // 清理其他HTML标签 for { startIndex := strings.Index(result, "<") if startIndex == -1 { break } endIndex := strings.Index(result[startIndex:], ">") if endIndex == -1 { break } result = result[:startIndex] + result[startIndex+endIndex+1:] } return strings.TrimSpace(result) } // min 返回两个int中的较小值 func min(a, b int) int { if a < b { return a } return b } // PanSearchResponse API响应结构 type PanSearchResponse struct { PageProps struct { Data struct { Total int `json:"total"` Data []PanSearchItem `json:"data"` Time int `json:"time"` } `json:"data"` Limit int `json:"limit"` IsMobile bool `json:"isMobile"` } `json:"pageProps"` NSSP bool `json:"__N_SSP"` } // PanSearchItem API响应中的单个结果项 type PanSearchItem struct { ID int `json:"id"` Content string `json:"content"` Pan string `json:"pan"` Image string `json:"image"` Time string `json:"time"` }