Files
pansou/util/regex_util.go
www.xueximeng.com e565502e83 优化tg搜索
2025-11-08 15:04:40 +08:00

819 lines
27 KiB
Go
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package util
import (
netUrl "net/url"
"regexp"
"strings"
)
// 通用网盘链接匹配正则表达式 - 修改为更精确的匹配模式
var AllPanLinksPattern = regexp.MustCompile(`(?i)(?:(?:magnet:\?xt=urn:btih:[a-zA-Z0-9]+)|(?:ed2k://\|file\|[^|]+\|\d+\|[A-Fa-f0-9]+\|/?)|(?:https?://(?:(?:[\w.-]+\.)?(?:pan\.(?:baidu|quark)\.cn|(?:www\.)?(?:alipan|aliyundrive)\.com|drive\.uc\.cn|cloud\.189\.cn|caiyun\.139\.com|(?:www\.)?123(?:684|685|912|pan|592)\.(?:com|cn)|115\.com|115cdn\.com|anxia\.com|pan\.xunlei\.com|mypikpak\.com))(?:/[^\s'"<>()]*)?))`)
// 单独定义各种网盘的链接匹配模式,以便更精确地提取
// 修改百度网盘链接正则表达式,确保只匹配到链接本身,不包含后面的文本
var BaiduPanPattern = regexp.MustCompile(`https?://pan\.baidu\.com/s/[a-zA-Z0-9_-]+(?:\?pwd=[a-zA-Z0-9]{4})?`)
var QuarkPanPattern = regexp.MustCompile(`https?://pan\.quark\.cn/s/[a-zA-Z0-9]+`)
var XunleiPanPattern = regexp.MustCompile(`https?://pan\.xunlei\.com/s/[a-zA-Z0-9]+(?:\?pwd=[a-zA-Z0-9]{4})?(?:#)?`)
// 添加天翼云盘链接正则表达式 - 精确匹配支持URL编码的访问码
var TianyiPanPattern = regexp.MustCompile(`https?://cloud\.189\.cn/t/[a-zA-Z0-9]+(?:%[0-9A-Fa-f]{2})*(?:[^]*)?`)
// 添加UC网盘链接正则表达式
var UCPanPattern = regexp.MustCompile(`https?://drive\.uc\.cn/s/[a-zA-Z0-9]+(?:\?public=\d)?`)
// 添加123网盘链接正则表达式
var Pan123Pattern = regexp.MustCompile(`https?://(?:www\.)?123(?:684|865|685|912|pan|592)\.(?:com|cn)/s/[a-zA-Z0-9_-]+(?:\?(?:%E6%8F%90%E5%8F%96%E7%A0%81|提取码)[:][a-zA-Z0-9]+)?`)
// 添加115网盘链接正则表达式
var Pan115Pattern = regexp.MustCompile(`https?://(?:115\.com|115cdn\.com|anxia\.com)/s/[a-zA-Z0-9]+(?:\?password=[a-zA-Z0-9]{4})?(?:#)?`)
// 添加阿里云盘链接正则表达式
var AliyunPanPattern = regexp.MustCompile(`https?://(?:www\.)?(?:alipan|aliyundrive)\.com/s/[a-zA-Z0-9]+`)
// 提取码匹配正则表达式 - 增强提取密码的能力
var PasswordPattern = regexp.MustCompile(`(?i)(?:(?:提取|访问|提取密|密)码|pwd)[:]\s*([a-zA-Z0-9]{4})(?:[^a-zA-Z0-9]|$)`)
var UrlPasswordPattern = regexp.MustCompile(`(?i)[?&]pwd=([a-zA-Z0-9]{4})(?:[^a-zA-Z0-9]|$)`)
// 百度网盘密码专用正则表达式 - 确保只提取4位密码
var BaiduPasswordPattern = regexp.MustCompile(`(?i)(?:链接:.*?提取码:|密码:|提取码:|pwd=|pwd:|pwd)([a-zA-Z0-9]{4})(?:[^a-zA-Z0-9]|$)`)
// GetLinkType 获取链接类型
func GetLinkType(url string) string {
url = strings.ToLower(url)
// 处理可能带有"链接:"前缀的情况
if strings.Contains(url, "链接:") || strings.Contains(url, "链接:") {
url = strings.Split(url, "链接")[1]
if strings.HasPrefix(url, "") || strings.HasPrefix(url, ":") {
url = url[1:]
}
url = strings.TrimSpace(url)
}
// 根据关键词判断ed2k链接
if strings.Contains(url, "ed2k:") {
return "ed2k"
}
if strings.HasPrefix(url, "magnet:") {
return "magnet"
}
if strings.Contains(url, "pan.baidu.com") {
return "baidu"
}
if strings.Contains(url, "pan.quark.cn") {
return "quark"
}
if strings.Contains(url, "alipan.com") || strings.Contains(url, "aliyundrive.com") {
return "aliyun"
}
if strings.Contains(url, "cloud.189.cn") {
return "tianyi"
}
if strings.Contains(url, "drive.uc.cn") {
return "uc"
}
if strings.Contains(url, "caiyun.139.com") {
return "mobile"
}
if strings.Contains(url, "115.com") || strings.Contains(url, "115cdn.com") || strings.Contains(url, "anxia.com") {
return "115"
}
if strings.Contains(url, "mypikpak.com") {
return "pikpak"
}
if strings.Contains(url, "pan.xunlei.com") {
return "xunlei"
}
// 123网盘有多个域名
if strings.Contains(url, "123684.com") || strings.Contains(url, "123685.com") || strings.Contains(url, "123865.com") ||
strings.Contains(url, "123912.com") || strings.Contains(url, "123pan.com") ||
strings.Contains(url, "123pan.cn") || strings.Contains(url, "123592.com") {
return "123"
}
return "others"
}
// CleanBaiduPanURL 清理百度网盘URL确保链接格式正确
func CleanBaiduPanURL(url string) string {
// 如果URL包含"https://pan.baidu.com/s/",提取出正确的链接部分
if strings.Contains(url, "https://pan.baidu.com/s/") {
// 找到链接的起始位置
startIdx := strings.Index(url, "https://pan.baidu.com/s/")
if startIdx >= 0 {
// 从起始位置开始提取
url = url[startIdx:]
// 查找可能的结束标记
endMarkers := []string{" ", "\n", "\t", "", "。", "", ";", "", ",", "?pwd="}
minEndIdx := len(url)
for _, marker := range endMarkers {
idx := strings.Index(url, marker)
if idx > 0 && idx < minEndIdx {
minEndIdx = idx
}
}
// 如果找到了结束标记,截取到结束标记位置
if minEndIdx < len(url) {
url = url[:minEndIdx]
}
// 特殊处理pwd参数确保只保留4位密码
if strings.Contains(url, "?pwd=") {
pwdIdx := strings.Index(url, "?pwd=")
if pwdIdx >= 0 && len(url) > pwdIdx+5 { // ?pwd= 有5个字符
// 只保留?pwd=后面的4位密码
pwdEndIdx := pwdIdx + 9 // ?pwd=xxxx 总共9个字符
if pwdEndIdx <= len(url) {
return url[:pwdEndIdx]
}
// 如果剩余字符不足4位返回所有可用字符
return url
}
}
}
}
return url
}
// CleanTianyiPanURL 清理天翼云盘URL确保链接格式正确
func CleanTianyiPanURL(url string) string {
// 如果URL包含"https://cloud.189.cn/t/",提取出正确的链接部分
if strings.Contains(url, "https://cloud.189.cn/t/") {
// 找到链接的起始位置
startIdx := strings.Index(url, "https://cloud.189.cn/t/")
if startIdx >= 0 {
// 从起始位置开始提取
url = url[startIdx:]
// 查找可能的结束标记
endMarkers := []string{" ", "\n", "\t", "", "。", "", ";", "", ",", "实时", "天翼", "更多"}
minEndIdx := len(url)
for _, marker := range endMarkers {
idx := strings.Index(url, marker)
if idx > 0 && idx < minEndIdx {
minEndIdx = idx
}
}
// 如果找到了结束标记,截取到结束标记位置
if minEndIdx < len(url) {
url = url[:minEndIdx]
}
// 标准化URL将URL编码转换为中文用于去重
if decoded, err := netUrl.QueryUnescape(url); err == nil {
url = decoded
}
}
}
return url
}
// CleanUCPanURL 清理UC网盘URL确保链接格式正确
func CleanUCPanURL(url string) string {
// 如果URL包含"https://drive.uc.cn/s/",提取出正确的链接部分
if strings.Contains(url, "https://drive.uc.cn/s/") {
// 找到链接的起始位置
startIdx := strings.Index(url, "https://drive.uc.cn/s/")
if startIdx >= 0 {
// 从起始位置开始提取
url = url[startIdx:]
// 查找可能的结束标记(包括常见的网盘名称,可能出现在链接后面)
endMarkers := []string{" ", "\n", "\t", "", "。", "", ";", "", ",", "网盘", "123", "夸克", "阿里", "百度"}
minEndIdx := len(url)
for _, marker := range endMarkers {
idx := strings.Index(url, marker)
if idx > 0 && idx < minEndIdx {
minEndIdx = idx
}
}
// 如果找到了结束标记,截取到结束标记位置
if minEndIdx < len(url) {
return url[:minEndIdx]
}
// 处理public参数
if strings.Contains(url, "?public=") {
publicIdx := strings.Index(url, "?public=")
if publicIdx > 0 {
// 确保只保留?public=1这样的参数不包含后面的文本
if publicIdx+9 <= len(url) { // ?public=1 总共9个字符
return url[:publicIdx+9]
}
return url[:publicIdx+8] // 如果参数不完整,至少保留?public=
}
}
}
}
return url
}
// Clean123PanURL 清理123网盘URL确保链接格式正确
func Clean123PanURL(url string) string {
// 检查是否为123网盘链接
domains := []string{"123684.com", "123685.com","123865.com", "123912.com", "123pan.com", "123pan.cn", "123592.com"}
isDomain123 := false
for _, domain := range domains {
if strings.Contains(url, domain+"/s/") {
isDomain123 = true
break
}
}
if isDomain123 {
// 确保链接有协议头
hasProtocol := strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "https://")
// 找到链接的起始位置
startIdx := -1
for _, domain := range domains {
if idx := strings.Index(url, domain+"/s/"); idx >= 0 {
startIdx = idx
break
}
}
if startIdx >= 0 {
// 如果链接没有协议头,添加协议头
if !hasProtocol {
// 提取链接部分
linkPart := url[startIdx:]
// 添加协议头
url = "https://" + linkPart
} else if startIdx > 0 {
// 如果链接有协议头但可能包含前缀文本提取完整URL
protocolIdx := strings.Index(url, "://")
if protocolIdx >= 0 {
protocol := url[:protocolIdx+3]
url = protocol + url[startIdx:]
}
}
// 保留提取码参数,但需要处理可能的表情符号和其他无关文本
// 查找可能的结束标记(表情符号、标签标识等)
// 注意:我们不再将"提取码"作为结束标记因为它是URL的一部分
endMarkers := []string{" ", "\n", "\t", "", "。", "", ";", "", ",", "📁", "🔍", "标签", "🏷", "📎", "🔗", "📌", "📋", "📂", "🗂️", "🔖", "📚", "📒", "📔", "📕", "📓", "📗", "📘", "📙", "📄", "📃", "📑", "🧾", "📊", "📈", "📉", "🗒️", "🗓️", "📆", "📅", "🗑️", "🔒", "🔓", "🔏", "🔐", "🔑", "🗝️"}
minEndIdx := len(url)
for _, marker := range endMarkers {
idx := strings.Index(url, marker)
if idx > 0 && idx < minEndIdx {
minEndIdx = idx
}
}
// 如果找到了结束标记,截取到结束标记位置
if minEndIdx < len(url) {
return url[:minEndIdx]
}
// 标准化URL编码的提取码统一使用非编码形式
if strings.Contains(url, "%E6%8F%90%E5%8F%96%E7%A0%81") {
url = strings.Replace(url, "%E6%8F%90%E5%8F%96%E7%A0%81", "提取码", 1)
}
}
}
return url
}
// Clean115PanURL 清理115网盘URL确保链接格式正确
func Clean115PanURL(url string) string {
// 检查是否为115网盘链接
if strings.Contains(url, "115.com/s/") || strings.Contains(url, "115cdn.com/s/") || strings.Contains(url, "anxia.com/s/") {
// 找到链接的起始位置
startIdx := -1
if idx := strings.Index(url, "115.com/s/"); idx >= 0 {
startIdx = idx
} else if idx := strings.Index(url, "115cdn.com/s/"); idx >= 0 {
startIdx = idx
} else if idx := strings.Index(url, "anxia.com/s/"); idx >= 0 {
startIdx = idx
}
if startIdx >= 0 {
// 确保链接有协议头
hasProtocol := strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "https://")
// 如果链接没有协议头,添加协议头
if !hasProtocol {
// 提取链接部分
linkPart := url[startIdx:]
// 添加协议头
url = "https://" + linkPart
} else if startIdx > 0 {
// 如果链接有协议头但可能包含前缀文本提取完整URL
protocolIdx := strings.Index(url, "://")
if protocolIdx >= 0 {
protocol := url[:protocolIdx+3]
url = protocol + url[startIdx:]
}
}
// 如果链接包含password参数确保只保留到password=xxxx部分4位密码
if strings.Contains(url, "?password=") {
pwdIdx := strings.Index(url, "?password=")
if pwdIdx > 0 && pwdIdx+14 <= len(url) { // ?password=xxxx 总共14个字符
// 截取到密码后面4位
url = url[:pwdIdx+14]
return url
}
}
// 如果链接包含#,截取到#位置
hashIdx := strings.Index(url, "#")
if hashIdx > 0 {
url = url[:hashIdx]
return url
}
}
}
return url
}
// CleanAliyunPanURL 清理阿里云盘URL确保链接格式正确
func CleanAliyunPanURL(url string) string {
// 如果URL包含阿里云盘域名提取出正确的链接部分
if strings.Contains(url, "alipan.com/s/") || strings.Contains(url, "aliyundrive.com/s/") {
// 找到链接的起始位置和域名部分
startIdx := -1
if idx := strings.Index(url, "www.alipan.com/s/"); idx >= 0 {
startIdx = idx
} else if idx := strings.Index(url, "alipan.com/s/"); idx >= 0 {
startIdx = idx
} else if idx := strings.Index(url, "www.aliyundrive.com/s/"); idx >= 0 {
startIdx = idx
} else if idx := strings.Index(url, "aliyundrive.com/s/"); idx >= 0 {
startIdx = idx
}
if startIdx >= 0 {
// 确保链接有协议头
hasProtocol := strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "https://")
// 如果链接没有协议头,添加协议头
if !hasProtocol {
// 提取链接部分
linkPart := url[startIdx:]
// 添加协议头
url = "https://" + linkPart
} else if startIdx > 0 {
// 如果链接有协议头但可能包含前缀文本提取完整URL
protocolIdx := strings.Index(url, "://")
if protocolIdx >= 0 {
protocol := url[:protocolIdx+3]
url = protocol + url[startIdx:]
}
}
// 查找可能的结束标记(表情符号、标签标识等)
endMarkers := []string{" ", "\n", "\t", "", "。", "", ";", "", ",", "📁", "🔍", "标签", "🏷", "📎", "🔗", "📌", "📋", "📂", "🗂️", "🔖", "📚", "📒", "📔", "📕", "📓", "📗", "📘", "📙", "📄", "📃", "📑", "🧾", "📊", "📈", "📉", "🗒️", "🗓️", "📆", "📅", "🗑️", "🔒", "🔓", "🔏", "🔐", "🔑", "🗝️"}
minEndIdx := len(url)
for _, marker := range endMarkers {
idx := strings.Index(url, marker)
if idx > 0 && idx < minEndIdx {
minEndIdx = idx
}
}
// 如果找到了结束标记,截取到结束标记位置
if minEndIdx < len(url) {
return url[:minEndIdx]
}
}
}
return url
}
// normalizeAliyunPanURL 标准化阿里云盘URL确保链接格式正确
func normalizeAliyunPanURL(url string, password string) string {
// 清理URL确保获取正确的链接部分
url = CleanAliyunPanURL(url)
// 阿里云盘链接通常不在URL中包含密码参数
// 但是我们确保返回的是干净的链接
return url
}
// ExtractPassword 提取链接密码
func ExtractPassword(content, url string) string {
// 特殊处理天翼云盘URL中的访问码
if strings.Contains(url, "cloud.189.cn") {
// 天翼云盘访问码格式访问码xxxx或者URL编码形式
tianyiPasswordPattern := regexp.MustCompile(`(?:(访问码:|%EF%BC%88%E8%AE%BF%E9%97%AE%E7%A0%81%EF%BC%9A)([a-zA-Z0-9]+)(?:|%EF%BC%89)`)
tianyiMatches := tianyiPasswordPattern.FindStringSubmatch(url)
if len(tianyiMatches) > 1 {
return tianyiMatches[1]
}
}
// 特殊处理迅雷网盘URL中的pwd参数
if strings.Contains(url, "pan.xunlei.com") && strings.Contains(url, "?pwd=") {
pwdPattern := regexp.MustCompile(`\?pwd=([a-zA-Z0-9]{4})`)
pwdMatches := pwdPattern.FindStringSubmatch(url)
if len(pwdMatches) > 1 {
return pwdMatches[1]
}
}
// 先从URL中提取密码
matches := UrlPasswordPattern.FindStringSubmatch(url)
if len(matches) > 1 {
return matches[1]
}
// 特殊处理115网盘URL中的密码
if (strings.Contains(url, "115.com") ||
strings.Contains(url, "115cdn.com") ||
strings.Contains(url, "anxia.com")) &&
strings.Contains(url, "password=") {
// 尝试从URL中提取密码
passwordPattern := regexp.MustCompile(`password=([a-zA-Z0-9]{4})`)
passwordMatches := passwordPattern.FindStringSubmatch(url)
if len(passwordMatches) > 1 {
return passwordMatches[1]
}
}
// 特殊处理123网盘URL中的提取码
if (strings.Contains(url, "123684.com") ||
strings.Contains(url, "123685.com") ||
strings.Contains(url, "123865.com") ||
strings.Contains(url, "123912.com") ||
strings.Contains(url, "123pan.com") ||
strings.Contains(url, "123pan.cn") ||
strings.Contains(url, "123592.com")) &&
(strings.Contains(url, "提取码") || strings.Contains(url, "%E6%8F%90%E5%8F%96%E7%A0%81")) {
// 尝试从URL中提取提取码处理普通文本和URL编码两种情况
extractCodePattern := regexp.MustCompile(`(?:提取码|%E6%8F%90%E5%8F%96%E7%A0%81)[:]([a-zA-Z0-9]+)`)
codeMatches := extractCodePattern.FindStringSubmatch(url)
if len(codeMatches) > 1 {
return codeMatches[1]
}
}
// 检查123网盘URL中的提取码参数
if (strings.Contains(url, "123684.com") ||
strings.Contains(url, "123685.com") ||
strings.Contains(url, "123865.com") ||
strings.Contains(url, "123912.com") ||
strings.Contains(url, "123pan.com") ||
strings.Contains(url, "123pan.cn") ||
strings.Contains(url, "123592.com")) &&
strings.Contains(url, "提取码") {
// 尝试从URL中提取提取码
parts := strings.Split(url, "提取码")
if len(parts) > 1 {
// 提取码通常跟在冒号后面
codeStart := strings.IndexAny(parts[1], ":")
if codeStart >= 0 && codeStart+1 < len(parts[1]) {
// 提取冒号后面的内容,去除空格
code := strings.TrimSpace(parts[1][codeStart+1:])
// 如果提取码后面有其他字符(如表情符号、标签等),只取提取码部分
// 增加更多可能的结束标记
endIdx := strings.IndexAny(code, " \t\n\r;,🏷📁🔍📎🔗📌📋📂🗂🔖📚📒📔📕📓📗📘📙📄📃📑🧾📊📈📉🗒🗓📆<EFB88F><F09F9386>🗑🔒🔓🔏🔐🔑🗝")
if endIdx > 0 {
code = code[:endIdx]
}
// 去除可能的空格和其他无关字符
code = strings.TrimSpace(code)
// 确保提取码是有效的通常是4位字母数字
if len(code) > 0 && len(code) <= 6 && isValidPassword(code) {
return code
}
}
}
}
// 检查内容中是否包含"提取码"字样
if strings.Contains(content, "提取码") {
// 尝试从内容中提取提取码
parts := strings.Split(content, "提取码")
for _, part := range parts {
if len(part) > 0 {
// 提取码通常跟在冒号后面
codeStart := strings.IndexAny(part, ":")
if codeStart >= 0 && codeStart+1 < len(part) {
// 提取冒号后面的内容,去除空格
code := strings.TrimSpace(part[codeStart+1:])
// 如果提取码后面有其他字符,只取提取码部分
endIdx := strings.IndexAny(code, " \t\n\r;,🏷📁🔍📎🔗📌📋📂🗂️🔖📚📒📔📕📓📗📘📙📄📃📑🧾📊📈📉🗒️🗓️📆📅🗑️🔒🔓🔏🔐🔑🗝️")
if endIdx > 0 {
code = code[:endIdx]
} else {
// 如果没有明显的结束标记假设提取码是4-6位字符
if len(code) > 6 {
// 检查前4-6位是否是有效的提取码
for i := 4; i <= 6 && i <= len(code); i++ {
if isValidPassword(code[:i]) {
code = code[:i]
break
}
}
// 如果没有找到有效的提取码取前4位
if len(code) > 6 {
code = code[:4]
}
}
}
// 去除可能的空格和其他无关字符
code = strings.TrimSpace(code)
// 如果提取码不为空且是有效的,返回
if code != "" && isValidPassword(code) {
return code
}
}
}
}
}
// 再从内容中提取密码
// 对于百度网盘链接,尝试查找特定格式的密码
if strings.Contains(strings.ToLower(url), "pan.baidu.com") {
// 尝试匹配百度网盘特定格式的密码
baiduMatches := BaiduPasswordPattern.FindStringSubmatch(content)
if len(baiduMatches) > 1 {
return baiduMatches[1]
}
}
// 通用密码提取
matches = PasswordPattern.FindStringSubmatch(content)
if len(matches) > 1 {
return matches[1]
}
return ""
}
// isValidPassword 检查提取码是否有效(只包含字母和数字)
func isValidPassword(password string) bool {
for _, c := range password {
if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
return false
}
}
return true
}
// ExtractNetDiskLinks 从文本中提取所有网盘链接
func ExtractNetDiskLinks(text string) []string {
var links []string
// 提取百度网盘链接
baiduMatches := BaiduPanPattern.FindAllString(text, -1)
for _, match := range baiduMatches {
// 清理并添加百度网盘链接
cleanURL := CleanBaiduPanURL(match)
// 确保链接末尾不包含https
if strings.HasSuffix(cleanURL, "https") {
cleanURL = cleanURL[:len(cleanURL)-5]
}
if cleanURL != "" {
links = append(links, cleanURL)
}
}
// 提取天翼云盘链接
tianyiMatches := TianyiPanPattern.FindAllString(text, -1)
for _, match := range tianyiMatches {
// 清理并添加天翼云盘链接
cleanURL := CleanTianyiPanURL(match)
// 确保链接末尾不包含https
if strings.HasSuffix(cleanURL, "https") {
cleanURL = cleanURL[:len(cleanURL)-5]
}
if cleanURL != "" {
links = append(links, cleanURL)
}
}
// 提取UC网盘链接
ucMatches := UCPanPattern.FindAllString(text, -1)
for _, match := range ucMatches {
// 清理并添加UC网盘链接
cleanURL := CleanUCPanURL(match)
// 确保链接末尾不包含https
if strings.HasSuffix(cleanURL, "https") {
cleanURL = cleanURL[:len(cleanURL)-5]
}
if cleanURL != "" {
links = append(links, cleanURL)
}
}
// 提取123网盘链接
pan123Matches := Pan123Pattern.FindAllString(text, -1)
for _, match := range pan123Matches {
// 清理并添加123网盘链接
cleanURL := Clean123PanURL(match)
// 确保链接末尾不包含https
if strings.HasSuffix(cleanURL, "https") {
cleanURL = cleanURL[:len(cleanURL)-5]
}
if cleanURL != "" {
// 检查是否已经存在相同的链接比较完整URL
isDuplicate := false
for _, existingLink := range links {
// 标准化链接以进行比较(仅移除协议)
normalizedExisting := normalizeURLForComparison(existingLink)
normalizedNew := normalizeURLForComparison(cleanURL)
if normalizedExisting == normalizedNew {
isDuplicate = true
break
}
}
if !isDuplicate {
links = append(links, cleanURL)
}
}
}
// 提取115网盘链接
pan115Matches := Pan115Pattern.FindAllString(text, -1)
for _, match := range pan115Matches {
// 清理并添加115网盘链接
cleanURL := Clean115PanURL(match) // 115网盘链接的清理逻辑与123网盘类似
// 确保链接末尾不包含https
if strings.HasSuffix(cleanURL, "https") {
cleanURL = cleanURL[:len(cleanURL)-5]
}
if cleanURL != "" {
// 检查是否已经存在相同的链接比较完整URL
isDuplicate := false
for _, existingLink := range links {
normalizedExisting := normalizeURLForComparison(existingLink)
normalizedNew := normalizeURLForComparison(cleanURL)
if normalizedExisting == normalizedNew {
isDuplicate = true
break
}
}
if !isDuplicate {
links = append(links, cleanURL)
}
}
}
// 提取阿里云盘链接
aliyunMatches := AliyunPanPattern.FindAllString(text, -1)
if aliyunMatches != nil {
for _, match := range aliyunMatches {
// 清理并添加阿里云盘链接
cleanURL := CleanAliyunPanURL(match)
// 确保链接末尾不包含https
if strings.HasSuffix(cleanURL, "https") {
cleanURL = cleanURL[:len(cleanURL)-5]
}
if cleanURL != "" {
// 检查是否已经存在相同的链接
isDuplicate := false
for _, existingLink := range links {
normalizedExisting := normalizeURLForComparison(existingLink)
normalizedNew := normalizeURLForComparison(cleanURL)
if normalizedExisting == normalizedNew {
isDuplicate = true
break
}
}
if !isDuplicate {
links = append(links, cleanURL)
}
}
}
}
// 提取夸克网盘链接
quarkLinks := QuarkPanPattern.FindAllString(text, -1)
if quarkLinks != nil {
for _, match := range quarkLinks {
// 确保链接末尾不包含https
cleanURL := match
if strings.HasSuffix(cleanURL, "https") {
cleanURL = cleanURL[:len(cleanURL)-5]
}
// 检查是否已经存在相同的链接
isDuplicate := false
for _, existingLink := range links {
if strings.Contains(existingLink, cleanURL) || strings.Contains(cleanURL, existingLink) {
isDuplicate = true
break
}
}
if !isDuplicate {
links = append(links, cleanURL)
}
}
}
// 提取迅雷网盘链接
xunleiLinks := XunleiPanPattern.FindAllString(text, -1)
if xunleiLinks != nil {
for _, match := range xunleiLinks {
// 确保链接末尾不包含https
cleanURL := match
if strings.HasSuffix(cleanURL, "https") {
cleanURL = cleanURL[:len(cleanURL)-5]
}
// 检查是否已经存在相同的链接
isDuplicate := false
for _, existingLink := range links {
if strings.Contains(existingLink, cleanURL) || strings.Contains(cleanURL, existingLink) {
isDuplicate = true
break
}
}
if !isDuplicate {
links = append(links, cleanURL)
}
}
}
// 使用通用模式提取其他可能的链接
otherLinks := AllPanLinksPattern.FindAllString(text, -1)
if otherLinks != nil {
// 过滤掉已经添加过的链接
for _, link := range otherLinks {
// 确保链接末尾不包含https
cleanURL := link
if strings.HasSuffix(cleanURL, "https") {
cleanURL = cleanURL[:len(cleanURL)-5]
}
// 跳过百度、夸克、迅雷、天翼、UC和123网盘链接因为已经单独处理过
if strings.Contains(cleanURL, "pan.baidu.com") ||
strings.Contains(cleanURL, "pan.quark.cn") ||
strings.Contains(cleanURL, "pan.xunlei.com") ||
strings.Contains(cleanURL, "cloud.189.cn") ||
strings.Contains(cleanURL, "drive.uc.cn") ||
strings.Contains(cleanURL, "123684.com") ||
strings.Contains(cleanURL, "123685.com") ||
strings.Contains(cleanURL, "123865.com") ||
strings.Contains(cleanURL, "123912.com") ||
strings.Contains(cleanURL, "123pan.com") ||
strings.Contains(cleanURL, "123pan.cn") ||
strings.Contains(cleanURL, "123592.com") {
continue
}
isDuplicate := false
for _, existingLink := range links {
normalizedExisting := normalizeURLForComparison(existingLink)
normalizedNew := normalizeURLForComparison(cleanURL)
// 使用完整URL比较包括www.前缀
if normalizedExisting == normalizedNew ||
strings.Contains(normalizedExisting, normalizedNew) ||
strings.Contains(normalizedNew, normalizedExisting) {
isDuplicate = true
break
}
}
if !isDuplicate {
links = append(links, cleanURL)
}
}
}
return links
}
// normalizeURLForComparison 标准化URL以便于比较
// 移除协议头,标准化提取码,保留完整域名用于比较
func normalizeURLForComparison(url string) string {
// 移除协议头
if idx := strings.Index(url, "://"); idx >= 0 {
url = url[idx+3:]
}
// 标准化URL编码的提取码统一使用非编码形式
if strings.Contains(url, "%E6%8F%90%E5%8F%96%E7%A0%81") {
url = strings.Replace(url, "%E6%8F%90%E5%8F%96%E7%A0%81", "提取码", 1)
}
return url
}