Files
pansou/util/regex_util.go

819 lines
27 KiB
Go
Raw Normal View History

2025-07-12 19:53:44 +08:00
package util
import (
netUrl "net/url"
2025-07-12 19:53:44 +08:00
"regexp"
"strings"
)
// 通用网盘链接匹配正则表达式 - 修改为更精确的匹配模式
var AllPanLinksPattern = regexp.MustCompile(`(?i)(?:(?:magnet:\?xt=urn:btih:[a-zA-Z0-9]+)|(?:ed2k://\|file\|[^|]+\|\d+\|[A-Fa-f0-9]+\|/?)|(?:https?://(?:(?:[\w.-]+\.)?(?:pan\.(?:baidu|quark)\.cn|(?:www\.)?(?:alipan|aliyundrive)\.com|drive\.uc\.cn|cloud\.189\.cn|caiyun\.139\.com|(?:www\.)?123(?:684|685|912|pan|592)\.(?:com|cn)|115\.com|115cdn\.com|anxia\.com|pan\.xunlei\.com|mypikpak\.com))(?:/[^\s'"<>()]*)?))`)
// 单独定义各种网盘的链接匹配模式,以便更精确地提取
// 修改百度网盘链接正则表达式,确保只匹配到链接本身,不包含后面的文本
var BaiduPanPattern = regexp.MustCompile(`https?://pan\.baidu\.com/s/[a-zA-Z0-9_-]+(?:\?pwd=[a-zA-Z0-9]{4})?`)
var QuarkPanPattern = regexp.MustCompile(`https?://pan\.quark\.cn/s/[a-zA-Z0-9]+`)
2025-11-08 15:04:40 +08:00
var XunleiPanPattern = regexp.MustCompile(`https?://pan\.xunlei\.com/s/[a-zA-Z0-9]+(?:\?pwd=[a-zA-Z0-9]{4})?(?:#)?`)
// 添加天翼云盘链接正则表达式 - 精确匹配支持URL编码的访问码
var TianyiPanPattern = regexp.MustCompile(`https?://cloud\.189\.cn/t/[a-zA-Z0-9]+(?:%[0-9A-Fa-f]{2})*(?:[^]*)?`)
2025-07-12 19:53:44 +08:00
// 添加UC网盘链接正则表达式
var UCPanPattern = regexp.MustCompile(`https?://drive\.uc\.cn/s/[a-zA-Z0-9]+(?:\?public=\d)?`)
// 添加123网盘链接正则表达式
2025-11-08 15:04:40 +08:00
var Pan123Pattern = regexp.MustCompile(`https?://(?:www\.)?123(?:684|865|685|912|pan|592)\.(?:com|cn)/s/[a-zA-Z0-9_-]+(?:\?(?:%E6%8F%90%E5%8F%96%E7%A0%81|提取码)[:][a-zA-Z0-9]+)?`)
2025-07-12 19:53:44 +08:00
// 添加115网盘链接正则表达式
var Pan115Pattern = regexp.MustCompile(`https?://(?:115\.com|115cdn\.com|anxia\.com)/s/[a-zA-Z0-9]+(?:\?password=[a-zA-Z0-9]{4})?(?:#)?`)
// 添加阿里云盘链接正则表达式
var AliyunPanPattern = regexp.MustCompile(`https?://(?:www\.)?(?:alipan|aliyundrive)\.com/s/[a-zA-Z0-9]+`)
// 提取码匹配正则表达式 - 增强提取密码的能力
2025-11-08 15:04:40 +08:00
var PasswordPattern = regexp.MustCompile(`(?i)(?:(?:提取|访问|提取密|密)码|pwd)[:]\s*([a-zA-Z0-9]{4})(?:[^a-zA-Z0-9]|$)`)
var UrlPasswordPattern = regexp.MustCompile(`(?i)[?&]pwd=([a-zA-Z0-9]{4})(?:[^a-zA-Z0-9]|$)`)
2025-07-12 19:53:44 +08:00
// 百度网盘密码专用正则表达式 - 确保只提取4位密码
2025-11-08 15:04:40 +08:00
var BaiduPasswordPattern = regexp.MustCompile(`(?i)(?:链接:.*?提取码:|密码:|提取码:|pwd=|pwd:|pwd)([a-zA-Z0-9]{4})(?:[^a-zA-Z0-9]|$)`)
2025-07-12 19:53:44 +08:00
// GetLinkType 获取链接类型
func GetLinkType(url string) string {
url = strings.ToLower(url)
// 处理可能带有"链接:"前缀的情况
if strings.Contains(url, "链接:") || strings.Contains(url, "链接:") {
url = strings.Split(url, "链接")[1]
if strings.HasPrefix(url, "") || strings.HasPrefix(url, ":") {
url = url[1:]
}
url = strings.TrimSpace(url)
}
// 根据关键词判断ed2k链接
if strings.Contains(url, "ed2k:") {
return "ed2k"
}
if strings.HasPrefix(url, "magnet:") {
return "magnet"
}
if strings.Contains(url, "pan.baidu.com") {
return "baidu"
}
if strings.Contains(url, "pan.quark.cn") {
return "quark"
}
if strings.Contains(url, "alipan.com") || strings.Contains(url, "aliyundrive.com") {
return "aliyun"
}
if strings.Contains(url, "cloud.189.cn") {
return "tianyi"
}
if strings.Contains(url, "drive.uc.cn") {
return "uc"
}
if strings.Contains(url, "caiyun.139.com") {
return "mobile"
}
if strings.Contains(url, "115.com") || strings.Contains(url, "115cdn.com") || strings.Contains(url, "anxia.com") {
return "115"
}
if strings.Contains(url, "mypikpak.com") {
return "pikpak"
}
if strings.Contains(url, "pan.xunlei.com") {
return "xunlei"
}
// 123网盘有多个域名
2025-11-08 15:04:40 +08:00
if strings.Contains(url, "123684.com") || strings.Contains(url, "123685.com") || strings.Contains(url, "123865.com") ||
2025-07-12 19:53:44 +08:00
strings.Contains(url, "123912.com") || strings.Contains(url, "123pan.com") ||
strings.Contains(url, "123pan.cn") || strings.Contains(url, "123592.com") {
return "123"
}
return "others"
}
// CleanBaiduPanURL 清理百度网盘URL确保链接格式正确
func CleanBaiduPanURL(url string) string {
// 如果URL包含"https://pan.baidu.com/s/",提取出正确的链接部分
if strings.Contains(url, "https://pan.baidu.com/s/") {
// 找到链接的起始位置
startIdx := strings.Index(url, "https://pan.baidu.com/s/")
if startIdx >= 0 {
// 从起始位置开始提取
url = url[startIdx:]
// 查找可能的结束标记
endMarkers := []string{" ", "\n", "\t", "", "。", "", ";", "", ",", "?pwd="}
minEndIdx := len(url)
for _, marker := range endMarkers {
idx := strings.Index(url, marker)
if idx > 0 && idx < minEndIdx {
minEndIdx = idx
}
}
// 如果找到了结束标记,截取到结束标记位置
if minEndIdx < len(url) {
2025-11-08 15:04:40 +08:00
url = url[:minEndIdx]
2025-07-12 19:53:44 +08:00
}
2025-11-08 15:04:40 +08:00
// 特殊处理pwd参数确保只保留4位密码
2025-07-12 19:53:44 +08:00
if strings.Contains(url, "?pwd=") {
pwdIdx := strings.Index(url, "?pwd=")
2025-11-08 15:04:40 +08:00
if pwdIdx >= 0 && len(url) > pwdIdx+5 { // ?pwd= 有5个字符
// 只保留?pwd=后面的4位密码
pwdEndIdx := pwdIdx + 9 // ?pwd=xxxx 总共9个字符
if pwdEndIdx <= len(url) {
return url[:pwdEndIdx]
}
// 如果剩余字符不足4位返回所有可用字符
return url
2025-07-12 19:53:44 +08:00
}
}
}
}
return url
}
// CleanTianyiPanURL 清理天翼云盘URL确保链接格式正确
func CleanTianyiPanURL(url string) string {
// 如果URL包含"https://cloud.189.cn/t/",提取出正确的链接部分
if strings.Contains(url, "https://cloud.189.cn/t/") {
// 找到链接的起始位置
startIdx := strings.Index(url, "https://cloud.189.cn/t/")
if startIdx >= 0 {
// 从起始位置开始提取
url = url[startIdx:]
// 查找可能的结束标记
endMarkers := []string{" ", "\n", "\t", "", "。", "", ";", "", ",", "实时", "天翼", "更多"}
minEndIdx := len(url)
for _, marker := range endMarkers {
idx := strings.Index(url, marker)
if idx > 0 && idx < minEndIdx {
minEndIdx = idx
}
}
// 如果找到了结束标记,截取到结束标记位置
if minEndIdx < len(url) {
url = url[:minEndIdx]
}
// 标准化URL将URL编码转换为中文用于去重
if decoded, err := netUrl.QueryUnescape(url); err == nil {
url = decoded
2025-07-12 19:53:44 +08:00
}
}
}
return url
}
// CleanUCPanURL 清理UC网盘URL确保链接格式正确
func CleanUCPanURL(url string) string {
// 如果URL包含"https://drive.uc.cn/s/",提取出正确的链接部分
if strings.Contains(url, "https://drive.uc.cn/s/") {
// 找到链接的起始位置
startIdx := strings.Index(url, "https://drive.uc.cn/s/")
if startIdx >= 0 {
// 从起始位置开始提取
url = url[startIdx:]
// 查找可能的结束标记(包括常见的网盘名称,可能出现在链接后面)
endMarkers := []string{" ", "\n", "\t", "", "。", "", ";", "", ",", "网盘", "123", "夸克", "阿里", "百度"}
minEndIdx := len(url)
for _, marker := range endMarkers {
idx := strings.Index(url, marker)
if idx > 0 && idx < minEndIdx {
minEndIdx = idx
}
}
// 如果找到了结束标记,截取到结束标记位置
if minEndIdx < len(url) {
return url[:minEndIdx]
}
// 处理public参数
if strings.Contains(url, "?public=") {
publicIdx := strings.Index(url, "?public=")
if publicIdx > 0 {
// 确保只保留?public=1这样的参数不包含后面的文本
if publicIdx+9 <= len(url) { // ?public=1 总共9个字符
return url[:publicIdx+9]
}
return url[:publicIdx+8] // 如果参数不完整,至少保留?public=
}
}
}
}
return url
}
// Clean123PanURL 清理123网盘URL确保链接格式正确
func Clean123PanURL(url string) string {
// 检查是否为123网盘链接
2025-11-08 15:04:40 +08:00
domains := []string{"123684.com", "123685.com","123865.com", "123912.com", "123pan.com", "123pan.cn", "123592.com"}
2025-07-12 19:53:44 +08:00
isDomain123 := false
for _, domain := range domains {
if strings.Contains(url, domain+"/s/") {
isDomain123 = true
break
}
}
if isDomain123 {
// 确保链接有协议头
hasProtocol := strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "https://")
// 找到链接的起始位置
startIdx := -1
for _, domain := range domains {
if idx := strings.Index(url, domain+"/s/"); idx >= 0 {
startIdx = idx
break
}
}
if startIdx >= 0 {
// 如果链接没有协议头,添加协议头
if !hasProtocol {
// 提取链接部分
linkPart := url[startIdx:]
// 添加协议头
url = "https://" + linkPart
} else if startIdx > 0 {
// 如果链接有协议头但可能包含前缀文本提取完整URL
protocolIdx := strings.Index(url, "://")
if protocolIdx >= 0 {
protocol := url[:protocolIdx+3]
url = protocol + url[startIdx:]
}
}
// 保留提取码参数,但需要处理可能的表情符号和其他无关文本
// 查找可能的结束标记(表情符号、标签标识等)
// 注意:我们不再将"提取码"作为结束标记因为它是URL的一部分
endMarkers := []string{" ", "\n", "\t", "", "。", "", ";", "", ",", "📁", "🔍", "标签", "🏷", "📎", "🔗", "📌", "📋", "📂", "🗂️", "🔖", "📚", "📒", "📔", "📕", "📓", "📗", "📘", "📙", "📄", "📃", "📑", "🧾", "📊", "📈", "📉", "🗒️", "🗓️", "📆", "📅", "🗑️", "🔒", "🔓", "🔏", "🔐", "🔑", "🗝️"}
minEndIdx := len(url)
for _, marker := range endMarkers {
idx := strings.Index(url, marker)
if idx > 0 && idx < minEndIdx {
minEndIdx = idx
}
}
// 如果找到了结束标记,截取到结束标记位置
if minEndIdx < len(url) {
return url[:minEndIdx]
}
// 标准化URL编码的提取码统一使用非编码形式
if strings.Contains(url, "%E6%8F%90%E5%8F%96%E7%A0%81") {
url = strings.Replace(url, "%E6%8F%90%E5%8F%96%E7%A0%81", "提取码", 1)
}
}
}
return url
}
// Clean115PanURL 清理115网盘URL确保链接格式正确
func Clean115PanURL(url string) string {
// 检查是否为115网盘链接
if strings.Contains(url, "115.com/s/") || strings.Contains(url, "115cdn.com/s/") || strings.Contains(url, "anxia.com/s/") {
// 找到链接的起始位置
startIdx := -1
if idx := strings.Index(url, "115.com/s/"); idx >= 0 {
startIdx = idx
} else if idx := strings.Index(url, "115cdn.com/s/"); idx >= 0 {
startIdx = idx
} else if idx := strings.Index(url, "anxia.com/s/"); idx >= 0 {
startIdx = idx
}
if startIdx >= 0 {
// 确保链接有协议头
hasProtocol := strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "https://")
// 如果链接没有协议头,添加协议头
if !hasProtocol {
// 提取链接部分
linkPart := url[startIdx:]
// 添加协议头
url = "https://" + linkPart
} else if startIdx > 0 {
// 如果链接有协议头但可能包含前缀文本提取完整URL
protocolIdx := strings.Index(url, "://")
if protocolIdx >= 0 {
protocol := url[:protocolIdx+3]
url = protocol + url[startIdx:]
}
}
// 如果链接包含password参数确保只保留到password=xxxx部分4位密码
if strings.Contains(url, "?password=") {
pwdIdx := strings.Index(url, "?password=")
if pwdIdx > 0 && pwdIdx+14 <= len(url) { // ?password=xxxx 总共14个字符
// 截取到密码后面4位
url = url[:pwdIdx+14]
return url
}
}
// 如果链接包含#,截取到#位置
hashIdx := strings.Index(url, "#")
if hashIdx > 0 {
url = url[:hashIdx]
return url
}
}
}
return url
}
// CleanAliyunPanURL 清理阿里云盘URL确保链接格式正确
func CleanAliyunPanURL(url string) string {
// 如果URL包含阿里云盘域名提取出正确的链接部分
if strings.Contains(url, "alipan.com/s/") || strings.Contains(url, "aliyundrive.com/s/") {
2025-07-19 15:56:03 +08:00
// 找到链接的起始位置和域名部分
2025-07-12 19:53:44 +08:00
startIdx := -1
2025-07-19 15:56:03 +08:00
if idx := strings.Index(url, "www.alipan.com/s/"); idx >= 0 {
startIdx = idx
} else if idx := strings.Index(url, "alipan.com/s/"); idx >= 0 {
startIdx = idx
} else if idx := strings.Index(url, "www.aliyundrive.com/s/"); idx >= 0 {
2025-07-12 19:53:44 +08:00
startIdx = idx
} else if idx := strings.Index(url, "aliyundrive.com/s/"); idx >= 0 {
startIdx = idx
}
if startIdx >= 0 {
// 确保链接有协议头
hasProtocol := strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "https://")
// 如果链接没有协议头,添加协议头
if !hasProtocol {
// 提取链接部分
linkPart := url[startIdx:]
// 添加协议头
url = "https://" + linkPart
} else if startIdx > 0 {
// 如果链接有协议头但可能包含前缀文本提取完整URL
protocolIdx := strings.Index(url, "://")
if protocolIdx >= 0 {
protocol := url[:protocolIdx+3]
url = protocol + url[startIdx:]
}
}
// 查找可能的结束标记(表情符号、标签标识等)
endMarkers := []string{" ", "\n", "\t", "", "。", "", ";", "", ",", "📁", "🔍", "标签", "🏷", "📎", "🔗", "📌", "📋", "📂", "🗂️", "🔖", "📚", "📒", "📔", "📕", "📓", "📗", "📘", "📙", "📄", "📃", "📑", "🧾", "📊", "📈", "📉", "🗒️", "🗓️", "📆", "📅", "🗑️", "🔒", "🔓", "🔏", "🔐", "🔑", "🗝️"}
minEndIdx := len(url)
for _, marker := range endMarkers {
idx := strings.Index(url, marker)
if idx > 0 && idx < minEndIdx {
minEndIdx = idx
}
}
// 如果找到了结束标记,截取到结束标记位置
if minEndIdx < len(url) {
return url[:minEndIdx]
}
}
}
return url
}
// normalizeAliyunPanURL 标准化阿里云盘URL确保链接格式正确
func normalizeAliyunPanURL(url string, password string) string {
// 清理URL确保获取正确的链接部分
url = CleanAliyunPanURL(url)
// 阿里云盘链接通常不在URL中包含密码参数
// 但是我们确保返回的是干净的链接
return url
}
// ExtractPassword 提取链接密码
func ExtractPassword(content, url string) string {
// 特殊处理天翼云盘URL中的访问码
if strings.Contains(url, "cloud.189.cn") {
// 天翼云盘访问码格式访问码xxxx或者URL编码形式
tianyiPasswordPattern := regexp.MustCompile(`(?:(访问码:|%EF%BC%88%E8%AE%BF%E9%97%AE%E7%A0%81%EF%BC%9A)([a-zA-Z0-9]+)(?:|%EF%BC%89)`)
tianyiMatches := tianyiPasswordPattern.FindStringSubmatch(url)
if len(tianyiMatches) > 1 {
return tianyiMatches[1]
}
}
2025-11-08 15:04:40 +08:00
// 特殊处理迅雷网盘URL中的pwd参数
if strings.Contains(url, "pan.xunlei.com") && strings.Contains(url, "?pwd=") {
pwdPattern := regexp.MustCompile(`\?pwd=([a-zA-Z0-9]{4})`)
pwdMatches := pwdPattern.FindStringSubmatch(url)
if len(pwdMatches) > 1 {
return pwdMatches[1]
}
}
2025-07-12 19:53:44 +08:00
// 先从URL中提取密码
matches := UrlPasswordPattern.FindStringSubmatch(url)
if len(matches) > 1 {
return matches[1]
}
// 特殊处理115网盘URL中的密码
if (strings.Contains(url, "115.com") ||
strings.Contains(url, "115cdn.com") ||
strings.Contains(url, "anxia.com")) &&
strings.Contains(url, "password=") {
// 尝试从URL中提取密码
passwordPattern := regexp.MustCompile(`password=([a-zA-Z0-9]{4})`)
passwordMatches := passwordPattern.FindStringSubmatch(url)
if len(passwordMatches) > 1 {
return passwordMatches[1]
}
}
// 特殊处理123网盘URL中的提取码
if (strings.Contains(url, "123684.com") ||
strings.Contains(url, "123685.com") ||
2025-11-08 15:04:40 +08:00
strings.Contains(url, "123865.com") ||
2025-07-12 19:53:44 +08:00
strings.Contains(url, "123912.com") ||
strings.Contains(url, "123pan.com") ||
strings.Contains(url, "123pan.cn") ||
strings.Contains(url, "123592.com")) &&
(strings.Contains(url, "提取码") || strings.Contains(url, "%E6%8F%90%E5%8F%96%E7%A0%81")) {
// 尝试从URL中提取提取码处理普通文本和URL编码两种情况
extractCodePattern := regexp.MustCompile(`(?:提取码|%E6%8F%90%E5%8F%96%E7%A0%81)[:]([a-zA-Z0-9]+)`)
codeMatches := extractCodePattern.FindStringSubmatch(url)
if len(codeMatches) > 1 {
return codeMatches[1]
}
}
// 检查123网盘URL中的提取码参数
if (strings.Contains(url, "123684.com") ||
strings.Contains(url, "123685.com") ||
2025-11-08 15:04:40 +08:00
strings.Contains(url, "123865.com") ||
2025-07-12 19:53:44 +08:00
strings.Contains(url, "123912.com") ||
strings.Contains(url, "123pan.com") ||
strings.Contains(url, "123pan.cn") ||
strings.Contains(url, "123592.com")) &&
strings.Contains(url, "提取码") {
// 尝试从URL中提取提取码
parts := strings.Split(url, "提取码")
if len(parts) > 1 {
// 提取码通常跟在冒号后面
codeStart := strings.IndexAny(parts[1], ":")
if codeStart >= 0 && codeStart+1 < len(parts[1]) {
// 提取冒号后面的内容,去除空格
code := strings.TrimSpace(parts[1][codeStart+1:])
// 如果提取码后面有其他字符(如表情符号、标签等),只取提取码部分
// 增加更多可能的结束标记
endIdx := strings.IndexAny(code, " \t\n\r;,🏷📁🔍📎🔗📌📋📂🗂🔖📚📒📔📕📓📗📘📙📄📃📑🧾📊📈📉🗒🗓📆<EFB88F><F09F9386>🗑🔒🔓🔏🔐🔑🗝")
if endIdx > 0 {
code = code[:endIdx]
}
// 去除可能的空格和其他无关字符
code = strings.TrimSpace(code)
// 确保提取码是有效的通常是4位字母数字
if len(code) > 0 && len(code) <= 6 && isValidPassword(code) {
return code
}
}
}
}
// 检查内容中是否包含"提取码"字样
if strings.Contains(content, "提取码") {
// 尝试从内容中提取提取码
parts := strings.Split(content, "提取码")
for _, part := range parts {
if len(part) > 0 {
// 提取码通常跟在冒号后面
codeStart := strings.IndexAny(part, ":")
if codeStart >= 0 && codeStart+1 < len(part) {
// 提取冒号后面的内容,去除空格
code := strings.TrimSpace(part[codeStart+1:])
// 如果提取码后面有其他字符,只取提取码部分
endIdx := strings.IndexAny(code, " \t\n\r;,🏷📁🔍📎🔗📌📋📂🗂️🔖📚📒📔📕📓📗📘📙📄📃📑🧾📊📈📉🗒️🗓️📆📅🗑️🔒🔓🔏🔐🔑🗝️")
if endIdx > 0 {
code = code[:endIdx]
} else {
// 如果没有明显的结束标记假设提取码是4-6位字符
if len(code) > 6 {
// 检查前4-6位是否是有效的提取码
for i := 4; i <= 6 && i <= len(code); i++ {
if isValidPassword(code[:i]) {
code = code[:i]
break
}
}
// 如果没有找到有效的提取码取前4位
if len(code) > 6 {
code = code[:4]
}
}
}
// 去除可能的空格和其他无关字符
code = strings.TrimSpace(code)
// 如果提取码不为空且是有效的,返回
if code != "" && isValidPassword(code) {
return code
}
}
}
}
}
// 再从内容中提取密码
// 对于百度网盘链接,尝试查找特定格式的密码
if strings.Contains(strings.ToLower(url), "pan.baidu.com") {
// 尝试匹配百度网盘特定格式的密码
baiduMatches := BaiduPasswordPattern.FindStringSubmatch(content)
if len(baiduMatches) > 1 {
return baiduMatches[1]
}
}
// 通用密码提取
matches = PasswordPattern.FindStringSubmatch(content)
if len(matches) > 1 {
return matches[1]
}
return ""
}
// isValidPassword 检查提取码是否有效(只包含字母和数字)
func isValidPassword(password string) bool {
for _, c := range password {
if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
return false
}
}
return true
}
// ExtractNetDiskLinks 从文本中提取所有网盘链接
func ExtractNetDiskLinks(text string) []string {
var links []string
// 提取百度网盘链接
baiduMatches := BaiduPanPattern.FindAllString(text, -1)
for _, match := range baiduMatches {
// 清理并添加百度网盘链接
cleanURL := CleanBaiduPanURL(match)
2025-07-21 18:51:58 +08:00
// 确保链接末尾不包含https
if strings.HasSuffix(cleanURL, "https") {
cleanURL = cleanURL[:len(cleanURL)-5]
}
2025-07-12 19:53:44 +08:00
if cleanURL != "" {
links = append(links, cleanURL)
}
}
// 提取天翼云盘链接
tianyiMatches := TianyiPanPattern.FindAllString(text, -1)
for _, match := range tianyiMatches {
// 清理并添加天翼云盘链接
cleanURL := CleanTianyiPanURL(match)
2025-07-21 18:51:58 +08:00
// 确保链接末尾不包含https
if strings.HasSuffix(cleanURL, "https") {
cleanURL = cleanURL[:len(cleanURL)-5]
}
2025-07-12 19:53:44 +08:00
if cleanURL != "" {
links = append(links, cleanURL)
}
}
// 提取UC网盘链接
ucMatches := UCPanPattern.FindAllString(text, -1)
for _, match := range ucMatches {
// 清理并添加UC网盘链接
cleanURL := CleanUCPanURL(match)
2025-07-21 18:51:58 +08:00
// 确保链接末尾不包含https
if strings.HasSuffix(cleanURL, "https") {
cleanURL = cleanURL[:len(cleanURL)-5]
}
2025-07-12 19:53:44 +08:00
if cleanURL != "" {
links = append(links, cleanURL)
}
}
// 提取123网盘链接
pan123Matches := Pan123Pattern.FindAllString(text, -1)
for _, match := range pan123Matches {
// 清理并添加123网盘链接
cleanURL := Clean123PanURL(match)
2025-07-21 18:51:58 +08:00
// 确保链接末尾不包含https
if strings.HasSuffix(cleanURL, "https") {
cleanURL = cleanURL[:len(cleanURL)-5]
}
2025-07-12 19:53:44 +08:00
if cleanURL != "" {
// 检查是否已经存在相同的链接比较完整URL
isDuplicate := false
for _, existingLink := range links {
// 标准化链接以进行比较(仅移除协议)
normalizedExisting := normalizeURLForComparison(existingLink)
normalizedNew := normalizeURLForComparison(cleanURL)
if normalizedExisting == normalizedNew {
isDuplicate = true
break
}
}
if !isDuplicate {
links = append(links, cleanURL)
}
}
}
// 提取115网盘链接
pan115Matches := Pan115Pattern.FindAllString(text, -1)
for _, match := range pan115Matches {
// 清理并添加115网盘链接
cleanURL := Clean115PanURL(match) // 115网盘链接的清理逻辑与123网盘类似
2025-07-21 18:51:58 +08:00
// 确保链接末尾不包含https
if strings.HasSuffix(cleanURL, "https") {
cleanURL = cleanURL[:len(cleanURL)-5]
}
2025-07-12 19:53:44 +08:00
if cleanURL != "" {
// 检查是否已经存在相同的链接比较完整URL
isDuplicate := false
for _, existingLink := range links {
normalizedExisting := normalizeURLForComparison(existingLink)
normalizedNew := normalizeURLForComparison(cleanURL)
if normalizedExisting == normalizedNew {
isDuplicate = true
break
}
}
if !isDuplicate {
links = append(links, cleanURL)
}
}
}
// 提取阿里云盘链接
aliyunMatches := AliyunPanPattern.FindAllString(text, -1)
if aliyunMatches != nil {
for _, match := range aliyunMatches {
// 清理并添加阿里云盘链接
cleanURL := CleanAliyunPanURL(match)
2025-07-21 18:51:58 +08:00
// 确保链接末尾不包含https
if strings.HasSuffix(cleanURL, "https") {
cleanURL = cleanURL[:len(cleanURL)-5]
}
2025-07-12 19:53:44 +08:00
if cleanURL != "" {
// 检查是否已经存在相同的链接
isDuplicate := false
for _, existingLink := range links {
normalizedExisting := normalizeURLForComparison(existingLink)
normalizedNew := normalizeURLForComparison(cleanURL)
if normalizedExisting == normalizedNew {
isDuplicate = true
break
}
}
if !isDuplicate {
links = append(links, cleanURL)
}
}
}
}
// 提取夸克网盘链接
quarkLinks := QuarkPanPattern.FindAllString(text, -1)
if quarkLinks != nil {
for _, match := range quarkLinks {
2025-07-21 18:51:58 +08:00
// 确保链接末尾不包含https
cleanURL := match
if strings.HasSuffix(cleanURL, "https") {
cleanURL = cleanURL[:len(cleanURL)-5]
}
2025-07-12 19:53:44 +08:00
// 检查是否已经存在相同的链接
isDuplicate := false
for _, existingLink := range links {
2025-07-21 18:51:58 +08:00
if strings.Contains(existingLink, cleanURL) || strings.Contains(cleanURL, existingLink) {
2025-07-12 19:53:44 +08:00
isDuplicate = true
break
}
}
if !isDuplicate {
2025-07-21 18:51:58 +08:00
links = append(links, cleanURL)
2025-07-12 19:53:44 +08:00
}
}
}
// 提取迅雷网盘链接
xunleiLinks := XunleiPanPattern.FindAllString(text, -1)
if xunleiLinks != nil {
for _, match := range xunleiLinks {
2025-07-21 18:51:58 +08:00
// 确保链接末尾不包含https
cleanURL := match
if strings.HasSuffix(cleanURL, "https") {
cleanURL = cleanURL[:len(cleanURL)-5]
}
2025-07-12 19:53:44 +08:00
// 检查是否已经存在相同的链接
isDuplicate := false
for _, existingLink := range links {
2025-07-21 18:51:58 +08:00
if strings.Contains(existingLink, cleanURL) || strings.Contains(cleanURL, existingLink) {
2025-07-12 19:53:44 +08:00
isDuplicate = true
break
}
}
if !isDuplicate {
2025-07-21 18:51:58 +08:00
links = append(links, cleanURL)
2025-07-12 19:53:44 +08:00
}
}
}
// 使用通用模式提取其他可能的链接
otherLinks := AllPanLinksPattern.FindAllString(text, -1)
if otherLinks != nil {
// 过滤掉已经添加过的链接
for _, link := range otherLinks {
2025-07-21 18:51:58 +08:00
// 确保链接末尾不包含https
cleanURL := link
if strings.HasSuffix(cleanURL, "https") {
cleanURL = cleanURL[:len(cleanURL)-5]
}
2025-07-12 19:53:44 +08:00
// 跳过百度、夸克、迅雷、天翼、UC和123网盘链接因为已经单独处理过
2025-07-21 18:51:58 +08:00
if strings.Contains(cleanURL, "pan.baidu.com") ||
strings.Contains(cleanURL, "pan.quark.cn") ||
strings.Contains(cleanURL, "pan.xunlei.com") ||
strings.Contains(cleanURL, "cloud.189.cn") ||
strings.Contains(cleanURL, "drive.uc.cn") ||
strings.Contains(cleanURL, "123684.com") ||
strings.Contains(cleanURL, "123685.com") ||
2025-11-08 15:04:40 +08:00
strings.Contains(cleanURL, "123865.com") ||
2025-07-21 18:51:58 +08:00
strings.Contains(cleanURL, "123912.com") ||
strings.Contains(cleanURL, "123pan.com") ||
strings.Contains(cleanURL, "123pan.cn") ||
strings.Contains(cleanURL, "123592.com") {
2025-07-12 19:53:44 +08:00
continue
}
isDuplicate := false
for _, existingLink := range links {
normalizedExisting := normalizeURLForComparison(existingLink)
2025-07-21 18:51:58 +08:00
normalizedNew := normalizeURLForComparison(cleanURL)
2025-07-12 19:53:44 +08:00
// 使用完整URL比较包括www.前缀
if normalizedExisting == normalizedNew ||
strings.Contains(normalizedExisting, normalizedNew) ||
strings.Contains(normalizedNew, normalizedExisting) {
isDuplicate = true
break
}
}
if !isDuplicate {
2025-07-21 18:51:58 +08:00
links = append(links, cleanURL)
2025-07-12 19:53:44 +08:00
}
}
}
return links
}
// normalizeURLForComparison 标准化URL以便于比较
// 移除协议头,标准化提取码,保留完整域名用于比较
func normalizeURLForComparison(url string) string {
// 移除协议头
if idx := strings.Index(url, "://"); idx >= 0 {
url = url[idx+3:]
}
// 标准化URL编码的提取码统一使用非编码形式
if strings.Contains(url, "%E6%8F%90%E5%8F%96%E7%A0%81") {
url = strings.Replace(url, "%E6%8F%90%E5%8F%96%E7%A0%81", "提取码", 1)
}
return url
}