feat: 新增网页解析类,优化依赖及图片编码支持

This commit is contained in:
begoniezhao
2025-11-17 17:08:20 +08:00
committed by lyingbug
parent 2d66abedf0
commit 4fdbec17a7
5 changed files with 161 additions and 64 deletions

View File

@@ -71,6 +71,7 @@ class BaseParser(ABC):
max_concurrent_tasks: int = 5, # Max concurrent tasks
max_chunks: int = 1000, # Max number of returned chunks
chunking_config: Optional[ChunkingConfig] = None,
**kwargs,
):
"""Initialize parser

View File

@@ -12,7 +12,7 @@ logger = logging.getLogger(__name__)
class MarkdownImageUtil:
def __init__(self):
self.b64_pattern = re.compile(
r"!\[([^\]]*)\]\(data:image/(\w+);base64,([^\)]+)\)"
r"!\[([^\]]*)\]\(data:image/(\w+)\+?\w*;base64,([^\)]+)\)"
)
self.image_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
self.replace_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")

View File

@@ -1,19 +1,20 @@
import asyncio
import logging
import os
from typing import Any
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from trafilatura import extract
from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.parser.chain_parser import PipelineParser
from docreader.parser.markdown_parser import MarkdownParser
from docreader.utils import endecode
logger = logging.getLogger(__name__)
class WebParser(BaseParser):
class StdWebParser(BaseParser):
"""Web page parser"""
def __init__(self, title: str, **kwargs):
@@ -22,7 +23,7 @@ class WebParser(BaseParser):
super().__init__(file_name=title, **kwargs)
logger.info(f"Initialized WebParser with title: {title}")
async def scrape(self, url: str) -> Any:
async def scrape(self, url: str) -> str:
logger.info(f"Starting web page scraping for URL: {url}")
try:
async with async_playwright() as p:
@@ -40,9 +41,7 @@ class WebParser(BaseParser):
except Exception as e:
logger.error(f"Error navigating to URL: {str(e)}")
await browser.close()
return BeautifulSoup(
"", "html.parser"
) # Return empty soup on navigation error
return ""
logger.info("Retrieving page HTML content")
content = await page.content()
@@ -53,14 +52,13 @@ class WebParser(BaseParser):
# Parse HTML content with BeautifulSoup
logger.info("Parsing HTML with BeautifulSoup")
soup = BeautifulSoup(content, "html.parser")
logger.info("Successfully parsed HTML content")
return soup
return content
except Exception as e:
logger.error(f"Failed to scrape web page: {str(e)}")
# Return empty BeautifulSoup object on error
return BeautifulSoup("", "html.parser")
return ""
def parse_into_text(self, content: bytes) -> Document:
"""Parse web page
@@ -71,63 +69,36 @@ class WebParser(BaseParser):
Returns:
Parse result
"""
logger.info("Starting web page parsing")
url = endecode.decode_bytes(content)
# Call async method synchronously
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
logger.info(f"Scraping web page: {url}")
chtml = asyncio.run(self.scrape(url))
md_text = extract(
chtml,
output_format="markdown",
with_metadata=True,
include_images=True,
include_tables=True,
include_links=True,
deduplicate=True,
)
if not md_text:
logger.error("Failed to parse web page")
return Document(content=f"Error parsing web page: {url}")
return Document(content=md_text)
try:
# Run async method
# Handle content possibly being a string
if isinstance(content, bytes):
url = endecode.decode_bytes(content)
logger.info(f"Decoded URL from bytes: {url}")
else:
url = str(content)
logger.info(f"Using content as URL directly: {url}")
logger.info(f"Scraping web page: {url}")
soup = loop.run_until_complete(self.scrape(url))
class WebParser(PipelineParser):
_parser_cls = (StdWebParser, MarkdownParser)
# Extract page text
logger.info("Extracting text from web page")
text = soup.get_text("\n")
logger.info(f"Extracted {len(text)} characters of text from URL: {url}")
# Get title, usually in <title> or <h1> tag
if self.title != "":
title = self.title
logger.info(f"Using provided title: {title}")
else:
title = soup.title.string if soup.title else None
logger.info(f"Found title tag: {title}")
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
logger.setLevel(logging.DEBUG)
if not title: # If <title> tag does not exist or is empty, try <h1> tag
h1_tag = soup.find("h1")
if h1_tag:
title = h1_tag.get_text()
logger.info(f"Using h1 tag as title: {title}")
else:
title = "Untitled Web Page"
logger.info("No title found, using default")
url = "https://cloud.tencent.com/document/product/457/6759"
logger.info(f"Web page title: {title}")
text = "\n".join(
(line.strip() for line in text.splitlines() if line.strip())
)
result = title + "\n\n" + text
logger.info(
f"Web page parsing complete, total content: {len(result)} characters"
)
return Document(content=result)
except Exception as e:
logger.error(f"Error parsing web page: {str(e)}")
return Document(content=f"Error parsing web page: {str(e)}")
finally:
# Close event loop
logger.info("Closing event loop")
loop.close()
parser = WebParser(title="")
cc = parser.parse_into_text(url.encode())
with open("./tencent.md", "w") as f:
f.write(cc.content)

View File

@@ -33,5 +33,6 @@ dependencies = [
"python-docx>=1.2.0",
"requests>=2.32.5",
"textract==1.5.0",
"trafilatura>=2.0.0",
"urllib3>=2.5.0",
]

124
docreader/uv.lock generated
View File

@@ -214,6 +214,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/f6/22/91616fe707a5c5510de2cac9b046a30defe7007ba8a0c04f9c08f27df312/audioop_lts-0.2.2-cp314-cp314t-win_arm64.whl", hash = "sha256:b492c3b040153e68b9fdaff5913305aaaba5bb433d8a7f73d5cf6a64ed3cc1dd", size = 25206, upload-time = "2025-08-05T16:43:16.444Z" },
]
[[package]]
name = "babel"
version = "2.17.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/7d/6b/d52e42361e1aa00709585ecc30b3f9684b3ab62530771402248b1b1d6240/babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d", size = 9951852, upload-time = "2025-02-01T15:17:41.026Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" },
]
[[package]]
name = "beautifulsoup4"
version = "4.14.2"
@@ -474,6 +483,20 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ab/c8/c9c156aa3bc7caba9b4f8a2b6abec3da6263215988f3fec0ea843f137a10/cos_python_sdk_v5-1.9.38-py3-none-any.whl", hash = "sha256:1d3dd3be2bd992b2e9c2dcd018e2596aa38eab022dbc86b4a5d14c8fc88370e6", size = 92601, upload-time = "2025-08-17T05:12:30.867Z" },
]
[[package]]
name = "courlan"
version = "1.3.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "babel" },
{ name = "tld" },
{ name = "urllib3" },
]
sdist = { url = "https://files.pythonhosted.org/packages/6f/54/6d6ceeff4bed42e7a10d6064d35ee43a810e7b3e8beb4abeae8cff4713ae/courlan-1.3.2.tar.gz", hash = "sha256:0b66f4db3a9c39a6e22dd247c72cfaa57d68ea660e94bb2c84ec7db8712af190", size = 206382, upload-time = "2024-10-29T16:40:20.994Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/8e/ca/6a667ccbe649856dcd3458bab80b016681b274399d6211187c6ab969fc50/courlan-1.3.2-py3-none-any.whl", hash = "sha256:d0dab52cf5b5b1000ee2839fbc2837e93b2514d3cb5bb61ae158a55b7a04c6be", size = 33848, upload-time = "2024-10-29T16:40:18.325Z" },
]
[[package]]
name = "crcmod"
version = "1.7"
@@ -613,6 +636,21 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/18/d5/7a04640bf559bb890455ffb28978daf7d44f667c3f04a4d422c655c1ba92/cython-3.1.6-py3-none-any.whl", hash = "sha256:91dcf7eb9b6a089ce4e9e1140e571d84c3bca834afb77ec269be7aa9d31a8157", size = 1223550, upload-time = "2025-10-23T12:38:16.732Z" },
]
[[package]]
name = "dateparser"
version = "1.2.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "python-dateutil" },
{ name = "pytz" },
{ name = "regex" },
{ name = "tzlocal" },
]
sdist = { url = "https://files.pythonhosted.org/packages/a9/30/064144f0df1749e7bb5faaa7f52b007d7c2d08ec08fed8411aba87207f68/dateparser-1.2.2.tar.gz", hash = "sha256:986316f17cb8cdc23ea8ce563027c5ef12fc725b6fb1d137c14ca08777c5ecf7", size = 329840, upload-time = "2025-06-26T09:29:23.211Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/87/22/f020c047ae1346613db9322638186468238bcfa8849b4668a22b97faad65/dateparser-1.2.2-py3-none-any.whl", hash = "sha256:5a5d7211a09013499867547023a2a0c91d5a27d15dd4dbcea676ea9fe66f2482", size = 315453, upload-time = "2025-06-26T09:29:21.412Z" },
]
[[package]]
name = "defusedxml"
version = "0.7.1"
@@ -664,6 +702,7 @@ dependencies = [
{ name = "python-docx" },
{ name = "requests" },
{ name = "textract" },
{ name = "trafilatura" },
{ name = "urllib3" },
]
@@ -697,6 +736,7 @@ requires-dist = [
{ name = "python-docx", specifier = ">=1.2.0" },
{ name = "requests", specifier = ">=2.32.5" },
{ name = "textract", specifier = "==1.5.0" },
{ name = "trafilatura", specifier = ">=2.0.0" },
{ name = "urllib3", specifier = ">=2.5.0" },
]
@@ -1100,6 +1140,22 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
]
[[package]]
name = "htmldate"
version = "1.9.4"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "charset-normalizer" },
{ name = "dateparser" },
{ name = "lxml" },
{ name = "python-dateutil" },
{ name = "urllib3" },
]
sdist = { url = "https://files.pythonhosted.org/packages/9d/10/ead9dabc999f353c3aa5d0dc0835b1e355215a5ecb489a7f4ef2ddad5e33/htmldate-1.9.4.tar.gz", hash = "sha256:1129063e02dd0354b74264de71e950c0c3fcee191178321418ccad2074cc8ed0", size = 44690, upload-time = "2025-11-04T17:46:44.983Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a1/bd/adfcdaaad5805c0c5156aeefd64c1e868c05e9c1cd6fd21751f168cd88c7/htmldate-1.9.4-py3-none-any.whl", hash = "sha256:1b94bcc4e08232a5b692159903acf95548b6a7492dddca5bb123d89d6325921c", size = 31558, upload-time = "2025-11-04T17:46:43.258Z" },
]
[[package]]
name = "httpcore"
version = "1.0.9"
@@ -1275,6 +1331,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/1e/e8/685f47e0d754320684db4425a0967f7d3fa70126bffd76110b7009a0090f/joblib-1.5.2-py3-none-any.whl", hash = "sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241", size = 308396, upload-time = "2025-08-27T12:15:45.188Z" },
]
[[package]]
name = "justext"
version = "3.0.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "lxml", extra = ["html-clean"] },
]
sdist = { url = "https://files.pythonhosted.org/packages/49/f3/45890c1b314f0d04e19c1c83d534e611513150939a7cf039664d9ab1e649/justext-3.0.2.tar.gz", hash = "sha256:13496a450c44c4cd5b5a75a5efcd9996066d2a189794ea99a49949685a0beb05", size = 828521, upload-time = "2025-02-25T20:21:49.934Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/f2/ac/52f4e86d1924a7fc05af3aeb34488570eccc39b4af90530dd6acecdf16b5/justext-3.0.2-py2.py3-none-any.whl", hash = "sha256:62b1c562b15c3c6265e121cc070874243a443bfd53060e869393f09d6b6cc9a7", size = 837940, upload-time = "2025-02-25T20:21:44.179Z" },
]
[[package]]
name = "langdetect"
version = "1.0.9"
@@ -1465,6 +1533,23 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/6c/77/d7f491cbc05303ac6801651aabeb262d43f319288c1ea96c66b1d2692ff3/lxml-6.0.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:27220da5be049e936c3aca06f174e8827ca6445a4353a1995584311487fc4e3e", size = 3518768, upload-time = "2025-09-22T04:04:57.097Z" },
]
[package.optional-dependencies]
html-clean = [
{ name = "lxml-html-clean" },
]
[[package]]
name = "lxml-html-clean"
version = "0.4.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "lxml" },
]
sdist = { url = "https://files.pythonhosted.org/packages/d9/cb/c9c5bb2a9c47292e236a808dd233a03531f53b626f36259dcd32b49c76da/lxml_html_clean-0.4.3.tar.gz", hash = "sha256:c9df91925b00f836c807beab127aac82575110eacff54d0a75187914f1bd9d8c", size = 21498, upload-time = "2025-10-02T20:49:24.895Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/10/4a/63a9540e3ca73709f4200564a737d63a4c8c9c4dd032bab8535f507c190a/lxml_html_clean-0.4.3-py3-none-any.whl", hash = "sha256:63fd7b0b9c3a2e4176611c2ca5d61c4c07ffca2de76c14059a81a2825833731e", size = 14177, upload-time = "2025-10-02T20:49:23.749Z" },
]
[[package]]
name = "magika"
version = "0.6.3"
@@ -3531,6 +3616,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e6/5e/56c751afab61336cf0e7aa671b134255a30f15f59cd9e04f59c598a37ff5/tifffile-2025.10.16-py3-none-any.whl", hash = "sha256:41463d979c1c262b0a5cdef2a7f95f0388a072ad82d899458b154a48609d759c", size = 231162, upload-time = "2025-10-16T22:56:07.214Z" },
]
[[package]]
name = "tld"
version = "0.13.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/df/a1/5723b07a70c1841a80afc9ac572fdf53488306848d844cd70519391b0d26/tld-0.13.1.tar.gz", hash = "sha256:75ec00936cbcf564f67361c41713363440b6c4ef0f0c1592b5b0fbe72c17a350", size = 462000, upload-time = "2025-05-21T22:18:29.341Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/dc/70/b2f38360c3fc4bc9b5e8ef429e1fde63749144ac583c2dbdf7e21e27a9ad/tld-0.13.1-py2.py3-none-any.whl", hash = "sha256:a2d35109433ac83486ddf87e3c4539ab2c5c2478230e5d9c060a18af4b03aa7c", size = 274718, upload-time = "2025-05-21T22:18:25.811Z" },
]
[[package]]
name = "tqdm"
version = "4.67.1"
@@ -3543,6 +3637,24 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
]
[[package]]
name = "trafilatura"
version = "2.0.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "certifi" },
{ name = "charset-normalizer" },
{ name = "courlan" },
{ name = "htmldate" },
{ name = "justext" },
{ name = "lxml" },
{ name = "urllib3" },
]
sdist = { url = "https://files.pythonhosted.org/packages/06/25/e3ebeefdebfdfae8c4a4396f5a6ea51fc6fa0831d63ce338e5090a8003dc/trafilatura-2.0.0.tar.gz", hash = "sha256:ceb7094a6ecc97e72fea73c7dba36714c5c5b577b6470e4520dca893706d6247", size = 253404, upload-time = "2024-12-03T15:23:24.16Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/8a/b6/097367f180b6383a3581ca1b86fcae284e52075fa941d1232df35293363c/trafilatura-2.0.0-py3-none-any.whl", hash = "sha256:77eb5d1e993747f6f20938e1de2d840020719735690c840b9a1024803a4cd51d", size = 132557, upload-time = "2024-12-03T15:23:21.41Z" },
]
[[package]]
name = "typing-extensions"
version = "4.15.0"
@@ -3573,6 +3685,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
]
[[package]]
name = "tzlocal"
version = "5.3.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "tzdata", marker = "sys_platform == 'win32'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/8b/2e/c14812d3d4d9cd1773c6be938f89e5735a1f11a9f184ac3639b93cef35d5/tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd", size = 30761, upload-time = "2025-03-05T21:17:41.549Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c2/14/e2a54fabd4f08cd7af1c07030603c3356b74da07f7cc056e600436edfa17/tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d", size = 18026, upload-time = "2025-03-05T21:17:39.857Z" },
]
[[package]]
name = "unidic-lite"
version = "1.0.8"