57 lines
2.2 KiB
Python
57 lines
2.2 KiB
Python
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
||
import os
|
||
|
||
|
||
async def crawl_missav(url):
|
||
result = {
|
||
'title': '',
|
||
'url': [],
|
||
'serial_number': '' # 新增字段存储番号
|
||
}
|
||
launch_args = {
|
||
"headless": False,
|
||
"args": ["--disable-blink-features=AutomationControlled"]
|
||
}
|
||
|
||
# 从环境变量获取代理
|
||
env_proxy = os.getenv('PROXY')
|
||
if env_proxy:
|
||
proxy = env_proxy
|
||
launch_args["proxy"] = {"server": proxy}
|
||
|
||
async with async_playwright() as p:
|
||
browser = await p.chromium.launch(**launch_args)
|
||
page = await browser.new_page()
|
||
page.set_default_timeout(60000)
|
||
try:
|
||
await page.goto(url, wait_until="domcontentloaded")
|
||
result['title'] = await page.title()
|
||
videos = await page.query_selector_all("video")
|
||
|
||
for i, video in enumerate(videos):
|
||
src = await video.get_attribute("src")
|
||
if (src != None):
|
||
result['url'].append(src)
|
||
|
||
# 新增:查找包含"番号:"的span标签,并获取其同级下一个标签的文本
|
||
try:
|
||
# 查找所有包含"番号:"文本的span标签
|
||
spans_with_serial = await page.query_selector_all('span')
|
||
for span in spans_with_serial:
|
||
span_text = await span.text_content()
|
||
if span_text and '番号:' in span_text:
|
||
# 获取span的下一个同级元素
|
||
next_element = await span.evaluate_handle('element => element.nextElementSibling')
|
||
if next_element:
|
||
next_element_text = await next_element.text_content()
|
||
if next_element_text:
|
||
result['serial_number'] = next_element_text.strip()
|
||
break # 找到第一个就退出
|
||
except Exception as e:
|
||
print(f"[INFO] 查找番号时出错: {e}")
|
||
|
||
except PlaywrightTimeoutError:
|
||
print("[ERROR] 页面加载超时,可能被 Cloudflare 拦截")
|
||
finally:
|
||
await browser.close()
|
||
return result |