MissAV-Download/function.py
2025-11-07 11:15:09 +08:00

57 lines
2.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
import os
async def crawl_missav(url):
result = {
'title': '',
'url': [],
'serial_number': '' # 新增字段存储番号
}
launch_args = {
"headless": False,
"args": ["--disable-blink-features=AutomationControlled"]
}
# 从环境变量获取代理
env_proxy = os.getenv('PROXY')
if env_proxy:
proxy = env_proxy
launch_args["proxy"] = {"server": proxy}
async with async_playwright() as p:
browser = await p.chromium.launch(**launch_args)
page = await browser.new_page()
page.set_default_timeout(60000)
try:
await page.goto(url, wait_until="domcontentloaded")
result['title'] = await page.title()
videos = await page.query_selector_all("video")
for i, video in enumerate(videos):
src = await video.get_attribute("src")
if (src != None):
result['url'].append(src)
# 新增:查找包含"番号:"的span标签并获取其同级下一个标签的文本
try:
# 查找所有包含"番号:"文本的span标签
spans_with_serial = await page.query_selector_all('span')
for span in spans_with_serial:
span_text = await span.text_content()
if span_text and '番号:' in span_text:
# 获取span的下一个同级元素
next_element = await span.evaluate_handle('element => element.nextElementSibling')
if next_element:
next_element_text = await next_element.text_content()
if next_element_text:
result['serial_number'] = next_element_text.strip()
break # 找到第一个就退出
except Exception as e:
print(f"[INFO] 查找番号时出错: {e}")
except PlaywrightTimeoutError:
print("[ERROR] 页面加载超时,可能被 Cloudflare 拦截")
finally:
await browser.close()
return result