from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError import os async def crawl_missav(url): result = { 'title': '', 'url': [], 'serial_number': '' # 新增字段存储番号 } launch_args = { "headless": False, "args": ["--disable-blink-features=AutomationControlled"] } # 从环境变量获取代理 env_proxy = os.getenv('PROXY') if env_proxy: proxy = env_proxy launch_args["proxy"] = {"server": proxy} async with async_playwright() as p: browser = await p.chromium.launch(**launch_args) page = await browser.new_page() page.set_default_timeout(60000) try: await page.goto(url, wait_until="domcontentloaded") result['title'] = await page.title() videos = await page.query_selector_all("video") for i, video in enumerate(videos): src = await video.get_attribute("src") if (src != None): result['url'].append(src) # 新增:查找包含"番号:"的span标签,并获取其同级下一个标签的文本 try: # 查找所有包含"番号:"文本的span标签 spans_with_serial = await page.query_selector_all('span') for span in spans_with_serial: span_text = await span.text_content() if span_text and '番号:' in span_text: # 获取span的下一个同级元素 next_element = await span.evaluate_handle('element => element.nextElementSibling') if next_element: next_element_text = await next_element.text_content() if next_element_text: result['serial_number'] = next_element_text.strip() break # 找到第一个就退出 except Exception as e: print(f"[INFO] 查找番号时出错: {e}") except PlaywrightTimeoutError: print("[ERROR] 页面加载超时,可能被 Cloudflare 拦截") finally: await browser.close() return result