Browse Source

12.30上传文件

master
huangqizhen 1 week ago
parent
commit
114c7bc2f3
  1. 19
      Dockerfile
  2. 525
      bailian.py
  3. 525
      hangye.py
  4. 525
      zixun.py

19
Dockerfile

@ -0,0 +1,19 @@
FROM python:3.14-slim
WORKDIR /app
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
# 如果不需要编译 C 扩展,可以跳过 gcc 安装
# 只复制 requirements.txt 并安装 Python 依赖
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY ./app .
RUN useradd -m -r appuser && chown -R appuser /app
USER appuser
EXPOSE 8000
CMD ["python", "main.py"]

525
bailian.py

@ -0,0 +1,525 @@
# marketmatrix_today_upload_anti_bot.py
# 功能:抓今日新闻(去重)+ 上传知识库 + 输出 Excel + 【高抗反爬 + I/O 监控】
import os
import re
import json
import hashlib
import time
import random
from datetime import datetime, timedelta
from urllib.parse import urljoin, urlparse
from contextlib import contextmanager
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
# ====== 配置区 ======
KB_API_URL = "https://dcapi.homilychart.com/prod/deepchartapi/api/QwenKnowledge/add"
KB_TOKEN = "d20287d0bb0298c73e540da7e3e1d7e3"
KB_INDEX_ID = "30xe1fbox1"
BASE_URL = "http://marketmatrix.net"
LIST_URL = urljoin(BASE_URL, "/news.htm")
OUTPUT_DIR = "today_news"
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUTPUT_EXCEL = os.path.join(OUTPUT_DIR, f"today_{datetime.now().strftime('%Y%m%d')}.xlsx")
DUPLICATE_CACHE_FILE = os.path.join(OUTPUT_DIR, "today_history.json")
# 关键:持久化浏览器配置目录
PERSISTENT_PROFILE_DIR = os.path.join(os.getcwd(), "bailian_config")
PROFILE_CLEAN_THRESHOLD_DAYS = 1 # 1天以上自动清理
MAX_PAGES = 30 # 适当降低,防深层页风控
print(f"📅 系统当前日期: {datetime.now().strftime('%Y-%m-%d')}")
# ====== 全局 driver 单例(✅ 反爬核心)======
_driver_instance = None
def stealth_driver(driver):
"""注入 Stealth JS,绕过常见 Bot 检测"""
try:
# 移除 webdriver 标志
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
# 隐藏语言特征
driver.execute_script("Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']})")
# 隐藏插件列表
driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3]})")
# 隐藏硬件并发数(伪装普通设备)
driver.execute_script("Object.defineProperty(navigator, 'hardwareConcurrency', {get: () => 8})")
# 隐藏设备内存(GB)
driver.execute_script("Object.defineProperty(navigator, 'deviceMemory', {get: () => 8})")
print("🛡️ Stealth JS 注入成功")
except Exception as e:
print(f"⚠️ Stealth JS 注入失败: {e}")
def init_persistent_driver():
"""初始化持久化 Chrome 实例(仅首次调用)"""
global _driver_instance
if _driver_instance is not None:
return _driver_instance
print("🔧 初始化持久化浏览器(首次启动,需过反爬,请稍候...)")
# 自动清理过期 Profile
_clean_old_profile()
# 配置 Chrome
chrome_options = Options()
chrome_options.add_argument(f"--user-data-dir={PERSISTENT_PROFILE_DIR}")
chrome_options.add_argument("--profile-directory=Default")
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-plugins-discovery")
chrome_options.add_argument("--disable-features=VizDisplayCompositor")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
# chrome_options.binary_location = "/usr/bin/chromium-browser"
# 启动
service = Service(ChromeDriverManager().install())
try:
_driver_instance = webdriver.Chrome(service=service, options=chrome_options)
_driver_instance.set_page_load_timeout(30)
_driver_instance.implicitly_wait(10)
# 关键:访问首页“暖机”,触发反爬验证
print("🌐 正在访问首页以通过反爬验证...")
_driver_instance.get(BASE_URL)
time.sleep(2 + random.uniform(0.5, 1.5))
# 注入 Stealth
stealth_driver(_driver_instance)
# 再访问一次列表页,确保状态稳定
_driver_instance.get(LIST_URL)
time.sleep(1.5 + random.uniform(0.5, 1.0))
print("✅ 浏览器初始化完成,后续请求将复用上下文")
return _driver_instance
except Exception as e:
if _driver_instance:
_driver_instance.quit()
_driver_instance = None
raise RuntimeError(f"浏览器初始化失败: {e}")
def _clean_old_profile():
"""清理过期 Profile(防积累)"""
if not os.path.exists(PERSISTENT_PROFILE_DIR):
return
try:
profile_age = time.time() - os.path.getctime(PERSISTENT_PROFILE_DIR)
if profile_age > PROFILE_CLEAN_THRESHOLD_DAYS * 86400:
print(f"🧹 Profile 目录已超 {PROFILE_CLEAN_THRESHOLD_DAYS} 天,正在清理...")
import shutil
shutil.rmtree(PERSISTENT_PROFILE_DIR, ignore_errors=True)
os.makedirs(PERSISTENT_PROFILE_DIR, exist_ok=True)
print("✅ 已重建干净 Profile")
except Exception as e:
print(f"⚠️ Profile 清理失败(继续使用现有): {e}")
def fetch_page(url, max_retries=2):
"""带重试的页面获取(✅ 复用全局 driver)"""
global _driver_instance
for attempt in range(max_retries + 1):
try:
if _driver_instance is None:
_driver_instance = init_persistent_driver()
# 拟人化:随机滚动 + 延迟
_driver_instance.get(url)
time.sleep(0.8 + random.uniform(0.3, 0.7))
# 检查是否被拦截
page_source = _driver_instance.page_source
if "403 Forbidden" in page_source or "challenge-platform" in page_source.lower():
raise Exception("被反爬拦截(403/Challenge)")
return page_source
except Exception as e:
print(f" ⚠️ 尝试 {attempt + 1}/{max_retries + 1} 失败: {e}")
if attempt < max_retries:
time.sleep(3 + random.uniform(1, 2))
# 重启 driver(极端情况)
if _driver_instance:
try:
_driver_instance.quit()
except:
pass
_driver_instance = None
else:
return None
# ====== I/O 监控器(保留)======
class IOMonitor:
def __init__(self):
self.records = {}
@contextmanager
def io_timer(self, io_type: str, desc: str = ""):
if io_type not in self.records:
self.records[io_type] = []
start = time.perf_counter()
try:
yield
finally:
duration = time.perf_counter() - start
self.records[io_type].append((duration, desc))
def summary(self):
print("\n" + "=" * 60)
print("📊 I/O 耗时总览(反爬优化版)")
print("=" * 60)
total_time = 0.0
for io_type, records in sorted(self.records.items()):
count = len(records)
total = sum(t for t, _ in records)
avg = total / count if count else 0
total_time += total
print(f"✅ {io_type:<15} | 调用 {count:2d} 次 | 总耗时 {total:6.2f}s | 平均 {avg:5.3f}s")
if count > 3:
slowest = sorted(records, key=lambda x: x[0], reverse=True)[:2]
for i, (t, d) in enumerate(slowest, 1):
print(f" └─ #{i} 慢: {t:5.2f}s → {d}")
print("-" * 60)
print(f"⏱️ I/O 总耗时: {total_time:6.2f}s")
print("=" * 60)
monitor = IOMonitor()
# ====== 工具函数(加监控)======
def load_history():
with monitor.io_timer("file_read", f"load_history: {DUPLICATE_CACHE_FILE}"):
if os.path.exists(DUPLICATE_CACHE_FILE):
try:
with open(DUPLICATE_CACHE_FILE, 'r', encoding='utf-8') as f:
return set(json.load(f))
except Exception as e:
print(f"⚠️ 历史缓存加载失败: {e}")
return set()
def save_history(history_set):
with monitor.io_timer("file_write", f"save_history: {DUPLICATE_CACHE_FILE}"):
try:
with open(DUPLICATE_CACHE_FILE, 'w', encoding='utf-8') as f:
json.dump(list(history_set), f, ensure_ascii=False, indent=2)
print(f"💾 缓存已更新:{len(history_set)} 条")
except Exception as e:
print(f"❌ 缓存保存失败: {e}")
def generate_fingerprint(title, pub_time):
raw = f"{title.strip()}|{pub_time.strip()}"
return hashlib.sha1(raw.encode('utf-8')).hexdigest()[:16]
def is_today(pub_time_str: str) -> bool:
if not pub_time_str:
return False
try:
m = re.search(r'(\d{4})[^\d]+(\d{1,2})[^\d]+(\d{1,2})', pub_time_str)
if not m:
return False
year, month, day = int(m.group(1)), int(m.group(2)), int(m.group(3))
curr_year = datetime.now().year
if year > curr_year + 1 or year < curr_year - 5:
year = curr_year
pub_date = datetime(year, month, day).date()
return pub_date == datetime.now().date()
except Exception as e:
print(f"⚠️ 日期解析失败: '{pub_time_str}' → {e}")
return False
def parse_news_list(html, base_url=BASE_URL):
soup = BeautifulSoup(html, 'lxml')
items = []
for table in soup.find_all('table', width="800", border="0"):
title_a = table.select_one('font[face="微软雅黑"][style*="font-size: 15pt"] a[href]')
if not title_a:
continue
title = title_a.get_text(strip=True)
link = urljoin(base_url, title_a['href'])
parsed = urlparse(link)
if not parsed.netloc.endswith("marketmatrix.net"):
continue
if "/topnews/" not in link:
continue
meta_fonts = table.select('font[size="2"]')
meta_combined = " ".join(f.get_text(strip=True) for f in meta_fonts)
time_match = ""
m = re.search(r'(\d{4})[^\d]+(\d{1,2})[^\d]+(\d{1,2})(?:[^\d]+(\d{1,2}:\d{2}))?', meta_combined)
if m:
date_part = f"{m.group(1)}.{int(m.group(2)):02d}.{int(m.group(3)):02d}"
time_part = m.group(4) or "00:00"
time_match = f"{date_part} {time_part}"
else:
m2 = re.search(r'(\d{4})[^\d]+(\d{1,2})[^\d]+(\d{1,2})', meta_combined)
if m2:
time_match = f"{m2.group(1)}.{int(m2.group(2)):02d}.{int(m2.group(3)):02d}"
category = ""
for txt in meta_fonts:
t = txt.get_text(strip=True).replace("主题:", "")
if re.search(r'\d{4}|编辑|新闻源|要点', t):
continue
if 2 < len(t) < 30:
category = t
break
items.append({
"标题": title,
"分类": category,
"发布时间": time_match,
"原文链接": link
})
return items
def extract_article_content(html):
if not html:
return "(访问失败)"
soup = BeautifulSoup(html, 'lxml')
editor_p = None
for p in soup.find_all(['p', 'font']):
txt = p.get_text()
if "编辑:" in txt and "发布时间:" in txt:
editor_p = p
break
if editor_p:
container = editor_p
for _ in range(3):
if container.parent and container.parent.name == 'td':
container = container.parent
break
container = container.parent
if container and container.name == 'td':
paras = []
for p in container.find_all('p'):
t = p.get_text(strip=True)
if len(t) > 20 and not any(skip in t for skip in [
"编辑:", "发布时间:", "主题:", "新闻源:", "要点:", "开户", "保证金", "©"
]):
paras.append(t)
if paras:
return "\n".join(paras)
fallback = [p.get_text(strip=True) for p in soup.find_all('p') if 30 <= len(p.get_text()) <= 500]
return "\n".join(fallback[:15]) if fallback else "(提取失败)"
def sanitize_filename(name: str, max_len=80) -> str:
return re.sub(r'[\\/:*?"<>|\r\n\t]', ' ', name).strip()[:max_len] or "untitled"
def upload_to_knowledge_base_from_content(filename: str, content: str) -> dict:
print(f"📤 正在上传: {filename} (内存)")
try:
content_bytes = content.encode('utf-8')
with monitor.io_timer("network_write", f"upload: {filename}"):
files = {'file': (filename, content_bytes, 'text/markdown')}
data = {'indexId': KB_INDEX_ID}
headers = {'token': KB_TOKEN}
response = requests.post(KB_API_URL, files=files, data=data, headers=headers, timeout=30)
print(f" ← HTTP {response.status_code}")
try:
res_json = response.json()
code = res_json.get("code")
msg = res_json.get("message") or res_json.get("error") or "未知错误"
if code == 200 and res_json.get("fileId"):
print(f" ✅ 上传成功 → fileId: {res_json['fileId']}")
return {"code": 200, "fileId": res_json["fileId"], "message": "OK"}
else:
print(f" ⚠️ 业务失败 → code: {code}, msg: {msg}")
return {"code": code or -1, "fileId": "", "message": msg}
except Exception as json_e:
print(f" ❌ JSON 解析失败: {json_e}, 原始响应: {response.text[:200]}")
return {"code": -2, "fileId": "", "message": f"非JSON响应: {response.text[:100]}"}
except Exception as e:
print(f" ❌ 上传异常: {e}")
return {"code": -1, "fileId": "", "message": str(e)}
# ====== 主流程 ======
def get_all_list_pages():
pages = [LIST_URL]
current_url = LIST_URL
visited = {LIST_URL}
print("🔗 探测分页中(从首页开始)...")
for i in range(1, MAX_PAGES):
html = fetch_page(current_url)
if not html:
break
soup = BeautifulSoup(html, 'lxml')
more_link = soup.find('a', string=re.compile(r'查看更多', re.IGNORECASE))
if not more_link:
more_link = soup.find('a', href=re.compile(r'news-list-\d+\.htm', re.IGNORECASE))
if not more_link or not more_link.get('href'):
break
next_href = more_link['href'].strip()
next_url = urljoin(current_url, next_href)
if not next_url.startswith(BASE_URL) or next_url in visited:
break
visited.add(next_url)
pages.append(next_url)
print(f" ➕ {len(pages):2d}. {next_url}")
current_url = next_url
return pages
def main():
overall_start = time.perf_counter()
print("▶ 启动高抗反爬抓取流程...")
# 1. 获取列表页(复用 driver)
print("\n[阶段1] 获取新闻列表(复用浏览器上下文)")
list_pages = get_all_list_pages()
all_items = []
for i, url in enumerate(list_pages, 1):
print(f"[{i}/{len(list_pages)}] 解析 {urlparse(url).path or '/'}")
html = fetch_page(url)
if not html:
continue
base_for_links = BASE_URL if "/list/" not in url else urljoin(BASE_URL, "/list/")
items = parse_news_list(html, base_url=base_for_links)
all_items.extend(items)
print(f"✅ 共提取 {len(all_items)} 条原始新闻")
# 2. 过滤今日 & 去重
print("\n[阶段2] 过滤今日 & 去重")
history = load_history()
new_items = []
for item in all_items:
if not is_today(item["发布时间"]):
continue
fp = generate_fingerprint(item["标题"], item["发布时间"])
if fp in history:
print(f"⏭️ 跳过重复: {item['标题'][:30]}...")
continue
new_items.append(item)
history.add(fp)
print(f"🆕 今日新增 {len(new_items)} 条新闻")
# 3. 抓取正文 + 上传
print("\n[阶段3] 抓取正文 & 上传(内存上传)")
results = []
for i, item in enumerate(new_items, 1):
title = item["标题"]
print(f"\n[{i}/{len(new_items)}] {title[:50]}...")
try:
with monitor.io_timer("network_read", f"article: {title[:20]}"):
html = fetch_page(item["原文链接"])
content = extract_article_content(html) if html else "(访问失败)"
item["正文内容"] = content
# 构建 Markdown 内容(内存中)
md_content = f"""# {title}
- {item['分类']}
- {item['发布时间']}
- {item['原文链接']}
---
{content}
"""
# 保存到磁盘(可选,用于审计)
safe_title = sanitize_filename(title)
md_file = f"{i:02d}_{safe_title}.md"
md_path = os.path.join(OUTPUT_DIR, md_file)
with monitor.io_timer("file_write", f"save_md: {md_file}"):
with open(md_path, "w", encoding="utf-8") as f:
f.write(md_content)
print(f" 💾 已保存:{md_file}")
# 上传(内存)
res = upload_to_knowledge_base_from_content(md_file, md_content)
item.update({
"知识库FileId": res.get("fileId", ""),
"上传状态": "" if res.get("code") == 200 else "",
"上传信息": res.get("message", "")[:100],
"指纹": fp
})
results.append(item)
except Exception as e:
print(f"❌ 处理失败: {title[:30]} | {e}")
item.update({
"知识库FileId": "",
"上传状态": "❌ 处理失败",
"上传信息": str(e)[:100],
"指纹": fp
})
results.append(item)
continue
# 4. 保存 & 退出
print("\n[阶段4] 保存缓存 & Excel")
save_history(history)
if results:
with monitor.io_timer("file_write", "save_excel"):
df = pd.DataFrame(results)
df.to_excel(OUTPUT_EXCEL, index=False, engine='openpyxl')
print(f"\n🎉 完成!今日新增 {len(results)} 条,Excel: {OUTPUT_EXCEL}")
else:
print(f"\nℹ️ 今日暂无新新闻发布(已探测 {len(list_pages)} 页)")
# 关键:不 quit driver!保留上下文供下次使用
print("📌 浏览器上下文已保留,下次运行将复用(加速)")
# 输出 I/O 总结
monitor.summary()
total_elapsed = time.perf_counter() - overall_start
print(f"\n🎯 总运行时间: {total_elapsed:.2f}s")
# ====== 优雅退出(保留 driver)======
def cleanup():
"""进程退出时清理(不 quit driver,除非强制)"""
global _driver_instance
if _driver_instance:
print("💡 提示:为加速下次运行,浏览器上下文已保留。")
print(" 如需彻底清理,请手动删除目录:", PERSISTENT_PROFILE_DIR)
# 不 quit,保留状态
# _driver_instance.quit()
import atexit
atexit.register(cleanup)
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\n\n🛑 用户中断,正在退出...")
cleanup()
except Exception as e:
print(f"\n💥 严重错误: {e}")
cleanup()
raise

525
hangye.py

@ -0,0 +1,525 @@
# marketmatrix_today_upload_anti_bot.py
# 功能:抓今日新闻(去重)+ 上传知识库 + 输出 Excel + 【高抗反爬 + I/O 监控】
import os
import re
import json
import hashlib
import time
import random
from datetime import datetime, timedelta
from urllib.parse import urljoin, urlparse
from contextlib import contextmanager
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
# ====== 配置区 ======
KB_API_URL = "https://dcapi.homilychart.com/prod/deepchartapi/api/QwenKnowledge/add"
KB_TOKEN = "d20287d0bb0298c73e540da7e3e1d7e3"
KB_INDEX_ID = "30xe1fbox1"
BASE_URL = "http://marketmatrix.net"
LIST_URL = urljoin(BASE_URL, "/trading.htm")
OUTPUT_DIR = "today_news"
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUTPUT_EXCEL = os.path.join(OUTPUT_DIR, f"today_{datetime.now().strftime('%Y%m%d')}.xlsx")
DUPLICATE_CACHE_FILE = os.path.join(OUTPUT_DIR, "today_history.json")
# 关键:持久化浏览器配置目录
PERSISTENT_PROFILE_DIR = os.path.join(os.getcwd(), "hangye_config")
PROFILE_CLEAN_THRESHOLD_DAYS = 1 # 1天以上自动清理
MAX_PAGES = 30 # 适当降低,防深层页风控
print(f"📅 系统当前日期: {datetime.now().strftime('%Y-%m-%d')}")
# ====== 全局 driver 单例(✅ 反爬核心)======
_driver_instance = None
def stealth_driver(driver):
"""注入 Stealth JS,绕过常见 Bot 检测"""
try:
# 移除 webdriver 标志
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
# 隐藏语言特征
driver.execute_script("Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']})")
# 隐藏插件列表
driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3]})")
# 隐藏硬件并发数(伪装普通设备)
driver.execute_script("Object.defineProperty(navigator, 'hardwareConcurrency', {get: () => 8})")
# 隐藏设备内存(GB)
driver.execute_script("Object.defineProperty(navigator, 'deviceMemory', {get: () => 8})")
print("🛡️ Stealth JS 注入成功")
except Exception as e:
print(f"⚠️ Stealth JS 注入失败: {e}")
def init_persistent_driver():
"""初始化持久化 Chrome 实例(仅首次调用)"""
global _driver_instance
if _driver_instance is not None:
return _driver_instance
print("🔧 初始化持久化浏览器(首次启动,需过反爬,请稍候...)")
# 自动清理过期 Profile
_clean_old_profile()
# 配置 Chrome
chrome_options = Options()
chrome_options.add_argument(f"--user-data-dir={PERSISTENT_PROFILE_DIR}")
chrome_options.add_argument("--profile-directory=Default")
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-plugins-discovery")
chrome_options.add_argument("--disable-features=VizDisplayCompositor")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
# chrome_options.binary_location = "/usr/bin/chromium-browser"
# 启动
service = Service(ChromeDriverManager().install())
try:
_driver_instance = webdriver.Chrome(service=service, options=chrome_options)
_driver_instance.set_page_load_timeout(30)
_driver_instance.implicitly_wait(10)
# 关键:访问首页“暖机”,触发反爬验证
print("🌐 正在访问首页以通过反爬验证...")
_driver_instance.get(BASE_URL)
time.sleep(2 + random.uniform(0.5, 1.5))
# 注入 Stealth
stealth_driver(_driver_instance)
# 再访问一次列表页,确保状态稳定
_driver_instance.get(LIST_URL)
time.sleep(1.5 + random.uniform(0.5, 1.0))
print("✅ 浏览器初始化完成,后续请求将复用上下文")
return _driver_instance
except Exception as e:
if _driver_instance:
_driver_instance.quit()
_driver_instance = None
raise RuntimeError(f"浏览器初始化失败: {e}")
def _clean_old_profile():
"""清理过期 Profile(防积累)"""
if not os.path.exists(PERSISTENT_PROFILE_DIR):
return
try:
profile_age = time.time() - os.path.getctime(PERSISTENT_PROFILE_DIR)
if profile_age > PROFILE_CLEAN_THRESHOLD_DAYS * 86400:
print(f"🧹 Profile 目录已超 {PROFILE_CLEAN_THRESHOLD_DAYS} 天,正在清理...")
import shutil
shutil.rmtree(PERSISTENT_PROFILE_DIR, ignore_errors=True)
os.makedirs(PERSISTENT_PROFILE_DIR, exist_ok=True)
print("✅ 已重建干净 Profile")
except Exception as e:
print(f"⚠️ Profile 清理失败(继续使用现有): {e}")
def fetch_page(url, max_retries=2):
"""带重试的页面获取(✅ 复用全局 driver)"""
global _driver_instance
for attempt in range(max_retries + 1):
try:
if _driver_instance is None:
_driver_instance = init_persistent_driver()
# 拟人化:随机滚动 + 延迟
_driver_instance.get(url)
time.sleep(0.8 + random.uniform(0.3, 0.7))
# 检查是否被拦截
page_source = _driver_instance.page_source
if "403 Forbidden" in page_source or "challenge-platform" in page_source.lower():
raise Exception("被反爬拦截(403/Challenge)")
return page_source
except Exception as e:
print(f" ⚠️ 尝试 {attempt + 1}/{max_retries + 1} 失败: {e}")
if attempt < max_retries:
time.sleep(3 + random.uniform(1, 2))
# 重启 driver(极端情况)
if _driver_instance:
try:
_driver_instance.quit()
except:
pass
_driver_instance = None
else:
return None
# ====== I/O 监控器(保留)======
class IOMonitor:
def __init__(self):
self.records = {}
@contextmanager
def io_timer(self, io_type: str, desc: str = ""):
if io_type not in self.records:
self.records[io_type] = []
start = time.perf_counter()
try:
yield
finally:
duration = time.perf_counter() - start
self.records[io_type].append((duration, desc))
def summary(self):
print("\n" + "=" * 60)
print("📊 I/O 耗时总览(反爬优化版)")
print("=" * 60)
total_time = 0.0
for io_type, records in sorted(self.records.items()):
count = len(records)
total = sum(t for t, _ in records)
avg = total / count if count else 0
total_time += total
print(f"✅ {io_type:<15} | 调用 {count:2d} 次 | 总耗时 {total:6.2f}s | 平均 {avg:5.3f}s")
if count > 3:
slowest = sorted(records, key=lambda x: x[0], reverse=True)[:2]
for i, (t, d) in enumerate(slowest, 1):
print(f" └─ #{i} 慢: {t:5.2f}s → {d}")
print("-" * 60)
print(f"⏱️ I/O 总耗时: {total_time:6.2f}s")
print("=" * 60)
monitor = IOMonitor()
# ====== 工具函数(加监控)======
def load_history():
with monitor.io_timer("file_read", f"load_history: {DUPLICATE_CACHE_FILE}"):
if os.path.exists(DUPLICATE_CACHE_FILE):
try:
with open(DUPLICATE_CACHE_FILE, 'r', encoding='utf-8') as f:
return set(json.load(f))
except Exception as e:
print(f"⚠️ 历史缓存加载失败: {e}")
return set()
def save_history(history_set):
with monitor.io_timer("file_write", f"save_history: {DUPLICATE_CACHE_FILE}"):
try:
with open(DUPLICATE_CACHE_FILE, 'w', encoding='utf-8') as f:
json.dump(list(history_set), f, ensure_ascii=False, indent=2)
print(f"💾 缓存已更新:{len(history_set)} 条")
except Exception as e:
print(f"❌ 缓存保存失败: {e}")
def generate_fingerprint(title, pub_time):
raw = f"{title.strip()}|{pub_time.strip()}"
return hashlib.sha1(raw.encode('utf-8')).hexdigest()[:16]
def is_today(pub_time_str: str) -> bool:
if not pub_time_str:
return False
try:
m = re.search(r'(\d{4})[^\d]+(\d{1,2})[^\d]+(\d{1,2})', pub_time_str)
if not m:
return False
year, month, day = int(m.group(1)), int(m.group(2)), int(m.group(3))
curr_year = datetime.now().year
if year > curr_year + 1 or year < curr_year - 5:
year = curr_year
pub_date = datetime(year, month, day).date()
return pub_date == datetime.now().date()
except Exception as e:
print(f"⚠️ 日期解析失败: '{pub_time_str}' → {e}")
return False
def parse_news_list(html, base_url=BASE_URL):
soup = BeautifulSoup(html, 'lxml')
items = []
for table in soup.find_all('table', width="800", border="0"):
title_a = table.select_one('font[face="微软雅黑"][style*="font-size: 15pt"] a[href]')
if not title_a:
continue
title = title_a.get_text(strip=True)
link = urljoin(base_url, title_a['href'])
parsed = urlparse(link)
if not parsed.netloc.endswith("marketmatrix.net"):
continue
if "/topnews/" not in link:
continue
meta_fonts = table.select('font[size="2"]')
meta_combined = " ".join(f.get_text(strip=True) for f in meta_fonts)
time_match = ""
m = re.search(r'(\d{4})[^\d]+(\d{1,2})[^\d]+(\d{1,2})(?:[^\d]+(\d{1,2}:\d{2}))?', meta_combined)
if m:
date_part = f"{m.group(1)}.{int(m.group(2)):02d}.{int(m.group(3)):02d}"
time_part = m.group(4) or "00:00"
time_match = f"{date_part} {time_part}"
else:
m2 = re.search(r'(\d{4})[^\d]+(\d{1,2})[^\d]+(\d{1,2})', meta_combined)
if m2:
time_match = f"{m2.group(1)}.{int(m2.group(2)):02d}.{int(m2.group(3)):02d}"
category = ""
for txt in meta_fonts:
t = txt.get_text(strip=True).replace("主题:", "")
if re.search(r'\d{4}|编辑|新闻源|要点', t):
continue
if 2 < len(t) < 30:
category = t
break
items.append({
"标题": title,
"分类": category,
"发布时间": time_match,
"原文链接": link
})
return items
def extract_article_content(html):
if not html:
return "(访问失败)"
soup = BeautifulSoup(html, 'lxml')
editor_p = None
for p in soup.find_all(['p', 'font']):
txt = p.get_text()
if "编辑:" in txt and "发布时间:" in txt:
editor_p = p
break
if editor_p:
container = editor_p
for _ in range(3):
if container.parent and container.parent.name == 'td':
container = container.parent
break
container = container.parent
if container and container.name == 'td':
paras = []
for p in container.find_all('p'):
t = p.get_text(strip=True)
if len(t) > 20 and not any(skip in t for skip in [
"编辑:", "发布时间:", "主题:", "新闻源:", "要点:", "开户", "保证金", "©"
]):
paras.append(t)
if paras:
return "\n".join(paras)
fallback = [p.get_text(strip=True) for p in soup.find_all('p') if 30 <= len(p.get_text()) <= 500]
return "\n".join(fallback[:15]) if fallback else "(提取失败)"
def sanitize_filename(name: str, max_len=80) -> str:
return re.sub(r'[\\/:*?"<>|\r\n\t]', ' ', name).strip()[:max_len] or "untitled"
def upload_to_knowledge_base_from_content(filename: str, content: str) -> dict:
print(f"📤 正在上传: {filename} (内存)")
try:
content_bytes = content.encode('utf-8')
with monitor.io_timer("network_write", f"upload: {filename}"):
files = {'file': (filename, content_bytes, 'text/markdown')}
data = {'indexId': KB_INDEX_ID}
headers = {'token': KB_TOKEN}
response = requests.post(KB_API_URL, files=files, data=data, headers=headers, timeout=30)
print(f" ← HTTP {response.status_code}")
try:
res_json = response.json()
code = res_json.get("code")
msg = res_json.get("message") or res_json.get("error") or "未知错误"
if code == 200 and res_json.get("fileId"):
print(f" ✅ 上传成功 → fileId: {res_json['fileId']}")
return {"code": 200, "fileId": res_json["fileId"], "message": "OK"}
else:
print(f" ⚠️ 业务失败 → code: {code}, msg: {msg}")
return {"code": code or -1, "fileId": "", "message": msg}
except Exception as json_e:
print(f" ❌ JSON 解析失败: {json_e}, 原始响应: {response.text[:200]}")
return {"code": -2, "fileId": "", "message": f"非JSON响应: {response.text[:100]}"}
except Exception as e:
print(f" ❌ 上传异常: {e}")
return {"code": -1, "fileId": "", "message": str(e)}
# ====== 主流程 ======
def get_all_list_pages():
pages = [LIST_URL]
current_url = LIST_URL
visited = {LIST_URL}
print("🔗 探测分页中(从首页开始)...")
for i in range(1, MAX_PAGES):
html = fetch_page(current_url)
if not html:
break
soup = BeautifulSoup(html, 'lxml')
more_link = soup.find('a', string=re.compile(r'查看更多', re.IGNORECASE))
if not more_link:
more_link = soup.find('a', href=re.compile(r'news-list-\d+\.htm', re.IGNORECASE))
if not more_link or not more_link.get('href'):
break
next_href = more_link['href'].strip()
next_url = urljoin(current_url, next_href)
if not next_url.startswith(BASE_URL) or next_url in visited:
break
visited.add(next_url)
pages.append(next_url)
print(f" ➕ {len(pages):2d}. {next_url}")
current_url = next_url
return pages
def main():
overall_start = time.perf_counter()
print("▶ 启动高抗反爬抓取流程...")
# 1. 获取列表页(复用 driver)
print("\n[阶段1] 获取新闻列表(复用浏览器上下文)")
list_pages = get_all_list_pages()
all_items = []
for i, url in enumerate(list_pages, 1):
print(f"[{i}/{len(list_pages)}] 解析 {urlparse(url).path or '/'}")
html = fetch_page(url)
if not html:
continue
base_for_links = BASE_URL if "/list/" not in url else urljoin(BASE_URL, "/list/")
items = parse_news_list(html, base_url=base_for_links)
all_items.extend(items)
print(f"✅ 共提取 {len(all_items)} 条原始新闻")
# 2. 过滤今日 & 去重
print("\n[阶段2] 过滤今日 & 去重")
history = load_history()
new_items = []
for item in all_items:
if not is_today(item["发布时间"]):
continue
fp = generate_fingerprint(item["标题"], item["发布时间"])
if fp in history:
print(f"⏭️ 跳过重复: {item['标题'][:30]}...")
continue
new_items.append(item)
history.add(fp)
print(f"🆕 今日新增 {len(new_items)} 条新闻")
# 3. 抓取正文 + 上传
print("\n[阶段3] 抓取正文 & 上传(内存上传)")
results = []
for i, item in enumerate(new_items, 1):
title = item["标题"]
print(f"\n[{i}/{len(new_items)}] {title[:50]}...")
try:
with monitor.io_timer("network_read", f"article: {title[:20]}"):
html = fetch_page(item["原文链接"])
content = extract_article_content(html) if html else "(访问失败)"
item["正文内容"] = content
# 构建 Markdown 内容(内存中)
md_content = f"""# {title}
- {item['分类']}
- {item['发布时间']}
- {item['原文链接']}
---
{content}
"""
# 保存到磁盘(可选,用于审计)
safe_title = sanitize_filename(title)
md_file = f"{i:02d}_{safe_title}.md"
md_path = os.path.join(OUTPUT_DIR, md_file)
with monitor.io_timer("file_write", f"save_md: {md_file}"):
with open(md_path, "w", encoding="utf-8") as f:
f.write(md_content)
print(f" 💾 已保存:{md_file}")
# 上传(内存)
res = upload_to_knowledge_base_from_content(md_file, md_content)
item.update({
"知识库FileId": res.get("fileId", ""),
"上传状态": "" if res.get("code") == 200 else "",
"上传信息": res.get("message", "")[:100],
"指纹": fp
})
results.append(item)
except Exception as e:
print(f"❌ 处理失败: {title[:30]} | {e}")
item.update({
"知识库FileId": "",
"上传状态": "❌ 处理失败",
"上传信息": str(e)[:100],
"指纹": fp
})
results.append(item)
continue
# 4. 保存 & 退出
print("\n[阶段4] 保存缓存 & Excel")
save_history(history)
if results:
with monitor.io_timer("file_write", "save_excel"):
df = pd.DataFrame(results)
df.to_excel(OUTPUT_EXCEL, index=False, engine='openpyxl')
print(f"\n🎉 完成!今日新增 {len(results)} 条,Excel: {OUTPUT_EXCEL}")
else:
print(f"\nℹ️ 今日暂无新新闻发布(已探测 {len(list_pages)} 页)")
# 关键:不 quit driver!保留上下文供下次使用
print("📌 浏览器上下文已保留,下次运行将复用(加速)")
# 输出 I/O 总结
monitor.summary()
total_elapsed = time.perf_counter() - overall_start
print(f"\n🎯 总运行时间: {total_elapsed:.2f}s")
# ====== 优雅退出(保留 driver)======
def cleanup():
"""进程退出时清理(不 quit driver,除非强制)"""
global _driver_instance
if _driver_instance:
print("💡 提示:为加速下次运行,浏览器上下文已保留。")
print(" 如需彻底清理,请手动删除目录:", PERSISTENT_PROFILE_DIR)
# 不 quit,保留状态
# _driver_instance.quit()
import atexit
atexit.register(cleanup)
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\n\n🛑 用户中断,正在退出...")
cleanup()
except Exception as e:
print(f"\n💥 严重错误: {e}")
cleanup()
raise

525
zixun.py

@ -0,0 +1,525 @@
# marketmatrix_today_upload_anti_bot.py
# 功能:抓今日新闻(去重)+ 上传知识库 + 输出 Excel + 【高抗反爬 + I/O 监控】
import os
import re
import json
import hashlib
import time
import random
from datetime import datetime, timedelta
from urllib.parse import urljoin, urlparse
from contextlib import contextmanager
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
# ====== 配置区 ======
KB_API_URL = "https://dcapi.homilychart.com/prod/deepchartapi/api/QwenKnowledge/add"
KB_TOKEN = "d20287d0bb0298c73e540da7e3e1d7e3"
KB_INDEX_ID = "30xe1fbox1"
BASE_URL = "http://marketmatrix.net"
LIST_URL = urljoin(BASE_URL, "/macro.htm")
OUTPUT_DIR = "today_news"
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUTPUT_EXCEL = os.path.join(OUTPUT_DIR, f"today_{datetime.now().strftime('%Y%m%d')}.xlsx")
DUPLICATE_CACHE_FILE = os.path.join(OUTPUT_DIR, "today_history.json")
# 关键:持久化浏览器配置目录
PERSISTENT_PROFILE_DIR = os.path.join(os.getcwd(), "zixun_config")
PROFILE_CLEAN_THRESHOLD_DAYS = 1 # 1天以上自动清理
MAX_PAGES = 30 # 适当降低,防深层页风控
print(f"📅 系统当前日期: {datetime.now().strftime('%Y-%m-%d')}")
# ====== 全局 driver 单例(✅ 反爬核心)======
_driver_instance = None
def stealth_driver(driver):
"""注入 Stealth JS,绕过常见 Bot 检测"""
try:
# 移除 webdriver 标志
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
# 隐藏语言特征
driver.execute_script("Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']})")
# 隐藏插件列表
driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3]})")
# 隐藏硬件并发数(伪装普通设备)
driver.execute_script("Object.defineProperty(navigator, 'hardwareConcurrency', {get: () => 8})")
# 隐藏设备内存(GB)
driver.execute_script("Object.defineProperty(navigator, 'deviceMemory', {get: () => 8})")
print("🛡️ Stealth JS 注入成功")
except Exception as e:
print(f"⚠️ Stealth JS 注入失败: {e}")
def init_persistent_driver():
"""初始化持久化 Chrome 实例(仅首次调用)"""
global _driver_instance
if _driver_instance is not None:
return _driver_instance
print("🔧 初始化持久化浏览器(首次启动,需过反爬,请稍候...)")
# 自动清理过期 Profile
_clean_old_profile()
# 配置 Chrome
chrome_options = Options()
chrome_options.add_argument(f"--user-data-dir={PERSISTENT_PROFILE_DIR}")
chrome_options.add_argument("--profile-directory=Default")
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-plugins-discovery")
chrome_options.add_argument("--disable-features=VizDisplayCompositor")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
# chrome_options.binary_location = "/usr/bin/chromium-browser"
# 启动
service = Service(ChromeDriverManager().install())
try:
_driver_instance = webdriver.Chrome(service=service, options=chrome_options)
_driver_instance.set_page_load_timeout(30)
_driver_instance.implicitly_wait(10)
# 关键:访问首页“暖机”,触发反爬验证
print("🌐 正在访问首页以通过反爬验证...")
_driver_instance.get(BASE_URL)
time.sleep(2 + random.uniform(0.5, 1.5))
# 注入 Stealth
stealth_driver(_driver_instance)
# 再访问一次列表页,确保状态稳定
_driver_instance.get(LIST_URL)
time.sleep(1.5 + random.uniform(0.5, 1.0))
print("✅ 浏览器初始化完成,后续请求将复用上下文")
return _driver_instance
except Exception as e:
if _driver_instance:
_driver_instance.quit()
_driver_instance = None
raise RuntimeError(f"浏览器初始化失败: {e}")
def _clean_old_profile():
"""清理过期 Profile(防积累)"""
if not os.path.exists(PERSISTENT_PROFILE_DIR):
return
try:
profile_age = time.time() - os.path.getctime(PERSISTENT_PROFILE_DIR)
if profile_age > PROFILE_CLEAN_THRESHOLD_DAYS * 86400:
print(f"🧹 Profile 目录已超 {PROFILE_CLEAN_THRESHOLD_DAYS} 天,正在清理...")
import shutil
shutil.rmtree(PERSISTENT_PROFILE_DIR, ignore_errors=True)
os.makedirs(PERSISTENT_PROFILE_DIR, exist_ok=True)
print("✅ 已重建干净 Profile")
except Exception as e:
print(f"⚠️ Profile 清理失败(继续使用现有): {e}")
def fetch_page(url, max_retries=2):
"""带重试的页面获取(✅ 复用全局 driver)"""
global _driver_instance
for attempt in range(max_retries + 1):
try:
if _driver_instance is None:
_driver_instance = init_persistent_driver()
# 拟人化:随机滚动 + 延迟
_driver_instance.get(url)
time.sleep(0.8 + random.uniform(0.3, 0.7))
# 检查是否被拦截
page_source = _driver_instance.page_source
if "403 Forbidden" in page_source or "challenge-platform" in page_source.lower():
raise Exception("被反爬拦截(403/Challenge)")
return page_source
except Exception as e:
print(f" ⚠️ 尝试 {attempt + 1}/{max_retries + 1} 失败: {e}")
if attempt < max_retries:
time.sleep(3 + random.uniform(1, 2))
# 重启 driver(极端情况)
if _driver_instance:
try:
_driver_instance.quit()
except:
pass
_driver_instance = None
else:
return None
# ====== I/O 监控器(保留)======
class IOMonitor:
def __init__(self):
self.records = {}
@contextmanager
def io_timer(self, io_type: str, desc: str = ""):
if io_type not in self.records:
self.records[io_type] = []
start = time.perf_counter()
try:
yield
finally:
duration = time.perf_counter() - start
self.records[io_type].append((duration, desc))
def summary(self):
print("\n" + "=" * 60)
print("📊 I/O 耗时总览(反爬优化版)")
print("=" * 60)
total_time = 0.0
for io_type, records in sorted(self.records.items()):
count = len(records)
total = sum(t for t, _ in records)
avg = total / count if count else 0
total_time += total
print(f"✅ {io_type:<15} | 调用 {count:2d} 次 | 总耗时 {total:6.2f}s | 平均 {avg:5.3f}s")
if count > 3:
slowest = sorted(records, key=lambda x: x[0], reverse=True)[:2]
for i, (t, d) in enumerate(slowest, 1):
print(f" └─ #{i} 慢: {t:5.2f}s → {d}")
print("-" * 60)
print(f"⏱️ I/O 总耗时: {total_time:6.2f}s")
print("=" * 60)
monitor = IOMonitor()
# ====== 工具函数(加监控)======
def load_history():
with monitor.io_timer("file_read", f"load_history: {DUPLICATE_CACHE_FILE}"):
if os.path.exists(DUPLICATE_CACHE_FILE):
try:
with open(DUPLICATE_CACHE_FILE, 'r', encoding='utf-8') as f:
return set(json.load(f))
except Exception as e:
print(f"⚠️ 历史缓存加载失败: {e}")
return set()
def save_history(history_set):
with monitor.io_timer("file_write", f"save_history: {DUPLICATE_CACHE_FILE}"):
try:
with open(DUPLICATE_CACHE_FILE, 'w', encoding='utf-8') as f:
json.dump(list(history_set), f, ensure_ascii=False, indent=2)
print(f"💾 缓存已更新:{len(history_set)} 条")
except Exception as e:
print(f"❌ 缓存保存失败: {e}")
def generate_fingerprint(title, pub_time):
raw = f"{title.strip()}|{pub_time.strip()}"
return hashlib.sha1(raw.encode('utf-8')).hexdigest()[:16]
def is_today(pub_time_str: str) -> bool:
if not pub_time_str:
return False
try:
m = re.search(r'(\d{4})[^\d]+(\d{1,2})[^\d]+(\d{1,2})', pub_time_str)
if not m:
return False
year, month, day = int(m.group(1)), int(m.group(2)), int(m.group(3))
curr_year = datetime.now().year
if year > curr_year + 1 or year < curr_year - 5:
year = curr_year
pub_date = datetime(year, month, day).date()
return pub_date == datetime.now().date()
except Exception as e:
print(f"⚠️ 日期解析失败: '{pub_time_str}' → {e}")
return False
def parse_news_list(html, base_url=BASE_URL):
soup = BeautifulSoup(html, 'lxml')
items = []
for table in soup.find_all('table', width="800", border="0"):
title_a = table.select_one('font[face="微软雅黑"][style*="font-size: 15pt"] a[href]')
if not title_a:
continue
title = title_a.get_text(strip=True)
link = urljoin(base_url, title_a['href'])
parsed = urlparse(link)
if not parsed.netloc.endswith("marketmatrix.net"):
continue
if "/topnews/" not in link:
continue
meta_fonts = table.select('font[size="2"]')
meta_combined = " ".join(f.get_text(strip=True) for f in meta_fonts)
time_match = ""
m = re.search(r'(\d{4})[^\d]+(\d{1,2})[^\d]+(\d{1,2})(?:[^\d]+(\d{1,2}:\d{2}))?', meta_combined)
if m:
date_part = f"{m.group(1)}.{int(m.group(2)):02d}.{int(m.group(3)):02d}"
time_part = m.group(4) or "00:00"
time_match = f"{date_part} {time_part}"
else:
m2 = re.search(r'(\d{4})[^\d]+(\d{1,2})[^\d]+(\d{1,2})', meta_combined)
if m2:
time_match = f"{m2.group(1)}.{int(m2.group(2)):02d}.{int(m2.group(3)):02d}"
category = ""
for txt in meta_fonts:
t = txt.get_text(strip=True).replace("主题:", "")
if re.search(r'\d{4}|编辑|新闻源|要点', t):
continue
if 2 < len(t) < 30:
category = t
break
items.append({
"标题": title,
"分类": category,
"发布时间": time_match,
"原文链接": link
})
return items
def extract_article_content(html):
if not html:
return "(访问失败)"
soup = BeautifulSoup(html, 'lxml')
editor_p = None
for p in soup.find_all(['p', 'font']):
txt = p.get_text()
if "编辑:" in txt and "发布时间:" in txt:
editor_p = p
break
if editor_p:
container = editor_p
for _ in range(3):
if container.parent and container.parent.name == 'td':
container = container.parent
break
container = container.parent
if container and container.name == 'td':
paras = []
for p in container.find_all('p'):
t = p.get_text(strip=True)
if len(t) > 20 and not any(skip in t for skip in [
"编辑:", "发布时间:", "主题:", "新闻源:", "要点:", "开户", "保证金", "©"
]):
paras.append(t)
if paras:
return "\n".join(paras)
fallback = [p.get_text(strip=True) for p in soup.find_all('p') if 30 <= len(p.get_text()) <= 500]
return "\n".join(fallback[:15]) if fallback else "(提取失败)"
def sanitize_filename(name: str, max_len=80) -> str:
return re.sub(r'[\\/:*?"<>|\r\n\t]', ' ', name).strip()[:max_len] or "untitled"
def upload_to_knowledge_base_from_content(filename: str, content: str) -> dict:
print(f"📤 正在上传: {filename} (内存)")
try:
content_bytes = content.encode('utf-8')
with monitor.io_timer("network_write", f"upload: {filename}"):
files = {'file': (filename, content_bytes, 'text/markdown')}
data = {'indexId': KB_INDEX_ID}
headers = {'token': KB_TOKEN}
response = requests.post(KB_API_URL, files=files, data=data, headers=headers, timeout=30)
print(f" ← HTTP {response.status_code}")
try:
res_json = response.json()
code = res_json.get("code")
msg = res_json.get("message") or res_json.get("error") or "未知错误"
if code == 200 and res_json.get("fileId"):
print(f" ✅ 上传成功 → fileId: {res_json['fileId']}")
return {"code": 200, "fileId": res_json["fileId"], "message": "OK"}
else:
print(f" ⚠️ 业务失败 → code: {code}, msg: {msg}")
return {"code": code or -1, "fileId": "", "message": msg}
except Exception as json_e:
print(f" ❌ JSON 解析失败: {json_e}, 原始响应: {response.text[:200]}")
return {"code": -2, "fileId": "", "message": f"非JSON响应: {response.text[:100]}"}
except Exception as e:
print(f" ❌ 上传异常: {e}")
return {"code": -1, "fileId": "", "message": str(e)}
# ====== 主流程 ======
def get_all_list_pages():
pages = [LIST_URL]
current_url = LIST_URL
visited = {LIST_URL}
print("🔗 探测分页中(从首页开始)...")
for i in range(1, MAX_PAGES):
html = fetch_page(current_url)
if not html:
break
soup = BeautifulSoup(html, 'lxml')
more_link = soup.find('a', string=re.compile(r'查看更多', re.IGNORECASE))
if not more_link:
more_link = soup.find('a', href=re.compile(r'news-list-\d+\.htm', re.IGNORECASE))
if not more_link or not more_link.get('href'):
break
next_href = more_link['href'].strip()
next_url = urljoin(current_url, next_href)
if not next_url.startswith(BASE_URL) or next_url in visited:
break
visited.add(next_url)
pages.append(next_url)
print(f" ➕ {len(pages):2d}. {next_url}")
current_url = next_url
return pages
def main():
overall_start = time.perf_counter()
print("▶ 启动高抗反爬抓取流程...")
# 1. 获取列表页(复用 driver)
print("\n[阶段1] 获取新闻列表(复用浏览器上下文)")
list_pages = get_all_list_pages()
all_items = []
for i, url in enumerate(list_pages, 1):
print(f"[{i}/{len(list_pages)}] 解析 {urlparse(url).path or '/'}")
html = fetch_page(url)
if not html:
continue
base_for_links = BASE_URL if "/list/" not in url else urljoin(BASE_URL, "/list/")
items = parse_news_list(html, base_url=base_for_links)
all_items.extend(items)
print(f"✅ 共提取 {len(all_items)} 条原始新闻")
# 2. 过滤今日 & 去重
print("\n[阶段2] 过滤今日 & 去重")
history = load_history()
new_items = []
for item in all_items:
if not is_today(item["发布时间"]):
continue
fp = generate_fingerprint(item["标题"], item["发布时间"])
if fp in history:
print(f"⏭️ 跳过重复: {item['标题'][:30]}...")
continue
new_items.append(item)
history.add(fp)
print(f"🆕 今日新增 {len(new_items)} 条新闻")
# 3. 抓取正文 + 上传
print("\n[阶段3] 抓取正文 & 上传(内存上传)")
results = []
for i, item in enumerate(new_items, 1):
title = item["标题"]
print(f"\n[{i}/{len(new_items)}] {title[:50]}...")
try:
with monitor.io_timer("network_read", f"article: {title[:20]}"):
html = fetch_page(item["原文链接"])
content = extract_article_content(html) if html else "(访问失败)"
item["正文内容"] = content
# 构建 Markdown 内容(内存中)
md_content = f"""# {title}
- {item['分类']}
- {item['发布时间']}
- {item['原文链接']}
---
{content}
"""
# 保存到磁盘(可选,用于审计)
safe_title = sanitize_filename(title)
md_file = f"{i:02d}_{safe_title}.md"
md_path = os.path.join(OUTPUT_DIR, md_file)
with monitor.io_timer("file_write", f"save_md: {md_file}"):
with open(md_path, "w", encoding="utf-8") as f:
f.write(md_content)
print(f" 💾 已保存:{md_file}")
# 上传(内存)
res = upload_to_knowledge_base_from_content(md_file, md_content)
item.update({
"知识库FileId": res.get("fileId", ""),
"上传状态": "" if res.get("code") == 200 else "",
"上传信息": res.get("message", "")[:100],
"指纹": fp
})
results.append(item)
except Exception as e:
print(f"❌ 处理失败: {title[:30]} | {e}")
item.update({
"知识库FileId": "",
"上传状态": "❌ 处理失败",
"上传信息": str(e)[:100],
"指纹": fp
})
results.append(item)
continue
# 4. 保存 & 退出
print("\n[阶段4] 保存缓存 & Excel")
save_history(history)
if results:
with monitor.io_timer("file_write", "save_excel"):
df = pd.DataFrame(results)
df.to_excel(OUTPUT_EXCEL, index=False, engine='openpyxl')
print(f"\n🎉 完成!今日新增 {len(results)} 条,Excel: {OUTPUT_EXCEL}")
else:
print(f"\nℹ️ 今日暂无新新闻发布(已探测 {len(list_pages)} 页)")
# 关键:不 quit driver!保留上下文供下次使用
print("📌 浏览器上下文已保留,下次运行将复用(加速)")
# 输出 I/O 总结
monitor.summary()
total_elapsed = time.perf_counter() - overall_start
print(f"\n🎯 总运行时间: {total_elapsed:.2f}s")
# ====== 优雅退出(保留 driver)======
def cleanup():
"""进程退出时清理(不 quit driver,除非强制)"""
global _driver_instance
if _driver_instance:
print("💡 提示:为加速下次运行,浏览器上下文已保留。")
print(" 如需彻底清理,请手动删除目录:", PERSISTENT_PROFILE_DIR)
# 不 quit,保留状态
# _driver_instance.quit()
import atexit
atexit.register(cleanup)
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\n\n🛑 用户中断,正在退出...")
cleanup()
except Exception as e:
print(f"\n💥 严重错误: {e}")
cleanup()
raise
Loading…
Cancel
Save