4 changed files with 1594 additions and 0 deletions
-
19Dockerfile
-
525bailian.py
-
525hangye.py
-
525zixun.py
@ -0,0 +1,19 @@ |
|||
FROM python:3.14-slim |
|||
|
|||
WORKDIR /app |
|||
|
|||
ENV PYTHONDONTWRITEBYTECODE=1 |
|||
ENV PYTHONUNBUFFERED=1 |
|||
|
|||
# 如果不需要编译 C 扩展,可以跳过 gcc 安装 |
|||
# 只复制 requirements.txt 并安装 Python 依赖 |
|||
COPY requirements.txt . |
|||
RUN pip install --no-cache-dir -r requirements.txt |
|||
|
|||
COPY ./app . |
|||
|
|||
RUN useradd -m -r appuser && chown -R appuser /app |
|||
USER appuser |
|||
|
|||
EXPOSE 8000 |
|||
CMD ["python", "main.py"] |
|||
@ -0,0 +1,525 @@ |
|||
# marketmatrix_today_upload_anti_bot.py |
|||
# 功能:抓今日新闻(去重)+ 上传知识库 + 输出 Excel + 【高抗反爬 + I/O 监控】 |
|||
|
|||
import os |
|||
import re |
|||
import json |
|||
import hashlib |
|||
import time |
|||
import random |
|||
from datetime import datetime, timedelta |
|||
from urllib.parse import urljoin, urlparse |
|||
from contextlib import contextmanager |
|||
|
|||
import pandas as pd |
|||
import requests |
|||
from bs4 import BeautifulSoup |
|||
from selenium import webdriver |
|||
from selenium.webdriver.chrome.options import Options |
|||
from selenium.webdriver.chrome.service import Service |
|||
from webdriver_manager.chrome import ChromeDriverManager |
|||
|
|||
# ====== 配置区 ====== |
|||
KB_API_URL = "https://dcapi.homilychart.com/prod/deepchartapi/api/QwenKnowledge/add" |
|||
KB_TOKEN = "d20287d0bb0298c73e540da7e3e1d7e3" |
|||
KB_INDEX_ID = "30xe1fbox1" |
|||
|
|||
BASE_URL = "http://marketmatrix.net" |
|||
LIST_URL = urljoin(BASE_URL, "/news.htm") |
|||
OUTPUT_DIR = "today_news" |
|||
os.makedirs(OUTPUT_DIR, exist_ok=True) |
|||
|
|||
OUTPUT_EXCEL = os.path.join(OUTPUT_DIR, f"today_{datetime.now().strftime('%Y%m%d')}.xlsx") |
|||
DUPLICATE_CACHE_FILE = os.path.join(OUTPUT_DIR, "today_history.json") |
|||
|
|||
# 关键:持久化浏览器配置目录 |
|||
PERSISTENT_PROFILE_DIR = os.path.join(os.getcwd(), "bailian_config") |
|||
PROFILE_CLEAN_THRESHOLD_DAYS = 1 # 1天以上自动清理 |
|||
|
|||
MAX_PAGES = 30 # 适当降低,防深层页风控 |
|||
print(f"📅 系统当前日期: {datetime.now().strftime('%Y-%m-%d')}") |
|||
|
|||
# ====== 全局 driver 单例(✅ 反爬核心)====== |
|||
_driver_instance = None |
|||
|
|||
|
|||
def stealth_driver(driver): |
|||
"""注入 Stealth JS,绕过常见 Bot 检测""" |
|||
try: |
|||
# 移除 webdriver 标志 |
|||
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") |
|||
# 隐藏语言特征 |
|||
driver.execute_script("Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']})") |
|||
# 隐藏插件列表 |
|||
driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3]})") |
|||
# 隐藏硬件并发数(伪装普通设备) |
|||
driver.execute_script("Object.defineProperty(navigator, 'hardwareConcurrency', {get: () => 8})") |
|||
# 隐藏设备内存(GB) |
|||
driver.execute_script("Object.defineProperty(navigator, 'deviceMemory', {get: () => 8})") |
|||
print("🛡️ Stealth JS 注入成功") |
|||
except Exception as e: |
|||
print(f"⚠️ Stealth JS 注入失败: {e}") |
|||
|
|||
|
|||
def init_persistent_driver(): |
|||
"""初始化持久化 Chrome 实例(仅首次调用)""" |
|||
global _driver_instance |
|||
if _driver_instance is not None: |
|||
return _driver_instance |
|||
|
|||
print("🔧 初始化持久化浏览器(首次启动,需过反爬,请稍候...)") |
|||
|
|||
# 自动清理过期 Profile |
|||
_clean_old_profile() |
|||
|
|||
# 配置 Chrome |
|||
chrome_options = Options() |
|||
chrome_options.add_argument(f"--user-data-dir={PERSISTENT_PROFILE_DIR}") |
|||
chrome_options.add_argument("--profile-directory=Default") |
|||
chrome_options.add_argument("--headless=new") |
|||
chrome_options.add_argument("--no-sandbox") |
|||
chrome_options.add_argument("--disable-dev-shm-usage") |
|||
chrome_options.add_argument("--disable-blink-features=AutomationControlled") |
|||
chrome_options.add_argument("--disable-extensions") |
|||
chrome_options.add_argument("--disable-plugins-discovery") |
|||
chrome_options.add_argument("--disable-features=VizDisplayCompositor") |
|||
chrome_options.add_argument("--disable-gpu") |
|||
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) |
|||
chrome_options.add_experimental_option('useAutomationExtension', False) |
|||
# chrome_options.binary_location = "/usr/bin/chromium-browser" |
|||
# 启动 |
|||
service = Service(ChromeDriverManager().install()) |
|||
try: |
|||
_driver_instance = webdriver.Chrome(service=service, options=chrome_options) |
|||
_driver_instance.set_page_load_timeout(30) |
|||
_driver_instance.implicitly_wait(10) |
|||
|
|||
# 关键:访问首页“暖机”,触发反爬验证 |
|||
print("🌐 正在访问首页以通过反爬验证...") |
|||
_driver_instance.get(BASE_URL) |
|||
time.sleep(2 + random.uniform(0.5, 1.5)) |
|||
|
|||
# 注入 Stealth |
|||
stealth_driver(_driver_instance) |
|||
|
|||
# 再访问一次列表页,确保状态稳定 |
|||
_driver_instance.get(LIST_URL) |
|||
time.sleep(1.5 + random.uniform(0.5, 1.0)) |
|||
|
|||
print("✅ 浏览器初始化完成,后续请求将复用上下文") |
|||
return _driver_instance |
|||
except Exception as e: |
|||
if _driver_instance: |
|||
_driver_instance.quit() |
|||
_driver_instance = None |
|||
raise RuntimeError(f"浏览器初始化失败: {e}") |
|||
|
|||
|
|||
def _clean_old_profile(): |
|||
"""清理过期 Profile(防积累)""" |
|||
if not os.path.exists(PERSISTENT_PROFILE_DIR): |
|||
return |
|||
try: |
|||
profile_age = time.time() - os.path.getctime(PERSISTENT_PROFILE_DIR) |
|||
if profile_age > PROFILE_CLEAN_THRESHOLD_DAYS * 86400: |
|||
print(f"🧹 Profile 目录已超 {PROFILE_CLEAN_THRESHOLD_DAYS} 天,正在清理...") |
|||
import shutil |
|||
shutil.rmtree(PERSISTENT_PROFILE_DIR, ignore_errors=True) |
|||
os.makedirs(PERSISTENT_PROFILE_DIR, exist_ok=True) |
|||
print("✅ 已重建干净 Profile") |
|||
except Exception as e: |
|||
print(f"⚠️ Profile 清理失败(继续使用现有): {e}") |
|||
|
|||
|
|||
def fetch_page(url, max_retries=2): |
|||
"""带重试的页面获取(✅ 复用全局 driver)""" |
|||
global _driver_instance |
|||
for attempt in range(max_retries + 1): |
|||
try: |
|||
if _driver_instance is None: |
|||
_driver_instance = init_persistent_driver() |
|||
|
|||
# 拟人化:随机滚动 + 延迟 |
|||
_driver_instance.get(url) |
|||
time.sleep(0.8 + random.uniform(0.3, 0.7)) |
|||
|
|||
# 检查是否被拦截 |
|||
page_source = _driver_instance.page_source |
|||
if "403 Forbidden" in page_source or "challenge-platform" in page_source.lower(): |
|||
raise Exception("被反爬拦截(403/Challenge)") |
|||
|
|||
return page_source |
|||
|
|||
except Exception as e: |
|||
print(f" ⚠️ 尝试 {attempt + 1}/{max_retries + 1} 失败: {e}") |
|||
if attempt < max_retries: |
|||
time.sleep(3 + random.uniform(1, 2)) |
|||
# 重启 driver(极端情况) |
|||
if _driver_instance: |
|||
try: |
|||
_driver_instance.quit() |
|||
except: |
|||
pass |
|||
_driver_instance = None |
|||
else: |
|||
return None |
|||
|
|||
|
|||
# ====== I/O 监控器(保留)====== |
|||
class IOMonitor: |
|||
def __init__(self): |
|||
self.records = {} |
|||
|
|||
@contextmanager |
|||
def io_timer(self, io_type: str, desc: str = ""): |
|||
if io_type not in self.records: |
|||
self.records[io_type] = [] |
|||
start = time.perf_counter() |
|||
try: |
|||
yield |
|||
finally: |
|||
duration = time.perf_counter() - start |
|||
self.records[io_type].append((duration, desc)) |
|||
|
|||
def summary(self): |
|||
print("\n" + "=" * 60) |
|||
print("📊 I/O 耗时总览(反爬优化版)") |
|||
print("=" * 60) |
|||
total_time = 0.0 |
|||
for io_type, records in sorted(self.records.items()): |
|||
count = len(records) |
|||
total = sum(t for t, _ in records) |
|||
avg = total / count if count else 0 |
|||
total_time += total |
|||
print(f"✅ {io_type:<15} | 调用 {count:2d} 次 | 总耗时 {total:6.2f}s | 平均 {avg:5.3f}s") |
|||
if count > 3: |
|||
slowest = sorted(records, key=lambda x: x[0], reverse=True)[:2] |
|||
for i, (t, d) in enumerate(slowest, 1): |
|||
print(f" └─ #{i} 慢: {t:5.2f}s → {d}") |
|||
print("-" * 60) |
|||
print(f"⏱️ I/O 总耗时: {total_time:6.2f}s") |
|||
print("=" * 60) |
|||
|
|||
|
|||
monitor = IOMonitor() |
|||
|
|||
|
|||
# ====== 工具函数(加监控)====== |
|||
def load_history(): |
|||
with monitor.io_timer("file_read", f"load_history: {DUPLICATE_CACHE_FILE}"): |
|||
if os.path.exists(DUPLICATE_CACHE_FILE): |
|||
try: |
|||
with open(DUPLICATE_CACHE_FILE, 'r', encoding='utf-8') as f: |
|||
return set(json.load(f)) |
|||
except Exception as e: |
|||
print(f"⚠️ 历史缓存加载失败: {e}") |
|||
return set() |
|||
|
|||
|
|||
def save_history(history_set): |
|||
with monitor.io_timer("file_write", f"save_history: {DUPLICATE_CACHE_FILE}"): |
|||
try: |
|||
with open(DUPLICATE_CACHE_FILE, 'w', encoding='utf-8') as f: |
|||
json.dump(list(history_set), f, ensure_ascii=False, indent=2) |
|||
print(f"💾 缓存已更新:{len(history_set)} 条") |
|||
except Exception as e: |
|||
print(f"❌ 缓存保存失败: {e}") |
|||
|
|||
|
|||
def generate_fingerprint(title, pub_time): |
|||
raw = f"{title.strip()}|{pub_time.strip()}" |
|||
return hashlib.sha1(raw.encode('utf-8')).hexdigest()[:16] |
|||
|
|||
|
|||
def is_today(pub_time_str: str) -> bool: |
|||
if not pub_time_str: |
|||
return False |
|||
try: |
|||
m = re.search(r'(\d{4})[^\d]+(\d{1,2})[^\d]+(\d{1,2})', pub_time_str) |
|||
if not m: |
|||
return False |
|||
year, month, day = int(m.group(1)), int(m.group(2)), int(m.group(3)) |
|||
curr_year = datetime.now().year |
|||
if year > curr_year + 1 or year < curr_year - 5: |
|||
year = curr_year |
|||
pub_date = datetime(year, month, day).date() |
|||
return pub_date == datetime.now().date() |
|||
except Exception as e: |
|||
print(f"⚠️ 日期解析失败: '{pub_time_str}' → {e}") |
|||
return False |
|||
|
|||
|
|||
def parse_news_list(html, base_url=BASE_URL): |
|||
soup = BeautifulSoup(html, 'lxml') |
|||
items = [] |
|||
for table in soup.find_all('table', width="800", border="0"): |
|||
title_a = table.select_one('font[face="微软雅黑"][style*="font-size: 15pt"] a[href]') |
|||
if not title_a: |
|||
continue |
|||
title = title_a.get_text(strip=True) |
|||
link = urljoin(base_url, title_a['href']) |
|||
parsed = urlparse(link) |
|||
if not parsed.netloc.endswith("marketmatrix.net"): |
|||
continue |
|||
if "/topnews/" not in link: |
|||
continue |
|||
meta_fonts = table.select('font[size="2"]') |
|||
meta_combined = " ".join(f.get_text(strip=True) for f in meta_fonts) |
|||
time_match = "" |
|||
m = re.search(r'(\d{4})[^\d]+(\d{1,2})[^\d]+(\d{1,2})(?:[^\d]+(\d{1,2}:\d{2}))?', meta_combined) |
|||
if m: |
|||
date_part = f"{m.group(1)}.{int(m.group(2)):02d}.{int(m.group(3)):02d}" |
|||
time_part = m.group(4) or "00:00" |
|||
time_match = f"{date_part} {time_part}" |
|||
else: |
|||
m2 = re.search(r'(\d{4})[^\d]+(\d{1,2})[^\d]+(\d{1,2})', meta_combined) |
|||
if m2: |
|||
time_match = f"{m2.group(1)}.{int(m2.group(2)):02d}.{int(m2.group(3)):02d}" |
|||
category = "" |
|||
for txt in meta_fonts: |
|||
t = txt.get_text(strip=True).replace("主题:", "") |
|||
if re.search(r'\d{4}|编辑|新闻源|要点', t): |
|||
continue |
|||
if 2 < len(t) < 30: |
|||
category = t |
|||
break |
|||
items.append({ |
|||
"标题": title, |
|||
"分类": category, |
|||
"发布时间": time_match, |
|||
"原文链接": link |
|||
}) |
|||
return items |
|||
|
|||
|
|||
def extract_article_content(html): |
|||
if not html: |
|||
return "(访问失败)" |
|||
soup = BeautifulSoup(html, 'lxml') |
|||
editor_p = None |
|||
for p in soup.find_all(['p', 'font']): |
|||
txt = p.get_text() |
|||
if "编辑:" in txt and "发布时间:" in txt: |
|||
editor_p = p |
|||
break |
|||
if editor_p: |
|||
container = editor_p |
|||
for _ in range(3): |
|||
if container.parent and container.parent.name == 'td': |
|||
container = container.parent |
|||
break |
|||
container = container.parent |
|||
if container and container.name == 'td': |
|||
paras = [] |
|||
for p in container.find_all('p'): |
|||
t = p.get_text(strip=True) |
|||
if len(t) > 20 and not any(skip in t for skip in [ |
|||
"编辑:", "发布时间:", "主题:", "新闻源:", "要点:", "开户", "保证金", "©" |
|||
]): |
|||
paras.append(t) |
|||
if paras: |
|||
return "\n".join(paras) |
|||
fallback = [p.get_text(strip=True) for p in soup.find_all('p') if 30 <= len(p.get_text()) <= 500] |
|||
return "\n".join(fallback[:15]) if fallback else "(提取失败)" |
|||
|
|||
|
|||
def sanitize_filename(name: str, max_len=80) -> str: |
|||
return re.sub(r'[\\/:*?"<>|\r\n\t]', ' ', name).strip()[:max_len] or "untitled" |
|||
|
|||
|
|||
def upload_to_knowledge_base_from_content(filename: str, content: str) -> dict: |
|||
print(f"📤 正在上传: {filename} (内存)") |
|||
try: |
|||
content_bytes = content.encode('utf-8') |
|||
with monitor.io_timer("network_write", f"upload: {filename}"): |
|||
files = {'file': (filename, content_bytes, 'text/markdown')} |
|||
data = {'indexId': KB_INDEX_ID} |
|||
headers = {'token': KB_TOKEN} |
|||
response = requests.post(KB_API_URL, files=files, data=data, headers=headers, timeout=30) |
|||
print(f" ← HTTP {response.status_code}") |
|||
try: |
|||
res_json = response.json() |
|||
code = res_json.get("code") |
|||
msg = res_json.get("message") or res_json.get("error") or "未知错误" |
|||
if code == 200 and res_json.get("fileId"): |
|||
print(f" ✅ 上传成功 → fileId: {res_json['fileId']}") |
|||
return {"code": 200, "fileId": res_json["fileId"], "message": "OK"} |
|||
else: |
|||
print(f" ⚠️ 业务失败 → code: {code}, msg: {msg}") |
|||
return {"code": code or -1, "fileId": "", "message": msg} |
|||
except Exception as json_e: |
|||
print(f" ❌ JSON 解析失败: {json_e}, 原始响应: {response.text[:200]}") |
|||
return {"code": -2, "fileId": "", "message": f"非JSON响应: {response.text[:100]}"} |
|||
except Exception as e: |
|||
print(f" ❌ 上传异常: {e}") |
|||
return {"code": -1, "fileId": "", "message": str(e)} |
|||
|
|||
|
|||
# ====== 主流程 ====== |
|||
def get_all_list_pages(): |
|||
pages = [LIST_URL] |
|||
current_url = LIST_URL |
|||
visited = {LIST_URL} |
|||
print("🔗 探测分页中(从首页开始)...") |
|||
|
|||
for i in range(1, MAX_PAGES): |
|||
html = fetch_page(current_url) |
|||
if not html: |
|||
break |
|||
|
|||
soup = BeautifulSoup(html, 'lxml') |
|||
more_link = soup.find('a', string=re.compile(r'查看更多', re.IGNORECASE)) |
|||
if not more_link: |
|||
more_link = soup.find('a', href=re.compile(r'news-list-\d+\.htm', re.IGNORECASE)) |
|||
if not more_link or not more_link.get('href'): |
|||
break |
|||
|
|||
next_href = more_link['href'].strip() |
|||
next_url = urljoin(current_url, next_href) |
|||
if not next_url.startswith(BASE_URL) or next_url in visited: |
|||
break |
|||
|
|||
visited.add(next_url) |
|||
pages.append(next_url) |
|||
print(f" ➕ {len(pages):2d}. {next_url}") |
|||
current_url = next_url |
|||
|
|||
return pages |
|||
|
|||
|
|||
def main(): |
|||
overall_start = time.perf_counter() |
|||
print("▶ 启动高抗反爬抓取流程...") |
|||
|
|||
# 1. 获取列表页(复用 driver) |
|||
print("\n[阶段1] 获取新闻列表(复用浏览器上下文)") |
|||
list_pages = get_all_list_pages() |
|||
all_items = [] |
|||
|
|||
for i, url in enumerate(list_pages, 1): |
|||
print(f"[{i}/{len(list_pages)}] 解析 {urlparse(url).path or '/'}") |
|||
html = fetch_page(url) |
|||
if not html: |
|||
continue |
|||
base_for_links = BASE_URL if "/list/" not in url else urljoin(BASE_URL, "/list/") |
|||
items = parse_news_list(html, base_url=base_for_links) |
|||
all_items.extend(items) |
|||
|
|||
print(f"✅ 共提取 {len(all_items)} 条原始新闻") |
|||
|
|||
# 2. 过滤今日 & 去重 |
|||
print("\n[阶段2] 过滤今日 & 去重") |
|||
history = load_history() |
|||
new_items = [] |
|||
for item in all_items: |
|||
if not is_today(item["发布时间"]): |
|||
continue |
|||
fp = generate_fingerprint(item["标题"], item["发布时间"]) |
|||
if fp in history: |
|||
print(f"⏭️ 跳过重复: {item['标题'][:30]}...") |
|||
continue |
|||
new_items.append(item) |
|||
history.add(fp) |
|||
|
|||
print(f"🆕 今日新增 {len(new_items)} 条新闻") |
|||
|
|||
# 3. 抓取正文 + 上传 |
|||
print("\n[阶段3] 抓取正文 & 上传(内存上传)") |
|||
results = [] |
|||
for i, item in enumerate(new_items, 1): |
|||
title = item["标题"] |
|||
print(f"\n[{i}/{len(new_items)}] {title[:50]}...") |
|||
|
|||
try: |
|||
with monitor.io_timer("network_read", f"article: {title[:20]}"): |
|||
html = fetch_page(item["原文链接"]) |
|||
content = extract_article_content(html) if html else "(访问失败)" |
|||
item["正文内容"] = content |
|||
|
|||
# 构建 Markdown 内容(内存中) |
|||
md_content = f"""# {title} |
|||
|
|||
- 分类:{item['分类']} |
|||
- 发布时间:{item['发布时间']} |
|||
- 原文链接:{item['原文链接']} |
|||
|
|||
--- |
|||
|
|||
{content} |
|||
""" |
|||
|
|||
# 保存到磁盘(可选,用于审计) |
|||
safe_title = sanitize_filename(title) |
|||
md_file = f"{i:02d}_{safe_title}.md" |
|||
md_path = os.path.join(OUTPUT_DIR, md_file) |
|||
with monitor.io_timer("file_write", f"save_md: {md_file}"): |
|||
with open(md_path, "w", encoding="utf-8") as f: |
|||
f.write(md_content) |
|||
print(f" 💾 已保存:{md_file}") |
|||
|
|||
# 上传(内存) |
|||
res = upload_to_knowledge_base_from_content(md_file, md_content) |
|||
item.update({ |
|||
"知识库FileId": res.get("fileId", ""), |
|||
"上传状态": "✅" if res.get("code") == 200 else "❌", |
|||
"上传信息": res.get("message", "")[:100], |
|||
"指纹": fp |
|||
}) |
|||
results.append(item) |
|||
|
|||
except Exception as e: |
|||
print(f"❌ 处理失败: {title[:30]} | {e}") |
|||
item.update({ |
|||
"知识库FileId": "", |
|||
"上传状态": "❌ 处理失败", |
|||
"上传信息": str(e)[:100], |
|||
"指纹": fp |
|||
}) |
|||
results.append(item) |
|||
continue |
|||
|
|||
# 4. 保存 & 退出 |
|||
print("\n[阶段4] 保存缓存 & Excel") |
|||
save_history(history) |
|||
if results: |
|||
with monitor.io_timer("file_write", "save_excel"): |
|||
df = pd.DataFrame(results) |
|||
df.to_excel(OUTPUT_EXCEL, index=False, engine='openpyxl') |
|||
print(f"\n🎉 完成!今日新增 {len(results)} 条,Excel: {OUTPUT_EXCEL}") |
|||
else: |
|||
print(f"\nℹ️ 今日暂无新新闻发布(已探测 {len(list_pages)} 页)") |
|||
|
|||
# 关键:不 quit driver!保留上下文供下次使用 |
|||
print("📌 浏览器上下文已保留,下次运行将复用(加速)") |
|||
|
|||
# 输出 I/O 总结 |
|||
monitor.summary() |
|||
total_elapsed = time.perf_counter() - overall_start |
|||
print(f"\n🎯 总运行时间: {total_elapsed:.2f}s") |
|||
|
|||
|
|||
# ====== 优雅退出(保留 driver)====== |
|||
def cleanup(): |
|||
"""进程退出时清理(不 quit driver,除非强制)""" |
|||
global _driver_instance |
|||
if _driver_instance: |
|||
print("💡 提示:为加速下次运行,浏览器上下文已保留。") |
|||
print(" 如需彻底清理,请手动删除目录:", PERSISTENT_PROFILE_DIR) |
|||
# 不 quit,保留状态 |
|||
# _driver_instance.quit() |
|||
|
|||
|
|||
import atexit |
|||
|
|||
atexit.register(cleanup) |
|||
|
|||
if __name__ == "__main__": |
|||
try: |
|||
main() |
|||
except KeyboardInterrupt: |
|||
print("\n\n🛑 用户中断,正在退出...") |
|||
cleanup() |
|||
except Exception as e: |
|||
print(f"\n💥 严重错误: {e}") |
|||
cleanup() |
|||
raise |
|||
@ -0,0 +1,525 @@ |
|||
# marketmatrix_today_upload_anti_bot.py |
|||
# 功能:抓今日新闻(去重)+ 上传知识库 + 输出 Excel + 【高抗反爬 + I/O 监控】 |
|||
|
|||
import os |
|||
import re |
|||
import json |
|||
import hashlib |
|||
import time |
|||
import random |
|||
from datetime import datetime, timedelta |
|||
from urllib.parse import urljoin, urlparse |
|||
from contextlib import contextmanager |
|||
|
|||
import pandas as pd |
|||
import requests |
|||
from bs4 import BeautifulSoup |
|||
from selenium import webdriver |
|||
from selenium.webdriver.chrome.options import Options |
|||
from selenium.webdriver.chrome.service import Service |
|||
from webdriver_manager.chrome import ChromeDriverManager |
|||
|
|||
# ====== 配置区 ====== |
|||
KB_API_URL = "https://dcapi.homilychart.com/prod/deepchartapi/api/QwenKnowledge/add" |
|||
KB_TOKEN = "d20287d0bb0298c73e540da7e3e1d7e3" |
|||
KB_INDEX_ID = "30xe1fbox1" |
|||
|
|||
BASE_URL = "http://marketmatrix.net" |
|||
LIST_URL = urljoin(BASE_URL, "/trading.htm") |
|||
OUTPUT_DIR = "today_news" |
|||
os.makedirs(OUTPUT_DIR, exist_ok=True) |
|||
|
|||
OUTPUT_EXCEL = os.path.join(OUTPUT_DIR, f"today_{datetime.now().strftime('%Y%m%d')}.xlsx") |
|||
DUPLICATE_CACHE_FILE = os.path.join(OUTPUT_DIR, "today_history.json") |
|||
|
|||
# 关键:持久化浏览器配置目录 |
|||
PERSISTENT_PROFILE_DIR = os.path.join(os.getcwd(), "hangye_config") |
|||
PROFILE_CLEAN_THRESHOLD_DAYS = 1 # 1天以上自动清理 |
|||
|
|||
MAX_PAGES = 30 # 适当降低,防深层页风控 |
|||
print(f"📅 系统当前日期: {datetime.now().strftime('%Y-%m-%d')}") |
|||
|
|||
# ====== 全局 driver 单例(✅ 反爬核心)====== |
|||
_driver_instance = None |
|||
|
|||
|
|||
def stealth_driver(driver): |
|||
"""注入 Stealth JS,绕过常见 Bot 检测""" |
|||
try: |
|||
# 移除 webdriver 标志 |
|||
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") |
|||
# 隐藏语言特征 |
|||
driver.execute_script("Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']})") |
|||
# 隐藏插件列表 |
|||
driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3]})") |
|||
# 隐藏硬件并发数(伪装普通设备) |
|||
driver.execute_script("Object.defineProperty(navigator, 'hardwareConcurrency', {get: () => 8})") |
|||
# 隐藏设备内存(GB) |
|||
driver.execute_script("Object.defineProperty(navigator, 'deviceMemory', {get: () => 8})") |
|||
print("🛡️ Stealth JS 注入成功") |
|||
except Exception as e: |
|||
print(f"⚠️ Stealth JS 注入失败: {e}") |
|||
|
|||
|
|||
def init_persistent_driver(): |
|||
"""初始化持久化 Chrome 实例(仅首次调用)""" |
|||
global _driver_instance |
|||
if _driver_instance is not None: |
|||
return _driver_instance |
|||
|
|||
print("🔧 初始化持久化浏览器(首次启动,需过反爬,请稍候...)") |
|||
|
|||
# 自动清理过期 Profile |
|||
_clean_old_profile() |
|||
|
|||
# 配置 Chrome |
|||
chrome_options = Options() |
|||
chrome_options.add_argument(f"--user-data-dir={PERSISTENT_PROFILE_DIR}") |
|||
chrome_options.add_argument("--profile-directory=Default") |
|||
chrome_options.add_argument("--headless=new") |
|||
chrome_options.add_argument("--no-sandbox") |
|||
chrome_options.add_argument("--disable-dev-shm-usage") |
|||
chrome_options.add_argument("--disable-blink-features=AutomationControlled") |
|||
chrome_options.add_argument("--disable-extensions") |
|||
chrome_options.add_argument("--disable-plugins-discovery") |
|||
chrome_options.add_argument("--disable-features=VizDisplayCompositor") |
|||
chrome_options.add_argument("--disable-gpu") |
|||
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) |
|||
chrome_options.add_experimental_option('useAutomationExtension', False) |
|||
# chrome_options.binary_location = "/usr/bin/chromium-browser" |
|||
# 启动 |
|||
service = Service(ChromeDriverManager().install()) |
|||
try: |
|||
_driver_instance = webdriver.Chrome(service=service, options=chrome_options) |
|||
_driver_instance.set_page_load_timeout(30) |
|||
_driver_instance.implicitly_wait(10) |
|||
|
|||
# 关键:访问首页“暖机”,触发反爬验证 |
|||
print("🌐 正在访问首页以通过反爬验证...") |
|||
_driver_instance.get(BASE_URL) |
|||
time.sleep(2 + random.uniform(0.5, 1.5)) |
|||
|
|||
# 注入 Stealth |
|||
stealth_driver(_driver_instance) |
|||
|
|||
# 再访问一次列表页,确保状态稳定 |
|||
_driver_instance.get(LIST_URL) |
|||
time.sleep(1.5 + random.uniform(0.5, 1.0)) |
|||
|
|||
print("✅ 浏览器初始化完成,后续请求将复用上下文") |
|||
return _driver_instance |
|||
except Exception as e: |
|||
if _driver_instance: |
|||
_driver_instance.quit() |
|||
_driver_instance = None |
|||
raise RuntimeError(f"浏览器初始化失败: {e}") |
|||
|
|||
|
|||
def _clean_old_profile(): |
|||
"""清理过期 Profile(防积累)""" |
|||
if not os.path.exists(PERSISTENT_PROFILE_DIR): |
|||
return |
|||
try: |
|||
profile_age = time.time() - os.path.getctime(PERSISTENT_PROFILE_DIR) |
|||
if profile_age > PROFILE_CLEAN_THRESHOLD_DAYS * 86400: |
|||
print(f"🧹 Profile 目录已超 {PROFILE_CLEAN_THRESHOLD_DAYS} 天,正在清理...") |
|||
import shutil |
|||
shutil.rmtree(PERSISTENT_PROFILE_DIR, ignore_errors=True) |
|||
os.makedirs(PERSISTENT_PROFILE_DIR, exist_ok=True) |
|||
print("✅ 已重建干净 Profile") |
|||
except Exception as e: |
|||
print(f"⚠️ Profile 清理失败(继续使用现有): {e}") |
|||
|
|||
|
|||
def fetch_page(url, max_retries=2): |
|||
"""带重试的页面获取(✅ 复用全局 driver)""" |
|||
global _driver_instance |
|||
for attempt in range(max_retries + 1): |
|||
try: |
|||
if _driver_instance is None: |
|||
_driver_instance = init_persistent_driver() |
|||
|
|||
# 拟人化:随机滚动 + 延迟 |
|||
_driver_instance.get(url) |
|||
time.sleep(0.8 + random.uniform(0.3, 0.7)) |
|||
|
|||
# 检查是否被拦截 |
|||
page_source = _driver_instance.page_source |
|||
if "403 Forbidden" in page_source or "challenge-platform" in page_source.lower(): |
|||
raise Exception("被反爬拦截(403/Challenge)") |
|||
|
|||
return page_source |
|||
|
|||
except Exception as e: |
|||
print(f" ⚠️ 尝试 {attempt + 1}/{max_retries + 1} 失败: {e}") |
|||
if attempt < max_retries: |
|||
time.sleep(3 + random.uniform(1, 2)) |
|||
# 重启 driver(极端情况) |
|||
if _driver_instance: |
|||
try: |
|||
_driver_instance.quit() |
|||
except: |
|||
pass |
|||
_driver_instance = None |
|||
else: |
|||
return None |
|||
|
|||
|
|||
# ====== I/O 监控器(保留)====== |
|||
class IOMonitor: |
|||
def __init__(self): |
|||
self.records = {} |
|||
|
|||
@contextmanager |
|||
def io_timer(self, io_type: str, desc: str = ""): |
|||
if io_type not in self.records: |
|||
self.records[io_type] = [] |
|||
start = time.perf_counter() |
|||
try: |
|||
yield |
|||
finally: |
|||
duration = time.perf_counter() - start |
|||
self.records[io_type].append((duration, desc)) |
|||
|
|||
def summary(self): |
|||
print("\n" + "=" * 60) |
|||
print("📊 I/O 耗时总览(反爬优化版)") |
|||
print("=" * 60) |
|||
total_time = 0.0 |
|||
for io_type, records in sorted(self.records.items()): |
|||
count = len(records) |
|||
total = sum(t for t, _ in records) |
|||
avg = total / count if count else 0 |
|||
total_time += total |
|||
print(f"✅ {io_type:<15} | 调用 {count:2d} 次 | 总耗时 {total:6.2f}s | 平均 {avg:5.3f}s") |
|||
if count > 3: |
|||
slowest = sorted(records, key=lambda x: x[0], reverse=True)[:2] |
|||
for i, (t, d) in enumerate(slowest, 1): |
|||
print(f" └─ #{i} 慢: {t:5.2f}s → {d}") |
|||
print("-" * 60) |
|||
print(f"⏱️ I/O 总耗时: {total_time:6.2f}s") |
|||
print("=" * 60) |
|||
|
|||
|
|||
monitor = IOMonitor() |
|||
|
|||
|
|||
# ====== 工具函数(加监控)====== |
|||
def load_history(): |
|||
with monitor.io_timer("file_read", f"load_history: {DUPLICATE_CACHE_FILE}"): |
|||
if os.path.exists(DUPLICATE_CACHE_FILE): |
|||
try: |
|||
with open(DUPLICATE_CACHE_FILE, 'r', encoding='utf-8') as f: |
|||
return set(json.load(f)) |
|||
except Exception as e: |
|||
print(f"⚠️ 历史缓存加载失败: {e}") |
|||
return set() |
|||
|
|||
|
|||
def save_history(history_set): |
|||
with monitor.io_timer("file_write", f"save_history: {DUPLICATE_CACHE_FILE}"): |
|||
try: |
|||
with open(DUPLICATE_CACHE_FILE, 'w', encoding='utf-8') as f: |
|||
json.dump(list(history_set), f, ensure_ascii=False, indent=2) |
|||
print(f"💾 缓存已更新:{len(history_set)} 条") |
|||
except Exception as e: |
|||
print(f"❌ 缓存保存失败: {e}") |
|||
|
|||
|
|||
def generate_fingerprint(title, pub_time): |
|||
raw = f"{title.strip()}|{pub_time.strip()}" |
|||
return hashlib.sha1(raw.encode('utf-8')).hexdigest()[:16] |
|||
|
|||
|
|||
def is_today(pub_time_str: str) -> bool: |
|||
if not pub_time_str: |
|||
return False |
|||
try: |
|||
m = re.search(r'(\d{4})[^\d]+(\d{1,2})[^\d]+(\d{1,2})', pub_time_str) |
|||
if not m: |
|||
return False |
|||
year, month, day = int(m.group(1)), int(m.group(2)), int(m.group(3)) |
|||
curr_year = datetime.now().year |
|||
if year > curr_year + 1 or year < curr_year - 5: |
|||
year = curr_year |
|||
pub_date = datetime(year, month, day).date() |
|||
return pub_date == datetime.now().date() |
|||
except Exception as e: |
|||
print(f"⚠️ 日期解析失败: '{pub_time_str}' → {e}") |
|||
return False |
|||
|
|||
|
|||
def parse_news_list(html, base_url=BASE_URL): |
|||
soup = BeautifulSoup(html, 'lxml') |
|||
items = [] |
|||
for table in soup.find_all('table', width="800", border="0"): |
|||
title_a = table.select_one('font[face="微软雅黑"][style*="font-size: 15pt"] a[href]') |
|||
if not title_a: |
|||
continue |
|||
title = title_a.get_text(strip=True) |
|||
link = urljoin(base_url, title_a['href']) |
|||
parsed = urlparse(link) |
|||
if not parsed.netloc.endswith("marketmatrix.net"): |
|||
continue |
|||
if "/topnews/" not in link: |
|||
continue |
|||
meta_fonts = table.select('font[size="2"]') |
|||
meta_combined = " ".join(f.get_text(strip=True) for f in meta_fonts) |
|||
time_match = "" |
|||
m = re.search(r'(\d{4})[^\d]+(\d{1,2})[^\d]+(\d{1,2})(?:[^\d]+(\d{1,2}:\d{2}))?', meta_combined) |
|||
if m: |
|||
date_part = f"{m.group(1)}.{int(m.group(2)):02d}.{int(m.group(3)):02d}" |
|||
time_part = m.group(4) or "00:00" |
|||
time_match = f"{date_part} {time_part}" |
|||
else: |
|||
m2 = re.search(r'(\d{4})[^\d]+(\d{1,2})[^\d]+(\d{1,2})', meta_combined) |
|||
if m2: |
|||
time_match = f"{m2.group(1)}.{int(m2.group(2)):02d}.{int(m2.group(3)):02d}" |
|||
category = "" |
|||
for txt in meta_fonts: |
|||
t = txt.get_text(strip=True).replace("主题:", "") |
|||
if re.search(r'\d{4}|编辑|新闻源|要点', t): |
|||
continue |
|||
if 2 < len(t) < 30: |
|||
category = t |
|||
break |
|||
items.append({ |
|||
"标题": title, |
|||
"分类": category, |
|||
"发布时间": time_match, |
|||
"原文链接": link |
|||
}) |
|||
return items |
|||
|
|||
|
|||
def extract_article_content(html): |
|||
if not html: |
|||
return "(访问失败)" |
|||
soup = BeautifulSoup(html, 'lxml') |
|||
editor_p = None |
|||
for p in soup.find_all(['p', 'font']): |
|||
txt = p.get_text() |
|||
if "编辑:" in txt and "发布时间:" in txt: |
|||
editor_p = p |
|||
break |
|||
if editor_p: |
|||
container = editor_p |
|||
for _ in range(3): |
|||
if container.parent and container.parent.name == 'td': |
|||
container = container.parent |
|||
break |
|||
container = container.parent |
|||
if container and container.name == 'td': |
|||
paras = [] |
|||
for p in container.find_all('p'): |
|||
t = p.get_text(strip=True) |
|||
if len(t) > 20 and not any(skip in t for skip in [ |
|||
"编辑:", "发布时间:", "主题:", "新闻源:", "要点:", "开户", "保证金", "©" |
|||
]): |
|||
paras.append(t) |
|||
if paras: |
|||
return "\n".join(paras) |
|||
fallback = [p.get_text(strip=True) for p in soup.find_all('p') if 30 <= len(p.get_text()) <= 500] |
|||
return "\n".join(fallback[:15]) if fallback else "(提取失败)" |
|||
|
|||
|
|||
def sanitize_filename(name: str, max_len=80) -> str: |
|||
return re.sub(r'[\\/:*?"<>|\r\n\t]', ' ', name).strip()[:max_len] or "untitled" |
|||
|
|||
|
|||
def upload_to_knowledge_base_from_content(filename: str, content: str) -> dict: |
|||
print(f"📤 正在上传: {filename} (内存)") |
|||
try: |
|||
content_bytes = content.encode('utf-8') |
|||
with monitor.io_timer("network_write", f"upload: {filename}"): |
|||
files = {'file': (filename, content_bytes, 'text/markdown')} |
|||
data = {'indexId': KB_INDEX_ID} |
|||
headers = {'token': KB_TOKEN} |
|||
response = requests.post(KB_API_URL, files=files, data=data, headers=headers, timeout=30) |
|||
print(f" ← HTTP {response.status_code}") |
|||
try: |
|||
res_json = response.json() |
|||
code = res_json.get("code") |
|||
msg = res_json.get("message") or res_json.get("error") or "未知错误" |
|||
if code == 200 and res_json.get("fileId"): |
|||
print(f" ✅ 上传成功 → fileId: {res_json['fileId']}") |
|||
return {"code": 200, "fileId": res_json["fileId"], "message": "OK"} |
|||
else: |
|||
print(f" ⚠️ 业务失败 → code: {code}, msg: {msg}") |
|||
return {"code": code or -1, "fileId": "", "message": msg} |
|||
except Exception as json_e: |
|||
print(f" ❌ JSON 解析失败: {json_e}, 原始响应: {response.text[:200]}") |
|||
return {"code": -2, "fileId": "", "message": f"非JSON响应: {response.text[:100]}"} |
|||
except Exception as e: |
|||
print(f" ❌ 上传异常: {e}") |
|||
return {"code": -1, "fileId": "", "message": str(e)} |
|||
|
|||
|
|||
# ====== 主流程 ====== |
|||
def get_all_list_pages(): |
|||
pages = [LIST_URL] |
|||
current_url = LIST_URL |
|||
visited = {LIST_URL} |
|||
print("🔗 探测分页中(从首页开始)...") |
|||
|
|||
for i in range(1, MAX_PAGES): |
|||
html = fetch_page(current_url) |
|||
if not html: |
|||
break |
|||
|
|||
soup = BeautifulSoup(html, 'lxml') |
|||
more_link = soup.find('a', string=re.compile(r'查看更多', re.IGNORECASE)) |
|||
if not more_link: |
|||
more_link = soup.find('a', href=re.compile(r'news-list-\d+\.htm', re.IGNORECASE)) |
|||
if not more_link or not more_link.get('href'): |
|||
break |
|||
|
|||
next_href = more_link['href'].strip() |
|||
next_url = urljoin(current_url, next_href) |
|||
if not next_url.startswith(BASE_URL) or next_url in visited: |
|||
break |
|||
|
|||
visited.add(next_url) |
|||
pages.append(next_url) |
|||
print(f" ➕ {len(pages):2d}. {next_url}") |
|||
current_url = next_url |
|||
|
|||
return pages |
|||
|
|||
|
|||
def main(): |
|||
overall_start = time.perf_counter() |
|||
print("▶ 启动高抗反爬抓取流程...") |
|||
|
|||
# 1. 获取列表页(复用 driver) |
|||
print("\n[阶段1] 获取新闻列表(复用浏览器上下文)") |
|||
list_pages = get_all_list_pages() |
|||
all_items = [] |
|||
|
|||
for i, url in enumerate(list_pages, 1): |
|||
print(f"[{i}/{len(list_pages)}] 解析 {urlparse(url).path or '/'}") |
|||
html = fetch_page(url) |
|||
if not html: |
|||
continue |
|||
base_for_links = BASE_URL if "/list/" not in url else urljoin(BASE_URL, "/list/") |
|||
items = parse_news_list(html, base_url=base_for_links) |
|||
all_items.extend(items) |
|||
|
|||
print(f"✅ 共提取 {len(all_items)} 条原始新闻") |
|||
|
|||
# 2. 过滤今日 & 去重 |
|||
print("\n[阶段2] 过滤今日 & 去重") |
|||
history = load_history() |
|||
new_items = [] |
|||
for item in all_items: |
|||
if not is_today(item["发布时间"]): |
|||
continue |
|||
fp = generate_fingerprint(item["标题"], item["发布时间"]) |
|||
if fp in history: |
|||
print(f"⏭️ 跳过重复: {item['标题'][:30]}...") |
|||
continue |
|||
new_items.append(item) |
|||
history.add(fp) |
|||
|
|||
print(f"🆕 今日新增 {len(new_items)} 条新闻") |
|||
|
|||
# 3. 抓取正文 + 上传 |
|||
print("\n[阶段3] 抓取正文 & 上传(内存上传)") |
|||
results = [] |
|||
for i, item in enumerate(new_items, 1): |
|||
title = item["标题"] |
|||
print(f"\n[{i}/{len(new_items)}] {title[:50]}...") |
|||
|
|||
try: |
|||
with monitor.io_timer("network_read", f"article: {title[:20]}"): |
|||
html = fetch_page(item["原文链接"]) |
|||
content = extract_article_content(html) if html else "(访问失败)" |
|||
item["正文内容"] = content |
|||
|
|||
# 构建 Markdown 内容(内存中) |
|||
md_content = f"""# {title} |
|||
|
|||
- 分类:{item['分类']} |
|||
- 发布时间:{item['发布时间']} |
|||
- 原文链接:{item['原文链接']} |
|||
|
|||
--- |
|||
|
|||
{content} |
|||
""" |
|||
|
|||
# 保存到磁盘(可选,用于审计) |
|||
safe_title = sanitize_filename(title) |
|||
md_file = f"{i:02d}_{safe_title}.md" |
|||
md_path = os.path.join(OUTPUT_DIR, md_file) |
|||
with monitor.io_timer("file_write", f"save_md: {md_file}"): |
|||
with open(md_path, "w", encoding="utf-8") as f: |
|||
f.write(md_content) |
|||
print(f" 💾 已保存:{md_file}") |
|||
|
|||
# 上传(内存) |
|||
res = upload_to_knowledge_base_from_content(md_file, md_content) |
|||
item.update({ |
|||
"知识库FileId": res.get("fileId", ""), |
|||
"上传状态": "✅" if res.get("code") == 200 else "❌", |
|||
"上传信息": res.get("message", "")[:100], |
|||
"指纹": fp |
|||
}) |
|||
results.append(item) |
|||
|
|||
except Exception as e: |
|||
print(f"❌ 处理失败: {title[:30]} | {e}") |
|||
item.update({ |
|||
"知识库FileId": "", |
|||
"上传状态": "❌ 处理失败", |
|||
"上传信息": str(e)[:100], |
|||
"指纹": fp |
|||
}) |
|||
results.append(item) |
|||
continue |
|||
|
|||
# 4. 保存 & 退出 |
|||
print("\n[阶段4] 保存缓存 & Excel") |
|||
save_history(history) |
|||
if results: |
|||
with monitor.io_timer("file_write", "save_excel"): |
|||
df = pd.DataFrame(results) |
|||
df.to_excel(OUTPUT_EXCEL, index=False, engine='openpyxl') |
|||
print(f"\n🎉 完成!今日新增 {len(results)} 条,Excel: {OUTPUT_EXCEL}") |
|||
else: |
|||
print(f"\nℹ️ 今日暂无新新闻发布(已探测 {len(list_pages)} 页)") |
|||
|
|||
# 关键:不 quit driver!保留上下文供下次使用 |
|||
print("📌 浏览器上下文已保留,下次运行将复用(加速)") |
|||
|
|||
# 输出 I/O 总结 |
|||
monitor.summary() |
|||
total_elapsed = time.perf_counter() - overall_start |
|||
print(f"\n🎯 总运行时间: {total_elapsed:.2f}s") |
|||
|
|||
|
|||
# ====== 优雅退出(保留 driver)====== |
|||
def cleanup(): |
|||
"""进程退出时清理(不 quit driver,除非强制)""" |
|||
global _driver_instance |
|||
if _driver_instance: |
|||
print("💡 提示:为加速下次运行,浏览器上下文已保留。") |
|||
print(" 如需彻底清理,请手动删除目录:", PERSISTENT_PROFILE_DIR) |
|||
# 不 quit,保留状态 |
|||
# _driver_instance.quit() |
|||
|
|||
|
|||
import atexit |
|||
|
|||
atexit.register(cleanup) |
|||
|
|||
if __name__ == "__main__": |
|||
try: |
|||
main() |
|||
except KeyboardInterrupt: |
|||
print("\n\n🛑 用户中断,正在退出...") |
|||
cleanup() |
|||
except Exception as e: |
|||
print(f"\n💥 严重错误: {e}") |
|||
cleanup() |
|||
raise |
|||
@ -0,0 +1,525 @@ |
|||
# marketmatrix_today_upload_anti_bot.py |
|||
# 功能:抓今日新闻(去重)+ 上传知识库 + 输出 Excel + 【高抗反爬 + I/O 监控】 |
|||
|
|||
import os |
|||
import re |
|||
import json |
|||
import hashlib |
|||
import time |
|||
import random |
|||
from datetime import datetime, timedelta |
|||
from urllib.parse import urljoin, urlparse |
|||
from contextlib import contextmanager |
|||
|
|||
import pandas as pd |
|||
import requests |
|||
from bs4 import BeautifulSoup |
|||
from selenium import webdriver |
|||
from selenium.webdriver.chrome.options import Options |
|||
from selenium.webdriver.chrome.service import Service |
|||
from webdriver_manager.chrome import ChromeDriverManager |
|||
|
|||
# ====== 配置区 ====== |
|||
KB_API_URL = "https://dcapi.homilychart.com/prod/deepchartapi/api/QwenKnowledge/add" |
|||
KB_TOKEN = "d20287d0bb0298c73e540da7e3e1d7e3" |
|||
KB_INDEX_ID = "30xe1fbox1" |
|||
|
|||
BASE_URL = "http://marketmatrix.net" |
|||
LIST_URL = urljoin(BASE_URL, "/macro.htm") |
|||
OUTPUT_DIR = "today_news" |
|||
os.makedirs(OUTPUT_DIR, exist_ok=True) |
|||
|
|||
OUTPUT_EXCEL = os.path.join(OUTPUT_DIR, f"today_{datetime.now().strftime('%Y%m%d')}.xlsx") |
|||
DUPLICATE_CACHE_FILE = os.path.join(OUTPUT_DIR, "today_history.json") |
|||
|
|||
# 关键:持久化浏览器配置目录 |
|||
PERSISTENT_PROFILE_DIR = os.path.join(os.getcwd(), "zixun_config") |
|||
PROFILE_CLEAN_THRESHOLD_DAYS = 1 # 1天以上自动清理 |
|||
|
|||
MAX_PAGES = 30 # 适当降低,防深层页风控 |
|||
print(f"📅 系统当前日期: {datetime.now().strftime('%Y-%m-%d')}") |
|||
|
|||
# ====== 全局 driver 单例(✅ 反爬核心)====== |
|||
_driver_instance = None |
|||
|
|||
|
|||
def stealth_driver(driver): |
|||
"""注入 Stealth JS,绕过常见 Bot 检测""" |
|||
try: |
|||
# 移除 webdriver 标志 |
|||
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") |
|||
# 隐藏语言特征 |
|||
driver.execute_script("Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']})") |
|||
# 隐藏插件列表 |
|||
driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3]})") |
|||
# 隐藏硬件并发数(伪装普通设备) |
|||
driver.execute_script("Object.defineProperty(navigator, 'hardwareConcurrency', {get: () => 8})") |
|||
# 隐藏设备内存(GB) |
|||
driver.execute_script("Object.defineProperty(navigator, 'deviceMemory', {get: () => 8})") |
|||
print("🛡️ Stealth JS 注入成功") |
|||
except Exception as e: |
|||
print(f"⚠️ Stealth JS 注入失败: {e}") |
|||
|
|||
|
|||
def init_persistent_driver(): |
|||
"""初始化持久化 Chrome 实例(仅首次调用)""" |
|||
global _driver_instance |
|||
if _driver_instance is not None: |
|||
return _driver_instance |
|||
|
|||
print("🔧 初始化持久化浏览器(首次启动,需过反爬,请稍候...)") |
|||
|
|||
# 自动清理过期 Profile |
|||
_clean_old_profile() |
|||
|
|||
# 配置 Chrome |
|||
chrome_options = Options() |
|||
chrome_options.add_argument(f"--user-data-dir={PERSISTENT_PROFILE_DIR}") |
|||
chrome_options.add_argument("--profile-directory=Default") |
|||
chrome_options.add_argument("--headless=new") |
|||
chrome_options.add_argument("--no-sandbox") |
|||
chrome_options.add_argument("--disable-dev-shm-usage") |
|||
chrome_options.add_argument("--disable-blink-features=AutomationControlled") |
|||
chrome_options.add_argument("--disable-extensions") |
|||
chrome_options.add_argument("--disable-plugins-discovery") |
|||
chrome_options.add_argument("--disable-features=VizDisplayCompositor") |
|||
chrome_options.add_argument("--disable-gpu") |
|||
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) |
|||
chrome_options.add_experimental_option('useAutomationExtension', False) |
|||
# chrome_options.binary_location = "/usr/bin/chromium-browser" |
|||
# 启动 |
|||
service = Service(ChromeDriverManager().install()) |
|||
try: |
|||
_driver_instance = webdriver.Chrome(service=service, options=chrome_options) |
|||
_driver_instance.set_page_load_timeout(30) |
|||
_driver_instance.implicitly_wait(10) |
|||
|
|||
# 关键:访问首页“暖机”,触发反爬验证 |
|||
print("🌐 正在访问首页以通过反爬验证...") |
|||
_driver_instance.get(BASE_URL) |
|||
time.sleep(2 + random.uniform(0.5, 1.5)) |
|||
|
|||
# 注入 Stealth |
|||
stealth_driver(_driver_instance) |
|||
|
|||
# 再访问一次列表页,确保状态稳定 |
|||
_driver_instance.get(LIST_URL) |
|||
time.sleep(1.5 + random.uniform(0.5, 1.0)) |
|||
|
|||
print("✅ 浏览器初始化完成,后续请求将复用上下文") |
|||
return _driver_instance |
|||
except Exception as e: |
|||
if _driver_instance: |
|||
_driver_instance.quit() |
|||
_driver_instance = None |
|||
raise RuntimeError(f"浏览器初始化失败: {e}") |
|||
|
|||
|
|||
def _clean_old_profile(): |
|||
"""清理过期 Profile(防积累)""" |
|||
if not os.path.exists(PERSISTENT_PROFILE_DIR): |
|||
return |
|||
try: |
|||
profile_age = time.time() - os.path.getctime(PERSISTENT_PROFILE_DIR) |
|||
if profile_age > PROFILE_CLEAN_THRESHOLD_DAYS * 86400: |
|||
print(f"🧹 Profile 目录已超 {PROFILE_CLEAN_THRESHOLD_DAYS} 天,正在清理...") |
|||
import shutil |
|||
shutil.rmtree(PERSISTENT_PROFILE_DIR, ignore_errors=True) |
|||
os.makedirs(PERSISTENT_PROFILE_DIR, exist_ok=True) |
|||
print("✅ 已重建干净 Profile") |
|||
except Exception as e: |
|||
print(f"⚠️ Profile 清理失败(继续使用现有): {e}") |
|||
|
|||
|
|||
def fetch_page(url, max_retries=2): |
|||
"""带重试的页面获取(✅ 复用全局 driver)""" |
|||
global _driver_instance |
|||
for attempt in range(max_retries + 1): |
|||
try: |
|||
if _driver_instance is None: |
|||
_driver_instance = init_persistent_driver() |
|||
|
|||
# 拟人化:随机滚动 + 延迟 |
|||
_driver_instance.get(url) |
|||
time.sleep(0.8 + random.uniform(0.3, 0.7)) |
|||
|
|||
# 检查是否被拦截 |
|||
page_source = _driver_instance.page_source |
|||
if "403 Forbidden" in page_source or "challenge-platform" in page_source.lower(): |
|||
raise Exception("被反爬拦截(403/Challenge)") |
|||
|
|||
return page_source |
|||
|
|||
except Exception as e: |
|||
print(f" ⚠️ 尝试 {attempt + 1}/{max_retries + 1} 失败: {e}") |
|||
if attempt < max_retries: |
|||
time.sleep(3 + random.uniform(1, 2)) |
|||
# 重启 driver(极端情况) |
|||
if _driver_instance: |
|||
try: |
|||
_driver_instance.quit() |
|||
except: |
|||
pass |
|||
_driver_instance = None |
|||
else: |
|||
return None |
|||
|
|||
|
|||
# ====== I/O 监控器(保留)====== |
|||
class IOMonitor: |
|||
def __init__(self): |
|||
self.records = {} |
|||
|
|||
@contextmanager |
|||
def io_timer(self, io_type: str, desc: str = ""): |
|||
if io_type not in self.records: |
|||
self.records[io_type] = [] |
|||
start = time.perf_counter() |
|||
try: |
|||
yield |
|||
finally: |
|||
duration = time.perf_counter() - start |
|||
self.records[io_type].append((duration, desc)) |
|||
|
|||
def summary(self): |
|||
print("\n" + "=" * 60) |
|||
print("📊 I/O 耗时总览(反爬优化版)") |
|||
print("=" * 60) |
|||
total_time = 0.0 |
|||
for io_type, records in sorted(self.records.items()): |
|||
count = len(records) |
|||
total = sum(t for t, _ in records) |
|||
avg = total / count if count else 0 |
|||
total_time += total |
|||
print(f"✅ {io_type:<15} | 调用 {count:2d} 次 | 总耗时 {total:6.2f}s | 平均 {avg:5.3f}s") |
|||
if count > 3: |
|||
slowest = sorted(records, key=lambda x: x[0], reverse=True)[:2] |
|||
for i, (t, d) in enumerate(slowest, 1): |
|||
print(f" └─ #{i} 慢: {t:5.2f}s → {d}") |
|||
print("-" * 60) |
|||
print(f"⏱️ I/O 总耗时: {total_time:6.2f}s") |
|||
print("=" * 60) |
|||
|
|||
|
|||
monitor = IOMonitor() |
|||
|
|||
|
|||
# ====== 工具函数(加监控)====== |
|||
def load_history(): |
|||
with monitor.io_timer("file_read", f"load_history: {DUPLICATE_CACHE_FILE}"): |
|||
if os.path.exists(DUPLICATE_CACHE_FILE): |
|||
try: |
|||
with open(DUPLICATE_CACHE_FILE, 'r', encoding='utf-8') as f: |
|||
return set(json.load(f)) |
|||
except Exception as e: |
|||
print(f"⚠️ 历史缓存加载失败: {e}") |
|||
return set() |
|||
|
|||
|
|||
def save_history(history_set): |
|||
with monitor.io_timer("file_write", f"save_history: {DUPLICATE_CACHE_FILE}"): |
|||
try: |
|||
with open(DUPLICATE_CACHE_FILE, 'w', encoding='utf-8') as f: |
|||
json.dump(list(history_set), f, ensure_ascii=False, indent=2) |
|||
print(f"💾 缓存已更新:{len(history_set)} 条") |
|||
except Exception as e: |
|||
print(f"❌ 缓存保存失败: {e}") |
|||
|
|||
|
|||
def generate_fingerprint(title, pub_time): |
|||
raw = f"{title.strip()}|{pub_time.strip()}" |
|||
return hashlib.sha1(raw.encode('utf-8')).hexdigest()[:16] |
|||
|
|||
|
|||
def is_today(pub_time_str: str) -> bool: |
|||
if not pub_time_str: |
|||
return False |
|||
try: |
|||
m = re.search(r'(\d{4})[^\d]+(\d{1,2})[^\d]+(\d{1,2})', pub_time_str) |
|||
if not m: |
|||
return False |
|||
year, month, day = int(m.group(1)), int(m.group(2)), int(m.group(3)) |
|||
curr_year = datetime.now().year |
|||
if year > curr_year + 1 or year < curr_year - 5: |
|||
year = curr_year |
|||
pub_date = datetime(year, month, day).date() |
|||
return pub_date == datetime.now().date() |
|||
except Exception as e: |
|||
print(f"⚠️ 日期解析失败: '{pub_time_str}' → {e}") |
|||
return False |
|||
|
|||
|
|||
def parse_news_list(html, base_url=BASE_URL): |
|||
soup = BeautifulSoup(html, 'lxml') |
|||
items = [] |
|||
for table in soup.find_all('table', width="800", border="0"): |
|||
title_a = table.select_one('font[face="微软雅黑"][style*="font-size: 15pt"] a[href]') |
|||
if not title_a: |
|||
continue |
|||
title = title_a.get_text(strip=True) |
|||
link = urljoin(base_url, title_a['href']) |
|||
parsed = urlparse(link) |
|||
if not parsed.netloc.endswith("marketmatrix.net"): |
|||
continue |
|||
if "/topnews/" not in link: |
|||
continue |
|||
meta_fonts = table.select('font[size="2"]') |
|||
meta_combined = " ".join(f.get_text(strip=True) for f in meta_fonts) |
|||
time_match = "" |
|||
m = re.search(r'(\d{4})[^\d]+(\d{1,2})[^\d]+(\d{1,2})(?:[^\d]+(\d{1,2}:\d{2}))?', meta_combined) |
|||
if m: |
|||
date_part = f"{m.group(1)}.{int(m.group(2)):02d}.{int(m.group(3)):02d}" |
|||
time_part = m.group(4) or "00:00" |
|||
time_match = f"{date_part} {time_part}" |
|||
else: |
|||
m2 = re.search(r'(\d{4})[^\d]+(\d{1,2})[^\d]+(\d{1,2})', meta_combined) |
|||
if m2: |
|||
time_match = f"{m2.group(1)}.{int(m2.group(2)):02d}.{int(m2.group(3)):02d}" |
|||
category = "" |
|||
for txt in meta_fonts: |
|||
t = txt.get_text(strip=True).replace("主题:", "") |
|||
if re.search(r'\d{4}|编辑|新闻源|要点', t): |
|||
continue |
|||
if 2 < len(t) < 30: |
|||
category = t |
|||
break |
|||
items.append({ |
|||
"标题": title, |
|||
"分类": category, |
|||
"发布时间": time_match, |
|||
"原文链接": link |
|||
}) |
|||
return items |
|||
|
|||
|
|||
def extract_article_content(html): |
|||
if not html: |
|||
return "(访问失败)" |
|||
soup = BeautifulSoup(html, 'lxml') |
|||
editor_p = None |
|||
for p in soup.find_all(['p', 'font']): |
|||
txt = p.get_text() |
|||
if "编辑:" in txt and "发布时间:" in txt: |
|||
editor_p = p |
|||
break |
|||
if editor_p: |
|||
container = editor_p |
|||
for _ in range(3): |
|||
if container.parent and container.parent.name == 'td': |
|||
container = container.parent |
|||
break |
|||
container = container.parent |
|||
if container and container.name == 'td': |
|||
paras = [] |
|||
for p in container.find_all('p'): |
|||
t = p.get_text(strip=True) |
|||
if len(t) > 20 and not any(skip in t for skip in [ |
|||
"编辑:", "发布时间:", "主题:", "新闻源:", "要点:", "开户", "保证金", "©" |
|||
]): |
|||
paras.append(t) |
|||
if paras: |
|||
return "\n".join(paras) |
|||
fallback = [p.get_text(strip=True) for p in soup.find_all('p') if 30 <= len(p.get_text()) <= 500] |
|||
return "\n".join(fallback[:15]) if fallback else "(提取失败)" |
|||
|
|||
|
|||
def sanitize_filename(name: str, max_len=80) -> str: |
|||
return re.sub(r'[\\/:*?"<>|\r\n\t]', ' ', name).strip()[:max_len] or "untitled" |
|||
|
|||
|
|||
def upload_to_knowledge_base_from_content(filename: str, content: str) -> dict: |
|||
print(f"📤 正在上传: {filename} (内存)") |
|||
try: |
|||
content_bytes = content.encode('utf-8') |
|||
with monitor.io_timer("network_write", f"upload: {filename}"): |
|||
files = {'file': (filename, content_bytes, 'text/markdown')} |
|||
data = {'indexId': KB_INDEX_ID} |
|||
headers = {'token': KB_TOKEN} |
|||
response = requests.post(KB_API_URL, files=files, data=data, headers=headers, timeout=30) |
|||
print(f" ← HTTP {response.status_code}") |
|||
try: |
|||
res_json = response.json() |
|||
code = res_json.get("code") |
|||
msg = res_json.get("message") or res_json.get("error") or "未知错误" |
|||
if code == 200 and res_json.get("fileId"): |
|||
print(f" ✅ 上传成功 → fileId: {res_json['fileId']}") |
|||
return {"code": 200, "fileId": res_json["fileId"], "message": "OK"} |
|||
else: |
|||
print(f" ⚠️ 业务失败 → code: {code}, msg: {msg}") |
|||
return {"code": code or -1, "fileId": "", "message": msg} |
|||
except Exception as json_e: |
|||
print(f" ❌ JSON 解析失败: {json_e}, 原始响应: {response.text[:200]}") |
|||
return {"code": -2, "fileId": "", "message": f"非JSON响应: {response.text[:100]}"} |
|||
except Exception as e: |
|||
print(f" ❌ 上传异常: {e}") |
|||
return {"code": -1, "fileId": "", "message": str(e)} |
|||
|
|||
|
|||
# ====== 主流程 ====== |
|||
def get_all_list_pages(): |
|||
pages = [LIST_URL] |
|||
current_url = LIST_URL |
|||
visited = {LIST_URL} |
|||
print("🔗 探测分页中(从首页开始)...") |
|||
|
|||
for i in range(1, MAX_PAGES): |
|||
html = fetch_page(current_url) |
|||
if not html: |
|||
break |
|||
|
|||
soup = BeautifulSoup(html, 'lxml') |
|||
more_link = soup.find('a', string=re.compile(r'查看更多', re.IGNORECASE)) |
|||
if not more_link: |
|||
more_link = soup.find('a', href=re.compile(r'news-list-\d+\.htm', re.IGNORECASE)) |
|||
if not more_link or not more_link.get('href'): |
|||
break |
|||
|
|||
next_href = more_link['href'].strip() |
|||
next_url = urljoin(current_url, next_href) |
|||
if not next_url.startswith(BASE_URL) or next_url in visited: |
|||
break |
|||
|
|||
visited.add(next_url) |
|||
pages.append(next_url) |
|||
print(f" ➕ {len(pages):2d}. {next_url}") |
|||
current_url = next_url |
|||
|
|||
return pages |
|||
|
|||
|
|||
def main(): |
|||
overall_start = time.perf_counter() |
|||
print("▶ 启动高抗反爬抓取流程...") |
|||
|
|||
# 1. 获取列表页(复用 driver) |
|||
print("\n[阶段1] 获取新闻列表(复用浏览器上下文)") |
|||
list_pages = get_all_list_pages() |
|||
all_items = [] |
|||
|
|||
for i, url in enumerate(list_pages, 1): |
|||
print(f"[{i}/{len(list_pages)}] 解析 {urlparse(url).path or '/'}") |
|||
html = fetch_page(url) |
|||
if not html: |
|||
continue |
|||
base_for_links = BASE_URL if "/list/" not in url else urljoin(BASE_URL, "/list/") |
|||
items = parse_news_list(html, base_url=base_for_links) |
|||
all_items.extend(items) |
|||
|
|||
print(f"✅ 共提取 {len(all_items)} 条原始新闻") |
|||
|
|||
# 2. 过滤今日 & 去重 |
|||
print("\n[阶段2] 过滤今日 & 去重") |
|||
history = load_history() |
|||
new_items = [] |
|||
for item in all_items: |
|||
if not is_today(item["发布时间"]): |
|||
continue |
|||
fp = generate_fingerprint(item["标题"], item["发布时间"]) |
|||
if fp in history: |
|||
print(f"⏭️ 跳过重复: {item['标题'][:30]}...") |
|||
continue |
|||
new_items.append(item) |
|||
history.add(fp) |
|||
|
|||
print(f"🆕 今日新增 {len(new_items)} 条新闻") |
|||
|
|||
# 3. 抓取正文 + 上传 |
|||
print("\n[阶段3] 抓取正文 & 上传(内存上传)") |
|||
results = [] |
|||
for i, item in enumerate(new_items, 1): |
|||
title = item["标题"] |
|||
print(f"\n[{i}/{len(new_items)}] {title[:50]}...") |
|||
|
|||
try: |
|||
with monitor.io_timer("network_read", f"article: {title[:20]}"): |
|||
html = fetch_page(item["原文链接"]) |
|||
content = extract_article_content(html) if html else "(访问失败)" |
|||
item["正文内容"] = content |
|||
|
|||
# 构建 Markdown 内容(内存中) |
|||
md_content = f"""# {title} |
|||
|
|||
- 分类:{item['分类']} |
|||
- 发布时间:{item['发布时间']} |
|||
- 原文链接:{item['原文链接']} |
|||
|
|||
--- |
|||
|
|||
{content} |
|||
""" |
|||
|
|||
# 保存到磁盘(可选,用于审计) |
|||
safe_title = sanitize_filename(title) |
|||
md_file = f"{i:02d}_{safe_title}.md" |
|||
md_path = os.path.join(OUTPUT_DIR, md_file) |
|||
with monitor.io_timer("file_write", f"save_md: {md_file}"): |
|||
with open(md_path, "w", encoding="utf-8") as f: |
|||
f.write(md_content) |
|||
print(f" 💾 已保存:{md_file}") |
|||
|
|||
# 上传(内存) |
|||
res = upload_to_knowledge_base_from_content(md_file, md_content) |
|||
item.update({ |
|||
"知识库FileId": res.get("fileId", ""), |
|||
"上传状态": "✅" if res.get("code") == 200 else "❌", |
|||
"上传信息": res.get("message", "")[:100], |
|||
"指纹": fp |
|||
}) |
|||
results.append(item) |
|||
|
|||
except Exception as e: |
|||
print(f"❌ 处理失败: {title[:30]} | {e}") |
|||
item.update({ |
|||
"知识库FileId": "", |
|||
"上传状态": "❌ 处理失败", |
|||
"上传信息": str(e)[:100], |
|||
"指纹": fp |
|||
}) |
|||
results.append(item) |
|||
continue |
|||
|
|||
# 4. 保存 & 退出 |
|||
print("\n[阶段4] 保存缓存 & Excel") |
|||
save_history(history) |
|||
if results: |
|||
with monitor.io_timer("file_write", "save_excel"): |
|||
df = pd.DataFrame(results) |
|||
df.to_excel(OUTPUT_EXCEL, index=False, engine='openpyxl') |
|||
print(f"\n🎉 完成!今日新增 {len(results)} 条,Excel: {OUTPUT_EXCEL}") |
|||
else: |
|||
print(f"\nℹ️ 今日暂无新新闻发布(已探测 {len(list_pages)} 页)") |
|||
|
|||
# 关键:不 quit driver!保留上下文供下次使用 |
|||
print("📌 浏览器上下文已保留,下次运行将复用(加速)") |
|||
|
|||
# 输出 I/O 总结 |
|||
monitor.summary() |
|||
total_elapsed = time.perf_counter() - overall_start |
|||
print(f"\n🎯 总运行时间: {total_elapsed:.2f}s") |
|||
|
|||
|
|||
# ====== 优雅退出(保留 driver)====== |
|||
def cleanup(): |
|||
"""进程退出时清理(不 quit driver,除非强制)""" |
|||
global _driver_instance |
|||
if _driver_instance: |
|||
print("💡 提示:为加速下次运行,浏览器上下文已保留。") |
|||
print(" 如需彻底清理,请手动删除目录:", PERSISTENT_PROFILE_DIR) |
|||
# 不 quit,保留状态 |
|||
# _driver_instance.quit() |
|||
|
|||
|
|||
import atexit |
|||
|
|||
atexit.register(cleanup) |
|||
|
|||
if __name__ == "__main__": |
|||
try: |
|||
main() |
|||
except KeyboardInterrupt: |
|||
print("\n\n🛑 用户中断,正在退出...") |
|||
cleanup() |
|||
except Exception as e: |
|||
print(f"\n💥 严重错误: {e}") |
|||
cleanup() |
|||
raise |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue