2246 lines
91 KiB
Python
2246 lines
91 KiB
Python
t#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
import requests
|
||
from typing import Dict, Any, List, Optional, Tuple
|
||
from urllib.parse import urlencode
|
||
import uuid
|
||
from loguru import logger
|
||
import os
|
||
import time
|
||
import random
|
||
import json
|
||
import sys
|
||
import re
|
||
|
||
from requests.exceptions import ProxyError
|
||
|
||
API_BASE_URL = os.getenv('API_BASE_URL', 'http://124.222.106.226:9999')
|
||
|
||
os.makedirs("logs", exist_ok=True)
|
||
logger.add("logs/log_{time:YYYY-MM-DD}.log", level="INFO", rotation="00:00", retention="30 days", enqueue=True)
|
||
|
||
|
||
def sleep_random_between() -> float:
|
||
"""
|
||
执行统一的随机延时(至少10秒以上,减少风控触发)
|
||
|
||
Returns:
|
||
float: 实际休眠的秒数
|
||
"""
|
||
try:
|
||
min_seconds = float(os.getenv('SLEEP_MIN_SECONDS', '10'))
|
||
max_seconds = float(os.getenv('SLEEP_MAX_SECONDS', '20'))
|
||
# 确保最小值至少为10秒
|
||
if min_seconds < 10:
|
||
min_seconds = 10
|
||
if max_seconds < min_seconds:
|
||
max_seconds = min_seconds + 10
|
||
wait_time = random.uniform(min_seconds, max_seconds)
|
||
except Exception:
|
||
wait_time = 10.0
|
||
time.sleep(wait_time)
|
||
return wait_time
|
||
|
||
|
||
class IPStrategyConfig:
|
||
def __init__(self,
|
||
response_time_threshold_sec: int = int(os.getenv('IP_RESP_TIME_THRESHOLD', '5')),
|
||
proxy_failure_threshold: int = int(os.getenv('IP_PROXY_FAIL_THRESHOLD', '3')),
|
||
local_cooldown_sec: int = int(os.getenv('IP_LOCAL_COOLDOWN_SEC', '1800')),
|
||
local_failure_threshold: int = int(os.getenv('IP_LOCAL_FAIL_THRESHOLD', '2'))):
|
||
"""IP策略配置
|
||
|
||
Args:
|
||
response_time_threshold_sec (int): 单次请求耗时阈值秒。
|
||
proxy_failure_threshold (int): 同一代理连续失败触发切换阈值。
|
||
local_cooldown_sec (int): 本机IP使用冷却时间秒。
|
||
local_failure_threshold (int): 本机连续失败阈值,超过后回到代理池。
|
||
|
||
Returns:
|
||
None
|
||
"""
|
||
self.response_time_threshold_sec = response_time_threshold_sec
|
||
self.proxy_failure_threshold = proxy_failure_threshold
|
||
self.local_cooldown_sec = local_cooldown_sec
|
||
self.local_failure_threshold = local_failure_threshold
|
||
|
||
def update(self, updates: Dict[str, Any]) -> None:
|
||
"""动态更新配置"""
|
||
for k, v in updates.items():
|
||
if hasattr(self, k):
|
||
setattr(self, k, v)
|
||
|
||
|
||
class IPAnomalyDetector:
|
||
def __init__(self, cfg: IPStrategyConfig):
|
||
"""异常检测器
|
||
|
||
Args:
|
||
cfg (IPStrategyConfig): 策略配置。
|
||
Returns:
|
||
None
|
||
"""
|
||
self.cfg = cfg
|
||
|
||
def detect(self, status_code: Optional[int], elapsed_sec: float, resp_json: Optional[Dict], error_text: str = "") -> Optional[str]:
|
||
"""检测是否存在IP异常
|
||
|
||
Args:
|
||
status_code (Optional[int]): 响应HTTP状态码;异常时可能为None。
|
||
elapsed_sec (float): 响应耗时秒。
|
||
resp_json (Optional[Dict]): 响应体JSON。
|
||
error_text (str): 异常文本。
|
||
|
||
Returns:
|
||
Optional[str]: 异常原因标识字符串;无异常返回None。
|
||
"""
|
||
if status_code in (403, 429, 407):
|
||
return f"http_{status_code}"
|
||
if elapsed_sec > self.cfg.response_time_threshold_sec:
|
||
return "slow_response"
|
||
if resp_json:
|
||
msg = str(resp_json.get("message", ""))
|
||
code = resp_json.get("code")
|
||
if code == 35 or ("IP地址存在异常" in msg or ("IP" in msg and "异常" in msg)):
|
||
return "ip_banned"
|
||
if error_text and ("IP" in error_text and "异常" in error_text):
|
||
return "ip_banned"
|
||
return None
|
||
|
||
|
||
class SmartIPManager:
|
||
def __init__(self, proxy_pool: Optional[List[Dict[str, str]]], cfg: IPStrategyConfig):
|
||
"""智能IP管理器
|
||
|
||
Args:
|
||
proxy_pool (Optional[List[Dict[str,str]]]): 代理池列表,元素为requests兼容代理字典。
|
||
cfg (IPStrategyConfig): 策略配置。
|
||
|
||
Returns:
|
||
None
|
||
"""
|
||
self.cfg = cfg
|
||
self.proxy_pool: List[Dict[str, str]] = proxy_pool or []
|
||
self.eliminated: set = set()
|
||
self.current_mode: str = 'proxy' if self.proxy_pool else 'local'
|
||
self.current_index: int = 0
|
||
self.proxy_failures_current: int = 0
|
||
self.local_failures: int = 0
|
||
self.last_local_use_time: float = 0.0
|
||
self.local_disabled_until: float = 0.0
|
||
|
||
def current_route(self) -> Tuple[str, Optional[Dict[str, str]]]:
|
||
"""返回当前路由模式和代理配置"""
|
||
if self.current_mode == 'proxy' and self.proxy_pool:
|
||
return 'proxy', self.proxy_pool[self.current_index]
|
||
return 'local', None
|
||
|
||
def mark_success(self) -> None:
|
||
"""请求成功后重置失败计数"""
|
||
if self.current_mode == 'proxy':
|
||
self.proxy_failures_current = 0
|
||
else:
|
||
self.local_failures = 0
|
||
|
||
def mark_failure(self, reason: str = "") -> None:
|
||
"""请求失败后更新失败计数与淘汰状态"""
|
||
if self.current_mode == 'proxy':
|
||
self.proxy_failures_current += 1
|
||
if self.proxy_failures_current >= self.cfg.proxy_failure_threshold:
|
||
self.eliminated.add(self.current_index)
|
||
else:
|
||
self.local_failures += 1
|
||
|
||
def select_next_route(self) -> Tuple[str, Optional[Dict[str, str]]]:
|
||
"""选择下一个路由(代理或本机),避免无限本机循环"""
|
||
now = time.monotonic()
|
||
if self.current_mode == 'proxy':
|
||
if self.proxy_failures_current >= self.cfg.proxy_failure_threshold:
|
||
if self._local_available(now):
|
||
self.current_mode = 'local'
|
||
self.last_local_use_time = now
|
||
self.proxy_failures_current = 0
|
||
return 'local', None
|
||
next_idx = self._next_proxy_index()
|
||
if next_idx is not None:
|
||
self.current_index = next_idx
|
||
self.proxy_failures_current = 0
|
||
return 'proxy', self.proxy_pool[self.current_index]
|
||
self.current_mode = 'local'
|
||
self.last_local_use_time = now
|
||
self.proxy_failures_current = 0
|
||
return 'local', None
|
||
if self.proxy_pool:
|
||
return 'proxy', self.proxy_pool[self.current_index]
|
||
self.current_mode = 'local'
|
||
return 'local', None
|
||
else:
|
||
if self.local_failures >= self.cfg.local_failure_threshold:
|
||
next_idx = self._next_proxy_index()
|
||
if next_idx is not None:
|
||
self.current_mode = 'proxy'
|
||
self.current_index = next_idx
|
||
self.local_failures = 0
|
||
return 'proxy', self.proxy_pool[self.current_index]
|
||
return 'local', None
|
||
|
||
def _next_proxy_index(self) -> Optional[int]:
|
||
"""查找下一个未被淘汰的代理索引"""
|
||
if not self.proxy_pool:
|
||
return None
|
||
n = len(self.proxy_pool)
|
||
for step in range(1, n + 1):
|
||
cand = (self.current_index + step) % n
|
||
if cand not in self.eliminated:
|
||
return cand
|
||
return None
|
||
|
||
def _local_available(self, now: float) -> bool:
|
||
"""本机是否可用(冷却与禁用窗口判断)"""
|
||
if now < self.local_disabled_until:
|
||
return False
|
||
return (now - self.last_local_use_time) >= self.cfg.local_cooldown_sec
|
||
|
||
def disable_local_temporarily(self, seconds: int) -> None:
|
||
"""临时禁用本机IP"""
|
||
self.local_disabled_until = time.monotonic() + max(0, seconds)
|
||
|
||
def manual_switch_to_proxy(self, index: int) -> None:
|
||
"""人工指定代理索引"""
|
||
if 0 <= index < len(self.proxy_pool) and index not in self.eliminated:
|
||
self.current_mode = 'proxy'
|
||
self.current_index = index
|
||
self.proxy_failures_current = 0
|
||
|
||
def enable_local(self) -> None:
|
||
"""重新允许本机IP"""
|
||
self.local_disabled_until = 0.0
|
||
|
||
class BossZhipinAPI:
|
||
|
||
def __init__(self, proxy_config: Optional[Dict] = None, proxy_pool: Optional[List[Dict[str, str]]] = None, ip_strategy_config: Optional[Dict[str, Any]] = None):
|
||
self.app_id = 10002
|
||
self.zp_product_id = 10002
|
||
self.serve_domain = "https://www.zhipin.com"
|
||
self.api_domain = "https://wxapp.zhipin.com"
|
||
|
||
self.session = requests.Session()
|
||
self.session.trust_env = False
|
||
self.session.headers.update({'no_proxy': '10.0.0.0/16,example.com,.example.com'})
|
||
|
||
self.proxy_config = proxy_config
|
||
|
||
# 代理字典值清洗,避免包含反引号或前后空白
|
||
if isinstance(self.proxy_config, dict):
|
||
cleaned = {}
|
||
for k, v in self.proxy_config.items():
|
||
if isinstance(v, str):
|
||
cleaned[k] = v.strip().strip('`')
|
||
else:
|
||
cleaned[k] = v
|
||
self.proxy_config = cleaned
|
||
|
||
if self.proxy_config:
|
||
self.session.proxies.update(self.proxy_config)
|
||
print(f"✅ 已设置代理: {self.proxy_config}")
|
||
|
||
self.local_mode = False
|
||
self.local_success_count = 0
|
||
self.local_mode_since = 0
|
||
self.local_fail_count = 0
|
||
|
||
cfg = IPStrategyConfig(**(ip_strategy_config or {}))
|
||
pool: List[Dict[str, str]] = []
|
||
if proxy_pool:
|
||
pool.extend(proxy_pool)
|
||
if self.proxy_config:
|
||
pool.append(self.proxy_config)
|
||
self.ip_cfg = cfg
|
||
self.ip_detector = IPAnomalyDetector(cfg)
|
||
self.ip_manager = SmartIPManager(pool, cfg)
|
||
route_mode, route_cfg = self.ip_manager.current_route()
|
||
if route_mode == 'proxy' and route_cfg:
|
||
self.session.proxies = route_cfg
|
||
print(f"🚦 初始路由: 代理 {route_cfg}")
|
||
else:
|
||
self.session.proxies = {}
|
||
print("🚦 初始路由: 本机直连")
|
||
|
||
self.device_id = str(uuid.uuid4())
|
||
self.wx_version = "8.0.43"
|
||
self.mini_version = "1.0.0"
|
||
self.scene = 1001
|
||
|
||
self.default_headers = {
|
||
"Accept": "*/*",
|
||
"Accept-Language": "zh-CN,zh;q=0.9",
|
||
"Accept-Encoding": "gzip, deflate, br",
|
||
"Connection": "keep-alive",
|
||
"Content-Type": "application/x-www-form-urlencoded",
|
||
"Host": "www.zhipin.com",
|
||
"Referer": "https://servicewechat.com/wxa8da525af05281f3/571/page-frame.html",
|
||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.10(0x13080a10) XWEB/1227",
|
||
"X-Requested-With": "XMLHttpRequest",
|
||
"platform": "zhipin/mac",
|
||
"zp_app_id": str(self.app_id),
|
||
"ver": "100.0000",
|
||
"mini_ver": "100.0000",
|
||
"ua": json.dumps({"model": "Mac16,8", "platform": "mac"}),
|
||
"zp_product_id": str(self.zp_product_id),
|
||
"scene": "1006",
|
||
"xweb_xhr": "1",
|
||
"sec-fetch-site": "cross-site",
|
||
"sec-fetch-mode": "cors",
|
||
"sec-fetch-dest": "empty"
|
||
}
|
||
|
||
self.init_cookies()
|
||
|
||
self.login_data = {
|
||
"mpt": "",
|
||
"wt2": "",
|
||
"openId": "",
|
||
"traceid": "F-77d05bnXuMVrHIB3"
|
||
}
|
||
|
||
# 从API获取token
|
||
self.current_token_id = None # 存储当前使用的token ID
|
||
self.load_token_from_api()
|
||
|
||
def load_token_from_api(self, api_base_url: str = None):
|
||
"""从后端API获取可用的Boss token"""
|
||
if api_base_url is None:
|
||
api_base_url = API_BASE_URL
|
||
|
||
try:
|
||
# 获取可用的token列表 - 使用通用数据接口
|
||
response = requests.get(f"{api_base_url}/api/v1/token/tokens?page=1&page_size=10", timeout=10)
|
||
print(response.text)
|
||
if response.status_code == 200:
|
||
data = response.json()
|
||
if data.get("data"):
|
||
tokens = data["data"]
|
||
if tokens:
|
||
token_info = tokens[0]
|
||
token_id = token_info.get("id")
|
||
mpt = token_info.get("mpt")
|
||
wt2 = token_info.get("wt2")
|
||
|
||
if mpt and wt2 and token_id:
|
||
self.current_token_id = token_id # 存储当前token ID
|
||
self.set_login_data(mpt, wt2)
|
||
logger.info(f"✅ 成功从API获取token: id={token_id}, mpt={mpt[:20]}..., wt2={wt2[:50]}...")
|
||
return True
|
||
else:
|
||
logger.warning("⚠️ API返回的token数据不完整")
|
||
else:
|
||
logger.warning("⚠️ 没有可用的token")
|
||
else:
|
||
current_ip = self.get_current_ip()
|
||
error_msg = data.get('message', '未知错误')
|
||
logger.error(f"❌ API返回错误: {error_msg}")
|
||
logger.error(f"🌐 当前IP地址: {current_ip}")
|
||
|
||
# 如果是IP异常,进行重试处理
|
||
if 'IP' in error_msg and '异常' in error_msg:
|
||
logger.info(f"🔄 检测到IP异常,开始重试处理...")
|
||
retry_success = self.handle_ip_exception_retry_for_token(api_base_url)
|
||
if retry_success:
|
||
return True # 重试成功
|
||
else:
|
||
logger.error(f"❌ API请求失败: {response.status_code}")
|
||
|
||
except Exception as e:
|
||
import traceback
|
||
traceback.print_exc()
|
||
logger.error(f"❌ 从API获取token失败: {str(e)}")
|
||
|
||
# 如果API获取失败,使用默认值
|
||
logger.warning("⚠️ 使用默认token值")
|
||
self.set_login_data(
|
||
"1179681000a187f48bab5c526e25baff",
|
||
"ELifb5J2w04JC_7-2QsHpI_tYDl-_XmGcLAjCQB4MdxlfeQF673MfEjBCZB2ncMyfO-a0SC-PUFfLS36iFZARNA~~"
|
||
)
|
||
return False
|
||
|
||
def generate_boss_trace_id(self) -> str:
|
||
"""生成Boss直聘的trace_id
|
||
|
||
基于Boss直聘官方算法:
|
||
1. 获取当前时间戳的16进制表示,取后6位
|
||
2. 生成10位随机字符串(包含数字、小写字母、大写字母)
|
||
3. 拼接为 F-{timestamp_hex}{random_string} 格式
|
||
"""
|
||
import time
|
||
import random
|
||
|
||
# 获取当前时间戳的16进制表示,取后6位
|
||
timestamp_hex = hex(int(time.time() * 1000))[2:][-6:]
|
||
|
||
# 字符集:数字 + 小写字母 + 大写字母
|
||
charset = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||
|
||
# 生成10位随机字符串
|
||
random_string = ''.join(random.choice(charset) for _ in range(10))
|
||
|
||
# 拼接最终的traceid
|
||
trace_id = f"F-{timestamp_hex}{random_string}"
|
||
|
||
return trace_id
|
||
|
||
def get_current_ip(self) -> str:
|
||
"""获取当前IP地址"""
|
||
try:
|
||
response = self.session.get('http://v2.api.juliangip.com/v2/dps/ip_search?ip=0.0.0.0', timeout=10)
|
||
if response.status_code == 200:
|
||
ip_data = response.json()
|
||
if ip_data.get('code') == 0 and ip_data.get('data'):
|
||
return ip_data['data'].get('ip', 'Unknown')
|
||
else:
|
||
pass
|
||
try:
|
||
r2 = self.session.get('http://v2.api.juliangip.com/v2/dps/ip_search?ip=0.0.0.0', timeout=10)
|
||
if r2.status_code == 200:
|
||
ip_data2 = r2.json()
|
||
if ip_data2.get('code') == 0 and ip_data2.get('data'):
|
||
return ip_data2['data'].get('ip', 'Unknown')
|
||
except Exception:
|
||
pass
|
||
return f'HTTP {response.status_code}'
|
||
except Exception as e:
|
||
try:
|
||
r3 = self.session.get('http://v2.api.juliangip.com/v2/dps/ip_search?ip=0.0.0.0', timeout=10)
|
||
if r3.status_code == 200:
|
||
ip_data3 = r3.json()
|
||
if ip_data3.get('code') == 0 and ip_data3.get('data'):
|
||
return ip_data3['data'].get('ip', 'Unknown')
|
||
except Exception:
|
||
pass
|
||
return f'获取IP失败: {str(e)}'
|
||
|
||
def handle_ip_exception_retry(self, page_params: dict, page_num: int, max_retries: int = 3) -> bool:
|
||
"""处理IP异常的重试逻辑
|
||
|
||
Args:
|
||
page_params: 页面参数
|
||
page_num: 页码
|
||
max_retries: 最大重试次数
|
||
|
||
Returns:
|
||
bool: 重试是否成功
|
||
"""
|
||
for retry_count in range(max_retries):
|
||
print(f"⏳ 第 {retry_count + 1}/{max_retries} 次重试...")
|
||
|
||
# 等待时间递增:至少10秒,然后15秒、20秒
|
||
wait_time = max(10, (retry_count + 1) * 5 + 5)
|
||
print(f"⏰ 等待 {wait_time} 秒后重试(让隧道代理切换IP)...")
|
||
time.sleep(wait_time)
|
||
|
||
# 重新初始化session和cookies(强制使用新IP)
|
||
self.reinit_session()
|
||
self.init_cookies()
|
||
|
||
# 获取新的IP地址
|
||
new_ip = self.get_current_ip()
|
||
print(f"🌐 新IP地址: {new_ip}")
|
||
|
||
# 重新尝试请求
|
||
try:
|
||
result = self.get_job_list_by_keyword(page_params, skip_init=True)
|
||
if result and isinstance(result, dict) and result.get('code') == 0:
|
||
print(f"✅ 重试成功!第 {page_num} 页数据获取成功")
|
||
return True
|
||
else:
|
||
error_msg = result.get('message', 'Unknown error') if result else 'No response'
|
||
print(f"❌ 重试失败: {error_msg}")
|
||
need_local = ('IP' in error_msg and '异常' in error_msg) or (retry_count + 1 >= max_retries)
|
||
if need_local:
|
||
self.enable_local_mode()
|
||
try:
|
||
local_result = self.get_job_list_by_keyword(page_params, skip_init=True)
|
||
if local_result and isinstance(local_result, dict) and local_result.get('code') == 0:
|
||
print("✅ 本机直连模式重试成功")
|
||
self.local_success_count += 1
|
||
self.try_restore_proxy()
|
||
return True
|
||
except Exception as e2:
|
||
print(f"❌ 本机直连模式重试异常: {str(e2)}")
|
||
except Exception as e:
|
||
print(f"❌ 重试异常: {str(e)}")
|
||
|
||
print(f"💥 重试 {max_retries} 次后仍然失败,跳过当前页")
|
||
return False
|
||
|
||
def reinit_session(self, route: Optional[Dict[str, str]] = None):
|
||
"""重新初始化session,应用指定路由(代理/直连)
|
||
|
||
Args:
|
||
route (Optional[Dict[str,str]]): 代理配置;None 表示直连。
|
||
|
||
Returns:
|
||
None
|
||
"""
|
||
# 关闭旧的session
|
||
if hasattr(self, 'session'):
|
||
self.session.close()
|
||
|
||
# 创建新的session
|
||
self.session = requests.Session()
|
||
# 禁止读取系统环境代理,避免被覆盖
|
||
self.session.trust_env = False
|
||
|
||
if route:
|
||
self.session.proxies = route
|
||
print("🔄 Session已重新初始化,将使用新的代理连接")
|
||
else:
|
||
self.session.proxies = {}
|
||
print("🔄 Session已重新初始化,未配置代理,使用直连")
|
||
|
||
def enable_local_mode(self):
|
||
try:
|
||
self.local_mode = True
|
||
self.local_mode_since = time.time()
|
||
self.local_success_count = 0
|
||
self.local_fail_count = 0
|
||
self.session.proxies = {}
|
||
print("🔁 已切换为本机直连模式")
|
||
except Exception:
|
||
pass
|
||
|
||
def enable_proxy_mode(self):
|
||
try:
|
||
if self.proxy_config:
|
||
self.session.proxies = self.proxy_config
|
||
self.local_mode = False
|
||
self.local_fail_count = 0
|
||
print("🔁 已切换为代理模式")
|
||
except Exception:
|
||
pass
|
||
|
||
def try_restore_proxy(self):
|
||
try:
|
||
seconds = int(os.getenv('PROXY_RESTORE_AFTER_SECONDS', '300'))
|
||
successes = int(os.getenv('PROXY_RESTORE_AFTER_SUCCESS', '3'))
|
||
except Exception:
|
||
seconds = 300
|
||
successes = 3
|
||
if self.local_mode:
|
||
if (self.local_mode_since and time.time() - self.local_mode_since >= seconds) or (self.local_success_count >= successes):
|
||
self.enable_proxy_mode()
|
||
|
||
def handle_ip_exception_retry_for_auto_crawl(self, params):
|
||
"""处理auto_crawl方法中的IP异常重试"""
|
||
max_retries = 3
|
||
|
||
for retry_count in range(1, max_retries + 1):
|
||
# 等待时间递增:至少10秒,然后15秒、20秒
|
||
wait_time = max(10, retry_count * 5 + 5)
|
||
print(f"⏳ 第 {retry_count} 次重试,等待 {wait_time} 秒...")
|
||
time.sleep(wait_time)
|
||
|
||
# 重新初始化session和cookies以获取新IP
|
||
self.reinit_session()
|
||
self.init_cookies()
|
||
|
||
# 获取并打印新IP
|
||
new_ip = self.get_current_ip()
|
||
print(f"🌐 新IP地址: {new_ip}")
|
||
|
||
# 重新尝试get_job_list_by_keyword
|
||
try:
|
||
result = self.get_job_list_by_keyword(params, skip_init=True)
|
||
if result and isinstance(result, dict) and result.get('code') == 0:
|
||
print(f"✅ 第 {retry_count} 次重试成功!")
|
||
return result
|
||
else:
|
||
print(f"❌ 第 {retry_count} 次重试仍然失败")
|
||
need_local = (retry_count >= max_retries) or (isinstance(result, dict) and (result.get('code') == 35 or 'IP地址存在异常' in result.get('message', '')))
|
||
if need_local:
|
||
self.enable_local_mode()
|
||
try:
|
||
local_result = self.get_job_list_by_keyword(params, skip_init=True)
|
||
if local_result and isinstance(local_result, dict) and local_result.get('code') == 0:
|
||
print("✅ 本机直连模式重试成功")
|
||
self.local_success_count += 1
|
||
self.try_restore_proxy()
|
||
return local_result
|
||
except Exception as e2:
|
||
print(f"❌ 本机直连模式重试异常: {e2}")
|
||
except Exception as e:
|
||
print(f"❌ 第 {retry_count} 次重试出现异常: {e}")
|
||
|
||
print(f"❌ 经过 {max_retries} 次重试仍然失败")
|
||
return None
|
||
|
||
def handle_ip_exception_retry_for_token(self, api_base_url):
|
||
"""处理token获取中的IP异常重试"""
|
||
max_retries = 3
|
||
|
||
for retry_count in range(1, max_retries + 1):
|
||
# 等待时间递增:至少10秒,然后15秒、20秒
|
||
wait_time = max(10, retry_count * 5 + 5)
|
||
logger.info(f"⏳ 第 {retry_count} 次重试,等待 {wait_time} 秒...")
|
||
time.sleep(wait_time)
|
||
|
||
# 重新初始化session以获取新IP
|
||
self.reinit_session()
|
||
|
||
# 获取并打印新IP
|
||
new_ip = self.get_current_ip()
|
||
logger.info(f"🌐 新IP地址: {new_ip}")
|
||
|
||
# 重新尝试获取token(避免递归调用)
|
||
try:
|
||
api_url = api_base_url or API_BASE_URL
|
||
response = self.session.get(f"{api_url}/api/token", timeout=10)
|
||
if response.status_code == 200:
|
||
data = response.json()
|
||
if data.get('code') == 0 and 'data' in data:
|
||
token_data = data['data']
|
||
if 'token' in token_data and 'cookies' in token_data:
|
||
self.token = token_data['token']
|
||
self.cookies.update(token_data['cookies'])
|
||
logger.info(f"✅ 第 {retry_count} 次重试成功!")
|
||
return True
|
||
logger.warning(f"❌ 第 {retry_count} 次重试仍然失败")
|
||
if retry_count >= max_retries:
|
||
self.enable_local_mode()
|
||
try:
|
||
response2 = self.session.get(f"{api_url}/api/token", timeout=10)
|
||
if response2.status_code == 200:
|
||
data2 = response2.json()
|
||
if data2.get('code') == 0 and 'data' in data2:
|
||
token_data2 = data2['data']
|
||
if 'token' in token_data2 and 'cookies' in token_data2:
|
||
self.token = token_data2['token']
|
||
self.cookies.update(token_data2['cookies'])
|
||
logger.info("✅ 本机直连模式获取token成功")
|
||
self.local_success_count += 1
|
||
self.try_restore_proxy()
|
||
return True
|
||
except Exception as e2:
|
||
logger.error(f"❌ 本机直连模式获取token异常: {e2}")
|
||
except Exception as e:
|
||
logger.error(f"❌ 第 {retry_count} 次重试出现异常: {e}")
|
||
|
||
logger.error(f"❌ 经过 {max_retries} 次重试仍然失败")
|
||
return False
|
||
|
||
def init_cookies(self):
|
||
cookies = {
|
||
'__zp_stoken__': self.generate_token(),
|
||
'Hm_lvt_194df3105ad7148dcf2b98a91b5e727a': str(int(time.time())),
|
||
'Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a': str(int(time.time())),
|
||
'__c': self.device_id[:8],
|
||
'__g': '-',
|
||
'__l': 'l=%2Fwww.zhipin.com%2F&r=&friend_source=0&s=3&friend_source=0',
|
||
'lastCity': '101010100',
|
||
'cityName': '%E5%8C%97%E4%BA%AC',
|
||
'__zp_sseed__': 'btHZ0bjBq8m//WNwlVrPUnVcIvini5J5P5LQUbflM24=',
|
||
'__zp_sname__': '3998243a',
|
||
'__zp_sts__': '1753719971615'
|
||
}
|
||
|
||
for name, value in cookies.items():
|
||
self.session.cookies.set(name, value, domain='.zhipin.com')
|
||
|
||
def generate_token(self) -> str:
|
||
chars = "0123456789abcdef"
|
||
return ''.join(random.choice(chars) for _ in range(32))
|
||
|
||
def test_proxy_connection(self):
|
||
if not hasattr(self, 'proxy_config') or not self.proxy_config:
|
||
return True
|
||
|
||
try:
|
||
print("🔍 测试代理连接...")
|
||
current_ip = self.get_current_ip()
|
||
if not current_ip or current_ip.startswith('获取IP失败') or current_ip.startswith('HTTP') or current_ip.startswith('API返回错误'):
|
||
print(f"❌ 代理连接测试失败: {current_ip}")
|
||
return False
|
||
|
||
print(f"✅ 代理连接成功,当前IP: {current_ip}")
|
||
|
||
try:
|
||
resp = self.session.get('http://v2.api.juliangip.com/v2/dps/ip_search?ip=0.0.0.0', timeout=10)
|
||
if resp.status_code == 200:
|
||
data = resp.json()
|
||
https_ip = data.get('data', {}).get('ip', '') if data.get('code') == 0 else ''
|
||
print(f"✅ HTTPS代理连接成功,当前IP: {https_ip}")
|
||
else:
|
||
print(f"❌ HTTPS代理连接失败: HTTP {resp.status_code}")
|
||
return False
|
||
except ProxyError as pe:
|
||
print(f"❌ HTTPS代理认证失败(407): {pe}")
|
||
return False
|
||
except Exception as e:
|
||
print(f"❌ HTTPS代理连接异常: {e}")
|
||
return False
|
||
|
||
try:
|
||
rbt = self.session.get('https://www.zhipin.com/robots.txt', timeout=10)
|
||
if rbt.status_code == 200:
|
||
print("✅ 目标域预检成功: www.zhipin.com")
|
||
return True
|
||
else:
|
||
print(f"❌ 目标域预检失败: HTTP {rbt.status_code}")
|
||
if self._try_switch_to_socks5h():
|
||
return self._retest_proxy_connection()
|
||
return False
|
||
except requests.ProxyError as pe2:
|
||
print(f"❌ 目标域预检代理认证失败(407): {pe2}")
|
||
if self._try_switch_to_socks5h():
|
||
return self._retest_proxy_connection()
|
||
return False
|
||
except Exception as e2:
|
||
print(f"❌ 目标域预检异常: {e2}")
|
||
return False
|
||
except Exception as e:
|
||
print(f"❌ 代理连接测试失败: {e}")
|
||
return False
|
||
|
||
def init_session(self):
|
||
try:
|
||
print("正在初始化微信小程序会话...")
|
||
|
||
if hasattr(self, 'proxy_config') and self.proxy_config and not self.test_proxy_connection():
|
||
print("❌ 代理连接失败,尝试切换 socks5h 并重测")
|
||
switched = self._try_switch_to_socks5h()
|
||
if switched:
|
||
if not self._retest_proxy_connection():
|
||
print("❌ 重测代理仍失败")
|
||
else:
|
||
print("⚠️ 未切换代理方案,保留现有代理配置")
|
||
|
||
wx_headers = self.default_headers.copy()
|
||
wx_headers["Host"] = "wxapp.zhipin.com"
|
||
wx_headers["Referer"] = "https://servicewechat.com/wx6c8d9b0c9ec51e7e/1/page-frame.html"
|
||
|
||
response = self.session.get(
|
||
f"{self.api_domain}/wapi/zpcommon/data/config.json",
|
||
headers=wx_headers,
|
||
timeout=30
|
||
)
|
||
|
||
if response.status_code == 200:
|
||
print("✅ 微信小程序配置获取成功")
|
||
|
||
main_headers = self.default_headers.copy()
|
||
main_headers["Host"] = "www.zhipin.com"
|
||
|
||
response2 = self.session.get(
|
||
self.serve_domain,
|
||
headers=main_headers,
|
||
timeout=30
|
||
)
|
||
|
||
return response2.status_code == 200
|
||
|
||
except Exception as e:
|
||
print(f"会话初始化失败: {e}")
|
||
return False
|
||
|
||
def _try_switch_to_socks5h(self) -> bool:
|
||
try:
|
||
if not self.proxy_config:
|
||
return False
|
||
new_cfg = {}
|
||
changed = False
|
||
for k, v in self.proxy_config.items():
|
||
if isinstance(v, str) and v.startswith('http://'):
|
||
new_cfg[k] = 'socks5h://' + v[len('http://'):]
|
||
changed = True
|
||
else:
|
||
new_cfg[k] = v
|
||
if not changed:
|
||
return False
|
||
self.proxy_config = new_cfg
|
||
self.session.proxies = new_cfg
|
||
print("🔁 已切换到 socks5h 代理方案")
|
||
return True
|
||
except Exception:
|
||
print("❌ 切换 socks5h 失败,可能缺少依赖 requests[socks]")
|
||
return False
|
||
|
||
def _retest_proxy_connection(self) -> bool:
|
||
try:
|
||
resp = self.session.get('http://v2.api.juliangip.com/v2/dps/ip_search?ip=0.0.0.0', timeout=10)
|
||
if resp.status_code != 200:
|
||
return False
|
||
rbt = self.session.get('https://www.zhipin.com/robots.txt', timeout=10)
|
||
return rbt.status_code == 200
|
||
except Exception:
|
||
return False
|
||
|
||
def set_login_data(self, mpt: str, wt2: str, open_id: str = ""):
|
||
self.login_data.update({
|
||
"mpt": mpt,
|
||
"wt2": wt2,
|
||
"openId": open_id
|
||
})
|
||
|
||
if wt2:
|
||
self.session.cookies.set('wt2', wt2, domain='.zhipin.com')
|
||
if mpt:
|
||
self.session.cookies.set('mpt', mpt, domain='.zhipin.com')
|
||
|
||
def update_login_from_curl(self, api_base_url: str = None):
|
||
"""从API更新登录数据"""
|
||
success = self.load_token_from_api(api_base_url)
|
||
|
||
# 更新traceid
|
||
new_traceid = "F-77d05bnXuMVrHIB3"
|
||
self.login_data["traceid"] = new_traceid
|
||
|
||
if success:
|
||
print(f"✅ 已从API更新登录数据:")
|
||
else:
|
||
print(f"⚠️ API获取失败,使用默认登录数据:")
|
||
|
||
print(f" mpt: {self.login_data['mpt'][:20]}...")
|
||
print(f" wt2: {self.login_data['wt2'][:50]}...")
|
||
print(f" traceid: {new_traceid}")
|
||
|
||
return success
|
||
|
||
def update_token_status(self, token_id: int, is_active: bool = False, increment_failed_count: bool = False,
|
||
api_base_url: str = None):
|
||
"""更新token状态到API"""
|
||
if api_base_url is None:
|
||
api_base_url = API_BASE_URL
|
||
|
||
try:
|
||
# 构建更新数据
|
||
update_data = {"is_active": is_active}
|
||
|
||
# 如果需要增加失败次数,先获取当前失败次数
|
||
if increment_failed_count:
|
||
get_response = requests.get(f"{api_base_url}/api/v1/universal/data/{token_id}?data_type=boss_token")
|
||
if get_response.status_code == 200:
|
||
token_data = get_response.json()
|
||
if token_data.get("success") and token_data.get("data"):
|
||
current_failed_count = token_data["data"].get("failed_count", 0)
|
||
update_data["failed_count"] = current_failed_count + 1
|
||
logger.info(f"📈 将失败次数从 {current_failed_count} 增加到 {current_failed_count + 1}")
|
||
|
||
# 使用通用数据接口更新token
|
||
universal_update_data = {
|
||
"data_type": "boss_token",
|
||
"platform": "boss",
|
||
"data": update_data
|
||
}
|
||
response = requests.put(f"{api_base_url}/api/v1/universal/data/{token_id}",
|
||
json=universal_update_data)
|
||
|
||
if response.status_code == 200:
|
||
data = response.json()
|
||
if data.get("success"):
|
||
logger.info(f"✅ 成功更新token状态: token_id={token_id}, is_active={is_active}")
|
||
return True
|
||
else:
|
||
logger.error(f"❌ 更新token状态失败: {data.get('message', '未知错误')}")
|
||
else:
|
||
logger.error(f"❌ 更新token状态请求失败: {response.status_code}")
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ 更新token状态异常: {str(e)}")
|
||
|
||
return False
|
||
|
||
def handle_login_expired(self, api_base_url: str = None):
|
||
"""处理登录失效:标记当前token失效并获取新token"""
|
||
if api_base_url is None:
|
||
api_base_url = API_BASE_URL
|
||
|
||
if self.current_token_id:
|
||
logger.warning(f"🚫 检测到登录失效,处理token: id={self.current_token_id}")
|
||
|
||
# 更新当前token状态为不可用,并增加失败次数
|
||
success = self.update_token_status(
|
||
token_id=self.current_token_id,
|
||
is_active=False,
|
||
increment_failed_count=True,
|
||
api_base_url=api_base_url
|
||
)
|
||
|
||
if success:
|
||
logger.info(f"✅ 已标记token失效: id={self.current_token_id}")
|
||
else:
|
||
logger.error(f"❌ 标记token失效失败: id={self.current_token_id}")
|
||
|
||
# 重新获取新的token
|
||
self.current_token_id = None
|
||
return self.load_token_from_api(api_base_url)
|
||
else:
|
||
logger.warning("⚠️ 没有当前token ID,直接获取新token")
|
||
return self.load_token_from_api(api_base_url)
|
||
|
||
def mark_token_failed(self, api_base_url: str = None):
|
||
"""标记当前token失效(兼容旧方法)"""
|
||
return self.handle_login_expired(api_base_url)
|
||
|
||
def build_request_headers(self, custom_headers: Optional[Dict] = None) -> Dict[str, str]:
|
||
headers = self.default_headers.copy()
|
||
|
||
headers.update({
|
||
"mpt": self.login_data.get("mpt", ""),
|
||
"scene": "1006",
|
||
"wt2": "",
|
||
"Traceid": self.generate_boss_trace_id()
|
||
})
|
||
|
||
headers["timestamp"] = str(int(time.time() * 1000))
|
||
|
||
if custom_headers:
|
||
headers.update(custom_headers)
|
||
|
||
if "Host" not in custom_headers:
|
||
if "wxapp.zhipin.com" in headers.get("Referer", ""):
|
||
headers["Host"] = "wxapp.zhipin.com"
|
||
elif "www.zhipin.com" in headers.get("Referer", ""):
|
||
headers["Host"] = "www.zhipin.com"
|
||
|
||
return headers
|
||
|
||
def build_request_data(self, data: Optional[Dict] = None) -> Dict[str, Any]:
|
||
request_data = {
|
||
"appId": self.app_id,
|
||
"scene": self.scene,
|
||
"timestamp": int(time.time() * 1000)
|
||
}
|
||
|
||
if data:
|
||
request_data.update(data)
|
||
|
||
return request_data
|
||
|
||
def batch_request_direct(self, batch_method_feed: str, app_id: str = "10002") -> Optional[Dict]:
|
||
headers = self.build_request_headers({
|
||
"Referer": "https://www.zhipin.com/web/geek/job"
|
||
})
|
||
|
||
data = {
|
||
"batch_method_feed": batch_method_feed,
|
||
"appId": app_id
|
||
}
|
||
|
||
try:
|
||
sleep_random_between()
|
||
# print(f"📡 批量请求数据: {headers}") # 已注释掉header打印
|
||
start_t = time.monotonic()
|
||
response = self.session.get(
|
||
f"{self.serve_domain}/wapi/batch/batchRunV3",
|
||
params=data,
|
||
headers=headers,
|
||
timeout=30
|
||
)
|
||
|
||
logger.info("RAW_RESPONSE method={} url={} status={} resp_size={}", "GET", f"{self.serve_domain}/wapi/batch/batchRunV3", response.status_code, len(response.content))
|
||
|
||
if response.status_code == 200:
|
||
elapsed = time.monotonic() - start_t
|
||
result = response.json()
|
||
reason = self.ip_detector.detect(response.status_code, elapsed, result)
|
||
if reason:
|
||
logger.warning("IP_ANOMALY reason={} elapsed={:.2f}s status={}", reason, elapsed, response.status_code)
|
||
self.ip_manager.mark_failure(reason)
|
||
mode, cfg = self.ip_manager.select_next_route()
|
||
self.reinit_session(cfg)
|
||
self.init_cookies()
|
||
logger.info("IP_SWITCH mode={} cfg={}", mode, cfg)
|
||
wait_time = sleep_random_between()
|
||
logger.info(f"⏳ IP异常,等待 {int(wait_time)} 秒后重试 (批量GET)")
|
||
headers = self.build_request_headers({
|
||
"Referer": "https://www.zhipin.com/web/geek/job",
|
||
"User-Agent": self.get_random_user_agent()
|
||
})
|
||
response = self.session.get(
|
||
f"{self.serve_domain}/wapi/batch/batchRunV3",
|
||
params=data,
|
||
headers=headers,
|
||
timeout=30
|
||
)
|
||
if response.status_code != 200:
|
||
print(f"❌ 批量请求失败: {response.status_code}")
|
||
print(f"响应内容: {response.text[:500]}")
|
||
return None
|
||
result = response.json()
|
||
if self.handle_ip_abnormal_response(result):
|
||
wait_time = sleep_random_between()
|
||
logger.info(f"⏳ IP异常,等待 {int(wait_time)} 秒后重试 (批量GET)")
|
||
headers = self.build_request_headers({
|
||
"Referer": "https://www.zhipin.com/web/geek/job",
|
||
"User-Agent": self.get_random_user_agent()
|
||
})
|
||
response = self.session.get(
|
||
f"{self.serve_domain}/wapi/batch/batchRunV3",
|
||
params=data,
|
||
headers=headers,
|
||
timeout=30
|
||
)
|
||
if response.status_code != 200:
|
||
print(f"❌ 批量请求失败: {response.status_code}")
|
||
print(f"响应内容: {response.text[:500]}")
|
||
return None
|
||
result = response.json()
|
||
print(f"✅ 批量请求成功")
|
||
self.ip_manager.mark_success()
|
||
if hasattr(self, 'local_mode') and self.local_mode:
|
||
self.local_success_count += 1
|
||
self.try_restore_proxy()
|
||
# 请求成功后也添加短暂休眠,进一步减少风控
|
||
post_wait_time = random.uniform(2, 5)
|
||
time.sleep(post_wait_time)
|
||
return result
|
||
else:
|
||
print(f"❌ 批量请求失败: {response.status_code}")
|
||
print(f"响应内容: {response.text[:500]}")
|
||
return None
|
||
|
||
except Exception as e:
|
||
print(f"❌ 批量请求异常: {str(e)}")
|
||
return None
|
||
|
||
def build_batch_method_feed(self, requests: List[Dict]) -> str:
|
||
from urllib.parse import quote
|
||
|
||
batch_methods = []
|
||
|
||
for request in requests:
|
||
url = request["url"]
|
||
params = request.get("params", {})
|
||
|
||
method_params = {"method": url}
|
||
method_params.update(params)
|
||
|
||
param_pairs = []
|
||
for key, value in method_params.items():
|
||
encoded_value = quote(str(value), safe='')
|
||
param_pairs.append(f"{key}={encoded_value}")
|
||
|
||
method_string = "&".join(param_pairs)
|
||
batch_methods.append(method_string)
|
||
|
||
return json.dumps(batch_methods)
|
||
|
||
def get_job_list_by_keyword(self, params: Dict, skip_init: bool = False) -> Optional[Dict]:
|
||
print("🚀 开始自动抓取...")
|
||
|
||
if not skip_init:
|
||
self.update_login_from_curl()
|
||
|
||
if not self.init_session():
|
||
print("❌ 会话初始化失败")
|
||
return None
|
||
else:
|
||
print("⏱️ 连续请求,等待至少10秒...")
|
||
wait_time = sleep_random_between()
|
||
print(f"⏰ 已等待 {int(wait_time)} 秒")
|
||
|
||
search_params = {
|
||
'pageSize': params.get('pageSize', 15),
|
||
'query': params.get('query', ''),
|
||
'city': params.get('city', '101010100'),
|
||
'source': params.get('source', '1'),
|
||
'sortType': params.get('sortType', '0'),
|
||
'subwayLineId': params.get('subwayLineId', ''),
|
||
'subwayStationId': params.get('subwayStationId', ''),
|
||
'districtCode': params.get('districtCode', ''),
|
||
'businessCode': params.get('businessCode', ''),
|
||
'longitude': params.get('longitude', ''),
|
||
'latitude': params.get('latitude', ''),
|
||
'position': params.get('position', ''),
|
||
'expectId': params.get('expectId', ''),
|
||
'expectPosition': params.get('expectPosition', ''),
|
||
'encryptExpectId': params.get('encryptExpectId', ''),
|
||
'page': params.get('page', 1),
|
||
'appId': '10002'
|
||
}
|
||
|
||
try:
|
||
result = self.single_request(
|
||
"/wapi/zpgeek/miniapp/search/joblist.json",
|
||
method="GET",
|
||
data=search_params,
|
||
|
||
)
|
||
|
||
if result:
|
||
print("✅ 自动抓取完成!")
|
||
|
||
if isinstance(result, dict) and 'code' in result:
|
||
if result.get('code') != 0:
|
||
current_ip = self.get_current_ip()
|
||
error_msg = result.get('message', 'Unknown error')
|
||
print(result)
|
||
print(f"⚠️ API返回错误: {error_msg}")
|
||
print(f"🌐 当前IP地址: {current_ip}")
|
||
|
||
# 如果是IP异常,进行重试处理
|
||
if 'IP' in error_msg and '异常' in error_msg:
|
||
print(f"🔄 检测到IP异常,开始重试处理...")
|
||
retry_result = self.handle_ip_exception_retry_for_auto_crawl(params)
|
||
if retry_result:
|
||
return retry_result # 重试成功,返回结果
|
||
|
||
return result
|
||
|
||
if isinstance(result, dict) and 'zpData' in result:
|
||
zp_data = result['zpData']
|
||
if 'list' in zp_data:
|
||
job_list = zp_data['list']
|
||
print(f"📊 获取到 {len(job_list)} 条职位数据")
|
||
return result
|
||
else:
|
||
print("⚠️ 未找到 list 字段")
|
||
return result
|
||
else:
|
||
print(f"⚠️ 响应格式异常: {list(result.keys()) if isinstance(result, dict) else type(result)}")
|
||
return result
|
||
else:
|
||
print("❌ 自动抓取失败")
|
||
return None
|
||
except Exception as e:
|
||
print(f"❌ 自动抓取异常: {str(e)}")
|
||
return None
|
||
|
||
def get_job_list_multi_pages(self, params: Dict, max_pages: Optional[int] = None,
|
||
job_processor: Optional[callable] = None,
|
||
page_processor: Optional[callable] = None) -> Optional[List[Dict]]:
|
||
"""根据关键词获取招聘列表(多页,流式处理)
|
||
|
||
Args:
|
||
params: 搜索参数,格式同get_job_list_by_keyword
|
||
max_pages: 最大页数限制,默认None表示获取所有可用页面(由hasMore控制)
|
||
job_processor: 每个job的处理函数,格式: def process_job(job: Dict) -> None
|
||
page_processor: 每页数据处理函数,格式: def process_page(jobs: List[Dict]) -> None
|
||
|
||
Returns:
|
||
如果job_processor为None,返回所有页面的职位数据列表;否则返回None(流式处理)
|
||
"""
|
||
start_page = params.get('page', 1)
|
||
if max_pages is None:
|
||
print("🚀 开始自动获取多页数据,将优先根据hasMore控制,必要时退回到最大页数约束")
|
||
else:
|
||
print(f"🚀 开始自动获取多页数据,最大页数限制: {max_pages}")
|
||
|
||
collect_all = job_processor is None
|
||
all_jobs = [] if collect_all else None
|
||
processed_count = 0
|
||
|
||
current_page = start_page
|
||
pages_fetched = 0
|
||
|
||
while True:
|
||
if max_pages is not None and pages_fetched >= max_pages:
|
||
print(f"⏹ 已达到最大页数限制 {max_pages},停止翻页")
|
||
break
|
||
|
||
print(f"\n📄 正在获取第 {current_page} 页数据...")
|
||
page_params = params.copy()
|
||
page_params['page'] = current_page
|
||
skip_init = current_page > start_page
|
||
|
||
result = self.get_job_list_by_keyword(page_params, skip_init=skip_init)
|
||
if not (result and isinstance(result, dict)):
|
||
print(f"❌ 第 {current_page} 页获取失败")
|
||
break
|
||
|
||
if result.get('code') != 0:
|
||
current_ip = self.get_current_ip()
|
||
error_msg = result.get('message', 'Unknown error')
|
||
print(f"❌ 第 {current_page} 页API返回错误: {error_msg}")
|
||
print(f"🌐 当前IP地址: {current_ip}")
|
||
if 'IP' in error_msg and '异常' in error_msg:
|
||
print(f"🔄 检测到IP异常,开始重试处理...")
|
||
retry_success = self.handle_ip_exception_retry(page_params, current_page)
|
||
if retry_success:
|
||
continue
|
||
break
|
||
|
||
zp_data = result.get('zpData', {})
|
||
job_list = zp_data.get('list') or zp_data.get('jobList') or []
|
||
if not job_list:
|
||
print(f"⚠️ 第 {current_page} 页未找到职位数据,停止翻页")
|
||
break
|
||
|
||
print(f"✅ 第 {current_page} 页获取到 {len(job_list)} 条职位数据")
|
||
for job in job_list:
|
||
if job_processor:
|
||
try:
|
||
job_processor(job)
|
||
except Exception as e:
|
||
print(f"❌ job_processor处理出错: {e}")
|
||
if collect_all:
|
||
all_jobs.append(job)
|
||
processed_count += 1
|
||
sleep_random_between()
|
||
|
||
if page_processor:
|
||
try:
|
||
page_processor(job_list)
|
||
except Exception as e:
|
||
print(f"❌ page_processor处理出错: {e}")
|
||
|
||
pages_fetched += 1
|
||
|
||
has_more = zp_data.get('hasMore')
|
||
if has_more is False:
|
||
print(f"⏹ 接口返回 hasMore = False,在第 {current_page} 页停止翻页")
|
||
break
|
||
|
||
print("⏱️ 等待至少10秒后获取下一页...")
|
||
wait_time = sleep_random_between()
|
||
print(f"⏰ 已等待 {int(wait_time)} 秒")
|
||
current_page += 1
|
||
|
||
print(f"\n🎉 处理完成,总共处理 {processed_count} 条职位数据")
|
||
return all_jobs if collect_all else None
|
||
|
||
def batch_request_v2(self, requests: List[Dict]) -> Optional[Dict]:
|
||
"""批量请求V2 - 完全按照微信小程序batch方法实现
|
||
|
||
Args:
|
||
requests: 请求列表,格式: [
|
||
{
|
||
"url": "/wapi/zpgeek/search/joblist.json",
|
||
"params": {"query": "python", "city": "101010100"}
|
||
}
|
||
]
|
||
|
||
Returns:
|
||
批量请求结果字典,按URL映射
|
||
"""
|
||
# 构建URL映射: s = {}
|
||
url_mapping = {}
|
||
|
||
# 构建batch_method_feed
|
||
batch_method_feed = self.build_batch_method_feed(requests)
|
||
|
||
# 建立URL映射: s[p.url] = l (这里用索引作为键)
|
||
for i, request in enumerate(requests):
|
||
url_mapping[request["url"]] = f"request_{i}"
|
||
|
||
# 发送请求到 /wapi/batch/batchRunV3
|
||
headers = self.build_request_headers({
|
||
"Referer": "https://www.zhipin.com/web/geek/job"
|
||
})
|
||
|
||
data = self.build_request_data({
|
||
"batch_method_feed": batch_method_feed
|
||
})
|
||
|
||
try:
|
||
sleep_random_between()
|
||
|
||
response = self.session.get(
|
||
f"{self.serve_domain}/wapi/batch/batchRunV3",
|
||
headers=headers,
|
||
params=data,
|
||
timeout=30,
|
||
proxies=self.proxy_config
|
||
)
|
||
|
||
logger.info("RAW_RESPONSE method={} url={} status={} resp_size={}", "GET", f"{self.serve_domain}/wapi/batch/batchRunV3", response.status_code, len(response.content))
|
||
|
||
if response.status_code != 200:
|
||
print(f"❌ HTTP错误: {response.status_code}")
|
||
print(f"响应内容: {response.text[:200]}")
|
||
return None
|
||
|
||
# 检查响应内容是否为空或无效
|
||
if not response.content:
|
||
print("❌ 响应内容为空")
|
||
return None
|
||
|
||
try:
|
||
result = response.json()
|
||
except json.JSONDecodeError as e:
|
||
print(f"❌ JSON解析失败: {e}")
|
||
print(f"响应内容: {response.text[:200]}")
|
||
return None
|
||
|
||
# 按照JS逻辑处理响应: if (0 !== f.code || !f.zpData)
|
||
if result.get("code") != 0 or not result.get("zpData"):
|
||
print(f"❌ 批量请求失败: {result.get('message', 'Unknown error')}")
|
||
return None
|
||
|
||
# 重新映射结果: w = {}, (y = s[b]) && (w[y] = v)
|
||
mapped_results = {}
|
||
for url, response_data in result["zpData"].items():
|
||
if url in url_mapping:
|
||
request_key = url_mapping[url]
|
||
mapped_results[request_key] = response_data
|
||
else:
|
||
# 如果没有映射,直接使用URL作为键
|
||
mapped_results[url] = response_data
|
||
|
||
print(f"✅ 批量请求成功,返回 {len(mapped_results)} 个结果")
|
||
return mapped_results
|
||
|
||
except Exception as e:
|
||
print(f"❌ 批量请求异常: {str(e)}")
|
||
return None
|
||
|
||
def get_job_detail_by_id(self, job_id: str, lid: str = "", security_id: str = "") -> Optional[Dict]:
|
||
"""根据招聘ID获取招聘详情 - 使用微信小程序batch接口
|
||
|
||
Args:
|
||
job_id: 招聘职位ID (encryptJobId)
|
||
lid: 招聘者ID
|
||
security_id: 安全ID (securityId)
|
||
|
||
Returns:
|
||
dict: 招聘详情数据
|
||
"""
|
||
print(f"🔍 获取招聘详情: {job_id}")
|
||
|
||
if not job_id:
|
||
print("❌ job_id不能为空")
|
||
return None
|
||
|
||
# 构建batch请求 - 参考微信小程序实现
|
||
batch_requests = {
|
||
"job_detail": {
|
||
"url": "/wapi/zpgeek/miniapp/job/detail.json",
|
||
"params": {
|
||
"securityId": security_id,
|
||
"jobId": job_id,
|
||
"lid": lid,
|
||
"source": "10",
|
||
"scene": "1"
|
||
}
|
||
},
|
||
"job_improvement": {
|
||
"url": "/wapi/zpgeek/miniapp/jobdetail/improvement/query.json",
|
||
"params": {
|
||
"securityId": security_id,
|
||
"jobId": job_id,
|
||
"lid": lid,
|
||
"source": "10",
|
||
"scene": "1"
|
||
}
|
||
}
|
||
}
|
||
|
||
# 使用batch接口获取详情
|
||
result = self.batch_request_v2(batch_requests)
|
||
|
||
if result and "job_detail" in result:
|
||
job_detail = result["job_detail"]
|
||
if job_detail.get("code") == 0:
|
||
return job_detail
|
||
else:
|
||
error_msg = job_detail.get("message", "未知错误")
|
||
print(f"❌ 招聘详情获取失败: {error_msg}")
|
||
return None
|
||
else:
|
||
print("❌ 招聘详情获取失败: 无法获取有效响应")
|
||
return None
|
||
|
||
def boss_batch_request(self, job_id, security_id, lid):
|
||
"""
|
||
发送Boss直聘批量请求 (POST JSON)
|
||
:param job_id: 职位ID
|
||
:param security_id: 安全ID
|
||
:param lid: 招聘者ID
|
||
:return: 响应JSON数据
|
||
"""
|
||
# 构建subReqs
|
||
sub_reqs = [
|
||
{
|
||
"path": "/wapi/zpgeek/miniapp/job/detail.json",
|
||
"method": "GET",
|
||
"query": urlencode({
|
||
"securityId": security_id,
|
||
"jobId": job_id,
|
||
"lid": lid,
|
||
"source": "10"
|
||
})
|
||
},
|
||
{
|
||
"path": "/wapi/zpgeek/miniapp/jobdetail/improvement/query.json",
|
||
"method": "GET",
|
||
"query": urlencode({
|
||
"securityId": security_id,
|
||
"jobId": job_id,
|
||
"lid": lid
|
||
})
|
||
}
|
||
]
|
||
|
||
# 构建POST请求的JSON body
|
||
post_data = {
|
||
"subReqs": sub_reqs,
|
||
"appId": 10002
|
||
}
|
||
|
||
# 请求头
|
||
headers = self.build_request_headers({
|
||
"Content-Type": "application/json",
|
||
"Referer": "https://servicewechat.com/wxa8da525af05281f3/585/page-frame.html" # From curl
|
||
})
|
||
try:
|
||
# 发送POST请求
|
||
sleep_random_between()
|
||
response = self.session.post(
|
||
"https://www.zhipin.com/wapi/batch/requests",
|
||
json=post_data,
|
||
headers=headers,
|
||
timeout=30
|
||
)
|
||
response.raise_for_status()
|
||
data = response.json()
|
||
logger.info("RAW_RESPONSE method={} url={} status={} resp_size={}", "POST", "https://www.zhipin.com/wapi/batch/requests", response.status_code, len(response.content))
|
||
if self.handle_ip_abnormal_response(data):
|
||
wait_time = sleep_random_between()
|
||
logger.info(f"⏳ IP异常,等待 {int(wait_time)} 秒后重试 (批量POST)")
|
||
self.init_cookies()
|
||
headers = self.build_request_headers({
|
||
"Content-Type": "application/json",
|
||
"Referer": "https://servicewechat.com/wxa8da525af05281f3/585/page-frame.html",
|
||
"User-Agent": self.get_random_user_agent()
|
||
})
|
||
response = self.session.post(
|
||
"https://www.zhipin.com/wapi/batch/requests",
|
||
json=post_data,
|
||
headers=headers,
|
||
timeout=30
|
||
)
|
||
response.raise_for_status()
|
||
data = response.json()
|
||
# 请求成功后也添加短暂休眠,进一步减少风控
|
||
post_wait_time = random.uniform(2, 5)
|
||
time.sleep(post_wait_time)
|
||
return data
|
||
except requests.exceptions.RequestException as e:
|
||
print(f"请求失败: {str(e)}")
|
||
return None
|
||
|
||
def get_company_detail_by_id(self, company_id: str) -> Optional[Dict]:
|
||
"""根据公司ID获取公司详情
|
||
|
||
Args:
|
||
company_id: 公司ID
|
||
|
||
Returns:
|
||
dict: 公司详情数据
|
||
"""
|
||
print(f"🏢 获取公司详情: {company_id}")
|
||
|
||
# 构建请求参数 - 使用新的miniapp接口格式
|
||
params = {
|
||
"brandId": company_id,
|
||
"appId": "10002"
|
||
}
|
||
|
||
# 发送请求 - 使用新的API路径
|
||
headers = self.build_request_headers({
|
||
"Referer": "https://servicewechat.com/wxa8da525af05281f3/574/page-frame.html"
|
||
})
|
||
result = self.single_request(
|
||
url="/wapi/zpgeek/miniapp/brand/detail.json",
|
||
method="GET",
|
||
data=params,
|
||
custom_headers=headers,
|
||
|
||
)
|
||
|
||
if result and result.get("code") == 0:
|
||
print("✅ 公司详情获取成功")
|
||
return result
|
||
else:
|
||
error_msg = result.get("message", "未知错误") if result else "请求失败"
|
||
print(f"❌ 公司详情获取失败: {error_msg}")
|
||
return None
|
||
|
||
def batch_request_v2(self, batch_requests: Dict[str, Dict]) -> Optional[Dict]:
|
||
"""批量请求 - 基于微信小程序batch方法实现
|
||
|
||
Args:
|
||
batch_requests: 批量请求配置
|
||
格式: {
|
||
"request_key": {
|
||
"url": "/wapi/zpgeek/search/joblist.json",
|
||
"params": {"query": "python", "city": "101010100"}
|
||
}
|
||
}
|
||
|
||
Returns:
|
||
批量请求结果字典
|
||
"""
|
||
# 构建URL映射和批量方法参数 - 完全按照JS代码逻辑
|
||
url_mapping = {} # s = {}
|
||
batch_methods = [] # o = []
|
||
|
||
for request_key, request_config in batch_requests.items():
|
||
url = request_config["url"]
|
||
params = request_config.get("params", {})
|
||
|
||
# 建立URL到请求键的映射: s[p.url] = l
|
||
url_mapping[url] = request_key
|
||
|
||
# 构建批量方法参数: d = n({method: p.url}, p.params)
|
||
method_params = {"method": url}
|
||
method_params.update(params)
|
||
|
||
# 转换为URL编码格式: h = Object.entries(d).map(...).join("&")
|
||
method_string = "&".join([f"{k}={v}" for k, v in method_params.items()])
|
||
batch_methods.append(method_string)
|
||
|
||
# 构建请求
|
||
headers = self.build_request_headers({
|
||
"Referer": "https://www.zhipin.com/web/geek/job"
|
||
})
|
||
data = self.build_request_data({
|
||
"batch_method_feed": json.dumps(batch_methods)
|
||
})
|
||
|
||
# 发送请求
|
||
try:
|
||
sleep_random_between()
|
||
|
||
start_t = time.monotonic()
|
||
response = self.session.get(
|
||
f"{self.serve_domain}/wapi/batch/batchRunV3",
|
||
headers=headers,
|
||
params=data,
|
||
timeout=30,
|
||
proxies=self.proxy_config
|
||
)
|
||
|
||
response.raise_for_status()
|
||
elapsed = time.monotonic() - start_t
|
||
result = response.json()
|
||
logger.info("RAW_RESPONSE method={} url={} status={} resp_size={}", "GET", f"{self.serve_domain}/wapi/batch/batchRunV3", response.status_code, len(response.content))
|
||
|
||
reason = self.ip_detector.detect(response.status_code, elapsed, result)
|
||
if reason:
|
||
logger.warning("IP_ANOMALY reason={} elapsed={:.2f}s status={}", reason, elapsed, response.status_code)
|
||
self.ip_manager.mark_failure(reason)
|
||
mode, cfg = self.ip_manager.select_next_route()
|
||
self.reinit_session(cfg)
|
||
self.init_cookies()
|
||
logger.info("IP_SWITCH mode={} cfg={}", mode, cfg)
|
||
headers = self.build_request_headers({
|
||
"Referer": "https://www.zhipin.com/web/geek/job"
|
||
})
|
||
response = self.session.get(
|
||
f"{self.serve_domain}/wapi/batch/batchRunV3",
|
||
headers=headers,
|
||
params=data,
|
||
timeout=30,
|
||
)
|
||
response.raise_for_status()
|
||
result = response.json()
|
||
logger.info("RAW_RESPONSE method={} url={} status={} resp_size={}", "GET", f"{self.serve_domain}/wapi/batch/batchRunV3", response.status_code, len(response.content))
|
||
|
||
# 检查是否需要处理反爬虫
|
||
if self.handle_anti_bot_response(result):
|
||
wait_time = sleep_random_between()
|
||
print(f"🔄 批量请求检测到安全验证,等待 {int(wait_time)} 秒后重试...")
|
||
|
||
# 重新构建请求头
|
||
headers = self.build_request_headers({
|
||
"Referer": "https://www.zhipin.com/web/geek/job"
|
||
})
|
||
|
||
# 重试请求
|
||
response = self.session.get(
|
||
f"{self.serve_domain}/wapi/batch/batchRunV3",
|
||
headers=headers,
|
||
params=data,
|
||
timeout=30,
|
||
|
||
)
|
||
|
||
response.raise_for_status()
|
||
result = response.json()
|
||
logger.info("RAW_RESPONSE method={} url={} status={} params={} body={} resp_size={}", method.upper(), f"{self.serve_domain}{url}", response.status_code, request_data, {} if method.upper()=="GET" else request_data, len(response.content))
|
||
logger.info("RAW_RESPONSE method={} url={} status={} resp_size={}", "GET", f"{self.serve_domain}/wapi/batch/batchRunV3", response.status_code, len(response.content))
|
||
|
||
# 处理响应
|
||
if result.get("code") == 0 and result.get("zpData"):
|
||
# 重新映射结果
|
||
mapped_results = {}
|
||
for url, response_data in result["zpData"].items():
|
||
if url in url_mapping:
|
||
request_key = url_mapping[url]
|
||
mapped_results[request_key] = response_data
|
||
|
||
self.ip_manager.mark_success()
|
||
# 请求成功后也添加短暂休眠,进一步减少风控
|
||
post_wait_time = random.uniform(2, 5)
|
||
time.sleep(post_wait_time)
|
||
return mapped_results
|
||
else:
|
||
print(f"批量请求失败: {result.get('message', '未知错误')}")
|
||
return None
|
||
|
||
except requests.RequestException as e:
|
||
print(f"请求异常: {e}")
|
||
if hasattr(e, 'response') and e.response is not None:
|
||
print(f"响应状态码: {e.response.status_code}")
|
||
print(f"响应内容: {e.response.text[:500]}")
|
||
return None
|
||
|
||
def handle_anti_bot_response(self, response_data: Dict) -> bool:
|
||
"""处理反爬虫响应 - 错误码37的处理逻辑"""
|
||
if response_data.get("code") == 37:
|
||
print("⚠️ 检测到反爬虫机制(错误码37),正在更新安全参数...")
|
||
zp_data = response_data.get("zpData", {})
|
||
updated = False
|
||
|
||
# 更新安全种子
|
||
if "seed" in zp_data:
|
||
self.session.cookies.set('__zp_sseed__', zp_data["seed"], domain='.zhipin.com')
|
||
print(f"✅ 已更新安全种子: {zp_data['seed'][:20]}...")
|
||
updated = True
|
||
|
||
# 更新名称
|
||
if "name" in zp_data:
|
||
self.session.cookies.set('__zp_sname__', zp_data["name"], domain='.zhipin.com')
|
||
print(f"✅ 已更新安全名称: {zp_data['name']}")
|
||
updated = True
|
||
|
||
# 更新时间戳
|
||
if "ts" in zp_data:
|
||
self.session.cookies.set('__zp_sts__', str(zp_data["ts"]), domain='.zhipin.com')
|
||
print(f"✅ 已更新时间戳: {zp_data['ts']}")
|
||
updated = True
|
||
|
||
# 如果没有获取到新的安全参数,尝试生成新的时间戳
|
||
if not updated:
|
||
new_ts = str(int(time.time() * 1000))
|
||
print(f"🔄 生成新的时间戳: ts={new_ts}")
|
||
self.session.cookies.set('__zp_sts__', new_ts, domain='.zhipin.com')
|
||
|
||
return True
|
||
return False
|
||
|
||
def handle_ip_abnormal_response(self, response_data: Dict) -> bool:
|
||
reason = None
|
||
if response_data.get("code") == 35 or "IP地址存在异常" in response_data.get("message", ""):
|
||
reason = "ip_banned"
|
||
if not reason:
|
||
return False
|
||
logger.warning("⚠️ 检测到IP异常,准备切换IP并重试")
|
||
self.ip_manager.mark_failure(reason)
|
||
mode, cfg = self.ip_manager.select_next_route()
|
||
self.reinit_session(cfg)
|
||
self.init_cookies()
|
||
logger.info("IP_SWITCH mode={} cfg={}", mode, cfg)
|
||
return True
|
||
|
||
def validate_request_params(self) -> bool:
|
||
"""验证请求参数的完整性"""
|
||
required_cookies = ['__zp_sseed__', '__zp_sname__', '__zp_sts__']
|
||
required_login_data = ['mpt', 'wt2', 'traceid']
|
||
|
||
# 检查必要的Cookie
|
||
missing_cookies = [c for c in required_cookies if c not in self.session.cookies]
|
||
if missing_cookies:
|
||
self.init_cookies()
|
||
missing_cookies = [c for c in required_cookies if c not in self.session.cookies]
|
||
if missing_cookies:
|
||
print(f"⚠️ 缺少必要的Cookie: {missing_cookies[0]}")
|
||
return False
|
||
|
||
# 检查登录数据
|
||
for key in required_login_data:
|
||
if not self.login_data.get(key):
|
||
print(f"⚠️ 缺少必要的登录数据: {key}")
|
||
return False
|
||
|
||
print("✅ 请求参数验证通过")
|
||
return True
|
||
|
||
def single_request(self, url: str, method: str = "GET", data: Optional[Dict] = None,
|
||
custom_headers: Optional[Dict] = None, max_retries: int = 5) -> Optional[Dict]:
|
||
"""单个请求
|
||
|
||
Args:
|
||
url: 请求URL
|
||
method: 请求方法
|
||
data: 请求数据
|
||
custom_headers: 自定义请求头
|
||
max_retries: 最大重试次数
|
||
|
||
Returns:
|
||
dict: 响应数据
|
||
"""
|
||
# 验证请求参数
|
||
if not self.validate_request_params():
|
||
print("❌ 请求参数验证失败")
|
||
return None
|
||
|
||
# 构建请求头,添加更真实的Referer
|
||
default_custom_headers = {
|
||
"Referer": "https://www.zhipin.com/web/geek/job"
|
||
}
|
||
if custom_headers:
|
||
default_custom_headers.update(custom_headers)
|
||
|
||
for attempt in range(max_retries):
|
||
try:
|
||
sleep_random_between()
|
||
|
||
headers = self.build_request_headers(default_custom_headers)
|
||
request_data = self.build_request_data(data)
|
||
|
||
start_t = time.monotonic()
|
||
if method.upper() == "GET":
|
||
response = self.session.get(
|
||
f"{self.serve_domain}{url}",
|
||
headers=headers,
|
||
params=request_data,
|
||
timeout=30,
|
||
)
|
||
else:
|
||
response = self.session.post(
|
||
f"{self.serve_domain}{url}",
|
||
headers=headers,
|
||
data=urlencode(request_data),
|
||
timeout=30
|
||
)
|
||
|
||
response.raise_for_status()
|
||
elapsed = time.monotonic() - start_t
|
||
result = response.json()
|
||
|
||
# 检查是否需要处理反爬虫
|
||
if self.handle_anti_bot_response(result):
|
||
wait_time = sleep_random_between()
|
||
print(f"🔄 检测到安全验证,等待 {int(wait_time)} 秒后重试... (尝试 {attempt + 1}/{max_retries})")
|
||
# 在重试前更新一些请求头
|
||
default_custom_headers.update({
|
||
'User-Agent': self.get_random_user_agent(),
|
||
'X-Requested-With': 'XMLHttpRequest'
|
||
})
|
||
continue
|
||
|
||
reason = self.ip_detector.detect(response.status_code, elapsed, result)
|
||
if reason:
|
||
logger.warning("IP_ANOMALY reason={} elapsed={:.2f}s status={}", reason, elapsed, response.status_code)
|
||
if self.local_mode:
|
||
try:
|
||
self.local_fail_count += 1
|
||
thr = int(os.getenv('LOCAL_FAIL_THRESHOLD', '3'))
|
||
except Exception:
|
||
thr = 3
|
||
if self.proxy_config and self.local_fail_count >= thr:
|
||
self.enable_proxy_mode()
|
||
self.ip_manager.mark_failure(reason)
|
||
mode, cfg = self.ip_manager.select_next_route()
|
||
if mode == 'proxy' and cfg:
|
||
self.reinit_session(cfg)
|
||
logger.info("IP_SWITCH mode={} cfg={}", "proxy", cfg)
|
||
else:
|
||
self.reinit_session(self.proxy_config)
|
||
logger.info("IP_SWITCH mode={} cfg={}", "proxy", self.proxy_config)
|
||
self.init_cookies()
|
||
continue
|
||
else:
|
||
self.reinit_session()
|
||
self.init_cookies()
|
||
logger.info("IP_SWITCH mode={} cfg={}", "local", None)
|
||
wait_time = sleep_random_between()
|
||
logger.info(f"⏳ IP切换后等待 {int(wait_time)} 秒")
|
||
continue
|
||
else:
|
||
self.ip_manager.mark_failure(reason)
|
||
mode, cfg = self.ip_manager.select_next_route()
|
||
if mode == 'proxy' and cfg:
|
||
self.reinit_session(cfg)
|
||
logger.info("IP_SWITCH mode={} cfg={}", mode, cfg)
|
||
else:
|
||
self.enable_local_mode()
|
||
self.reinit_session()
|
||
logger.info("IP_SWITCH mode={} cfg={}", "local", None)
|
||
self.init_cookies()
|
||
continue
|
||
|
||
self.ip_manager.mark_success()
|
||
if hasattr(self, 'local_mode') and self.local_mode:
|
||
self.local_success_count += 1
|
||
self.local_fail_count = 0
|
||
self.try_restore_proxy()
|
||
# 请求成功后也添加短暂休眠,进一步减少风控
|
||
post_wait_time = random.uniform(2, 5)
|
||
time.sleep(post_wait_time)
|
||
return result
|
||
|
||
except requests.RequestException as e:
|
||
status = getattr(getattr(e, 'response', None), 'status_code', None)
|
||
err_text = ''
|
||
try:
|
||
err_text = getattr(getattr(e, 'response', None), 'text', '')[:500]
|
||
except Exception:
|
||
pass
|
||
print(f"❌ 请求失败 (尝试 {attempt + 1}/{max_retries}): {e}")
|
||
if status is not None:
|
||
print(f"响应状态码: {status}")
|
||
print(f"响应内容: {err_text}")
|
||
reason = self.ip_detector.detect(status, float(self.ip_cfg.response_time_threshold_sec) + 0.01, None, err_text)
|
||
if reason:
|
||
self.ip_manager.mark_failure(reason)
|
||
mode, cfg = self.ip_manager.select_next_route()
|
||
self.reinit_session(cfg)
|
||
self.init_cookies()
|
||
logger.info("IP_SWITCH mode={} cfg={}", mode, cfg)
|
||
|
||
if attempt < max_retries - 1:
|
||
# 确保至少等待10秒,重试时递增等待时间
|
||
wait_time = max(10, min(2 ** attempt * 5, 30)) # 至少10秒,最大30秒
|
||
print(f"⏳ 等待{wait_time}秒后重试...")
|
||
time.sleep(wait_time)
|
||
else:
|
||
print("❌ 达到最大重试次数,请求失败")
|
||
return None
|
||
|
||
return None
|
||
|
||
def get_random_user_agent(self) -> str:
|
||
"""获取随机User-Agent"""
|
||
user_agents = [
|
||
f"Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/{self.wx_version}(0x18002b2d) NetType/WIFI Language/zh_CN",
|
||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15"
|
||
]
|
||
return random.choice(user_agents)
|
||
|
||
|
||
def push_company(company_data: Dict[str, Any]) -> None:
|
||
"""
|
||
推送公司数据到数据库
|
||
"""
|
||
try:
|
||
zp_data = company_data.get('zpData', {})
|
||
|
||
if not zp_data:
|
||
print("❌ 公司数据格式错误")
|
||
return
|
||
|
||
# 转换为通用数据格式
|
||
universal_data = {
|
||
"data_list": [
|
||
zp_data
|
||
],
|
||
"data_type": "company",
|
||
"platform": "boss"
|
||
}
|
||
|
||
response = requests.post(
|
||
f'{API_BASE_URL}/api/v1/universal/data/batch-store-async',
|
||
headers={
|
||
'accept': 'application/json',
|
||
'token': 'dev',
|
||
'Content-Type': 'application/json'
|
||
},
|
||
json=universal_data,
|
||
timeout=30
|
||
)
|
||
logger.info("REPORT_DATA_STATUS {}", {"status": response.status_code, "size": len(response.content) if hasattr(response, 'content') else 0})
|
||
|
||
if response.status_code == 200:
|
||
company_name = "未知公司"
|
||
if zp_data and 'companyFullInfoVO' in zp_data and zp_data['companyFullInfoVO'] and 'name' in zp_data['companyFullInfoVO']:
|
||
company_name = zp_data['companyFullInfoVO']['name']
|
||
print(f"✅ 推送公司数据成功: {company_name}")
|
||
else:
|
||
print(f"❌ 推送公司数据失败: {response.status_code} - {response.text}")
|
||
|
||
except Exception as e:
|
||
print(f"❌ 推送公司数据失败: {e}")
|
||
|
||
|
||
def push_job(job_data: Dict[str, Any]) -> None:
|
||
"""
|
||
推送职位数据到数据库
|
||
"""
|
||
try:
|
||
zp_data = job_data.get('zpData', {})
|
||
if not zp_data:
|
||
print("❌ 职位数据格式错误")
|
||
return
|
||
|
||
# 转换为通用数据格式
|
||
universal_data = {
|
||
"data_list": [
|
||
zp_data
|
||
],
|
||
"data_type": "job",
|
||
"platform": "boss"
|
||
}
|
||
|
||
|
||
response = requests.post(
|
||
f'{API_BASE_URL}/api/v1/universal/data/batch-store-async',
|
||
headers={
|
||
'accept': 'application/json',
|
||
'token': 'dev',
|
||
'Content-Type': 'application/json'
|
||
},
|
||
json=universal_data,
|
||
timeout=30
|
||
)
|
||
logger.info("REPORT_DATA_STATUS {}", {"status": response.status_code, "size": len(response.content) if hasattr(response, 'content') else 0})
|
||
|
||
if response.status_code == 200:
|
||
print(
|
||
f"✅ 推送职位数据成功: {zp_data['jobBaseInfoVO']['positionName']} @ {zp_data['brandComInfoVO']['brandName']}")
|
||
else:
|
||
print(f"❌ 推送职位数据失败: {response.status_code} - {response.text}")
|
||
|
||
except Exception as e:
|
||
print(f"❌ 推送职位数据失败: {e}")
|
||
|
||
|
||
import json
|
||
import random
|
||
|
||
|
||
def load_json_data():
|
||
"""
|
||
加载城市和职位数据
|
||
|
||
Returns:
|
||
tuple: (cities, positions)
|
||
"""
|
||
try:
|
||
# 加载城市数据
|
||
with open('city.json', 'r', encoding='utf-8') as f:
|
||
city_data = json.load(f)
|
||
|
||
# 提取城市列表
|
||
cities = []
|
||
if 'zpData' in city_data and 'hotCityList' in city_data['zpData']:
|
||
for city in city_data['zpData']['hotCityList']:
|
||
cities.append({
|
||
'code': city['code'],
|
||
'name': city['name']
|
||
})
|
||
|
||
# 加载职位数据
|
||
with open('work.json', 'r', encoding='utf-8') as f:
|
||
work_data = json.load(f)
|
||
|
||
# 提取职位列表
|
||
positions = []
|
||
if 'zpData' in work_data and 'config' in work_data['zpData']:
|
||
for category in work_data['zpData']['config']:
|
||
if 'subLevelModelList' in category and category['subLevelModelList']:
|
||
for sub_category in category['subLevelModelList']:
|
||
if 'subLevelModelList' in sub_category and sub_category['subLevelModelList']:
|
||
for position in sub_category['subLevelModelList']:
|
||
positions.append({
|
||
'code': position['code'],
|
||
'name': position['name']
|
||
})
|
||
|
||
return cities, positions
|
||
|
||
except Exception as e:
|
||
print(f"加载数据失败: {e}")
|
||
# 返回默认数据
|
||
default_cities = [
|
||
{'code': '101010100', 'name': '北京'},
|
||
{'code': '101020100', 'name': '上海'},
|
||
{'code': '101280100', 'name': '广州'},
|
||
{'code': '101280600', 'name': '深圳'},
|
||
{'code': '101210100', 'name': '杭州'}
|
||
]
|
||
default_positions = [
|
||
{'code': 100101, 'name': 'Java'},
|
||
{'code': 100109, 'name': 'Python'},
|
||
{'code': 100901, 'name': '前端开发工程师'},
|
||
{'code': 100202, 'name': 'Android'},
|
||
{'code': 100203, 'name': 'iOS'}
|
||
]
|
||
return default_cities, default_positions
|
||
|
||
|
||
def generate_random_params() -> Dict[str, Any]:
|
||
"""
|
||
随机生成一个爬虫参数
|
||
|
||
Returns:
|
||
dict: 包含爬虫参数的字典
|
||
"""
|
||
cities, positions = load_json_data()
|
||
|
||
# 随机选择城市和职位
|
||
city = random.choice(cities)
|
||
position = random.choice(positions)
|
||
|
||
# 生成查询名称
|
||
if position['name'] in ['Java', 'Python', 'PHP', 'C#', 'C/C++', 'Golang', 'Node.js']:
|
||
query = f"{position['name']}开发"
|
||
elif position['name'] in ['Android', 'iOS']:
|
||
query = f"{position['name']}开发"
|
||
else:
|
||
query = position['name']
|
||
|
||
return {
|
||
"query": query,
|
||
"city": city['code'],
|
||
"scene": 1,
|
||
"page": 1,
|
||
"position": position['code'],
|
||
}
|
||
|
||
|
||
def generate_random_params_batch(count: int = 10) -> List[Dict[str, Any]]:
|
||
"""
|
||
批量生成随机参数
|
||
|
||
Args:
|
||
count: 生成数量
|
||
|
||
Returns:
|
||
list: 参数列表
|
||
"""
|
||
return [generate_random_params() for _ in range(count)]
|
||
|
||
|
||
def fetch_service_params() -> Optional[Dict[str, Any]]:
|
||
try:
|
||
url = f"{API_BASE_URL}/api/v1/keyword/available"
|
||
r = requests.get(url, params={"source": "boss", "limit": 1, "reserve": True}, timeout=10)
|
||
if r.status_code != 200:
|
||
return None
|
||
js = r.json()
|
||
data = js.get("data") or {}
|
||
items = data.get("items") or []
|
||
if not items:
|
||
return None
|
||
item = items[0]
|
||
ids = [item.get("id")]
|
||
if ids and ids[0]:
|
||
try:
|
||
murl = f"{API_BASE_URL}/api/v1/keyword/mark-used"
|
||
requests.post(murl, json={"source": "boss", "ids": ids}, timeout=10)
|
||
except Exception as e:
|
||
logger.error(f"标记已用失败: {e}")
|
||
pass
|
||
city = str(item.get("city", ""))
|
||
job = str(item.get("job", ""))
|
||
if not city or not job:
|
||
return None
|
||
return {"query": job, "city": city, "scene": 1, "page": 1}
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
# 使用示例
|
||
if __name__ == "__main__":
|
||
import time
|
||
import random
|
||
|
||
# 代理配置(从环境变量获取)
|
||
username = os.getenv('PROXY_USERNAME')
|
||
password = os.getenv('PROXY_PASSWORD')
|
||
tunnel = os.getenv('PROXY_TUNNEL')
|
||
scheme = os.getenv('PROXY_SCHEME')
|
||
|
||
username = (username or '').strip().strip('`')
|
||
password = (password or '').strip().strip('`')
|
||
tunnel = (tunnel or '').strip().strip('`')
|
||
scheme = (scheme or '').strip().strip('`').lower()
|
||
|
||
proxies = None
|
||
if username and password and tunnel:
|
||
if scheme not in ('http', 'socks5h'):
|
||
scheme = 'http'
|
||
proxy_url = f"{scheme}://{username}:{password}@{tunnel}"
|
||
pattern = r"^(http|socks5h)://[^:]+:[^@]+@[^:]+:\d+$"
|
||
if re.match(pattern, proxy_url):
|
||
proxies = {
|
||
"http": proxy_url,
|
||
"https": proxy_url
|
||
}
|
||
print(f"✅ 使用代理配置: {tunnel} ({scheme})")
|
||
else:
|
||
print("❌ 代理配置格式不正确,请检查用户名/密码/隧道地址")
|
||
else:
|
||
print("ℹ️ 未配置代理,使用直连")
|
||
|
||
# 初始化API实例(传入代理以确保所有请求走代理)
|
||
api = BossZhipinAPI(proxy_config=proxies)
|
||
|
||
# 自检模式:仅验证代理是否生效并退出
|
||
if os.getenv('RUN_SELFTEST') == '1':
|
||
print("🌐 当前IP(初始化后):", api.get_current_ip())
|
||
try:
|
||
r = api.session.get('http://v2.api.juliangip.com/v2/dps/ip_search?ip=0.0.0.0', timeout=10)
|
||
if r.status_code == 200:
|
||
data = r.json()
|
||
ip_v = data.get('data', {}).get('ip', '') if data.get('code') == 0 else ''
|
||
print("🌐 HTTPS出口IP(初始化后):", ip_v)
|
||
else:
|
||
print("❌ HTTPS出口IP获取失败: HTTP", r.status_code)
|
||
except Exception as e:
|
||
print("❌ HTTPS出口IP获取异常:", e)
|
||
|
||
try:
|
||
rbt = api.session.get('https://www.zhipin.com/robots.txt', timeout=10)
|
||
if rbt.status_code == 200:
|
||
print("🌐 目标域预检(初始化后): 成功")
|
||
else:
|
||
print("❌ 目标域预检(初始化后)失败: HTTP", rbt.status_code)
|
||
except Exception as e:
|
||
print("❌ 目标域预检(初始化后)异常:", e)
|
||
|
||
api.reinit_session()
|
||
print("🌐 当前IP(重建会话后):", api.get_current_ip())
|
||
try:
|
||
r2 = api.session.get('http://v2.api.juliangip.com/v2/dps/ip_search?ip=0.0.0.0', timeout=10)
|
||
if r2.status_code == 200:
|
||
data2 = r2.json()
|
||
ip_v2 = data2.get('data', {}).get('ip', '') if data2.get('code') == 0 else ''
|
||
print("🌐 HTTPS出口IP(重建会话后):", ip_v2)
|
||
else:
|
||
print("❌ HTTPS出口IP获取失败: HTTP", r2.status_code)
|
||
except Exception as e2:
|
||
print("❌ HTTPS出口IP获取异常:", e2)
|
||
try:
|
||
rbt2 = api.session.get('https://www.zhipin.com/robots.txt', timeout=10)
|
||
if rbt2.status_code == 200:
|
||
print("🌐 目标域预检(重建会话后): 成功")
|
||
else:
|
||
print("❌ 目标域预检(重建会话后)失败: HTTP", rbt2.status_code)
|
||
except Exception as e2:
|
||
print("❌ 目标域预检(重建会话后)异常:", e2)
|
||
sys.exit(0)
|
||
|
||
# 首先初始化会话
|
||
print("=== 初始化会话 ===")
|
||
try:
|
||
if not api.init_session():
|
||
print("会话初始化失败,但继续运行")
|
||
except Exception as e:
|
||
print(f"会话初始化异常: {e},继续运行")
|
||
|
||
# 使用最新的curl命令数据更新登录信息
|
||
print("\n=== 更新登录数据 ===")
|
||
try:
|
||
api.update_login_from_curl()
|
||
except Exception as e:
|
||
print(f"更新登录数据异常: {e},继续运行")
|
||
|
||
|
||
# 流式处理函数
|
||
def process_job_detail(job: Dict) -> None:
|
||
"""流式处理每个job的详情"""
|
||
try:
|
||
job_id = job.get('encryptJobId')
|
||
lid = job.get('lid', '')
|
||
security_id = job.get('securityId', '')
|
||
job_name = job.get('jobName', 'N/A')
|
||
company_name = job.get('brandName', 'N/A')
|
||
city_name = job.get('cityName') or ''
|
||
job_id_display = job_id or job.get('jobId') or ''
|
||
if city_name and job_id_display:
|
||
print(f"\n📋 处理职位: {city_name} | {job_name} | {job_id_display} @ {company_name}")
|
||
elif city_name:
|
||
print(f"\n📋 处理职位: {city_name} | {job_name} @ {company_name}")
|
||
elif job_id_display:
|
||
print(f"\n📋 处理职位: {job_name} | {job_id_display} @ {company_name}")
|
||
else:
|
||
print(f"\n📋 处理职位: {job_name} @ {company_name}")
|
||
|
||
if job_id:
|
||
job_detail = api.boss_batch_request(job_id, security_id,lid)
|
||
if job_detail and job_detail.get('code') == 0:
|
||
# 获取公司详情
|
||
company_id = job_detail.get('zpData', {}).get("/wapi/zpgeek/miniapp/job/detail.json",{}).get('zpData', {}).get('brandComInfoVO', {}).get('encryptBrandId')
|
||
if company_id:
|
||
try:
|
||
wait_time = sleep_random_between()
|
||
print(f"⏰ 等待 {int(wait_time)} 秒后开始职位详情...")
|
||
company_detail = api.get_company_detail_by_id(company_id)
|
||
if company_detail and company_detail.get('zpData'):
|
||
push_company(company_detail)
|
||
# 修复 企业信息---> 发送job 详情
|
||
if (job_detail and 'zpData' in job_detail and
|
||
'/wapi/zpgeek/miniapp/job/detail.json' in job_detail['zpData'] and
|
||
job_detail['zpData']['/wapi/zpgeek/miniapp/job/detail.json'] and
|
||
'zpData' in job_detail['zpData']['/wapi/zpgeek/miniapp/job/detail.json'] and
|
||
'brandComInfoVO' in job_detail['zpData']['/wapi/zpgeek/miniapp/job/detail.json']['zpData'] and
|
||
company_detail and 'zpData' in company_detail and
|
||
'companyFullInfoVO' in company_detail['zpData'] and
|
||
company_detail['zpData']['companyFullInfoVO'] and
|
||
'name' in company_detail['zpData']['companyFullInfoVO']):
|
||
job_detail['zpData']['/wapi/zpgeek/miniapp/job/detail.json']['zpData']['brandComInfoVO']['brandName'] = \
|
||
company_detail['zpData']['companyFullInfoVO']['name']
|
||
except Exception as e:
|
||
print(f" ⚠️ 获取公司详情失败: {e}")
|
||
|
||
try:
|
||
job_detail = job_detail.get("zpData",{}).get("/wapi/zpgeek/miniapp/job/detail.json",{})
|
||
push_job(job_detail)
|
||
except Exception as e:
|
||
print(f" ⚠️ 推送职位数据失败: {e}")
|
||
else:
|
||
err_msg = job_detail.get('message', '未知错误') if isinstance(job_detail, dict) else '请求失败'
|
||
print(f" ❌ 获取职位详情失败: {err_msg}")
|
||
if isinstance(job_detail, dict) and (job_detail.get('code') == 35 or 'IP地址存在异常' in err_msg):
|
||
self_reinit_ok = False
|
||
try:
|
||
api.reinit_session()
|
||
self_reinit_ok = True
|
||
except Exception:
|
||
pass
|
||
wait_time = sleep_random_between()
|
||
print(f" ⏳ 等待 {int(wait_time)} 秒后继续...{'(已重建会话)' if self_reinit_ok else ''}")
|
||
except Exception as e:
|
||
print(f" ⚠️ 处理职位详情异常: {e}")
|
||
|
||
|
||
# 死循环开始
|
||
print("\n🔄 开始死循环抓取模式...")
|
||
loop_count = 0
|
||
|
||
while True:
|
||
try:
|
||
loop_count += 1
|
||
print(f"\n{'=' * 50}")
|
||
print(f"🚀 第 {loop_count} 轮抓取开始")
|
||
print(f"{'=' * 50}")
|
||
|
||
crawl_params = fetch_service_params()
|
||
if not crawl_params:
|
||
print("⚠️ 服务未返回可用关键词,改用本地随机参数")
|
||
crawl_params = generate_random_params()
|
||
multi_page_params = crawl_params.copy()
|
||
multi_page_params['pageSize'] = int(os.getenv('PAGE_SIZE', '15')) # 每页条数
|
||
|
||
print(f"📊 搜索参数: {multi_page_params}")
|
||
|
||
# 执行抓取
|
||
max_pages = int(os.getenv('MAX_PAGES', '3'))
|
||
try:
|
||
api.get_job_list_multi_pages(multi_page_params, max_pages=max_pages, job_processor=process_job_detail)
|
||
print(f"✅ 第 {loop_count} 轮抓取完成")
|
||
except Exception as e:
|
||
print(f"❌ 第 {loop_count} 轮抓取失败: {e}")
|
||
# 如果是登录失效,尝试处理
|
||
if "登录" in str(e) or "login" in str(e).lower() or "401" in str(e):
|
||
try:
|
||
print("🔄 检测到登录失效,尝试处理...")
|
||
api.handle_login_expired()
|
||
except Exception as login_e:
|
||
print(f"⚠️ 处理登录失效失败: {login_e}")
|
||
|
||
# 随机等待时间(避免请求过于频繁)
|
||
wait_time = sleep_random_between()
|
||
print(f"⏰ 等待 {int(wait_time)} 秒后开始下一轮...")
|
||
except Exception as e:
|
||
print(f"❌ 循环异常: {e},跳过本轮,继续下一轮")
|
||
# 出现异常时等待更长时间
|
||
wait_time = sleep_random_between()
|
||
print(f"⏰ 异常恢复等待 {int(wait_time)} 秒...")
|
||
continue
|