JobData/jobs_spider/boss/boos_api.py
2026-01-20 15:42:47 +08:00

2246 lines
91 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

t#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
from typing import Dict, Any, List, Optional, Tuple
from urllib.parse import urlencode
import uuid
from loguru import logger
import os
import time
import random
import json
import sys
import re
from requests.exceptions import ProxyError
API_BASE_URL = os.getenv('API_BASE_URL', 'http://124.222.106.226:9999')
os.makedirs("logs", exist_ok=True)
logger.add("logs/log_{time:YYYY-MM-DD}.log", level="INFO", rotation="00:00", retention="30 days", enqueue=True)
def sleep_random_between() -> float:
"""
执行统一的随机延时至少10秒以上减少风控触发
Returns:
float: 实际休眠的秒数
"""
try:
min_seconds = float(os.getenv('SLEEP_MIN_SECONDS', '10'))
max_seconds = float(os.getenv('SLEEP_MAX_SECONDS', '20'))
# 确保最小值至少为10秒
if min_seconds < 10:
min_seconds = 10
if max_seconds < min_seconds:
max_seconds = min_seconds + 10
wait_time = random.uniform(min_seconds, max_seconds)
except Exception:
wait_time = 10.0
time.sleep(wait_time)
return wait_time
class IPStrategyConfig:
def __init__(self,
response_time_threshold_sec: int = int(os.getenv('IP_RESP_TIME_THRESHOLD', '5')),
proxy_failure_threshold: int = int(os.getenv('IP_PROXY_FAIL_THRESHOLD', '3')),
local_cooldown_sec: int = int(os.getenv('IP_LOCAL_COOLDOWN_SEC', '1800')),
local_failure_threshold: int = int(os.getenv('IP_LOCAL_FAIL_THRESHOLD', '2'))):
"""IP策略配置
Args:
response_time_threshold_sec (int): 单次请求耗时阈值秒。
proxy_failure_threshold (int): 同一代理连续失败触发切换阈值。
local_cooldown_sec (int): 本机IP使用冷却时间秒。
local_failure_threshold (int): 本机连续失败阈值,超过后回到代理池。
Returns:
None
"""
self.response_time_threshold_sec = response_time_threshold_sec
self.proxy_failure_threshold = proxy_failure_threshold
self.local_cooldown_sec = local_cooldown_sec
self.local_failure_threshold = local_failure_threshold
def update(self, updates: Dict[str, Any]) -> None:
"""动态更新配置"""
for k, v in updates.items():
if hasattr(self, k):
setattr(self, k, v)
class IPAnomalyDetector:
def __init__(self, cfg: IPStrategyConfig):
"""异常检测器
Args:
cfg (IPStrategyConfig): 策略配置。
Returns:
None
"""
self.cfg = cfg
def detect(self, status_code: Optional[int], elapsed_sec: float, resp_json: Optional[Dict], error_text: str = "") -> Optional[str]:
"""检测是否存在IP异常
Args:
status_code (Optional[int]): 响应HTTP状态码异常时可能为None。
elapsed_sec (float): 响应耗时秒。
resp_json (Optional[Dict]): 响应体JSON。
error_text (str): 异常文本。
Returns:
Optional[str]: 异常原因标识字符串无异常返回None。
"""
if status_code in (403, 429, 407):
return f"http_{status_code}"
if elapsed_sec > self.cfg.response_time_threshold_sec:
return "slow_response"
if resp_json:
msg = str(resp_json.get("message", ""))
code = resp_json.get("code")
if code == 35 or ("IP地址存在异常" in msg or ("IP" in msg and "异常" in msg)):
return "ip_banned"
if error_text and ("IP" in error_text and "异常" in error_text):
return "ip_banned"
return None
class SmartIPManager:
def __init__(self, proxy_pool: Optional[List[Dict[str, str]]], cfg: IPStrategyConfig):
"""智能IP管理器
Args:
proxy_pool (Optional[List[Dict[str,str]]]): 代理池列表元素为requests兼容代理字典。
cfg (IPStrategyConfig): 策略配置。
Returns:
None
"""
self.cfg = cfg
self.proxy_pool: List[Dict[str, str]] = proxy_pool or []
self.eliminated: set = set()
self.current_mode: str = 'proxy' if self.proxy_pool else 'local'
self.current_index: int = 0
self.proxy_failures_current: int = 0
self.local_failures: int = 0
self.last_local_use_time: float = 0.0
self.local_disabled_until: float = 0.0
def current_route(self) -> Tuple[str, Optional[Dict[str, str]]]:
"""返回当前路由模式和代理配置"""
if self.current_mode == 'proxy' and self.proxy_pool:
return 'proxy', self.proxy_pool[self.current_index]
return 'local', None
def mark_success(self) -> None:
"""请求成功后重置失败计数"""
if self.current_mode == 'proxy':
self.proxy_failures_current = 0
else:
self.local_failures = 0
def mark_failure(self, reason: str = "") -> None:
"""请求失败后更新失败计数与淘汰状态"""
if self.current_mode == 'proxy':
self.proxy_failures_current += 1
if self.proxy_failures_current >= self.cfg.proxy_failure_threshold:
self.eliminated.add(self.current_index)
else:
self.local_failures += 1
def select_next_route(self) -> Tuple[str, Optional[Dict[str, str]]]:
"""选择下一个路由(代理或本机),避免无限本机循环"""
now = time.monotonic()
if self.current_mode == 'proxy':
if self.proxy_failures_current >= self.cfg.proxy_failure_threshold:
if self._local_available(now):
self.current_mode = 'local'
self.last_local_use_time = now
self.proxy_failures_current = 0
return 'local', None
next_idx = self._next_proxy_index()
if next_idx is not None:
self.current_index = next_idx
self.proxy_failures_current = 0
return 'proxy', self.proxy_pool[self.current_index]
self.current_mode = 'local'
self.last_local_use_time = now
self.proxy_failures_current = 0
return 'local', None
if self.proxy_pool:
return 'proxy', self.proxy_pool[self.current_index]
self.current_mode = 'local'
return 'local', None
else:
if self.local_failures >= self.cfg.local_failure_threshold:
next_idx = self._next_proxy_index()
if next_idx is not None:
self.current_mode = 'proxy'
self.current_index = next_idx
self.local_failures = 0
return 'proxy', self.proxy_pool[self.current_index]
return 'local', None
def _next_proxy_index(self) -> Optional[int]:
"""查找下一个未被淘汰的代理索引"""
if not self.proxy_pool:
return None
n = len(self.proxy_pool)
for step in range(1, n + 1):
cand = (self.current_index + step) % n
if cand not in self.eliminated:
return cand
return None
def _local_available(self, now: float) -> bool:
"""本机是否可用(冷却与禁用窗口判断)"""
if now < self.local_disabled_until:
return False
return (now - self.last_local_use_time) >= self.cfg.local_cooldown_sec
def disable_local_temporarily(self, seconds: int) -> None:
"""临时禁用本机IP"""
self.local_disabled_until = time.monotonic() + max(0, seconds)
def manual_switch_to_proxy(self, index: int) -> None:
"""人工指定代理索引"""
if 0 <= index < len(self.proxy_pool) and index not in self.eliminated:
self.current_mode = 'proxy'
self.current_index = index
self.proxy_failures_current = 0
def enable_local(self) -> None:
"""重新允许本机IP"""
self.local_disabled_until = 0.0
class BossZhipinAPI:
def __init__(self, proxy_config: Optional[Dict] = None, proxy_pool: Optional[List[Dict[str, str]]] = None, ip_strategy_config: Optional[Dict[str, Any]] = None):
self.app_id = 10002
self.zp_product_id = 10002
self.serve_domain = "https://www.zhipin.com"
self.api_domain = "https://wxapp.zhipin.com"
self.session = requests.Session()
self.session.trust_env = False
self.session.headers.update({'no_proxy': '10.0.0.0/16,example.com,.example.com'})
self.proxy_config = proxy_config
# 代理字典值清洗,避免包含反引号或前后空白
if isinstance(self.proxy_config, dict):
cleaned = {}
for k, v in self.proxy_config.items():
if isinstance(v, str):
cleaned[k] = v.strip().strip('`')
else:
cleaned[k] = v
self.proxy_config = cleaned
if self.proxy_config:
self.session.proxies.update(self.proxy_config)
print(f"✅ 已设置代理: {self.proxy_config}")
self.local_mode = False
self.local_success_count = 0
self.local_mode_since = 0
self.local_fail_count = 0
cfg = IPStrategyConfig(**(ip_strategy_config or {}))
pool: List[Dict[str, str]] = []
if proxy_pool:
pool.extend(proxy_pool)
if self.proxy_config:
pool.append(self.proxy_config)
self.ip_cfg = cfg
self.ip_detector = IPAnomalyDetector(cfg)
self.ip_manager = SmartIPManager(pool, cfg)
route_mode, route_cfg = self.ip_manager.current_route()
if route_mode == 'proxy' and route_cfg:
self.session.proxies = route_cfg
print(f"🚦 初始路由: 代理 {route_cfg}")
else:
self.session.proxies = {}
print("🚦 初始路由: 本机直连")
self.device_id = str(uuid.uuid4())
self.wx_version = "8.0.43"
self.mini_version = "1.0.0"
self.scene = 1001
self.default_headers = {
"Accept": "*/*",
"Accept-Language": "zh-CN,zh;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "www.zhipin.com",
"Referer": "https://servicewechat.com/wxa8da525af05281f3/571/page-frame.html",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.10(0x13080a10) XWEB/1227",
"X-Requested-With": "XMLHttpRequest",
"platform": "zhipin/mac",
"zp_app_id": str(self.app_id),
"ver": "100.0000",
"mini_ver": "100.0000",
"ua": json.dumps({"model": "Mac16,8", "platform": "mac"}),
"zp_product_id": str(self.zp_product_id),
"scene": "1006",
"xweb_xhr": "1",
"sec-fetch-site": "cross-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty"
}
self.init_cookies()
self.login_data = {
"mpt": "",
"wt2": "",
"openId": "",
"traceid": "F-77d05bnXuMVrHIB3"
}
# 从API获取token
self.current_token_id = None # 存储当前使用的token ID
self.load_token_from_api()
def load_token_from_api(self, api_base_url: str = None):
"""从后端API获取可用的Boss token"""
if api_base_url is None:
api_base_url = API_BASE_URL
try:
# 获取可用的token列表 - 使用通用数据接口
response = requests.get(f"{api_base_url}/api/v1/token/tokens?page=1&page_size=10", timeout=10)
print(response.text)
if response.status_code == 200:
data = response.json()
if data.get("data"):
tokens = data["data"]
if tokens:
token_info = tokens[0]
token_id = token_info.get("id")
mpt = token_info.get("mpt")
wt2 = token_info.get("wt2")
if mpt and wt2 and token_id:
self.current_token_id = token_id # 存储当前token ID
self.set_login_data(mpt, wt2)
logger.info(f"✅ 成功从API获取token: id={token_id}, mpt={mpt[:20]}..., wt2={wt2[:50]}...")
return True
else:
logger.warning("⚠️ API返回的token数据不完整")
else:
logger.warning("⚠️ 没有可用的token")
else:
current_ip = self.get_current_ip()
error_msg = data.get('message', '未知错误')
logger.error(f"❌ API返回错误: {error_msg}")
logger.error(f"🌐 当前IP地址: {current_ip}")
# 如果是IP异常进行重试处理
if 'IP' in error_msg and '异常' in error_msg:
logger.info(f"🔄 检测到IP异常开始重试处理...")
retry_success = self.handle_ip_exception_retry_for_token(api_base_url)
if retry_success:
return True # 重试成功
else:
logger.error(f"❌ API请求失败: {response.status_code}")
except Exception as e:
import traceback
traceback.print_exc()
logger.error(f"❌ 从API获取token失败: {str(e)}")
# 如果API获取失败使用默认值
logger.warning("⚠️ 使用默认token值")
self.set_login_data(
"1179681000a187f48bab5c526e25baff",
"ELifb5J2w04JC_7-2QsHpI_tYDl-_XmGcLAjCQB4MdxlfeQF673MfEjBCZB2ncMyfO-a0SC-PUFfLS36iFZARNA~~"
)
return False
def generate_boss_trace_id(self) -> str:
"""生成Boss直聘的trace_id
基于Boss直聘官方算法:
1. 获取当前时间戳的16进制表示取后6位
2. 生成10位随机字符串包含数字、小写字母、大写字母
3. 拼接为 F-{timestamp_hex}{random_string} 格式
"""
import time
import random
# 获取当前时间戳的16进制表示取后6位
timestamp_hex = hex(int(time.time() * 1000))[2:][-6:]
# 字符集:数字 + 小写字母 + 大写字母
charset = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
# 生成10位随机字符串
random_string = ''.join(random.choice(charset) for _ in range(10))
# 拼接最终的traceid
trace_id = f"F-{timestamp_hex}{random_string}"
return trace_id
def get_current_ip(self) -> str:
"""获取当前IP地址"""
try:
response = self.session.get('http://v2.api.juliangip.com/v2/dps/ip_search?ip=0.0.0.0', timeout=10)
if response.status_code == 200:
ip_data = response.json()
if ip_data.get('code') == 0 and ip_data.get('data'):
return ip_data['data'].get('ip', 'Unknown')
else:
pass
try:
r2 = self.session.get('http://v2.api.juliangip.com/v2/dps/ip_search?ip=0.0.0.0', timeout=10)
if r2.status_code == 200:
ip_data2 = r2.json()
if ip_data2.get('code') == 0 and ip_data2.get('data'):
return ip_data2['data'].get('ip', 'Unknown')
except Exception:
pass
return f'HTTP {response.status_code}'
except Exception as e:
try:
r3 = self.session.get('http://v2.api.juliangip.com/v2/dps/ip_search?ip=0.0.0.0', timeout=10)
if r3.status_code == 200:
ip_data3 = r3.json()
if ip_data3.get('code') == 0 and ip_data3.get('data'):
return ip_data3['data'].get('ip', 'Unknown')
except Exception:
pass
return f'获取IP失败: {str(e)}'
def handle_ip_exception_retry(self, page_params: dict, page_num: int, max_retries: int = 3) -> bool:
"""处理IP异常的重试逻辑
Args:
page_params: 页面参数
page_num: 页码
max_retries: 最大重试次数
Returns:
bool: 重试是否成功
"""
for retry_count in range(max_retries):
print(f"⏳ 第 {retry_count + 1}/{max_retries} 次重试...")
# 等待时间递增至少10秒然后15秒、20秒
wait_time = max(10, (retry_count + 1) * 5 + 5)
print(f"⏰ 等待 {wait_time} 秒后重试让隧道代理切换IP...")
time.sleep(wait_time)
# 重新初始化session和cookies强制使用新IP
self.reinit_session()
self.init_cookies()
# 获取新的IP地址
new_ip = self.get_current_ip()
print(f"🌐 新IP地址: {new_ip}")
# 重新尝试请求
try:
result = self.get_job_list_by_keyword(page_params, skip_init=True)
if result and isinstance(result, dict) and result.get('code') == 0:
print(f"✅ 重试成功!第 {page_num} 页数据获取成功")
return True
else:
error_msg = result.get('message', 'Unknown error') if result else 'No response'
print(f"❌ 重试失败: {error_msg}")
need_local = ('IP' in error_msg and '异常' in error_msg) or (retry_count + 1 >= max_retries)
if need_local:
self.enable_local_mode()
try:
local_result = self.get_job_list_by_keyword(page_params, skip_init=True)
if local_result and isinstance(local_result, dict) and local_result.get('code') == 0:
print("✅ 本机直连模式重试成功")
self.local_success_count += 1
self.try_restore_proxy()
return True
except Exception as e2:
print(f"❌ 本机直连模式重试异常: {str(e2)}")
except Exception as e:
print(f"❌ 重试异常: {str(e)}")
print(f"💥 重试 {max_retries} 次后仍然失败,跳过当前页")
return False
def reinit_session(self, route: Optional[Dict[str, str]] = None):
"""重新初始化session应用指定路由代理/直连)
Args:
route (Optional[Dict[str,str]]): 代理配置None 表示直连。
Returns:
None
"""
# 关闭旧的session
if hasattr(self, 'session'):
self.session.close()
# 创建新的session
self.session = requests.Session()
# 禁止读取系统环境代理,避免被覆盖
self.session.trust_env = False
if route:
self.session.proxies = route
print("🔄 Session已重新初始化将使用新的代理连接")
else:
self.session.proxies = {}
print("🔄 Session已重新初始化未配置代理使用直连")
def enable_local_mode(self):
try:
self.local_mode = True
self.local_mode_since = time.time()
self.local_success_count = 0
self.local_fail_count = 0
self.session.proxies = {}
print("🔁 已切换为本机直连模式")
except Exception:
pass
def enable_proxy_mode(self):
try:
if self.proxy_config:
self.session.proxies = self.proxy_config
self.local_mode = False
self.local_fail_count = 0
print("🔁 已切换为代理模式")
except Exception:
pass
def try_restore_proxy(self):
try:
seconds = int(os.getenv('PROXY_RESTORE_AFTER_SECONDS', '300'))
successes = int(os.getenv('PROXY_RESTORE_AFTER_SUCCESS', '3'))
except Exception:
seconds = 300
successes = 3
if self.local_mode:
if (self.local_mode_since and time.time() - self.local_mode_since >= seconds) or (self.local_success_count >= successes):
self.enable_proxy_mode()
def handle_ip_exception_retry_for_auto_crawl(self, params):
"""处理auto_crawl方法中的IP异常重试"""
max_retries = 3
for retry_count in range(1, max_retries + 1):
# 等待时间递增至少10秒然后15秒、20秒
wait_time = max(10, retry_count * 5 + 5)
print(f"⏳ 第 {retry_count} 次重试,等待 {wait_time} 秒...")
time.sleep(wait_time)
# 重新初始化session和cookies以获取新IP
self.reinit_session()
self.init_cookies()
# 获取并打印新IP
new_ip = self.get_current_ip()
print(f"🌐 新IP地址: {new_ip}")
# 重新尝试get_job_list_by_keyword
try:
result = self.get_job_list_by_keyword(params, skip_init=True)
if result and isinstance(result, dict) and result.get('code') == 0:
print(f"✅ 第 {retry_count} 次重试成功!")
return result
else:
print(f"❌ 第 {retry_count} 次重试仍然失败")
need_local = (retry_count >= max_retries) or (isinstance(result, dict) and (result.get('code') == 35 or 'IP地址存在异常' in result.get('message', '')))
if need_local:
self.enable_local_mode()
try:
local_result = self.get_job_list_by_keyword(params, skip_init=True)
if local_result and isinstance(local_result, dict) and local_result.get('code') == 0:
print("✅ 本机直连模式重试成功")
self.local_success_count += 1
self.try_restore_proxy()
return local_result
except Exception as e2:
print(f"❌ 本机直连模式重试异常: {e2}")
except Exception as e:
print(f"❌ 第 {retry_count} 次重试出现异常: {e}")
print(f"❌ 经过 {max_retries} 次重试仍然失败")
return None
def handle_ip_exception_retry_for_token(self, api_base_url):
"""处理token获取中的IP异常重试"""
max_retries = 3
for retry_count in range(1, max_retries + 1):
# 等待时间递增至少10秒然后15秒、20秒
wait_time = max(10, retry_count * 5 + 5)
logger.info(f"⏳ 第 {retry_count} 次重试,等待 {wait_time} 秒...")
time.sleep(wait_time)
# 重新初始化session以获取新IP
self.reinit_session()
# 获取并打印新IP
new_ip = self.get_current_ip()
logger.info(f"🌐 新IP地址: {new_ip}")
# 重新尝试获取token避免递归调用
try:
api_url = api_base_url or API_BASE_URL
response = self.session.get(f"{api_url}/api/token", timeout=10)
if response.status_code == 200:
data = response.json()
if data.get('code') == 0 and 'data' in data:
token_data = data['data']
if 'token' in token_data and 'cookies' in token_data:
self.token = token_data['token']
self.cookies.update(token_data['cookies'])
logger.info(f"✅ 第 {retry_count} 次重试成功!")
return True
logger.warning(f"❌ 第 {retry_count} 次重试仍然失败")
if retry_count >= max_retries:
self.enable_local_mode()
try:
response2 = self.session.get(f"{api_url}/api/token", timeout=10)
if response2.status_code == 200:
data2 = response2.json()
if data2.get('code') == 0 and 'data' in data2:
token_data2 = data2['data']
if 'token' in token_data2 and 'cookies' in token_data2:
self.token = token_data2['token']
self.cookies.update(token_data2['cookies'])
logger.info("✅ 本机直连模式获取token成功")
self.local_success_count += 1
self.try_restore_proxy()
return True
except Exception as e2:
logger.error(f"❌ 本机直连模式获取token异常: {e2}")
except Exception as e:
logger.error(f"❌ 第 {retry_count} 次重试出现异常: {e}")
logger.error(f"❌ 经过 {max_retries} 次重试仍然失败")
return False
def init_cookies(self):
cookies = {
'__zp_stoken__': self.generate_token(),
'Hm_lvt_194df3105ad7148dcf2b98a91b5e727a': str(int(time.time())),
'Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a': str(int(time.time())),
'__c': self.device_id[:8],
'__g': '-',
'__l': 'l=%2Fwww.zhipin.com%2F&r=&friend_source=0&s=3&friend_source=0',
'lastCity': '101010100',
'cityName': '%E5%8C%97%E4%BA%AC',
'__zp_sseed__': 'btHZ0bjBq8m//WNwlVrPUnVcIvini5J5P5LQUbflM24=',
'__zp_sname__': '3998243a',
'__zp_sts__': '1753719971615'
}
for name, value in cookies.items():
self.session.cookies.set(name, value, domain='.zhipin.com')
def generate_token(self) -> str:
chars = "0123456789abcdef"
return ''.join(random.choice(chars) for _ in range(32))
def test_proxy_connection(self):
if not hasattr(self, 'proxy_config') or not self.proxy_config:
return True
try:
print("🔍 测试代理连接...")
current_ip = self.get_current_ip()
if not current_ip or current_ip.startswith('获取IP失败') or current_ip.startswith('HTTP') or current_ip.startswith('API返回错误'):
print(f"❌ 代理连接测试失败: {current_ip}")
return False
print(f"✅ 代理连接成功当前IP: {current_ip}")
try:
resp = self.session.get('http://v2.api.juliangip.com/v2/dps/ip_search?ip=0.0.0.0', timeout=10)
if resp.status_code == 200:
data = resp.json()
https_ip = data.get('data', {}).get('ip', '') if data.get('code') == 0 else ''
print(f"✅ HTTPS代理连接成功当前IP: {https_ip}")
else:
print(f"❌ HTTPS代理连接失败: HTTP {resp.status_code}")
return False
except ProxyError as pe:
print(f"❌ HTTPS代理认证失败(407): {pe}")
return False
except Exception as e:
print(f"❌ HTTPS代理连接异常: {e}")
return False
try:
rbt = self.session.get('https://www.zhipin.com/robots.txt', timeout=10)
if rbt.status_code == 200:
print("✅ 目标域预检成功: www.zhipin.com")
return True
else:
print(f"❌ 目标域预检失败: HTTP {rbt.status_code}")
if self._try_switch_to_socks5h():
return self._retest_proxy_connection()
return False
except requests.ProxyError as pe2:
print(f"❌ 目标域预检代理认证失败(407): {pe2}")
if self._try_switch_to_socks5h():
return self._retest_proxy_connection()
return False
except Exception as e2:
print(f"❌ 目标域预检异常: {e2}")
return False
except Exception as e:
print(f"❌ 代理连接测试失败: {e}")
return False
def init_session(self):
try:
print("正在初始化微信小程序会话...")
if hasattr(self, 'proxy_config') and self.proxy_config and not self.test_proxy_connection():
print("❌ 代理连接失败,尝试切换 socks5h 并重测")
switched = self._try_switch_to_socks5h()
if switched:
if not self._retest_proxy_connection():
print("❌ 重测代理仍失败")
else:
print("⚠️ 未切换代理方案,保留现有代理配置")
wx_headers = self.default_headers.copy()
wx_headers["Host"] = "wxapp.zhipin.com"
wx_headers["Referer"] = "https://servicewechat.com/wx6c8d9b0c9ec51e7e/1/page-frame.html"
response = self.session.get(
f"{self.api_domain}/wapi/zpcommon/data/config.json",
headers=wx_headers,
timeout=30
)
if response.status_code == 200:
print("✅ 微信小程序配置获取成功")
main_headers = self.default_headers.copy()
main_headers["Host"] = "www.zhipin.com"
response2 = self.session.get(
self.serve_domain,
headers=main_headers,
timeout=30
)
return response2.status_code == 200
except Exception as e:
print(f"会话初始化失败: {e}")
return False
def _try_switch_to_socks5h(self) -> bool:
try:
if not self.proxy_config:
return False
new_cfg = {}
changed = False
for k, v in self.proxy_config.items():
if isinstance(v, str) and v.startswith('http://'):
new_cfg[k] = 'socks5h://' + v[len('http://'):]
changed = True
else:
new_cfg[k] = v
if not changed:
return False
self.proxy_config = new_cfg
self.session.proxies = new_cfg
print("🔁 已切换到 socks5h 代理方案")
return True
except Exception:
print("❌ 切换 socks5h 失败,可能缺少依赖 requests[socks]")
return False
def _retest_proxy_connection(self) -> bool:
try:
resp = self.session.get('http://v2.api.juliangip.com/v2/dps/ip_search?ip=0.0.0.0', timeout=10)
if resp.status_code != 200:
return False
rbt = self.session.get('https://www.zhipin.com/robots.txt', timeout=10)
return rbt.status_code == 200
except Exception:
return False
def set_login_data(self, mpt: str, wt2: str, open_id: str = ""):
self.login_data.update({
"mpt": mpt,
"wt2": wt2,
"openId": open_id
})
if wt2:
self.session.cookies.set('wt2', wt2, domain='.zhipin.com')
if mpt:
self.session.cookies.set('mpt', mpt, domain='.zhipin.com')
def update_login_from_curl(self, api_base_url: str = None):
"""从API更新登录数据"""
success = self.load_token_from_api(api_base_url)
# 更新traceid
new_traceid = "F-77d05bnXuMVrHIB3"
self.login_data["traceid"] = new_traceid
if success:
print(f"✅ 已从API更新登录数据:")
else:
print(f"⚠️ API获取失败使用默认登录数据:")
print(f" mpt: {self.login_data['mpt'][:20]}...")
print(f" wt2: {self.login_data['wt2'][:50]}...")
print(f" traceid: {new_traceid}")
return success
def update_token_status(self, token_id: int, is_active: bool = False, increment_failed_count: bool = False,
api_base_url: str = None):
"""更新token状态到API"""
if api_base_url is None:
api_base_url = API_BASE_URL
try:
# 构建更新数据
update_data = {"is_active": is_active}
# 如果需要增加失败次数,先获取当前失败次数
if increment_failed_count:
get_response = requests.get(f"{api_base_url}/api/v1/universal/data/{token_id}?data_type=boss_token")
if get_response.status_code == 200:
token_data = get_response.json()
if token_data.get("success") and token_data.get("data"):
current_failed_count = token_data["data"].get("failed_count", 0)
update_data["failed_count"] = current_failed_count + 1
logger.info(f"📈 将失败次数从 {current_failed_count} 增加到 {current_failed_count + 1}")
# 使用通用数据接口更新token
universal_update_data = {
"data_type": "boss_token",
"platform": "boss",
"data": update_data
}
response = requests.put(f"{api_base_url}/api/v1/universal/data/{token_id}",
json=universal_update_data)
if response.status_code == 200:
data = response.json()
if data.get("success"):
logger.info(f"✅ 成功更新token状态: token_id={token_id}, is_active={is_active}")
return True
else:
logger.error(f"❌ 更新token状态失败: {data.get('message', '未知错误')}")
else:
logger.error(f"❌ 更新token状态请求失败: {response.status_code}")
except Exception as e:
logger.error(f"❌ 更新token状态异常: {str(e)}")
return False
def handle_login_expired(self, api_base_url: str = None):
"""处理登录失效标记当前token失效并获取新token"""
if api_base_url is None:
api_base_url = API_BASE_URL
if self.current_token_id:
logger.warning(f"🚫 检测到登录失效处理token: id={self.current_token_id}")
# 更新当前token状态为不可用并增加失败次数
success = self.update_token_status(
token_id=self.current_token_id,
is_active=False,
increment_failed_count=True,
api_base_url=api_base_url
)
if success:
logger.info(f"✅ 已标记token失效: id={self.current_token_id}")
else:
logger.error(f"❌ 标记token失效失败: id={self.current_token_id}")
# 重新获取新的token
self.current_token_id = None
return self.load_token_from_api(api_base_url)
else:
logger.warning("⚠️ 没有当前token ID直接获取新token")
return self.load_token_from_api(api_base_url)
def mark_token_failed(self, api_base_url: str = None):
"""标记当前token失效兼容旧方法"""
return self.handle_login_expired(api_base_url)
def build_request_headers(self, custom_headers: Optional[Dict] = None) -> Dict[str, str]:
headers = self.default_headers.copy()
headers.update({
"mpt": self.login_data.get("mpt", ""),
"scene": "1006",
"wt2": "",
"Traceid": self.generate_boss_trace_id()
})
headers["timestamp"] = str(int(time.time() * 1000))
if custom_headers:
headers.update(custom_headers)
if "Host" not in custom_headers:
if "wxapp.zhipin.com" in headers.get("Referer", ""):
headers["Host"] = "wxapp.zhipin.com"
elif "www.zhipin.com" in headers.get("Referer", ""):
headers["Host"] = "www.zhipin.com"
return headers
def build_request_data(self, data: Optional[Dict] = None) -> Dict[str, Any]:
request_data = {
"appId": self.app_id,
"scene": self.scene,
"timestamp": int(time.time() * 1000)
}
if data:
request_data.update(data)
return request_data
def batch_request_direct(self, batch_method_feed: str, app_id: str = "10002") -> Optional[Dict]:
headers = self.build_request_headers({
"Referer": "https://www.zhipin.com/web/geek/job"
})
data = {
"batch_method_feed": batch_method_feed,
"appId": app_id
}
try:
sleep_random_between()
# print(f"📡 批量请求数据: {headers}") # 已注释掉header打印
start_t = time.monotonic()
response = self.session.get(
f"{self.serve_domain}/wapi/batch/batchRunV3",
params=data,
headers=headers,
timeout=30
)
logger.info("RAW_RESPONSE method={} url={} status={} resp_size={}", "GET", f"{self.serve_domain}/wapi/batch/batchRunV3", response.status_code, len(response.content))
if response.status_code == 200:
elapsed = time.monotonic() - start_t
result = response.json()
reason = self.ip_detector.detect(response.status_code, elapsed, result)
if reason:
logger.warning("IP_ANOMALY reason={} elapsed={:.2f}s status={}", reason, elapsed, response.status_code)
self.ip_manager.mark_failure(reason)
mode, cfg = self.ip_manager.select_next_route()
self.reinit_session(cfg)
self.init_cookies()
logger.info("IP_SWITCH mode={} cfg={}", mode, cfg)
wait_time = sleep_random_between()
logger.info(f"⏳ IP异常等待 {int(wait_time)} 秒后重试 (批量GET)")
headers = self.build_request_headers({
"Referer": "https://www.zhipin.com/web/geek/job",
"User-Agent": self.get_random_user_agent()
})
response = self.session.get(
f"{self.serve_domain}/wapi/batch/batchRunV3",
params=data,
headers=headers,
timeout=30
)
if response.status_code != 200:
print(f"❌ 批量请求失败: {response.status_code}")
print(f"响应内容: {response.text[:500]}")
return None
result = response.json()
if self.handle_ip_abnormal_response(result):
wait_time = sleep_random_between()
logger.info(f"⏳ IP异常等待 {int(wait_time)} 秒后重试 (批量GET)")
headers = self.build_request_headers({
"Referer": "https://www.zhipin.com/web/geek/job",
"User-Agent": self.get_random_user_agent()
})
response = self.session.get(
f"{self.serve_domain}/wapi/batch/batchRunV3",
params=data,
headers=headers,
timeout=30
)
if response.status_code != 200:
print(f"❌ 批量请求失败: {response.status_code}")
print(f"响应内容: {response.text[:500]}")
return None
result = response.json()
print(f"✅ 批量请求成功")
self.ip_manager.mark_success()
if hasattr(self, 'local_mode') and self.local_mode:
self.local_success_count += 1
self.try_restore_proxy()
# 请求成功后也添加短暂休眠,进一步减少风控
post_wait_time = random.uniform(2, 5)
time.sleep(post_wait_time)
return result
else:
print(f"❌ 批量请求失败: {response.status_code}")
print(f"响应内容: {response.text[:500]}")
return None
except Exception as e:
print(f"❌ 批量请求异常: {str(e)}")
return None
def build_batch_method_feed(self, requests: List[Dict]) -> str:
from urllib.parse import quote
batch_methods = []
for request in requests:
url = request["url"]
params = request.get("params", {})
method_params = {"method": url}
method_params.update(params)
param_pairs = []
for key, value in method_params.items():
encoded_value = quote(str(value), safe='')
param_pairs.append(f"{key}={encoded_value}")
method_string = "&".join(param_pairs)
batch_methods.append(method_string)
return json.dumps(batch_methods)
def get_job_list_by_keyword(self, params: Dict, skip_init: bool = False) -> Optional[Dict]:
print("🚀 开始自动抓取...")
if not skip_init:
self.update_login_from_curl()
if not self.init_session():
print("❌ 会话初始化失败")
return None
else:
print("⏱️ 连续请求等待至少10秒...")
wait_time = sleep_random_between()
print(f"⏰ 已等待 {int(wait_time)}")
search_params = {
'pageSize': params.get('pageSize', 15),
'query': params.get('query', ''),
'city': params.get('city', '101010100'),
'source': params.get('source', '1'),
'sortType': params.get('sortType', '0'),
'subwayLineId': params.get('subwayLineId', ''),
'subwayStationId': params.get('subwayStationId', ''),
'districtCode': params.get('districtCode', ''),
'businessCode': params.get('businessCode', ''),
'longitude': params.get('longitude', ''),
'latitude': params.get('latitude', ''),
'position': params.get('position', ''),
'expectId': params.get('expectId', ''),
'expectPosition': params.get('expectPosition', ''),
'encryptExpectId': params.get('encryptExpectId', ''),
'page': params.get('page', 1),
'appId': '10002'
}
try:
result = self.single_request(
"/wapi/zpgeek/miniapp/search/joblist.json",
method="GET",
data=search_params,
)
if result:
print("✅ 自动抓取完成!")
if isinstance(result, dict) and 'code' in result:
if result.get('code') != 0:
current_ip = self.get_current_ip()
error_msg = result.get('message', 'Unknown error')
print(result)
print(f"⚠️ API返回错误: {error_msg}")
print(f"🌐 当前IP地址: {current_ip}")
# 如果是IP异常进行重试处理
if 'IP' in error_msg and '异常' in error_msg:
print(f"🔄 检测到IP异常开始重试处理...")
retry_result = self.handle_ip_exception_retry_for_auto_crawl(params)
if retry_result:
return retry_result # 重试成功,返回结果
return result
if isinstance(result, dict) and 'zpData' in result:
zp_data = result['zpData']
if 'list' in zp_data:
job_list = zp_data['list']
print(f"📊 获取到 {len(job_list)} 条职位数据")
return result
else:
print("⚠️ 未找到 list 字段")
return result
else:
print(f"⚠️ 响应格式异常: {list(result.keys()) if isinstance(result, dict) else type(result)}")
return result
else:
print("❌ 自动抓取失败")
return None
except Exception as e:
print(f"❌ 自动抓取异常: {str(e)}")
return None
def get_job_list_multi_pages(self, params: Dict, max_pages: Optional[int] = None,
job_processor: Optional[callable] = None,
page_processor: Optional[callable] = None) -> Optional[List[Dict]]:
"""根据关键词获取招聘列表(多页,流式处理)
Args:
params: 搜索参数格式同get_job_list_by_keyword
max_pages: 最大页数限制默认None表示获取所有可用页面由hasMore控制
job_processor: 每个job的处理函数格式: def process_job(job: Dict) -> None
page_processor: 每页数据处理函数,格式: def process_page(jobs: List[Dict]) -> None
Returns:
如果job_processor为None返回所有页面的职位数据列表否则返回None流式处理
"""
start_page = params.get('page', 1)
if max_pages is None:
print("🚀 开始自动获取多页数据将优先根据hasMore控制必要时退回到最大页数约束")
else:
print(f"🚀 开始自动获取多页数据,最大页数限制: {max_pages}")
collect_all = job_processor is None
all_jobs = [] if collect_all else None
processed_count = 0
current_page = start_page
pages_fetched = 0
while True:
if max_pages is not None and pages_fetched >= max_pages:
print(f"⏹ 已达到最大页数限制 {max_pages},停止翻页")
break
print(f"\n📄 正在获取第 {current_page} 页数据...")
page_params = params.copy()
page_params['page'] = current_page
skip_init = current_page > start_page
result = self.get_job_list_by_keyword(page_params, skip_init=skip_init)
if not (result and isinstance(result, dict)):
print(f"❌ 第 {current_page} 页获取失败")
break
if result.get('code') != 0:
current_ip = self.get_current_ip()
error_msg = result.get('message', 'Unknown error')
print(f"❌ 第 {current_page} 页API返回错误: {error_msg}")
print(f"🌐 当前IP地址: {current_ip}")
if 'IP' in error_msg and '异常' in error_msg:
print(f"🔄 检测到IP异常开始重试处理...")
retry_success = self.handle_ip_exception_retry(page_params, current_page)
if retry_success:
continue
break
zp_data = result.get('zpData', {})
job_list = zp_data.get('list') or zp_data.get('jobList') or []
if not job_list:
print(f"⚠️ 第 {current_page} 页未找到职位数据,停止翻页")
break
print(f"✅ 第 {current_page} 页获取到 {len(job_list)} 条职位数据")
for job in job_list:
if job_processor:
try:
job_processor(job)
except Exception as e:
print(f"❌ job_processor处理出错: {e}")
if collect_all:
all_jobs.append(job)
processed_count += 1
sleep_random_between()
if page_processor:
try:
page_processor(job_list)
except Exception as e:
print(f"❌ page_processor处理出错: {e}")
pages_fetched += 1
has_more = zp_data.get('hasMore')
if has_more is False:
print(f"⏹ 接口返回 hasMore = False在第 {current_page} 页停止翻页")
break
print("⏱️ 等待至少10秒后获取下一页...")
wait_time = sleep_random_between()
print(f"⏰ 已等待 {int(wait_time)}")
current_page += 1
print(f"\n🎉 处理完成,总共处理 {processed_count} 条职位数据")
return all_jobs if collect_all else None
def batch_request_v2(self, requests: List[Dict]) -> Optional[Dict]:
"""批量请求V2 - 完全按照微信小程序batch方法实现
Args:
requests: 请求列表,格式: [
{
"url": "/wapi/zpgeek/search/joblist.json",
"params": {"query": "python", "city": "101010100"}
}
]
Returns:
批量请求结果字典按URL映射
"""
# 构建URL映射: s = {}
url_mapping = {}
# 构建batch_method_feed
batch_method_feed = self.build_batch_method_feed(requests)
# 建立URL映射: s[p.url] = l (这里用索引作为键)
for i, request in enumerate(requests):
url_mapping[request["url"]] = f"request_{i}"
# 发送请求到 /wapi/batch/batchRunV3
headers = self.build_request_headers({
"Referer": "https://www.zhipin.com/web/geek/job"
})
data = self.build_request_data({
"batch_method_feed": batch_method_feed
})
try:
sleep_random_between()
response = self.session.get(
f"{self.serve_domain}/wapi/batch/batchRunV3",
headers=headers,
params=data,
timeout=30,
proxies=self.proxy_config
)
logger.info("RAW_RESPONSE method={} url={} status={} resp_size={}", "GET", f"{self.serve_domain}/wapi/batch/batchRunV3", response.status_code, len(response.content))
if response.status_code != 200:
print(f"❌ HTTP错误: {response.status_code}")
print(f"响应内容: {response.text[:200]}")
return None
# 检查响应内容是否为空或无效
if not response.content:
print("❌ 响应内容为空")
return None
try:
result = response.json()
except json.JSONDecodeError as e:
print(f"❌ JSON解析失败: {e}")
print(f"响应内容: {response.text[:200]}")
return None
# 按照JS逻辑处理响应: if (0 !== f.code || !f.zpData)
if result.get("code") != 0 or not result.get("zpData"):
print(f"❌ 批量请求失败: {result.get('message', 'Unknown error')}")
return None
# 重新映射结果: w = {}, (y = s[b]) && (w[y] = v)
mapped_results = {}
for url, response_data in result["zpData"].items():
if url in url_mapping:
request_key = url_mapping[url]
mapped_results[request_key] = response_data
else:
# 如果没有映射直接使用URL作为键
mapped_results[url] = response_data
print(f"✅ 批量请求成功,返回 {len(mapped_results)} 个结果")
return mapped_results
except Exception as e:
print(f"❌ 批量请求异常: {str(e)}")
return None
def get_job_detail_by_id(self, job_id: str, lid: str = "", security_id: str = "") -> Optional[Dict]:
"""根据招聘ID获取招聘详情 - 使用微信小程序batch接口
Args:
job_id: 招聘职位ID (encryptJobId)
lid: 招聘者ID
security_id: 安全ID (securityId)
Returns:
dict: 招聘详情数据
"""
print(f"🔍 获取招聘详情: {job_id}")
if not job_id:
print("❌ job_id不能为空")
return None
# 构建batch请求 - 参考微信小程序实现
batch_requests = {
"job_detail": {
"url": "/wapi/zpgeek/miniapp/job/detail.json",
"params": {
"securityId": security_id,
"jobId": job_id,
"lid": lid,
"source": "10",
"scene": "1"
}
},
"job_improvement": {
"url": "/wapi/zpgeek/miniapp/jobdetail/improvement/query.json",
"params": {
"securityId": security_id,
"jobId": job_id,
"lid": lid,
"source": "10",
"scene": "1"
}
}
}
# 使用batch接口获取详情
result = self.batch_request_v2(batch_requests)
if result and "job_detail" in result:
job_detail = result["job_detail"]
if job_detail.get("code") == 0:
return job_detail
else:
error_msg = job_detail.get("message", "未知错误")
print(f"❌ 招聘详情获取失败: {error_msg}")
return None
else:
print("❌ 招聘详情获取失败: 无法获取有效响应")
return None
def boss_batch_request(self, job_id, security_id, lid):
"""
发送Boss直聘批量请求 (POST JSON)
:param job_id: 职位ID
:param security_id: 安全ID
:param lid: 招聘者ID
:return: 响应JSON数据
"""
# 构建subReqs
sub_reqs = [
{
"path": "/wapi/zpgeek/miniapp/job/detail.json",
"method": "GET",
"query": urlencode({
"securityId": security_id,
"jobId": job_id,
"lid": lid,
"source": "10"
})
},
{
"path": "/wapi/zpgeek/miniapp/jobdetail/improvement/query.json",
"method": "GET",
"query": urlencode({
"securityId": security_id,
"jobId": job_id,
"lid": lid
})
}
]
# 构建POST请求的JSON body
post_data = {
"subReqs": sub_reqs,
"appId": 10002
}
# 请求头
headers = self.build_request_headers({
"Content-Type": "application/json",
"Referer": "https://servicewechat.com/wxa8da525af05281f3/585/page-frame.html" # From curl
})
try:
# 发送POST请求
sleep_random_between()
response = self.session.post(
"https://www.zhipin.com/wapi/batch/requests",
json=post_data,
headers=headers,
timeout=30
)
response.raise_for_status()
data = response.json()
logger.info("RAW_RESPONSE method={} url={} status={} resp_size={}", "POST", "https://www.zhipin.com/wapi/batch/requests", response.status_code, len(response.content))
if self.handle_ip_abnormal_response(data):
wait_time = sleep_random_between()
logger.info(f"⏳ IP异常等待 {int(wait_time)} 秒后重试 (批量POST)")
self.init_cookies()
headers = self.build_request_headers({
"Content-Type": "application/json",
"Referer": "https://servicewechat.com/wxa8da525af05281f3/585/page-frame.html",
"User-Agent": self.get_random_user_agent()
})
response = self.session.post(
"https://www.zhipin.com/wapi/batch/requests",
json=post_data,
headers=headers,
timeout=30
)
response.raise_for_status()
data = response.json()
# 请求成功后也添加短暂休眠,进一步减少风控
post_wait_time = random.uniform(2, 5)
time.sleep(post_wait_time)
return data
except requests.exceptions.RequestException as e:
print(f"请求失败: {str(e)}")
return None
def get_company_detail_by_id(self, company_id: str) -> Optional[Dict]:
"""根据公司ID获取公司详情
Args:
company_id: 公司ID
Returns:
dict: 公司详情数据
"""
print(f"🏢 获取公司详情: {company_id}")
# 构建请求参数 - 使用新的miniapp接口格式
params = {
"brandId": company_id,
"appId": "10002"
}
# 发送请求 - 使用新的API路径
headers = self.build_request_headers({
"Referer": "https://servicewechat.com/wxa8da525af05281f3/574/page-frame.html"
})
result = self.single_request(
url="/wapi/zpgeek/miniapp/brand/detail.json",
method="GET",
data=params,
custom_headers=headers,
)
if result and result.get("code") == 0:
print("✅ 公司详情获取成功")
return result
else:
error_msg = result.get("message", "未知错误") if result else "请求失败"
print(f"❌ 公司详情获取失败: {error_msg}")
return None
def batch_request_v2(self, batch_requests: Dict[str, Dict]) -> Optional[Dict]:
"""批量请求 - 基于微信小程序batch方法实现
Args:
batch_requests: 批量请求配置
格式: {
"request_key": {
"url": "/wapi/zpgeek/search/joblist.json",
"params": {"query": "python", "city": "101010100"}
}
}
Returns:
批量请求结果字典
"""
# 构建URL映射和批量方法参数 - 完全按照JS代码逻辑
url_mapping = {} # s = {}
batch_methods = [] # o = []
for request_key, request_config in batch_requests.items():
url = request_config["url"]
params = request_config.get("params", {})
# 建立URL到请求键的映射: s[p.url] = l
url_mapping[url] = request_key
# 构建批量方法参数: d = n({method: p.url}, p.params)
method_params = {"method": url}
method_params.update(params)
# 转换为URL编码格式: h = Object.entries(d).map(...).join("&")
method_string = "&".join([f"{k}={v}" for k, v in method_params.items()])
batch_methods.append(method_string)
# 构建请求
headers = self.build_request_headers({
"Referer": "https://www.zhipin.com/web/geek/job"
})
data = self.build_request_data({
"batch_method_feed": json.dumps(batch_methods)
})
# 发送请求
try:
sleep_random_between()
start_t = time.monotonic()
response = self.session.get(
f"{self.serve_domain}/wapi/batch/batchRunV3",
headers=headers,
params=data,
timeout=30,
proxies=self.proxy_config
)
response.raise_for_status()
elapsed = time.monotonic() - start_t
result = response.json()
logger.info("RAW_RESPONSE method={} url={} status={} resp_size={}", "GET", f"{self.serve_domain}/wapi/batch/batchRunV3", response.status_code, len(response.content))
reason = self.ip_detector.detect(response.status_code, elapsed, result)
if reason:
logger.warning("IP_ANOMALY reason={} elapsed={:.2f}s status={}", reason, elapsed, response.status_code)
self.ip_manager.mark_failure(reason)
mode, cfg = self.ip_manager.select_next_route()
self.reinit_session(cfg)
self.init_cookies()
logger.info("IP_SWITCH mode={} cfg={}", mode, cfg)
headers = self.build_request_headers({
"Referer": "https://www.zhipin.com/web/geek/job"
})
response = self.session.get(
f"{self.serve_domain}/wapi/batch/batchRunV3",
headers=headers,
params=data,
timeout=30,
)
response.raise_for_status()
result = response.json()
logger.info("RAW_RESPONSE method={} url={} status={} resp_size={}", "GET", f"{self.serve_domain}/wapi/batch/batchRunV3", response.status_code, len(response.content))
# 检查是否需要处理反爬虫
if self.handle_anti_bot_response(result):
wait_time = sleep_random_between()
print(f"🔄 批量请求检测到安全验证,等待 {int(wait_time)} 秒后重试...")
# 重新构建请求头
headers = self.build_request_headers({
"Referer": "https://www.zhipin.com/web/geek/job"
})
# 重试请求
response = self.session.get(
f"{self.serve_domain}/wapi/batch/batchRunV3",
headers=headers,
params=data,
timeout=30,
)
response.raise_for_status()
result = response.json()
logger.info("RAW_RESPONSE method={} url={} status={} params={} body={} resp_size={}", method.upper(), f"{self.serve_domain}{url}", response.status_code, request_data, {} if method.upper()=="GET" else request_data, len(response.content))
logger.info("RAW_RESPONSE method={} url={} status={} resp_size={}", "GET", f"{self.serve_domain}/wapi/batch/batchRunV3", response.status_code, len(response.content))
# 处理响应
if result.get("code") == 0 and result.get("zpData"):
# 重新映射结果
mapped_results = {}
for url, response_data in result["zpData"].items():
if url in url_mapping:
request_key = url_mapping[url]
mapped_results[request_key] = response_data
self.ip_manager.mark_success()
# 请求成功后也添加短暂休眠,进一步减少风控
post_wait_time = random.uniform(2, 5)
time.sleep(post_wait_time)
return mapped_results
else:
print(f"批量请求失败: {result.get('message', '未知错误')}")
return None
except requests.RequestException as e:
print(f"请求异常: {e}")
if hasattr(e, 'response') and e.response is not None:
print(f"响应状态码: {e.response.status_code}")
print(f"响应内容: {e.response.text[:500]}")
return None
def handle_anti_bot_response(self, response_data: Dict) -> bool:
"""处理反爬虫响应 - 错误码37的处理逻辑"""
if response_data.get("code") == 37:
print("⚠️ 检测到反爬虫机制错误码37正在更新安全参数...")
zp_data = response_data.get("zpData", {})
updated = False
# 更新安全种子
if "seed" in zp_data:
self.session.cookies.set('__zp_sseed__', zp_data["seed"], domain='.zhipin.com')
print(f"✅ 已更新安全种子: {zp_data['seed'][:20]}...")
updated = True
# 更新名称
if "name" in zp_data:
self.session.cookies.set('__zp_sname__', zp_data["name"], domain='.zhipin.com')
print(f"✅ 已更新安全名称: {zp_data['name']}")
updated = True
# 更新时间戳
if "ts" in zp_data:
self.session.cookies.set('__zp_sts__', str(zp_data["ts"]), domain='.zhipin.com')
print(f"✅ 已更新时间戳: {zp_data['ts']}")
updated = True
# 如果没有获取到新的安全参数,尝试生成新的时间戳
if not updated:
new_ts = str(int(time.time() * 1000))
print(f"🔄 生成新的时间戳: ts={new_ts}")
self.session.cookies.set('__zp_sts__', new_ts, domain='.zhipin.com')
return True
return False
def handle_ip_abnormal_response(self, response_data: Dict) -> bool:
reason = None
if response_data.get("code") == 35 or "IP地址存在异常" in response_data.get("message", ""):
reason = "ip_banned"
if not reason:
return False
logger.warning("⚠️ 检测到IP异常准备切换IP并重试")
self.ip_manager.mark_failure(reason)
mode, cfg = self.ip_manager.select_next_route()
self.reinit_session(cfg)
self.init_cookies()
logger.info("IP_SWITCH mode={} cfg={}", mode, cfg)
return True
def validate_request_params(self) -> bool:
"""验证请求参数的完整性"""
required_cookies = ['__zp_sseed__', '__zp_sname__', '__zp_sts__']
required_login_data = ['mpt', 'wt2', 'traceid']
# 检查必要的Cookie
missing_cookies = [c for c in required_cookies if c not in self.session.cookies]
if missing_cookies:
self.init_cookies()
missing_cookies = [c for c in required_cookies if c not in self.session.cookies]
if missing_cookies:
print(f"⚠️ 缺少必要的Cookie: {missing_cookies[0]}")
return False
# 检查登录数据
for key in required_login_data:
if not self.login_data.get(key):
print(f"⚠️ 缺少必要的登录数据: {key}")
return False
print("✅ 请求参数验证通过")
return True
def single_request(self, url: str, method: str = "GET", data: Optional[Dict] = None,
custom_headers: Optional[Dict] = None, max_retries: int = 5) -> Optional[Dict]:
"""单个请求
Args:
url: 请求URL
method: 请求方法
data: 请求数据
custom_headers: 自定义请求头
max_retries: 最大重试次数
Returns:
dict: 响应数据
"""
# 验证请求参数
if not self.validate_request_params():
print("❌ 请求参数验证失败")
return None
# 构建请求头添加更真实的Referer
default_custom_headers = {
"Referer": "https://www.zhipin.com/web/geek/job"
}
if custom_headers:
default_custom_headers.update(custom_headers)
for attempt in range(max_retries):
try:
sleep_random_between()
headers = self.build_request_headers(default_custom_headers)
request_data = self.build_request_data(data)
start_t = time.monotonic()
if method.upper() == "GET":
response = self.session.get(
f"{self.serve_domain}{url}",
headers=headers,
params=request_data,
timeout=30,
)
else:
response = self.session.post(
f"{self.serve_domain}{url}",
headers=headers,
data=urlencode(request_data),
timeout=30
)
response.raise_for_status()
elapsed = time.monotonic() - start_t
result = response.json()
# 检查是否需要处理反爬虫
if self.handle_anti_bot_response(result):
wait_time = sleep_random_between()
print(f"🔄 检测到安全验证,等待 {int(wait_time)} 秒后重试... (尝试 {attempt + 1}/{max_retries})")
# 在重试前更新一些请求头
default_custom_headers.update({
'User-Agent': self.get_random_user_agent(),
'X-Requested-With': 'XMLHttpRequest'
})
continue
reason = self.ip_detector.detect(response.status_code, elapsed, result)
if reason:
logger.warning("IP_ANOMALY reason={} elapsed={:.2f}s status={}", reason, elapsed, response.status_code)
if self.local_mode:
try:
self.local_fail_count += 1
thr = int(os.getenv('LOCAL_FAIL_THRESHOLD', '3'))
except Exception:
thr = 3
if self.proxy_config and self.local_fail_count >= thr:
self.enable_proxy_mode()
self.ip_manager.mark_failure(reason)
mode, cfg = self.ip_manager.select_next_route()
if mode == 'proxy' and cfg:
self.reinit_session(cfg)
logger.info("IP_SWITCH mode={} cfg={}", "proxy", cfg)
else:
self.reinit_session(self.proxy_config)
logger.info("IP_SWITCH mode={} cfg={}", "proxy", self.proxy_config)
self.init_cookies()
continue
else:
self.reinit_session()
self.init_cookies()
logger.info("IP_SWITCH mode={} cfg={}", "local", None)
wait_time = sleep_random_between()
logger.info(f"⏳ IP切换后等待 {int(wait_time)}")
continue
else:
self.ip_manager.mark_failure(reason)
mode, cfg = self.ip_manager.select_next_route()
if mode == 'proxy' and cfg:
self.reinit_session(cfg)
logger.info("IP_SWITCH mode={} cfg={}", mode, cfg)
else:
self.enable_local_mode()
self.reinit_session()
logger.info("IP_SWITCH mode={} cfg={}", "local", None)
self.init_cookies()
continue
self.ip_manager.mark_success()
if hasattr(self, 'local_mode') and self.local_mode:
self.local_success_count += 1
self.local_fail_count = 0
self.try_restore_proxy()
# 请求成功后也添加短暂休眠,进一步减少风控
post_wait_time = random.uniform(2, 5)
time.sleep(post_wait_time)
return result
except requests.RequestException as e:
status = getattr(getattr(e, 'response', None), 'status_code', None)
err_text = ''
try:
err_text = getattr(getattr(e, 'response', None), 'text', '')[:500]
except Exception:
pass
print(f"❌ 请求失败 (尝试 {attempt + 1}/{max_retries}): {e}")
if status is not None:
print(f"响应状态码: {status}")
print(f"响应内容: {err_text}")
reason = self.ip_detector.detect(status, float(self.ip_cfg.response_time_threshold_sec) + 0.01, None, err_text)
if reason:
self.ip_manager.mark_failure(reason)
mode, cfg = self.ip_manager.select_next_route()
self.reinit_session(cfg)
self.init_cookies()
logger.info("IP_SWITCH mode={} cfg={}", mode, cfg)
if attempt < max_retries - 1:
# 确保至少等待10秒重试时递增等待时间
wait_time = max(10, min(2 ** attempt * 5, 30)) # 至少10秒最大30秒
print(f"⏳ 等待{wait_time}秒后重试...")
time.sleep(wait_time)
else:
print("❌ 达到最大重试次数,请求失败")
return None
return None
def get_random_user_agent(self) -> str:
"""获取随机User-Agent"""
user_agents = [
f"Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/{self.wx_version}(0x18002b2d) NetType/WIFI Language/zh_CN",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15"
]
return random.choice(user_agents)
def push_company(company_data: Dict[str, Any]) -> None:
"""
推送公司数据到数据库
"""
try:
zp_data = company_data.get('zpData', {})
if not zp_data:
print("❌ 公司数据格式错误")
return
# 转换为通用数据格式
universal_data = {
"data_list": [
zp_data
],
"data_type": "company",
"platform": "boss"
}
response = requests.post(
f'{API_BASE_URL}/api/v1/universal/data/batch-store-async',
headers={
'accept': 'application/json',
'token': 'dev',
'Content-Type': 'application/json'
},
json=universal_data,
timeout=30
)
logger.info("REPORT_DATA_STATUS {}", {"status": response.status_code, "size": len(response.content) if hasattr(response, 'content') else 0})
if response.status_code == 200:
company_name = "未知公司"
if zp_data and 'companyFullInfoVO' in zp_data and zp_data['companyFullInfoVO'] and 'name' in zp_data['companyFullInfoVO']:
company_name = zp_data['companyFullInfoVO']['name']
print(f"✅ 推送公司数据成功: {company_name}")
else:
print(f"❌ 推送公司数据失败: {response.status_code} - {response.text}")
except Exception as e:
print(f"❌ 推送公司数据失败: {e}")
def push_job(job_data: Dict[str, Any]) -> None:
"""
推送职位数据到数据库
"""
try:
zp_data = job_data.get('zpData', {})
if not zp_data:
print("❌ 职位数据格式错误")
return
# 转换为通用数据格式
universal_data = {
"data_list": [
zp_data
],
"data_type": "job",
"platform": "boss"
}
response = requests.post(
f'{API_BASE_URL}/api/v1/universal/data/batch-store-async',
headers={
'accept': 'application/json',
'token': 'dev',
'Content-Type': 'application/json'
},
json=universal_data,
timeout=30
)
logger.info("REPORT_DATA_STATUS {}", {"status": response.status_code, "size": len(response.content) if hasattr(response, 'content') else 0})
if response.status_code == 200:
print(
f"✅ 推送职位数据成功: {zp_data['jobBaseInfoVO']['positionName']} @ {zp_data['brandComInfoVO']['brandName']}")
else:
print(f"❌ 推送职位数据失败: {response.status_code} - {response.text}")
except Exception as e:
print(f"❌ 推送职位数据失败: {e}")
import json
import random
def load_json_data():
"""
加载城市和职位数据
Returns:
tuple: (cities, positions)
"""
try:
# 加载城市数据
with open('city.json', 'r', encoding='utf-8') as f:
city_data = json.load(f)
# 提取城市列表
cities = []
if 'zpData' in city_data and 'hotCityList' in city_data['zpData']:
for city in city_data['zpData']['hotCityList']:
cities.append({
'code': city['code'],
'name': city['name']
})
# 加载职位数据
with open('work.json', 'r', encoding='utf-8') as f:
work_data = json.load(f)
# 提取职位列表
positions = []
if 'zpData' in work_data and 'config' in work_data['zpData']:
for category in work_data['zpData']['config']:
if 'subLevelModelList' in category and category['subLevelModelList']:
for sub_category in category['subLevelModelList']:
if 'subLevelModelList' in sub_category and sub_category['subLevelModelList']:
for position in sub_category['subLevelModelList']:
positions.append({
'code': position['code'],
'name': position['name']
})
return cities, positions
except Exception as e:
print(f"加载数据失败: {e}")
# 返回默认数据
default_cities = [
{'code': '101010100', 'name': '北京'},
{'code': '101020100', 'name': '上海'},
{'code': '101280100', 'name': '广州'},
{'code': '101280600', 'name': '深圳'},
{'code': '101210100', 'name': '杭州'}
]
default_positions = [
{'code': 100101, 'name': 'Java'},
{'code': 100109, 'name': 'Python'},
{'code': 100901, 'name': '前端开发工程师'},
{'code': 100202, 'name': 'Android'},
{'code': 100203, 'name': 'iOS'}
]
return default_cities, default_positions
def generate_random_params() -> Dict[str, Any]:
"""
随机生成一个爬虫参数
Returns:
dict: 包含爬虫参数的字典
"""
cities, positions = load_json_data()
# 随机选择城市和职位
city = random.choice(cities)
position = random.choice(positions)
# 生成查询名称
if position['name'] in ['Java', 'Python', 'PHP', 'C#', 'C/C++', 'Golang', 'Node.js']:
query = f"{position['name']}开发"
elif position['name'] in ['Android', 'iOS']:
query = f"{position['name']}开发"
else:
query = position['name']
return {
"query": query,
"city": city['code'],
"scene": 1,
"page": 1,
"position": position['code'],
}
def generate_random_params_batch(count: int = 10) -> List[Dict[str, Any]]:
"""
批量生成随机参数
Args:
count: 生成数量
Returns:
list: 参数列表
"""
return [generate_random_params() for _ in range(count)]
def fetch_service_params() -> Optional[Dict[str, Any]]:
try:
url = f"{API_BASE_URL}/api/v1/keyword/available"
r = requests.get(url, params={"source": "boss", "limit": 1, "reserve": True}, timeout=10)
if r.status_code != 200:
return None
js = r.json()
data = js.get("data") or {}
items = data.get("items") or []
if not items:
return None
item = items[0]
ids = [item.get("id")]
if ids and ids[0]:
try:
murl = f"{API_BASE_URL}/api/v1/keyword/mark-used"
requests.post(murl, json={"source": "boss", "ids": ids}, timeout=10)
except Exception as e:
logger.error(f"标记已用失败: {e}")
pass
city = str(item.get("city", ""))
job = str(item.get("job", ""))
if not city or not job:
return None
return {"query": job, "city": city, "scene": 1, "page": 1}
except Exception:
return None
# 使用示例
if __name__ == "__main__":
import time
import random
# 代理配置(从环境变量获取)
username = os.getenv('PROXY_USERNAME')
password = os.getenv('PROXY_PASSWORD')
tunnel = os.getenv('PROXY_TUNNEL')
scheme = os.getenv('PROXY_SCHEME')
username = (username or '').strip().strip('`')
password = (password or '').strip().strip('`')
tunnel = (tunnel or '').strip().strip('`')
scheme = (scheme or '').strip().strip('`').lower()
proxies = None
if username and password and tunnel:
if scheme not in ('http', 'socks5h'):
scheme = 'http'
proxy_url = f"{scheme}://{username}:{password}@{tunnel}"
pattern = r"^(http|socks5h)://[^:]+:[^@]+@[^:]+:\d+$"
if re.match(pattern, proxy_url):
proxies = {
"http": proxy_url,
"https": proxy_url
}
print(f"✅ 使用代理配置: {tunnel} ({scheme})")
else:
print("❌ 代理配置格式不正确,请检查用户名/密码/隧道地址")
else:
print(" 未配置代理,使用直连")
# 初始化API实例传入代理以确保所有请求走代理
api = BossZhipinAPI(proxy_config=proxies)
# 自检模式:仅验证代理是否生效并退出
if os.getenv('RUN_SELFTEST') == '1':
print("🌐 当前IP(初始化后):", api.get_current_ip())
try:
r = api.session.get('http://v2.api.juliangip.com/v2/dps/ip_search?ip=0.0.0.0', timeout=10)
if r.status_code == 200:
data = r.json()
ip_v = data.get('data', {}).get('ip', '') if data.get('code') == 0 else ''
print("🌐 HTTPS出口IP(初始化后):", ip_v)
else:
print("❌ HTTPS出口IP获取失败: HTTP", r.status_code)
except Exception as e:
print("❌ HTTPS出口IP获取异常:", e)
try:
rbt = api.session.get('https://www.zhipin.com/robots.txt', timeout=10)
if rbt.status_code == 200:
print("🌐 目标域预检(初始化后): 成功")
else:
print("❌ 目标域预检(初始化后)失败: HTTP", rbt.status_code)
except Exception as e:
print("❌ 目标域预检(初始化后)异常:", e)
api.reinit_session()
print("🌐 当前IP(重建会话后):", api.get_current_ip())
try:
r2 = api.session.get('http://v2.api.juliangip.com/v2/dps/ip_search?ip=0.0.0.0', timeout=10)
if r2.status_code == 200:
data2 = r2.json()
ip_v2 = data2.get('data', {}).get('ip', '') if data2.get('code') == 0 else ''
print("🌐 HTTPS出口IP(重建会话后):", ip_v2)
else:
print("❌ HTTPS出口IP获取失败: HTTP", r2.status_code)
except Exception as e2:
print("❌ HTTPS出口IP获取异常:", e2)
try:
rbt2 = api.session.get('https://www.zhipin.com/robots.txt', timeout=10)
if rbt2.status_code == 200:
print("🌐 目标域预检(重建会话后): 成功")
else:
print("❌ 目标域预检(重建会话后)失败: HTTP", rbt2.status_code)
except Exception as e2:
print("❌ 目标域预检(重建会话后)异常:", e2)
sys.exit(0)
# 首先初始化会话
print("=== 初始化会话 ===")
try:
if not api.init_session():
print("会话初始化失败,但继续运行")
except Exception as e:
print(f"会话初始化异常: {e},继续运行")
# 使用最新的curl命令数据更新登录信息
print("\n=== 更新登录数据 ===")
try:
api.update_login_from_curl()
except Exception as e:
print(f"更新登录数据异常: {e},继续运行")
# 流式处理函数
def process_job_detail(job: Dict) -> None:
"""流式处理每个job的详情"""
try:
job_id = job.get('encryptJobId')
lid = job.get('lid', '')
security_id = job.get('securityId', '')
job_name = job.get('jobName', 'N/A')
company_name = job.get('brandName', 'N/A')
city_name = job.get('cityName') or ''
job_id_display = job_id or job.get('jobId') or ''
if city_name and job_id_display:
print(f"\n📋 处理职位: {city_name} | {job_name} | {job_id_display} @ {company_name}")
elif city_name:
print(f"\n📋 处理职位: {city_name} | {job_name} @ {company_name}")
elif job_id_display:
print(f"\n📋 处理职位: {job_name} | {job_id_display} @ {company_name}")
else:
print(f"\n📋 处理职位: {job_name} @ {company_name}")
if job_id:
job_detail = api.boss_batch_request(job_id, security_id,lid)
if job_detail and job_detail.get('code') == 0:
# 获取公司详情
company_id = job_detail.get('zpData', {}).get("/wapi/zpgeek/miniapp/job/detail.json",{}).get('zpData', {}).get('brandComInfoVO', {}).get('encryptBrandId')
if company_id:
try:
wait_time = sleep_random_between()
print(f"⏰ 等待 {int(wait_time)} 秒后开始职位详情...")
company_detail = api.get_company_detail_by_id(company_id)
if company_detail and company_detail.get('zpData'):
push_company(company_detail)
# 修复 企业信息---> 发送job 详情
if (job_detail and 'zpData' in job_detail and
'/wapi/zpgeek/miniapp/job/detail.json' in job_detail['zpData'] and
job_detail['zpData']['/wapi/zpgeek/miniapp/job/detail.json'] and
'zpData' in job_detail['zpData']['/wapi/zpgeek/miniapp/job/detail.json'] and
'brandComInfoVO' in job_detail['zpData']['/wapi/zpgeek/miniapp/job/detail.json']['zpData'] and
company_detail and 'zpData' in company_detail and
'companyFullInfoVO' in company_detail['zpData'] and
company_detail['zpData']['companyFullInfoVO'] and
'name' in company_detail['zpData']['companyFullInfoVO']):
job_detail['zpData']['/wapi/zpgeek/miniapp/job/detail.json']['zpData']['brandComInfoVO']['brandName'] = \
company_detail['zpData']['companyFullInfoVO']['name']
except Exception as e:
print(f" ⚠️ 获取公司详情失败: {e}")
try:
job_detail = job_detail.get("zpData",{}).get("/wapi/zpgeek/miniapp/job/detail.json",{})
push_job(job_detail)
except Exception as e:
print(f" ⚠️ 推送职位数据失败: {e}")
else:
err_msg = job_detail.get('message', '未知错误') if isinstance(job_detail, dict) else '请求失败'
print(f" ❌ 获取职位详情失败: {err_msg}")
if isinstance(job_detail, dict) and (job_detail.get('code') == 35 or 'IP地址存在异常' in err_msg):
self_reinit_ok = False
try:
api.reinit_session()
self_reinit_ok = True
except Exception:
pass
wait_time = sleep_random_between()
print(f" ⏳ 等待 {int(wait_time)} 秒后继续...{'(已重建会话)' if self_reinit_ok else ''}")
except Exception as e:
print(f" ⚠️ 处理职位详情异常: {e}")
# 死循环开始
print("\n🔄 开始死循环抓取模式...")
loop_count = 0
while True:
try:
loop_count += 1
print(f"\n{'=' * 50}")
print(f"🚀 第 {loop_count} 轮抓取开始")
print(f"{'=' * 50}")
crawl_params = fetch_service_params()
if not crawl_params:
print("⚠️ 服务未返回可用关键词,改用本地随机参数")
crawl_params = generate_random_params()
multi_page_params = crawl_params.copy()
multi_page_params['pageSize'] = int(os.getenv('PAGE_SIZE', '15')) # 每页条数
print(f"📊 搜索参数: {multi_page_params}")
# 执行抓取
max_pages = int(os.getenv('MAX_PAGES', '3'))
try:
api.get_job_list_multi_pages(multi_page_params, max_pages=max_pages, job_processor=process_job_detail)
print(f"✅ 第 {loop_count} 轮抓取完成")
except Exception as e:
print(f"❌ 第 {loop_count} 轮抓取失败: {e}")
# 如果是登录失效,尝试处理
if "登录" in str(e) or "login" in str(e).lower() or "401" in str(e):
try:
print("🔄 检测到登录失效,尝试处理...")
api.handle_login_expired()
except Exception as login_e:
print(f"⚠️ 处理登录失效失败: {login_e}")
# 随机等待时间(避免请求过于频繁)
wait_time = sleep_random_between()
print(f"⏰ 等待 {int(wait_time)} 秒后开始下一轮...")
except Exception as e:
print(f"❌ 循环异常: {e},跳过本轮,继续下一轮")
# 出现异常时等待更长时间
wait_time = sleep_random_between()
print(f"⏰ 异常恢复等待 {int(wait_time)} 秒...")
continue