win 24918a272b feat: 爬虫优化 — company_desc 补全、Boss详情获取、URL修复
- 新增 company_enrichment.py: job 入库时自动补全 company_desc
  (优先查 MySQL,fallback 调平台 API 获取并入库)
- Boss 爬虫: 搜索列表后逐条调 batch 详情接口拿完整数据
  (jobBaseInfoVO/brandComInfoVO),每条获取后立即上报
- Boss push_mapper: 兼容新旧两种 API 格式(扁平/嵌套VO)
- Boss token: 启动时自动从后端 API 读取数据库中的 mpt/wt2
- Boss client: header 值 strip 防止空格导致请求失败
- qcwy URL: 用 jobId/coId 拼接 jobs.51job.com 格式
- 三个平台 max_pages 默认改为 100
2026-03-22 21:54:19 +08:00

123 lines
3.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Boss直聘 HTTP 客户端
在通用 HTTPClient 上叠加 Boss 特有的 headers 和 Traceid 注入
"""
from __future__ import annotations
from typing import Any, Optional
from crawler_core.http_client import HTTPClient
from crawler_core.boss.sign import BossSign
BASE_URL = "https://www.zhipin.com"
# Boss 小程序特有的默认请求头
BOSS_HEADERS = {
"content-type": "application/x-www-form-urlencoded",
"user-agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 "
"MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI "
"MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.7(0x13080712) "
"UnifiedPCMacWechat(0xf2641702) XWEB/18788"
),
"x-requested-with": "XMLHttpRequest",
"xweb_xhr": "1",
"zp_app_id": "10002",
"zp_product_id": "10002",
"ver": "14.0400",
"mini_ver": "14.0400",
"platform": "zhipin/mac",
"ua": '{"model":"Mac16,8","platform":"mac"}',
"scene": "1256",
"sec-fetch-site": "cross-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"referer": "https://servicewechat.com/wxa8da525af05281f3/601/page-frame.html",
"accept-language": "zh-CN,zh;q=0.9",
}
class BossClient(HTTPClient):
"""
Boss直聘 HTTP 客户端
继承通用 HTTPClient每次请求自动注入 Traceid
Args:
signer: BossSign 实例(可选)
tunnel_proxy: 隧道代理地址(每次请求自动换 IP
proxy: 固定代理地址
proxy_pool: 代理池列表
timeout: 请求超时秒数
"""
def __init__(
self,
signer: Optional[BossSign] = None,
tunnel_proxy: Optional[str] = None,
proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
timeout: int = 10,
):
super().__init__(
base_url=BASE_URL,
default_headers=BOSS_HEADERS,
tunnel_proxy=tunnel_proxy,
proxy=proxy,
proxy_pool=proxy_pool,
timeout=timeout,
)
self.signer = signer or BossSign()
def _boss_headers(self) -> dict:
"""构造每次请求需要动态更新的 Boss 请求头"""
return {
"mpt": (self.signer.mpt or "").strip(),
"wt2": (self.signer.wt2 or "").strip(),
"Traceid": BossSign.generate_traceid("M-W"),
}
def post(self, path: str, body: dict, headers: Optional[dict] = None) -> tuple[int, Any]:
"""POST 请求,自动注入 Boss headers"""
boss_h = self._boss_headers()
if headers:
boss_h.update(headers)
return super().post(path, body, boss_h)
def get(self, path: str, params: Optional[dict] = None, headers: Optional[dict] = None) -> tuple[int, Any]:
"""GET 请求,自动注入 Boss headers"""
boss_h = self._boss_headers()
if headers:
boss_h.update(headers)
return super().get(path, params, boss_h)
def batch(self, sub_reqs: list[dict]) -> tuple[int, Any]:
"""
批量请求 /wapi/batch/requests
Args:
sub_reqs: 子请求列表, 每个元素格式:
{"path": "/wapi/...", "method": "GET", "query": "key=val&..."}
Returns:
(http_code, response_json)
"""
body = {"subReqs": sub_reqs, "appId": 10002}
return self.post(
"/wapi/batch/requests",
body,
headers={"content-type": "application/json"},
)
def create_client(
signer: Optional[BossSign] = None,
tunnel_proxy: Optional[str] = None,
proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
) -> BossClient:
"""创建 Boss 客户端"""
return BossClient(signer=signer, tunnel_proxy=tunnel_proxy, proxy=proxy, proxy_pool=proxy_pool)