JobData/app/services/ingest/company_enrichment.py
win 24918a272b feat: 爬虫优化 — company_desc 补全、Boss详情获取、URL修复
- 新增 company_enrichment.py: job 入库时自动补全 company_desc
  (优先查 MySQL,fallback 调平台 API 获取并入库)
- Boss 爬虫: 搜索列表后逐条调 batch 详情接口拿完整数据
  (jobBaseInfoVO/brandComInfoVO),每条获取后立即上报
- Boss push_mapper: 兼容新旧两种 API 格式(扁平/嵌套VO)
- Boss token: 启动时自动从后端 API 读取数据库中的 mpt/wt2
- Boss client: header 值 strip 防止空格导致请求失败
- qcwy URL: 用 jobId/coId 拼接 jobs.51job.com 格式
- 三个平台 max_pages 默认改为 100
2026-03-22 21:54:19 +08:00

177 lines
6.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Job 入库时自动补全 company_desc
每个平台独立处理:
- 从原始 job 数据提取 source_company_id各平台 key 不同)
- 批量查 MySQL {Platform}Company 表
- 找到 → 直接填入 company_desc
- 没找到 → 调平台 API 获取公司详情 → 写入 MySQL → 回填 company_desc
"""
from __future__ import annotations
import asyncio
from typing import Any, Callable, Dict, List, Optional
from app.log import logger
from app.services.company_storage import company_storage, normalize_company_id
# ─────────────────────────────────────────────
# 各平台:从 raw job data 提取 source_company_id
# ─────────────────────────────────────────────
def _zhilian_company_id(raw: Dict[str, Any]) -> Optional[str]:
"""智联用 companyNumber (如 CZL1227180200)"""
return raw.get("companyNumber") or raw.get("rootCompanyNumber")
def _boss_company_id(raw: Dict[str, Any]) -> Optional[str]:
"""Boss 用 encryptBrandId"""
brand = raw.get("brandComInfoVO") or {}
val = brand.get("encryptBrandId")
if val:
return str(val)
bid = raw.get("brandId")
return str(bid) if bid else None
def _qcwy_company_id(raw: Dict[str, Any]) -> Optional[str]:
"""前程无忧用 coId"""
val = raw.get("coId")
return str(val) if val else None
_EXTRACTORS: Dict[str, Callable[[Dict[str, Any]], Optional[str]]] = {
"zhilian": _zhilian_company_id,
"boss": _boss_company_id,
"qcwy": _qcwy_company_id,
}
# 单批次最大 API 调用数,防止拖慢入库
_MAX_API_CALLS_PER_BATCH = 5
# ─────────────────────────────────────────────
# 主入口
# ─────────────────────────────────────────────
async def enrich_company_desc(
platform: str,
push_data_list: List[Dict[str, Any]],
) -> int:
"""对 push_data_list 中 company_desc 为空的记录补全公司简介
直接修改 push_data_list 中的 dict。
Returns: 补全成功的数量
"""
extractor = _EXTRACTORS.get(platform)
if not extractor:
return 0
# 1. 找出需要补全的记录
need_enrich: List[tuple[int, str]] = [] # (index_in_list, normalized_company_id)
for i, item in enumerate(push_data_list):
if item.get("company_desc"):
continue
raw = item.get("base_data") or {}
cid = extractor(raw)
if not cid:
continue
normalized = normalize_company_id(platform, cid)
if normalized:
need_enrich.append((i, normalized))
if not need_enrich:
return 0
# 2. 去重,批量查 MySQL
unique_ids = list({cid for _, cid in need_enrich})
desc_map = await _batch_lookup_mysql(platform, unique_ids)
# 3. MySQL 中没有的公司 → 调 API 获取(限制数量)
missing_ids = [cid for cid in unique_ids if cid not in desc_map]
if missing_ids:
api_ids = missing_ids[:_MAX_API_CALLS_PER_BATCH]
if len(missing_ids) > _MAX_API_CALLS_PER_BATCH:
logger.info(
f"[enrichment] {platform} 待获取公司 {len(missing_ids)} 个,"
f"本批次限制 {_MAX_API_CALLS_PER_BATCH}"
)
fetched = await _fetch_and_store_companies(platform, api_ids)
desc_map.update(fetched)
# 4. 回填 company_desc
enriched = 0
for idx, cid in need_enrich:
desc = desc_map.get(cid)
if desc:
push_data_list[idx]["company_desc"] = desc
enriched += 1
if enriched:
logger.info(f"[enrichment] {platform} 补全 company_desc: {enriched}/{len(need_enrich)}")
return enriched
# ─────────────────────────────────────────────
# MySQL 批量查询
# ─────────────────────────────────────────────
async def _batch_lookup_mysql(
platform: str,
company_ids: List[str],
) -> Dict[str, str]:
"""批量查 MySQL {Platform}Company 表,返回 {source_company_id: description}"""
if not company_ids:
return {}
model = company_storage.company_model(platform)
rows = await model.filter(
source_company_id__in=company_ids,
).values_list("source_company_id", "description")
return {
str(sid): str(desc)
for sid, desc in rows
if desc
}
# ─────────────────────────────────────────────
# API fallback获取公司详情并写入 MySQL
# ─────────────────────────────────────────────
async def _fetch_and_store_companies(
platform: str,
company_ids: List[str],
) -> Dict[str, str]:
"""逐个调平台 API 获取公司详情,写入 MySQL返回 {company_id: description}"""
# 延迟导入避免循环依赖
from app.services.company_cleaner import company_cleaner
result: Dict[str, str] = {}
for cid in company_ids:
try:
# boss 需要先加载 token
if platform == "boss":
await company_cleaner._ensure_boss_token_loaded()
data = await company_cleaner._fetch_company_data(platform, cid)
if not data:
continue
upsert_result = await company_storage.upsert_company(
platform, data, company_id=cid,
)
record = upsert_result.get("record")
if record and record.description:
result[cid] = record.description
logger.info(f"[enrichment] {platform} 公司 {cid} 从 API 获取成功")
except Exception as e:
logger.warning(f"[enrichment] {platform} 公司 {cid} API 获取失败: {e}")
return result