- 新增 company_enrichment.py: job 入库时自动补全 company_desc (优先查 MySQL,fallback 调平台 API 获取并入库) - Boss 爬虫: 搜索列表后逐条调 batch 详情接口拿完整数据 (jobBaseInfoVO/brandComInfoVO),每条获取后立即上报 - Boss push_mapper: 兼容新旧两种 API 格式(扁平/嵌套VO) - Boss token: 启动时自动从后端 API 读取数据库中的 mpt/wt2 - Boss client: header 值 strip 防止空格导致请求失败 - qcwy URL: 用 jobId/coId 拼接 jobs.51job.com 格式 - 三个平台 max_pages 默认改为 100
177 lines
6.1 KiB
Python
177 lines
6.1 KiB
Python
"""
|
||
Job 入库时自动补全 company_desc
|
||
|
||
每个平台独立处理:
|
||
- 从原始 job 数据提取 source_company_id(各平台 key 不同)
|
||
- 批量查 MySQL {Platform}Company 表
|
||
- 找到 → 直接填入 company_desc
|
||
- 没找到 → 调平台 API 获取公司详情 → 写入 MySQL → 回填 company_desc
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
from typing import Any, Callable, Dict, List, Optional
|
||
|
||
from app.log import logger
|
||
from app.services.company_storage import company_storage, normalize_company_id
|
||
|
||
|
||
# ─────────────────────────────────────────────
|
||
# 各平台:从 raw job data 提取 source_company_id
|
||
# ─────────────────────────────────────────────
|
||
|
||
def _zhilian_company_id(raw: Dict[str, Any]) -> Optional[str]:
|
||
"""智联用 companyNumber (如 CZL1227180200)"""
|
||
return raw.get("companyNumber") or raw.get("rootCompanyNumber")
|
||
|
||
|
||
def _boss_company_id(raw: Dict[str, Any]) -> Optional[str]:
|
||
"""Boss 用 encryptBrandId"""
|
||
brand = raw.get("brandComInfoVO") or {}
|
||
val = brand.get("encryptBrandId")
|
||
if val:
|
||
return str(val)
|
||
bid = raw.get("brandId")
|
||
return str(bid) if bid else None
|
||
|
||
|
||
def _qcwy_company_id(raw: Dict[str, Any]) -> Optional[str]:
|
||
"""前程无忧用 coId"""
|
||
val = raw.get("coId")
|
||
return str(val) if val else None
|
||
|
||
|
||
_EXTRACTORS: Dict[str, Callable[[Dict[str, Any]], Optional[str]]] = {
|
||
"zhilian": _zhilian_company_id,
|
||
"boss": _boss_company_id,
|
||
"qcwy": _qcwy_company_id,
|
||
}
|
||
|
||
# 单批次最大 API 调用数,防止拖慢入库
|
||
_MAX_API_CALLS_PER_BATCH = 5
|
||
|
||
|
||
# ─────────────────────────────────────────────
|
||
# 主入口
|
||
# ─────────────────────────────────────────────
|
||
|
||
async def enrich_company_desc(
|
||
platform: str,
|
||
push_data_list: List[Dict[str, Any]],
|
||
) -> int:
|
||
"""对 push_data_list 中 company_desc 为空的记录补全公司简介
|
||
|
||
直接修改 push_data_list 中的 dict。
|
||
Returns: 补全成功的数量
|
||
"""
|
||
extractor = _EXTRACTORS.get(platform)
|
||
if not extractor:
|
||
return 0
|
||
|
||
# 1. 找出需要补全的记录
|
||
need_enrich: List[tuple[int, str]] = [] # (index_in_list, normalized_company_id)
|
||
for i, item in enumerate(push_data_list):
|
||
if item.get("company_desc"):
|
||
continue
|
||
raw = item.get("base_data") or {}
|
||
cid = extractor(raw)
|
||
if not cid:
|
||
continue
|
||
normalized = normalize_company_id(platform, cid)
|
||
if normalized:
|
||
need_enrich.append((i, normalized))
|
||
|
||
if not need_enrich:
|
||
return 0
|
||
|
||
# 2. 去重,批量查 MySQL
|
||
unique_ids = list({cid for _, cid in need_enrich})
|
||
desc_map = await _batch_lookup_mysql(platform, unique_ids)
|
||
|
||
# 3. MySQL 中没有的公司 → 调 API 获取(限制数量)
|
||
missing_ids = [cid for cid in unique_ids if cid not in desc_map]
|
||
if missing_ids:
|
||
api_ids = missing_ids[:_MAX_API_CALLS_PER_BATCH]
|
||
if len(missing_ids) > _MAX_API_CALLS_PER_BATCH:
|
||
logger.info(
|
||
f"[enrichment] {platform} 待获取公司 {len(missing_ids)} 个,"
|
||
f"本批次限制 {_MAX_API_CALLS_PER_BATCH} 个"
|
||
)
|
||
fetched = await _fetch_and_store_companies(platform, api_ids)
|
||
desc_map.update(fetched)
|
||
|
||
# 4. 回填 company_desc
|
||
enriched = 0
|
||
for idx, cid in need_enrich:
|
||
desc = desc_map.get(cid)
|
||
if desc:
|
||
push_data_list[idx]["company_desc"] = desc
|
||
enriched += 1
|
||
|
||
if enriched:
|
||
logger.info(f"[enrichment] {platform} 补全 company_desc: {enriched}/{len(need_enrich)}")
|
||
|
||
return enriched
|
||
|
||
|
||
# ─────────────────────────────────────────────
|
||
# MySQL 批量查询
|
||
# ─────────────────────────────────────────────
|
||
|
||
async def _batch_lookup_mysql(
|
||
platform: str,
|
||
company_ids: List[str],
|
||
) -> Dict[str, str]:
|
||
"""批量查 MySQL {Platform}Company 表,返回 {source_company_id: description}"""
|
||
if not company_ids:
|
||
return {}
|
||
|
||
model = company_storage.company_model(platform)
|
||
rows = await model.filter(
|
||
source_company_id__in=company_ids,
|
||
).values_list("source_company_id", "description")
|
||
|
||
return {
|
||
str(sid): str(desc)
|
||
for sid, desc in rows
|
||
if desc
|
||
}
|
||
|
||
|
||
# ─────────────────────────────────────────────
|
||
# API fallback:获取公司详情并写入 MySQL
|
||
# ─────────────────────────────────────────────
|
||
|
||
async def _fetch_and_store_companies(
|
||
platform: str,
|
||
company_ids: List[str],
|
||
) -> Dict[str, str]:
|
||
"""逐个调平台 API 获取公司详情,写入 MySQL,返回 {company_id: description}"""
|
||
# 延迟导入避免循环依赖
|
||
from app.services.company_cleaner import company_cleaner
|
||
|
||
result: Dict[str, str] = {}
|
||
|
||
for cid in company_ids:
|
||
try:
|
||
# boss 需要先加载 token
|
||
if platform == "boss":
|
||
await company_cleaner._ensure_boss_token_loaded()
|
||
|
||
data = await company_cleaner._fetch_company_data(platform, cid)
|
||
if not data:
|
||
continue
|
||
|
||
upsert_result = await company_storage.upsert_company(
|
||
platform, data, company_id=cid,
|
||
)
|
||
record = upsert_result.get("record")
|
||
if record and record.description:
|
||
result[cid] = record.description
|
||
logger.info(f"[enrichment] {platform} 公司 {cid} 从 API 获取成功")
|
||
except Exception as e:
|
||
logger.warning(f"[enrichment] {platform} 公司 {cid} API 获取失败: {e}")
|
||
|
||
return result
|