JobData/app/api/v1/job/job.py
win 6c8eb00a50 feat(06): quality & frontend (QUAL-02, QUAL-06)
Plan 01 - QUAL-02: 三平台解析函数单元测试:
- tests/ingest/test_configs_boss.py: 10 个测试
  (_extract_job_id, _extract_company_name, _build_boss_push)
- tests/ingest/test_configs_qcwy.py: 12 个测试
  (_extract_job_id, _extract_update_dt, _extract_company_name, _build_qcwy_push)
- tests/ingest/test_configs_zhilian.py: 12 个测试
  (_extract_number, _extract_fpt, _extract_company_name, _build_zhilian_push)

Plan 02 - QUAL-06: 爬虫入库统计 API + 前端监控区域:
- job.py: GET /job/data/stats 端点(总量/今日/最近入库时间/近7天趋势)
- web/src/api/index.js: getIngestStats() 方法
- monitoring.vue: 新增爬虫职位入库统计区域(三平台卡片 + 趋势表格)
- job.py: Optional 导入修复

QUAL-07: 确认 monitor.vue 已有完整清洗队列功能,无需改动

Full regression: 146 passed (112 existing + 34 new)
2026-03-21 22:56:24 +08:00

180 lines
6.1 KiB
Python

from typing import Dict, Any, Optional
from fastapi import APIRouter, BackgroundTasks, Depends
from app.core.clickhouse import clickhouse_manager
from app.schemas.ingest import (
IngestBatchRequest,
IngestSingleRequest,
PlatformType,
ChannelType,
DataType,
)
from app.services.ingest import IngestService
from app.log import logger
router = APIRouter(tags=["数据入库"])
async def get_ingest_service() -> IngestService:
client = await clickhouse_manager.get_client()
return IngestService(client)
@router.post("/data/store", summary="存储单条数据")
async def store_single(
request: IngestSingleRequest,
service: IngestService = Depends(get_ingest_service),
) -> Dict[str, Any]:
result = await service.store_single(
platform=request.platform.value,
channel=request.channel.value,
data_type=request.data_type.value,
data=request.data,
check_duplicate=request.check_duplicate,
)
return {"code": 200, "data": result, "message": "ok"}
@router.post("/data/batch", summary="批量存储数据")
@router.post("/data/batch-store", summary="批量存储数据(兼容)")
async def store_batch(
request: IngestBatchRequest,
service: IngestService = Depends(get_ingest_service),
) -> Dict[str, Any]:
result = await service.store_batch(
platform=request.platform.value,
channel=request.channel.value,
data_type=request.data_type.value,
data_list=request.data_list,
check_duplicate=request.check_duplicate,
)
return {
"code": 200,
"data": result,
"message": f"批量处理完成: 成功 {result['success']} 条, 重复 {result['duplicate']} 条, 失败 {result['failed']}",
}
@router.post("/data/batch-async", summary="异步批量存储数据")
@router.post("/data/batch-store-async", summary="异步批量存储数据(兼容)")
async def store_batch_async(
request: IngestBatchRequest,
background_tasks: BackgroundTasks,
service: IngestService = Depends(get_ingest_service),
) -> Dict[str, Any]:
platform_names = {"boss": "Boss直聘", "qcwy": "前程无忧", "zhilian": "智联招聘"}
name = platform_names.get(request.platform.value, request.platform.value)
logger.info(f"收到批量请求: [{name}] {request.data_type.value} x{len(request.data_list)}")
async def _task():
r = await service.store_batch(
platform=request.platform.value,
channel=request.channel.value,
data_type=request.data_type.value,
data_list=request.data_list,
check_duplicate=request.check_duplicate,
)
logger.info(
f"批量处理完成: [{name}] 成功 {r['success']} 条, 重复 {r['duplicate']} 条, 失败 {r['failed']}"
)
background_tasks.add_task(_task)
return {
"code": 202,
"message": f"批量数据已加入异步处理队列,共 {len(request.data_list)}",
"platform": request.platform,
"data_type": request.data_type,
}
@router.get("/data", summary="查询数据")
async def query_data(
platform: PlatformType,
data_type: DataType,
channel: ChannelType = ChannelType.MINI,
page: int = 1,
page_size: int = 20,
service: IngestService = Depends(get_ingest_service),
) -> Dict[str, Any]:
offset = (page - 1) * page_size
result = await service.query_data(
platform=platform.value,
channel=channel.value,
data_type=data_type.value,
limit=page_size,
offset=offset,
)
return {
"code": 200,
"data": {
"items": result.get("data", []),
"total": result.get("count", 0),
"page": page,
"page_size": page_size,
},
}
@router.get("/platforms", summary="获取注册表信息")
async def get_platforms() -> Dict[str, Any]:
return {"code": 200, "data": IngestService.get_registry_info()}
@router.get("/data/stats", summary="各平台爬虫入库统计")
async def get_ingest_stats(
platform: Optional[PlatformType] = None,
days: int = 7,
) -> Dict[str, Any]:
"""
查询各平台 ClickHouse 入库统计:总量、今日新增、最近入库时间、近 N 天每日趋势。
用于前端爬虫监控页面展示。
"""
from app.core.clickhouse import clickhouse_manager
client = await clickhouse_manager.get_client()
platforms = [platform.value] if platform else ["boss", "qcwy", "zhilian"]
table_map = {"boss": "boss_job", "qcwy": "qcwy_job", "zhilian": "zhilian_job"}
result: Dict[str, Any] = {}
for p in platforms:
table = f"job_data.{table_map[p]}"
try:
r_total = await client.query(f"SELECT count() FROM {table}")
total = r_total.result_rows[0][0] if r_total.result_rows else 0
r_today = await client.query(
f"SELECT count() FROM {table} WHERE toDate(created_at) = today()"
)
today = r_today.result_rows[0][0] if r_today.result_rows else 0
r_last = await client.query(f"SELECT max(created_at) FROM {table}")
last_at_raw = r_last.result_rows[0][0] if r_last.result_rows else None
last_at = str(last_at_raw).split(".")[0] if last_at_raw else None
r_daily = await client.query(
f"SELECT toDate(created_at) AS day, count() AS cnt "
f"FROM {table} "
f"WHERE created_at >= today() - {days} "
f"GROUP BY day ORDER BY day DESC"
)
daily_counts = [
{"date": str(row[0]), "count": int(row[1])}
for row in r_daily.result_rows
]
result[p] = {
"total": total,
"today": today,
"last_ingest_at": last_at,
"daily_counts": daily_counts,
}
except Exception as e:
logger.warning(f"stats query failed for {p}: {e}")
result[p] = {"total": 0, "today": 0, "last_ingest_at": None, "daily_counts": [], "error": str(e)}
return {"code": 200, "data": result}