Plan 01 - QUAL-02: 三平台解析函数单元测试: - tests/ingest/test_configs_boss.py: 10 个测试 (_extract_job_id, _extract_company_name, _build_boss_push) - tests/ingest/test_configs_qcwy.py: 12 个测试 (_extract_job_id, _extract_update_dt, _extract_company_name, _build_qcwy_push) - tests/ingest/test_configs_zhilian.py: 12 个测试 (_extract_number, _extract_fpt, _extract_company_name, _build_zhilian_push) Plan 02 - QUAL-06: 爬虫入库统计 API + 前端监控区域: - job.py: GET /job/data/stats 端点(总量/今日/最近入库时间/近7天趋势) - web/src/api/index.js: getIngestStats() 方法 - monitoring.vue: 新增爬虫职位入库统计区域(三平台卡片 + 趋势表格) - job.py: Optional 导入修复 QUAL-07: 确认 monitor.vue 已有完整清洗队列功能,无需改动 Full regression: 146 passed (112 existing + 34 new)
180 lines
6.1 KiB
Python
180 lines
6.1 KiB
Python
from typing import Dict, Any, Optional
|
|
|
|
from fastapi import APIRouter, BackgroundTasks, Depends
|
|
|
|
from app.core.clickhouse import clickhouse_manager
|
|
from app.schemas.ingest import (
|
|
IngestBatchRequest,
|
|
IngestSingleRequest,
|
|
PlatformType,
|
|
ChannelType,
|
|
DataType,
|
|
)
|
|
from app.services.ingest import IngestService
|
|
from app.log import logger
|
|
|
|
|
|
router = APIRouter(tags=["数据入库"])
|
|
|
|
|
|
async def get_ingest_service() -> IngestService:
|
|
client = await clickhouse_manager.get_client()
|
|
return IngestService(client)
|
|
|
|
|
|
@router.post("/data/store", summary="存储单条数据")
|
|
async def store_single(
|
|
request: IngestSingleRequest,
|
|
service: IngestService = Depends(get_ingest_service),
|
|
) -> Dict[str, Any]:
|
|
result = await service.store_single(
|
|
platform=request.platform.value,
|
|
channel=request.channel.value,
|
|
data_type=request.data_type.value,
|
|
data=request.data,
|
|
check_duplicate=request.check_duplicate,
|
|
)
|
|
return {"code": 200, "data": result, "message": "ok"}
|
|
|
|
|
|
@router.post("/data/batch", summary="批量存储数据")
|
|
@router.post("/data/batch-store", summary="批量存储数据(兼容)")
|
|
async def store_batch(
|
|
request: IngestBatchRequest,
|
|
service: IngestService = Depends(get_ingest_service),
|
|
) -> Dict[str, Any]:
|
|
result = await service.store_batch(
|
|
platform=request.platform.value,
|
|
channel=request.channel.value,
|
|
data_type=request.data_type.value,
|
|
data_list=request.data_list,
|
|
check_duplicate=request.check_duplicate,
|
|
)
|
|
return {
|
|
"code": 200,
|
|
"data": result,
|
|
"message": f"批量处理完成: 成功 {result['success']} 条, 重复 {result['duplicate']} 条, 失败 {result['failed']} 条",
|
|
}
|
|
|
|
|
|
@router.post("/data/batch-async", summary="异步批量存储数据")
|
|
@router.post("/data/batch-store-async", summary="异步批量存储数据(兼容)")
|
|
async def store_batch_async(
|
|
request: IngestBatchRequest,
|
|
background_tasks: BackgroundTasks,
|
|
service: IngestService = Depends(get_ingest_service),
|
|
) -> Dict[str, Any]:
|
|
platform_names = {"boss": "Boss直聘", "qcwy": "前程无忧", "zhilian": "智联招聘"}
|
|
name = platform_names.get(request.platform.value, request.platform.value)
|
|
logger.info(f"收到批量请求: [{name}] {request.data_type.value} x{len(request.data_list)} 条")
|
|
|
|
async def _task():
|
|
r = await service.store_batch(
|
|
platform=request.platform.value,
|
|
channel=request.channel.value,
|
|
data_type=request.data_type.value,
|
|
data_list=request.data_list,
|
|
check_duplicate=request.check_duplicate,
|
|
)
|
|
logger.info(
|
|
f"批量处理完成: [{name}] 成功 {r['success']} 条, 重复 {r['duplicate']} 条, 失败 {r['failed']} 条"
|
|
)
|
|
|
|
background_tasks.add_task(_task)
|
|
return {
|
|
"code": 202,
|
|
"message": f"批量数据已加入异步处理队列,共 {len(request.data_list)} 条",
|
|
"platform": request.platform,
|
|
"data_type": request.data_type,
|
|
}
|
|
|
|
|
|
@router.get("/data", summary="查询数据")
|
|
async def query_data(
|
|
platform: PlatformType,
|
|
data_type: DataType,
|
|
channel: ChannelType = ChannelType.MINI,
|
|
page: int = 1,
|
|
page_size: int = 20,
|
|
service: IngestService = Depends(get_ingest_service),
|
|
) -> Dict[str, Any]:
|
|
offset = (page - 1) * page_size
|
|
result = await service.query_data(
|
|
platform=platform.value,
|
|
channel=channel.value,
|
|
data_type=data_type.value,
|
|
limit=page_size,
|
|
offset=offset,
|
|
)
|
|
return {
|
|
"code": 200,
|
|
"data": {
|
|
"items": result.get("data", []),
|
|
"total": result.get("count", 0),
|
|
"page": page,
|
|
"page_size": page_size,
|
|
},
|
|
}
|
|
|
|
|
|
@router.get("/platforms", summary="获取注册表信息")
|
|
async def get_platforms() -> Dict[str, Any]:
|
|
return {"code": 200, "data": IngestService.get_registry_info()}
|
|
|
|
|
|
@router.get("/data/stats", summary="各平台爬虫入库统计")
|
|
async def get_ingest_stats(
|
|
platform: Optional[PlatformType] = None,
|
|
days: int = 7,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
查询各平台 ClickHouse 入库统计:总量、今日新增、最近入库时间、近 N 天每日趋势。
|
|
用于前端爬虫监控页面展示。
|
|
"""
|
|
from app.core.clickhouse import clickhouse_manager
|
|
|
|
client = await clickhouse_manager.get_client()
|
|
|
|
platforms = [platform.value] if platform else ["boss", "qcwy", "zhilian"]
|
|
table_map = {"boss": "boss_job", "qcwy": "qcwy_job", "zhilian": "zhilian_job"}
|
|
|
|
result: Dict[str, Any] = {}
|
|
for p in platforms:
|
|
table = f"job_data.{table_map[p]}"
|
|
try:
|
|
r_total = await client.query(f"SELECT count() FROM {table}")
|
|
total = r_total.result_rows[0][0] if r_total.result_rows else 0
|
|
|
|
r_today = await client.query(
|
|
f"SELECT count() FROM {table} WHERE toDate(created_at) = today()"
|
|
)
|
|
today = r_today.result_rows[0][0] if r_today.result_rows else 0
|
|
|
|
r_last = await client.query(f"SELECT max(created_at) FROM {table}")
|
|
last_at_raw = r_last.result_rows[0][0] if r_last.result_rows else None
|
|
last_at = str(last_at_raw).split(".")[0] if last_at_raw else None
|
|
|
|
r_daily = await client.query(
|
|
f"SELECT toDate(created_at) AS day, count() AS cnt "
|
|
f"FROM {table} "
|
|
f"WHERE created_at >= today() - {days} "
|
|
f"GROUP BY day ORDER BY day DESC"
|
|
)
|
|
daily_counts = [
|
|
{"date": str(row[0]), "count": int(row[1])}
|
|
for row in r_daily.result_rows
|
|
]
|
|
|
|
result[p] = {
|
|
"total": total,
|
|
"today": today,
|
|
"last_ingest_at": last_at,
|
|
"daily_counts": daily_counts,
|
|
}
|
|
except Exception as e:
|
|
logger.warning(f"stats query failed for {p}: {e}")
|
|
result[p] = {"total": 0, "today": 0, "last_ingest_at": None, "daily_counts": [], "error": str(e)}
|
|
|
|
return {"code": 200, "data": result}
|
|
|