from typing import Dict, Any, Optional from fastapi import APIRouter, BackgroundTasks, Depends from app.core.clickhouse import clickhouse_manager from app.schemas.ingest import ( IngestBatchRequest, IngestSingleRequest, PlatformType, ChannelType, DataType, ) from app.services.ingest import IngestService from app.log import logger router = APIRouter(tags=["数据入库"]) async def get_ingest_service() -> IngestService: client = await clickhouse_manager.get_client() return IngestService(client) @router.post("/data/store", summary="存储单条数据") async def store_single( request: IngestSingleRequest, service: IngestService = Depends(get_ingest_service), ) -> Dict[str, Any]: result = await service.store_single( platform=request.platform.value, channel=request.channel.value, data_type=request.data_type.value, data=request.data, check_duplicate=request.check_duplicate, ) return {"code": 200, "data": result, "message": "ok"} @router.post("/data/batch", summary="批量存储数据") @router.post("/data/batch-store", summary="批量存储数据(兼容)") async def store_batch( request: IngestBatchRequest, service: IngestService = Depends(get_ingest_service), ) -> Dict[str, Any]: result = await service.store_batch( platform=request.platform.value, channel=request.channel.value, data_type=request.data_type.value, data_list=request.data_list, check_duplicate=request.check_duplicate, ) return { "code": 200, "data": result, "message": f"批量处理完成: 成功 {result['success']} 条, 重复 {result['duplicate']} 条, 失败 {result['failed']} 条", } @router.post("/data/batch-async", summary="异步批量存储数据") @router.post("/data/batch-store-async", summary="异步批量存储数据(兼容)") async def store_batch_async( request: IngestBatchRequest, background_tasks: BackgroundTasks, service: IngestService = Depends(get_ingest_service), ) -> Dict[str, Any]: platform_names = {"boss": "Boss直聘", "qcwy": "前程无忧", "zhilian": "智联招聘"} name = platform_names.get(request.platform.value, request.platform.value) logger.info(f"收到批量请求: [{name}] {request.data_type.value} x{len(request.data_list)} 条") async def _task(): r = await service.store_batch( platform=request.platform.value, channel=request.channel.value, data_type=request.data_type.value, data_list=request.data_list, check_duplicate=request.check_duplicate, ) logger.info( f"批量处理完成: [{name}] 成功 {r['success']} 条, 重复 {r['duplicate']} 条, 失败 {r['failed']} 条" ) background_tasks.add_task(_task) return { "code": 202, "message": f"批量数据已加入异步处理队列,共 {len(request.data_list)} 条", "platform": request.platform, "data_type": request.data_type, } @router.get("/data", summary="查询数据") async def query_data( platform: PlatformType, data_type: DataType, channel: ChannelType = ChannelType.MINI, page: int = 1, page_size: int = 20, service: IngestService = Depends(get_ingest_service), ) -> Dict[str, Any]: offset = (page - 1) * page_size result = await service.query_data( platform=platform.value, channel=channel.value, data_type=data_type.value, limit=page_size, offset=offset, ) return { "code": 200, "data": { "items": result.get("data", []), "total": result.get("count", 0), "page": page, "page_size": page_size, }, } @router.get("/platforms", summary="获取注册表信息") async def get_platforms() -> Dict[str, Any]: return {"code": 200, "data": IngestService.get_registry_info()} @router.get("/data/stats", summary="各平台爬虫入库统计") async def get_ingest_stats( platform: Optional[PlatformType] = None, days: int = 7, ) -> Dict[str, Any]: """ 查询各平台 ClickHouse 入库统计:总量、今日新增、最近入库时间、近 N 天每日趋势。 用于前端爬虫监控页面展示。 """ from app.core.clickhouse import clickhouse_manager client = await clickhouse_manager.get_client() platforms = [platform.value] if platform else ["boss", "qcwy", "zhilian"] table_map = {"boss": "boss_job", "qcwy": "qcwy_job", "zhilian": "zhilian_job"} result: Dict[str, Any] = {} for p in platforms: table = f"job_data.{table_map[p]}" try: r_total = await client.query(f"SELECT count() FROM {table}") total = r_total.result_rows[0][0] if r_total.result_rows else 0 r_today = await client.query( f"SELECT count() FROM {table} WHERE toDate(created_at) = today()" ) today = r_today.result_rows[0][0] if r_today.result_rows else 0 r_last = await client.query(f"SELECT max(created_at) FROM {table}") last_at_raw = r_last.result_rows[0][0] if r_last.result_rows else None last_at = str(last_at_raw).split(".")[0] if last_at_raw else None r_daily = await client.query( f"SELECT toDate(created_at) AS day, count() AS cnt " f"FROM {table} " f"WHERE created_at >= today() - {days} " f"GROUP BY day ORDER BY day DESC" ) daily_counts = [ {"date": str(row[0]), "count": int(row[1])} for row in r_daily.result_rows ] result[p] = { "total": total, "today": today, "last_ingest_at": last_at, "daily_counts": daily_counts, } except Exception as e: logger.warning(f"stats query failed for {p}: {e}") result[p] = {"total": 0, "today": 0, "last_ingest_at": None, "daily_counts": [], "error": str(e)} return {"code": 200, "data": result}