from datetime import datetime import json from typing import Optional from fastapi import APIRouter, File, UploadFile, Form, Body, Query from tortoise.expressions import Q from tortoise.functions import Count, Sum from app.models.cleaning import CleaningTask from app.models.company import CompanyCleaningQueue from app.schemas import Success, SuccessExtra from app.services.cleaning import CleaningService from app.services.company_cleaner import company_cleaner from app.services.company_storage import company_storage, normalize_company_id from app.controllers.cleaning import cleaning_controller router = APIRouter() cleaning_service = CleaningService() COMPANY_SOURCES = ("boss", "qcwy", "zhilian") @router.get("/stats", summary="获取公司清洗统计信息") async def get_stats(): """获取 MySQL 中待处理公司的统计信息""" pending_count = await CompanyCleaningQueue.filter(status="pending").count() today_count = await CompanyCleaningQueue.filter( status="done", updated_at__gte=datetime.now().replace(hour=0, minute=0, second=0, microsecond=0), ).count() dist_rows = await CompanyCleaningQueue.annotate(total=Count("id")).group_by("source", "status").values( "source", "status", "total", ) stats = { "total_pending": pending_count, "today_processed": today_count, "details": [] } source_stats = { source: { "pending": 0, "processing": 0, "done": 0, "failed": 0, "total": 0, "jobs_fetched": 0, "jobs_stored": 0, "jobs_duplicate": 0, "jobs_failed": 0, } for source in COMPANY_SOURCES } for row in dist_rows: source = row["source"] status = row["status"] count = row["total"] if source not in source_stats: continue if status in source_stats[source]: source_stats[source][status] = count source_stats[source]["total"] += count job_dist_rows = await CompanyCleaningQueue.annotate( jobs_fetched_sum=Sum("jobs_fetched"), jobs_stored_sum=Sum("jobs_stored"), jobs_duplicate_sum=Sum("jobs_duplicate"), jobs_failed_sum=Sum("jobs_failed"), ).group_by("source").values( "source", "jobs_fetched_sum", "jobs_stored_sum", "jobs_duplicate_sum", "jobs_failed_sum", ) for row in job_dist_rows: source = row["source"] if source not in source_stats: continue source_stats[source]["jobs_fetched"] = int(row["jobs_fetched_sum"] or 0) source_stats[source]["jobs_stored"] = int(row["jobs_stored_sum"] or 0) source_stats[source]["jobs_duplicate"] = int(row["jobs_duplicate_sum"] or 0) source_stats[source]["jobs_failed"] = int(row["jobs_failed_sum"] or 0) stats["details"] = [{"source": k, **v} for k, v in source_stats.items()] return Success(data=stats) @router.get("/companies", summary="获取公司清洗列表") async def get_companies_list( page: int = Query(1, ge=1), page_size: int = Query(20, ge=1, le=100), source: Optional[str] = Query(None), status: Optional[str] = Query(None) ): """分页获取待处理公司列表详情""" offset = (page - 1) * page_size VALID_SOURCES = {"boss", "qcwy", "zhilian"} VALID_STATUSES = {"pending", "processing", "done", "failed"} queue_query = CompanyCleaningQueue.all() if source: if source not in VALID_SOURCES: return SuccessExtra(data=[], total=0, page=page, page_size=page_size) queue_query = queue_query.filter(source=source) if status: if status not in VALID_STATUSES: return SuccessExtra(data=[], total=0, page=page, page_size=page_size) queue_query = queue_query.filter(status=status) total = await queue_query.count() rows = await queue_query.order_by("-updated_at").offset(offset).limit(page_size) data = [] for row in rows: data.append({ "source": row.source, "company_id": row.company_id, "company_name": row.company_name, "status": row.status, "error_msg": row.error_msg, "retry_count": row.retry_count, "started_at": row.started_at.isoformat() if row.started_at else None, "finished_at": row.finished_at.isoformat() if row.finished_at else None, "jobs_fetched": row.jobs_fetched, "jobs_stored": row.jobs_stored, "jobs_duplicate": row.jobs_duplicate, "jobs_failed": row.jobs_failed, "jobs_error_msg": row.jobs_error_msg, "created_at": row.created_at.isoformat() if row.created_at else None, "updated_at": row.updated_at.isoformat() if row.updated_at else None }) return SuccessExtra(data=data, total=total, page=page, page_size=page_size) @router.get("/company-detail", summary="获取公司清洗详情") async def get_company_cleaning_detail( source: str = Query(..., description="数据源"), company_id: str = Query(..., description="公司ID"), company_name: Optional[str] = Query(None, description="公司名称"), ): if source not in {"boss", "qcwy", "zhilian"}: return Success(code=400, msg="不支持的数据源") model = company_storage.company_model(source) normalized_id = normalize_company_id(source, company_id) row = await model.get_or_none(source_company_id=normalized_id) if not row and company_name: row = await model.get_or_none(company_name=company_name) if not row: return Success(code=404, msg="未找到公司清洗结果") data = row.raw_json if isinstance(data, str): try: data = json.loads(data) except Exception: data = {"raw": data} return Success( data={ "source": source, "company_id": row.source_company_id, "company_name": row.company_name, "created_at": row.created_at.isoformat() if row.created_at else None, "updated_at": row.updated_at.isoformat() if row.updated_at else None, "data": data, } ) @router.post("/collect-pending", summary="分析待处理数据") async def collect_pending_companies_api( limit: int = Body(1000, embed=True, ge=1, le=10000), source: Optional[str] = Body(None, embed=True) ): """ 分析招聘数据,收集待处理的公司ID到 MySQL 队列表 """ summary = await company_cleaner.collect_pending_companies(limit=limit, source=source) return Success( msg=f"已完成数据分析,本次新增 {summary['total_created']} 条待处理公司", data=summary, ) @router.post("/run-pending", summary="手动执行待处理公司清洗") async def run_pending_companies( limit: int = Body(100, embed=True, ge=1, le=5000), source: Optional[str] = Body(None, embed=True), proxy: Optional[str] = Body(None, embed=True), max_delay_seconds: int = Body(5, embed=True), ): """ 手动触发待处理公司清洗任务 仅会处理当前状态为 pending 的记录,已经处理过的记录不会重复执行。 """ await company_cleaner.process_pending_companies( limit=limit, source=source, proxy=proxy, max_delay_seconds=max_delay_seconds, ) return Success(msg=f"已触发执行最近 {limit} 条待处理公司清洗任务") @router.post("/crawl-execute", summary="爬取并执行待处理公司清洗") async def crawl_execute_pending( limit: int = Body(100, embed=True, ge=1, le=5000), source: Optional[str] = Body(None, embed=True), proxy: Optional[str] = Body(None, embed=True), max_delay_seconds: int = Body(5, embed=True), ): await company_cleaner.collect_pending_companies(limit=limit, source=source) await company_cleaner.process_pending_companies( limit=limit, source=source, proxy=proxy, max_delay_seconds=max_delay_seconds, ) return Success(msg=f"已触发爬取并执行最近 {limit} 条待处理公司清洗任务") @router.post("/update-company-status", summary="更新公司爬取状态(爬虫端调用)") async def update_company_status( source: str = Body(..., embed=True), company_id: str = Body(..., embed=True), status: str = Body(..., embed=True), error_message: str = Body("", embed=True), ): """爬虫完成公司数据抓取后,调用此接口更新 MySQL 队列状态""" VALID_STATUSES = {"done", "failed"} if status not in VALID_STATUSES: return Success(msg=f"无效状态: {status},仅支持 {VALID_STATUSES}", code=400) normalized_id = normalize_company_id(source, company_id) queue, _ = await CompanyCleaningQueue.get_or_create( source=source, company_id=normalized_id, defaults={ "company_name": "", "status": "pending", "error_msg": "", "retry_count": 0, "started_at": None, "finished_at": None, }, ) queue.status = status queue.error_msg = error_message or "" queue.finished_at = datetime.now() if status == "failed": queue.retry_count += 1 await queue.save() return Success(msg="状态更新成功", data={"source": source, "company_id": company_id, "status": status}) @router.post("/process-company", summary="执行单个公司清洗任务") async def process_single_company_api( source: str = Body(..., embed=True), company_id: str = Body(..., embed=True), proxy: Optional[str] = Body(None, embed=True), max_delay_seconds: int = Body(5, embed=True), ): result = await company_cleaner.process_single_company( source=source, company_id=company_id, proxy=proxy, max_delay_seconds=max_delay_seconds, ) success = bool(result.get("success")) msg = "任务执行成功" if success else "任务执行失败" return Success(msg=msg, data=result) @router.post("/upload", summary="上传文件并保存任务") async def upload_file( file: UploadFile = File(...), clean_type: str = Form("auto"), platform: str = Form("auto"), proxy: Optional[str] = Form(None) ): targets = await cleaning_service.parse_file(file) tasks = [ CleaningTask( target=t, clean_type=clean_type, platform=platform, proxy=proxy, status="pending" ) for t in targets ] if tasks: await CleaningTask.bulk_create(tasks) return Success(msg=f"Successfully imported {len(tasks)} tasks") @router.get("/list", summary="获取清洗任务列表") async def list_tasks( page: int = Query(1, description="页码"), page_size: int = Query(10, description="每页数量"), target: str = Query(None, description="目标搜索"), status: str = Query(None, description="状态筛选"), clean_type: str = Query(None, description="清洗类型筛选") ): q = Q() if target: q &= Q(target__contains=target) if status: q &= Q(status=status) if clean_type: q &= Q(clean_type=clean_type) total, tasks = await cleaning_controller.list(page=page, page_size=page_size, search=q, order=["-created_at"]) data = [await t.to_dict() for t in tasks] return SuccessExtra(data=data, total=total, page=page, page_size=page_size) @router.post("/process/{task_id}", summary="处理单个任务") async def process_task(task_id: int): task = await cleaning_controller.get(id=task_id) if not task: return Success(code=404, msg="Task not found") task.status = "processing" await task.save() result = await cleaning_service.process_single_item( target=task.target, clean_type=task.clean_type, platform=task.platform, proxy=task.proxy, ) task.status = "success" if result.get("success") else "fail" task.storage_status = result.get("storage_status", "unknown") task.remote_sent = result.get("remote_sent", False) task.result_summary = result.get("data_summary") task.error_msg = result.get("error") await task.save() return Success(data=await task.to_dict(), msg="Task processed") @router.delete("/delete", summary="删除任务") async def delete_task( id: int = Query(..., description="任务ID") ): await cleaning_controller.remove(id=id) return Success(msg="Deleted Successfully") @router.post("/clear", summary="清空所有任务") async def clear_tasks(): await CleaningTask.all().delete() return Success(msg="All tasks cleared")