360 lines
12 KiB
Python
360 lines
12 KiB
Python
from datetime import datetime
|
||
import json
|
||
from typing import Optional
|
||
|
||
from fastapi import APIRouter, File, UploadFile, Form, Body, Query
|
||
from tortoise.expressions import Q
|
||
from tortoise.functions import Count, Sum
|
||
|
||
from app.models.cleaning import CleaningTask
|
||
from app.models.company import CompanyCleaningQueue
|
||
from app.schemas import Success, SuccessExtra
|
||
from app.services.cleaning import CleaningService
|
||
from app.services.company_cleaner import company_cleaner
|
||
from app.services.company_storage import company_storage, normalize_company_id
|
||
from app.controllers.cleaning import cleaning_controller
|
||
|
||
router = APIRouter()
|
||
cleaning_service = CleaningService()
|
||
COMPANY_SOURCES = ("boss", "qcwy", "zhilian")
|
||
|
||
@router.get("/stats", summary="获取公司清洗统计信息")
|
||
async def get_stats():
|
||
"""获取 MySQL 中待处理公司的统计信息"""
|
||
pending_count = await CompanyCleaningQueue.filter(status="pending").count()
|
||
today_count = await CompanyCleaningQueue.filter(
|
||
status="done",
|
||
updated_at__gte=datetime.now().replace(hour=0, minute=0, second=0, microsecond=0),
|
||
).count()
|
||
dist_rows = await CompanyCleaningQueue.annotate(total=Count("id")).group_by("source", "status").values(
|
||
"source",
|
||
"status",
|
||
"total",
|
||
)
|
||
|
||
stats = {
|
||
"total_pending": pending_count,
|
||
"today_processed": today_count,
|
||
"details": []
|
||
}
|
||
|
||
source_stats = {
|
||
source: {
|
||
"pending": 0,
|
||
"processing": 0,
|
||
"done": 0,
|
||
"failed": 0,
|
||
"total": 0,
|
||
"jobs_fetched": 0,
|
||
"jobs_stored": 0,
|
||
"jobs_duplicate": 0,
|
||
"jobs_failed": 0,
|
||
}
|
||
for source in COMPANY_SOURCES
|
||
}
|
||
for row in dist_rows:
|
||
source = row["source"]
|
||
status = row["status"]
|
||
count = row["total"]
|
||
if source not in source_stats:
|
||
continue
|
||
|
||
if status in source_stats[source]:
|
||
source_stats[source][status] = count
|
||
source_stats[source]["total"] += count
|
||
|
||
job_dist_rows = await CompanyCleaningQueue.annotate(
|
||
jobs_fetched_sum=Sum("jobs_fetched"),
|
||
jobs_stored_sum=Sum("jobs_stored"),
|
||
jobs_duplicate_sum=Sum("jobs_duplicate"),
|
||
jobs_failed_sum=Sum("jobs_failed"),
|
||
).group_by("source").values(
|
||
"source",
|
||
"jobs_fetched_sum",
|
||
"jobs_stored_sum",
|
||
"jobs_duplicate_sum",
|
||
"jobs_failed_sum",
|
||
)
|
||
for row in job_dist_rows:
|
||
source = row["source"]
|
||
if source not in source_stats:
|
||
continue
|
||
source_stats[source]["jobs_fetched"] = int(row["jobs_fetched_sum"] or 0)
|
||
source_stats[source]["jobs_stored"] = int(row["jobs_stored_sum"] or 0)
|
||
source_stats[source]["jobs_duplicate"] = int(row["jobs_duplicate_sum"] or 0)
|
||
source_stats[source]["jobs_failed"] = int(row["jobs_failed_sum"] or 0)
|
||
|
||
stats["details"] = [{"source": k, **v} for k, v in source_stats.items()]
|
||
|
||
return Success(data=stats)
|
||
|
||
|
||
@router.get("/companies", summary="获取公司清洗列表")
|
||
async def get_companies_list(
|
||
page: int = Query(1, ge=1),
|
||
page_size: int = Query(20, ge=1, le=100),
|
||
source: Optional[str] = Query(None),
|
||
status: Optional[str] = Query(None)
|
||
):
|
||
"""分页获取待处理公司列表详情"""
|
||
offset = (page - 1) * page_size
|
||
|
||
VALID_SOURCES = {"boss", "qcwy", "zhilian"}
|
||
VALID_STATUSES = {"pending", "processing", "done", "failed"}
|
||
|
||
queue_query = CompanyCleaningQueue.all()
|
||
if source:
|
||
if source not in VALID_SOURCES:
|
||
return SuccessExtra(data=[], total=0, page=page, page_size=page_size)
|
||
queue_query = queue_query.filter(source=source)
|
||
if status:
|
||
if status not in VALID_STATUSES:
|
||
return SuccessExtra(data=[], total=0, page=page, page_size=page_size)
|
||
queue_query = queue_query.filter(status=status)
|
||
|
||
total = await queue_query.count()
|
||
rows = await queue_query.order_by("-updated_at").offset(offset).limit(page_size)
|
||
|
||
data = []
|
||
for row in rows:
|
||
data.append({
|
||
"source": row.source,
|
||
"company_id": row.company_id,
|
||
"company_name": row.company_name,
|
||
"status": row.status,
|
||
"error_msg": row.error_msg,
|
||
"retry_count": row.retry_count,
|
||
"started_at": row.started_at.isoformat() if row.started_at else None,
|
||
"finished_at": row.finished_at.isoformat() if row.finished_at else None,
|
||
"jobs_fetched": row.jobs_fetched,
|
||
"jobs_stored": row.jobs_stored,
|
||
"jobs_duplicate": row.jobs_duplicate,
|
||
"jobs_failed": row.jobs_failed,
|
||
"jobs_error_msg": row.jobs_error_msg,
|
||
"created_at": row.created_at.isoformat() if row.created_at else None,
|
||
"updated_at": row.updated_at.isoformat() if row.updated_at else None
|
||
})
|
||
|
||
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
|
||
|
||
|
||
@router.get("/company-detail", summary="获取公司清洗详情")
|
||
async def get_company_cleaning_detail(
|
||
source: str = Query(..., description="数据源"),
|
||
company_id: str = Query(..., description="公司ID"),
|
||
company_name: Optional[str] = Query(None, description="公司名称"),
|
||
):
|
||
if source not in {"boss", "qcwy", "zhilian"}:
|
||
return Success(code=400, msg="不支持的数据源")
|
||
|
||
model = company_storage.company_model(source)
|
||
normalized_id = normalize_company_id(source, company_id)
|
||
row = await model.get_or_none(source_company_id=normalized_id)
|
||
if not row and company_name:
|
||
row = await model.get_or_none(company_name=company_name)
|
||
if not row:
|
||
return Success(code=404, msg="未找到公司清洗结果")
|
||
|
||
data = row.raw_json
|
||
if isinstance(data, str):
|
||
try:
|
||
data = json.loads(data)
|
||
except Exception:
|
||
data = {"raw": data}
|
||
return Success(
|
||
data={
|
||
"source": source,
|
||
"company_id": row.source_company_id,
|
||
"company_name": row.company_name,
|
||
"created_at": row.created_at.isoformat() if row.created_at else None,
|
||
"updated_at": row.updated_at.isoformat() if row.updated_at else None,
|
||
"data": data,
|
||
}
|
||
)
|
||
|
||
|
||
@router.post("/collect-pending", summary="分析待处理数据")
|
||
async def collect_pending_companies_api(
|
||
limit: int = Body(1000, embed=True, ge=1, le=10000),
|
||
source: Optional[str] = Body(None, embed=True)
|
||
):
|
||
"""
|
||
分析招聘数据,收集待处理的公司ID到 MySQL 队列表
|
||
"""
|
||
summary = await company_cleaner.collect_pending_companies(limit=limit, source=source)
|
||
return Success(
|
||
msg=f"已完成数据分析,本次新增 {summary['total_created']} 条待处理公司",
|
||
data=summary,
|
||
)
|
||
|
||
|
||
@router.post("/run-pending", summary="手动执行待处理公司清洗")
|
||
async def run_pending_companies(
|
||
limit: int = Body(100, embed=True, ge=1, le=5000),
|
||
source: Optional[str] = Body(None, embed=True),
|
||
proxy: Optional[str] = Body(None, embed=True),
|
||
max_delay_seconds: int = Body(5, embed=True),
|
||
):
|
||
"""
|
||
手动触发待处理公司清洗任务
|
||
|
||
仅会处理当前状态为 pending 的记录,已经处理过的记录不会重复执行。
|
||
"""
|
||
await company_cleaner.process_pending_companies(
|
||
limit=limit,
|
||
source=source,
|
||
proxy=proxy,
|
||
max_delay_seconds=max_delay_seconds,
|
||
)
|
||
return Success(msg=f"已触发执行最近 {limit} 条待处理公司清洗任务")
|
||
|
||
|
||
@router.post("/crawl-execute", summary="爬取并执行待处理公司清洗")
|
||
async def crawl_execute_pending(
|
||
limit: int = Body(100, embed=True, ge=1, le=5000),
|
||
source: Optional[str] = Body(None, embed=True),
|
||
proxy: Optional[str] = Body(None, embed=True),
|
||
max_delay_seconds: int = Body(5, embed=True),
|
||
):
|
||
await company_cleaner.collect_pending_companies(limit=limit, source=source)
|
||
await company_cleaner.process_pending_companies(
|
||
limit=limit,
|
||
source=source,
|
||
proxy=proxy,
|
||
max_delay_seconds=max_delay_seconds,
|
||
)
|
||
return Success(msg=f"已触发爬取并执行最近 {limit} 条待处理公司清洗任务")
|
||
|
||
|
||
@router.post("/update-company-status", summary="更新公司爬取状态(爬虫端调用)")
|
||
async def update_company_status(
|
||
source: str = Body(..., embed=True),
|
||
company_id: str = Body(..., embed=True),
|
||
status: str = Body(..., embed=True),
|
||
error_message: str = Body("", embed=True),
|
||
):
|
||
"""爬虫完成公司数据抓取后,调用此接口更新 MySQL 队列状态"""
|
||
VALID_STATUSES = {"done", "failed"}
|
||
if status not in VALID_STATUSES:
|
||
return Success(msg=f"无效状态: {status},仅支持 {VALID_STATUSES}", code=400)
|
||
|
||
normalized_id = normalize_company_id(source, company_id)
|
||
queue, _ = await CompanyCleaningQueue.get_or_create(
|
||
source=source,
|
||
company_id=normalized_id,
|
||
defaults={
|
||
"company_name": "",
|
||
"status": "pending",
|
||
"error_msg": "",
|
||
"retry_count": 0,
|
||
"started_at": None,
|
||
"finished_at": None,
|
||
},
|
||
)
|
||
queue.status = status
|
||
queue.error_msg = error_message or ""
|
||
queue.finished_at = datetime.now()
|
||
if status == "failed":
|
||
queue.retry_count += 1
|
||
await queue.save()
|
||
return Success(msg="状态更新成功", data={"source": source, "company_id": company_id, "status": status})
|
||
|
||
|
||
@router.post("/process-company", summary="执行单个公司清洗任务")
|
||
async def process_single_company_api(
|
||
source: str = Body(..., embed=True),
|
||
company_id: str = Body(..., embed=True),
|
||
proxy: Optional[str] = Body(None, embed=True),
|
||
max_delay_seconds: int = Body(5, embed=True),
|
||
):
|
||
result = await company_cleaner.process_single_company(
|
||
source=source,
|
||
company_id=company_id,
|
||
proxy=proxy,
|
||
max_delay_seconds=max_delay_seconds,
|
||
)
|
||
success = bool(result.get("success"))
|
||
msg = "任务执行成功" if success else "任务执行失败"
|
||
return Success(msg=msg, data=result)
|
||
|
||
@router.post("/upload", summary="上传文件并保存任务")
|
||
async def upload_file(
|
||
file: UploadFile = File(...),
|
||
clean_type: str = Form("auto"),
|
||
platform: str = Form("auto"),
|
||
proxy: Optional[str] = Form(None)
|
||
):
|
||
targets = await cleaning_service.parse_file(file)
|
||
tasks = [
|
||
CleaningTask(
|
||
target=t,
|
||
clean_type=clean_type,
|
||
platform=platform,
|
||
proxy=proxy,
|
||
status="pending"
|
||
) for t in targets
|
||
]
|
||
|
||
if tasks:
|
||
await CleaningTask.bulk_create(tasks)
|
||
|
||
return Success(msg=f"Successfully imported {len(tasks)} tasks")
|
||
|
||
@router.get("/list", summary="获取清洗任务列表")
|
||
async def list_tasks(
|
||
page: int = Query(1, description="页码"),
|
||
page_size: int = Query(10, description="每页数量"),
|
||
target: str = Query(None, description="目标搜索"),
|
||
status: str = Query(None, description="状态筛选"),
|
||
clean_type: str = Query(None, description="清洗类型筛选")
|
||
):
|
||
q = Q()
|
||
if target:
|
||
q &= Q(target__contains=target)
|
||
if status:
|
||
q &= Q(status=status)
|
||
if clean_type:
|
||
q &= Q(clean_type=clean_type)
|
||
|
||
total, tasks = await cleaning_controller.list(page=page, page_size=page_size, search=q, order=["-created_at"])
|
||
data = [await t.to_dict() for t in tasks]
|
||
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
|
||
|
||
@router.post("/process/{task_id}", summary="处理单个任务")
|
||
async def process_task(task_id: int):
|
||
task = await cleaning_controller.get(id=task_id)
|
||
if not task:
|
||
return Success(code=404, msg="Task not found")
|
||
|
||
task.status = "processing"
|
||
await task.save()
|
||
|
||
result = await cleaning_service.process_single_item(
|
||
target=task.target,
|
||
clean_type=task.clean_type,
|
||
platform=task.platform,
|
||
proxy=task.proxy,
|
||
)
|
||
|
||
task.status = "success" if result.get("success") else "fail"
|
||
task.storage_status = result.get("storage_status", "unknown")
|
||
task.remote_sent = result.get("remote_sent", False)
|
||
task.result_summary = result.get("data_summary")
|
||
task.error_msg = result.get("error")
|
||
|
||
await task.save()
|
||
|
||
return Success(data=await task.to_dict(), msg="Task processed")
|
||
|
||
@router.delete("/delete", summary="删除任务")
|
||
async def delete_task(
|
||
id: int = Query(..., description="任务ID")
|
||
):
|
||
await cleaning_controller.remove(id=id)
|
||
return Success(msg="Deleted Successfully")
|
||
|
||
@router.post("/clear", summary="清空所有任务")
|
||
async def clear_tasks():
|
||
await CleaningTask.all().delete()
|
||
return Success(msg="All tasks cleared")
|