JobData/app/api/v1/cleaning/cleaning.py
2026-03-22 23:22:30 +08:00

360 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from datetime import datetime
import json
from typing import Optional
from fastapi import APIRouter, File, UploadFile, Form, Body, Query
from tortoise.expressions import Q
from tortoise.functions import Count, Sum
from app.models.cleaning import CleaningTask
from app.models.company import CompanyCleaningQueue
from app.schemas import Success, SuccessExtra
from app.services.cleaning import CleaningService
from app.services.company_cleaner import company_cleaner
from app.services.company_storage import company_storage, normalize_company_id
from app.controllers.cleaning import cleaning_controller
router = APIRouter()
cleaning_service = CleaningService()
COMPANY_SOURCES = ("boss", "qcwy", "zhilian")
@router.get("/stats", summary="获取公司清洗统计信息")
async def get_stats():
"""获取 MySQL 中待处理公司的统计信息"""
pending_count = await CompanyCleaningQueue.filter(status="pending").count()
today_count = await CompanyCleaningQueue.filter(
status="done",
updated_at__gte=datetime.now().replace(hour=0, minute=0, second=0, microsecond=0),
).count()
dist_rows = await CompanyCleaningQueue.annotate(total=Count("id")).group_by("source", "status").values(
"source",
"status",
"total",
)
stats = {
"total_pending": pending_count,
"today_processed": today_count,
"details": []
}
source_stats = {
source: {
"pending": 0,
"processing": 0,
"done": 0,
"failed": 0,
"total": 0,
"jobs_fetched": 0,
"jobs_stored": 0,
"jobs_duplicate": 0,
"jobs_failed": 0,
}
for source in COMPANY_SOURCES
}
for row in dist_rows:
source = row["source"]
status = row["status"]
count = row["total"]
if source not in source_stats:
continue
if status in source_stats[source]:
source_stats[source][status] = count
source_stats[source]["total"] += count
job_dist_rows = await CompanyCleaningQueue.annotate(
jobs_fetched_sum=Sum("jobs_fetched"),
jobs_stored_sum=Sum("jobs_stored"),
jobs_duplicate_sum=Sum("jobs_duplicate"),
jobs_failed_sum=Sum("jobs_failed"),
).group_by("source").values(
"source",
"jobs_fetched_sum",
"jobs_stored_sum",
"jobs_duplicate_sum",
"jobs_failed_sum",
)
for row in job_dist_rows:
source = row["source"]
if source not in source_stats:
continue
source_stats[source]["jobs_fetched"] = int(row["jobs_fetched_sum"] or 0)
source_stats[source]["jobs_stored"] = int(row["jobs_stored_sum"] or 0)
source_stats[source]["jobs_duplicate"] = int(row["jobs_duplicate_sum"] or 0)
source_stats[source]["jobs_failed"] = int(row["jobs_failed_sum"] or 0)
stats["details"] = [{"source": k, **v} for k, v in source_stats.items()]
return Success(data=stats)
@router.get("/companies", summary="获取公司清洗列表")
async def get_companies_list(
page: int = Query(1, ge=1),
page_size: int = Query(20, ge=1, le=100),
source: Optional[str] = Query(None),
status: Optional[str] = Query(None)
):
"""分页获取待处理公司列表详情"""
offset = (page - 1) * page_size
VALID_SOURCES = {"boss", "qcwy", "zhilian"}
VALID_STATUSES = {"pending", "processing", "done", "failed"}
queue_query = CompanyCleaningQueue.all()
if source:
if source not in VALID_SOURCES:
return SuccessExtra(data=[], total=0, page=page, page_size=page_size)
queue_query = queue_query.filter(source=source)
if status:
if status not in VALID_STATUSES:
return SuccessExtra(data=[], total=0, page=page, page_size=page_size)
queue_query = queue_query.filter(status=status)
total = await queue_query.count()
rows = await queue_query.order_by("-updated_at").offset(offset).limit(page_size)
data = []
for row in rows:
data.append({
"source": row.source,
"company_id": row.company_id,
"company_name": row.company_name,
"status": row.status,
"error_msg": row.error_msg,
"retry_count": row.retry_count,
"started_at": row.started_at.isoformat() if row.started_at else None,
"finished_at": row.finished_at.isoformat() if row.finished_at else None,
"jobs_fetched": row.jobs_fetched,
"jobs_stored": row.jobs_stored,
"jobs_duplicate": row.jobs_duplicate,
"jobs_failed": row.jobs_failed,
"jobs_error_msg": row.jobs_error_msg,
"created_at": row.created_at.isoformat() if row.created_at else None,
"updated_at": row.updated_at.isoformat() if row.updated_at else None
})
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
@router.get("/company-detail", summary="获取公司清洗详情")
async def get_company_cleaning_detail(
source: str = Query(..., description="数据源"),
company_id: str = Query(..., description="公司ID"),
company_name: Optional[str] = Query(None, description="公司名称"),
):
if source not in {"boss", "qcwy", "zhilian"}:
return Success(code=400, msg="不支持的数据源")
model = company_storage.company_model(source)
normalized_id = normalize_company_id(source, company_id)
row = await model.get_or_none(source_company_id=normalized_id)
if not row and company_name:
row = await model.get_or_none(company_name=company_name)
if not row:
return Success(code=404, msg="未找到公司清洗结果")
data = row.raw_json
if isinstance(data, str):
try:
data = json.loads(data)
except Exception:
data = {"raw": data}
return Success(
data={
"source": source,
"company_id": row.source_company_id,
"company_name": row.company_name,
"created_at": row.created_at.isoformat() if row.created_at else None,
"updated_at": row.updated_at.isoformat() if row.updated_at else None,
"data": data,
}
)
@router.post("/collect-pending", summary="分析待处理数据")
async def collect_pending_companies_api(
limit: int = Body(1000, embed=True, ge=1, le=10000),
source: Optional[str] = Body(None, embed=True)
):
"""
分析招聘数据收集待处理的公司ID到 MySQL 队列表
"""
summary = await company_cleaner.collect_pending_companies(limit=limit, source=source)
return Success(
msg=f"已完成数据分析,本次新增 {summary['total_created']} 条待处理公司",
data=summary,
)
@router.post("/run-pending", summary="手动执行待处理公司清洗")
async def run_pending_companies(
limit: int = Body(100, embed=True, ge=1, le=5000),
source: Optional[str] = Body(None, embed=True),
proxy: Optional[str] = Body(None, embed=True),
max_delay_seconds: int = Body(5, embed=True),
):
"""
手动触发待处理公司清洗任务
仅会处理当前状态为 pending 的记录,已经处理过的记录不会重复执行。
"""
await company_cleaner.process_pending_companies(
limit=limit,
source=source,
proxy=proxy,
max_delay_seconds=max_delay_seconds,
)
return Success(msg=f"已触发执行最近 {limit} 条待处理公司清洗任务")
@router.post("/crawl-execute", summary="爬取并执行待处理公司清洗")
async def crawl_execute_pending(
limit: int = Body(100, embed=True, ge=1, le=5000),
source: Optional[str] = Body(None, embed=True),
proxy: Optional[str] = Body(None, embed=True),
max_delay_seconds: int = Body(5, embed=True),
):
await company_cleaner.collect_pending_companies(limit=limit, source=source)
await company_cleaner.process_pending_companies(
limit=limit,
source=source,
proxy=proxy,
max_delay_seconds=max_delay_seconds,
)
return Success(msg=f"已触发爬取并执行最近 {limit} 条待处理公司清洗任务")
@router.post("/update-company-status", summary="更新公司爬取状态(爬虫端调用)")
async def update_company_status(
source: str = Body(..., embed=True),
company_id: str = Body(..., embed=True),
status: str = Body(..., embed=True),
error_message: str = Body("", embed=True),
):
"""爬虫完成公司数据抓取后,调用此接口更新 MySQL 队列状态"""
VALID_STATUSES = {"done", "failed"}
if status not in VALID_STATUSES:
return Success(msg=f"无效状态: {status},仅支持 {VALID_STATUSES}", code=400)
normalized_id = normalize_company_id(source, company_id)
queue, _ = await CompanyCleaningQueue.get_or_create(
source=source,
company_id=normalized_id,
defaults={
"company_name": "",
"status": "pending",
"error_msg": "",
"retry_count": 0,
"started_at": None,
"finished_at": None,
},
)
queue.status = status
queue.error_msg = error_message or ""
queue.finished_at = datetime.now()
if status == "failed":
queue.retry_count += 1
await queue.save()
return Success(msg="状态更新成功", data={"source": source, "company_id": company_id, "status": status})
@router.post("/process-company", summary="执行单个公司清洗任务")
async def process_single_company_api(
source: str = Body(..., embed=True),
company_id: str = Body(..., embed=True),
proxy: Optional[str] = Body(None, embed=True),
max_delay_seconds: int = Body(5, embed=True),
):
result = await company_cleaner.process_single_company(
source=source,
company_id=company_id,
proxy=proxy,
max_delay_seconds=max_delay_seconds,
)
success = bool(result.get("success"))
msg = "任务执行成功" if success else "任务执行失败"
return Success(msg=msg, data=result)
@router.post("/upload", summary="上传文件并保存任务")
async def upload_file(
file: UploadFile = File(...),
clean_type: str = Form("auto"),
platform: str = Form("auto"),
proxy: Optional[str] = Form(None)
):
targets = await cleaning_service.parse_file(file)
tasks = [
CleaningTask(
target=t,
clean_type=clean_type,
platform=platform,
proxy=proxy,
status="pending"
) for t in targets
]
if tasks:
await CleaningTask.bulk_create(tasks)
return Success(msg=f"Successfully imported {len(tasks)} tasks")
@router.get("/list", summary="获取清洗任务列表")
async def list_tasks(
page: int = Query(1, description="页码"),
page_size: int = Query(10, description="每页数量"),
target: str = Query(None, description="目标搜索"),
status: str = Query(None, description="状态筛选"),
clean_type: str = Query(None, description="清洗类型筛选")
):
q = Q()
if target:
q &= Q(target__contains=target)
if status:
q &= Q(status=status)
if clean_type:
q &= Q(clean_type=clean_type)
total, tasks = await cleaning_controller.list(page=page, page_size=page_size, search=q, order=["-created_at"])
data = [await t.to_dict() for t in tasks]
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
@router.post("/process/{task_id}", summary="处理单个任务")
async def process_task(task_id: int):
task = await cleaning_controller.get(id=task_id)
if not task:
return Success(code=404, msg="Task not found")
task.status = "processing"
await task.save()
result = await cleaning_service.process_single_item(
target=task.target,
clean_type=task.clean_type,
platform=task.platform,
proxy=task.proxy,
)
task.status = "success" if result.get("success") else "fail"
task.storage_status = result.get("storage_status", "unknown")
task.remote_sent = result.get("remote_sent", False)
task.result_summary = result.get("data_summary")
task.error_msg = result.get("error")
await task.save()
return Success(data=await task.to_dict(), msg="Task processed")
@router.delete("/delete", summary="删除任务")
async def delete_task(
id: int = Query(..., description="任务ID")
):
await cleaning_controller.remove(id=id)
return Success(msg="Deleted Successfully")
@router.post("/clear", summary="清空所有任务")
async def clear_tasks():
await CleaningTask.all().delete()
return Success(msg="All tasks cleared")