JobData/app/services/company_cleaner.py
2026-03-22 23:22:30 +08:00

340 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
import random
import time
from typing import Any, Optional
from loguru import logger
from app.core.clickhouse import clickhouse_manager
from app.models.company import CompanyCleaningQueue
from app.models.token import BossToken
from app.services.company_jobs_sync import CompanyJobsSyncService
from app.services.company_storage import company_storage, normalize_company_id
from app.services.crawler.boss import BossService
from app.services.crawler.qcwy import QcwyService
from app.services.crawler.zhilian import ZhilianService
SOURCE_CONFIGS: dict[str, dict[str, Any]] = {
"zhilian": {
"job_table": "zhilian_job",
"company_id_expr": "JSONExtractString(json_data, 'companyNumber')",
"company_name_expr": "JSONExtractString(json_data, 'companyName')",
"days_back": 30,
"max_query_limit": None,
},
"qcwy": {
"job_table": "qcwy_job",
"company_id_expr": "JSONExtractString(json_data, 'coId')",
"company_name_expr": "JSONExtractString(json_data, 'companyName')",
"days_back": 30,
"max_query_limit": 5000,
},
"boss": {
"job_table": "boss_job",
"company_id_expr": "JSONExtractString(json_data, 'brandComInfoVO', 'encryptBrandId')",
"company_name_expr": "JSONExtractString(json_data, 'brandComInfoVO', 'brandName')",
"days_back": 30,
"max_query_limit": None,
},
}
class CompanyCleaner:
_TOKEN_REFRESH_INTERVAL = 3600
def __init__(self):
self.boss_service = BossService()
self.qcwy_service = QcwyService()
self.zhilian_service = ZhilianService()
self.company_jobs_sync = CompanyJobsSyncService()
self._boss_token_loaded = False
self._token_loaded_at: float = 0
def _apply_proxy(self, proxy: Optional[str]) -> None:
self.boss_service.set_proxy(proxy)
self.qcwy_service.set_proxy(proxy)
self.zhilian_service.set_proxy(proxy)
self.company_jobs_sync.set_proxy(proxy)
async def _ensure_boss_token_loaded(self) -> None:
now = time.time()
if (
self._boss_token_loaded
and self.boss_service.login_data.get("mpt")
and now - self._token_loaded_at < self._TOKEN_REFRESH_INTERVAL
):
return
token_obj = await BossToken.filter(is_active=True).order_by("-updated_at").first()
if not token_obj:
logger.warning("BossToken not found or inactive in CompanyCleaner")
return
self.boss_service.set_login_data(token_obj.mpt or "", "")
self._boss_token_loaded = True
self._token_loaded_at = now
async def collect_pending_companies(self, limit: int = 1000, source: Optional[str] = None) -> dict[str, Any]:
client = await clickhouse_manager.get_client()
logger.info(f"Starting to collect pending companies (limit={limit}, source={source or 'all'})...")
summary: dict[str, Any] = {
"total_created": 0,
"sources": {},
}
sources = [s for s in SOURCE_CONFIGS if source is None or source == s]
# 并行采集各平台
tasks = [self._collect_source(client, s, limit) for s in sources]
results = await asyncio.gather(*tasks, return_exceptions=True)
for s, result in zip(sources, results):
if isinstance(result, Exception):
logger.error(f"Error collecting {s}: {result}")
summary["sources"][s] = {"source": s, "created_count": 0, "error": str(result)}
else:
summary["sources"][s] = result
summary["total_created"] += result["created_count"]
logger.info("Finished collecting pending companies.")
return summary
async def _collect_source(self, client, source: str, limit: int) -> dict[str, Any]:
config = SOURCE_CONFIGS[source]
# 先从 MySQL 取出该平台所有已入队/已入库的 company_idPython 侧快速排除
all_queued = set(await CompanyCleaningQueue.filter(source=source).values_list("company_id", flat=True))
all_existing = await company_storage.get_all_company_ids(source)
exclude_ids = all_queued | all_existing
logger.info(f"Loaded {len(exclude_ids)} known {source} company IDs for exclusion")
# 用 OFFSET 跳过已知公司数量,获取新公司
offset = len(exclude_ids)
result = await self._query_candidate_rows(
client=client,
table=config["job_table"],
company_id_expr=config["company_id_expr"],
company_name_expr=config["company_name_expr"],
days_back=config["days_back"],
limit=limit,
max_query_limit=config["max_query_limit"],
offset=offset,
)
if not result.result_rows:
logger.info(f"No new {source} companies found in job table query.")
return {
"source": source,
"query_count": 0,
"deduped_count": 0,
"existing_count": len(all_existing),
"queued_count": len(all_queued),
"created_count": 0,
}
deduped_candidates: list[dict[str, str]] = []
seen_ids: set[str] = set()
for raw_company_id, company_name in result.result_rows:
company_id = normalize_company_id(source, raw_company_id)
if not company_id or company_id in seen_ids or company_id in exclude_ids:
continue
seen_ids.add(company_id)
deduped_candidates.append(
{
"company_id": company_id,
"company_name": (company_name or "").strip(),
}
)
if len(deduped_candidates) >= limit:
break
created_count = await company_storage.enqueue_companies(source, deduped_candidates)
logger.info(f"Added {created_count} {source} companies to MySQL queue.")
return {
"source": source,
"query_count": len(result.result_rows),
"deduped_count": len(deduped_candidates),
"existing_count": len(all_existing),
"queued_count": len(all_queued),
"created_count": created_count,
}
async def _query_candidate_rows(
self,
*,
client,
table: str,
company_id_expr: str,
company_name_expr: str,
days_back: int,
limit: int,
max_query_limit: Optional[int],
offset: int = 0,
):
current_days = days_back
current_limit = limit * 5
if max_query_limit is not None:
current_limit = min(current_limit, max_query_limit)
last_error: Optional[Exception] = None
for attempt in range(3):
sample_sql = " SAMPLE 0.1" if attempt == 2 else ""
current_offset = offset
if attempt == 1:
current_days = max(1, min(current_days, 3))
current_limit = min(current_limit, max(limit, 50))
current_offset = 0
query = f"""
SELECT DISTINCT
{company_id_expr} AS cid,
{company_name_expr} AS cname
FROM job_data.{table}{sample_sql}
PREWHERE created_at > now() - INTERVAL {current_days} DAY
AND json_data != ''
WHERE {company_id_expr} != ''
LIMIT {current_limit} OFFSET {current_offset}
"""
try:
logger.info(
f"Querying company candidates from {table} "
f"(days={current_days}, limit={current_limit}, attempt={attempt + 1})"
)
return await client.query(query)
except Exception as exc:
last_error = exc
error_str = str(exc).lower()
if "memory" in error_str or "memory_limit" in error_str:
logger.warning(f"Memory-sensitive query retry for {table}: {exc}")
continue
raise
assert last_error is not None
raise last_error
async def process_single_company(
self,
source: str,
company_id: str,
proxy: Optional[str] = None,
max_delay_seconds: int = 5,
) -> dict[str, Any]:
normalized_id = normalize_company_id(source, company_id)
queue, _ = await company_storage.enqueue_company(source, normalized_id)
if proxy:
self._apply_proxy(proxy)
if max_delay_seconds and max_delay_seconds > 0:
await asyncio.sleep(random.randint(1, max_delay_seconds))
await company_storage.mark_queue_processing(queue)
try:
persist_result = await self._fetch_and_save(source, normalized_id)
jobs_result = await self._sync_company_jobs(source, normalized_id)
if persist_result["company_name"] and queue.company_name != persist_result["company_name"]:
queue.company_name = persist_result["company_name"]
await company_storage.mark_queue_result(queue, status="done", jobs_summary=jobs_result)
return {
"success": True,
"source": source,
"company_id": normalized_id,
"company_name": persist_result["company_name"],
"status": "done",
"error_msg": "",
"created": persist_result["created"],
"jobs_summary": jobs_result,
}
except Exception as exc:
logger.error(f"Error processing {source} {normalized_id}: {exc}")
await company_storage.mark_queue_result(
queue,
status="failed",
error_msg=str(exc),
increment_retry=True,
)
return {
"success": False,
"source": source,
"company_id": normalized_id,
"company_name": queue.company_name or "",
"status": "failed",
"error_msg": str(exc),
}
async def process_pending_companies(
self,
limit: int = 100,
source: Optional[str] = None,
proxy: Optional[str] = None,
max_delay_seconds: int = 0,
):
logger.info(f"Processing pending companies (limit={limit}, source={source or 'all'})...")
if proxy:
self._apply_proxy(proxy)
query = CompanyCleaningQueue.filter(status="pending")
if source:
query = query.filter(source=source)
queue_rows = await query.order_by("created_at").limit(limit)
if not queue_rows:
logger.info("No pending companies to process.")
return
for queue in queue_rows:
logger.info(f"Processing {queue.source} company: {queue.company_name or ''} ({queue.company_id})")
await company_storage.mark_queue_processing(queue)
try:
if max_delay_seconds and max_delay_seconds > 0:
await asyncio.sleep(random.randint(1, max_delay_seconds))
persist_result = await self._fetch_and_save(queue.source, queue.company_id)
jobs_result = await self._sync_company_jobs(queue.source, queue.company_id)
logger.info(
f"Synced {queue.source} company jobs: "
f"fetched={jobs_result['jobs_fetched']} stored={jobs_result['stored_success']} "
f"duplicate={jobs_result['duplicate']} failed={jobs_result['failed']}"
)
if persist_result["company_name"] and queue.company_name != persist_result["company_name"]:
queue.company_name = persist_result["company_name"]
await company_storage.mark_queue_result(queue, status="done", jobs_summary=jobs_result)
except Exception as exc:
logger.error(f"Error processing {queue.source} {queue.company_id}: {exc}")
await company_storage.mark_queue_result(
queue,
status="failed",
error_msg=str(exc),
increment_retry=True,
)
async def _fetch_and_save(self, source: str, company_id: str) -> dict[str, Any]:
data = await self._fetch_company_data(source, company_id)
if not data:
raise ValueError(f"No data returned from source={source} company_id={company_id}")
return await company_storage.upsert_company(source, data, company_id=company_id)
async def _sync_company_jobs(self, source: str, company_id: str) -> dict[str, Any]:
try:
return await self.company_jobs_sync.sync_company_jobs(source, company_id)
except Exception as exc:
logger.warning(f"Sync company jobs failed for {source} {company_id}: {exc}")
return {
"success": False,
"source": source,
"company_id": company_id,
"jobs_fetched": 0,
"stored_success": 0,
"duplicate": 0,
"failed": 0,
"error": str(exc),
}
async def _fetch_company_data(self, source: str, company_id: str) -> dict[str, Any]:
if source == "zhilian":
data = await asyncio.to_thread(self.zhilian_service.get_company_detail, company_id)
return data or {}
if source == "qcwy":
data = await asyncio.to_thread(self.qcwy_service.get_company_info, company_id)
return data or {}
if source == "boss":
await self._ensure_boss_token_loaded()
data = await asyncio.to_thread(self.boss_service.get_company_detail_by_id, company_id)
return data or {}
raise ValueError(f"unsupported source: {source}")
async def cleanup_old_records(self):
logger.info("Starting cleanup of processed pending companies...")
await CompanyCleaningQueue.filter(status__in=["done", "failed"]).delete()
company_cleaner = CompanyCleaner()