340 lines
14 KiB
Python
340 lines
14 KiB
Python
import asyncio
|
||
import random
|
||
import time
|
||
from typing import Any, Optional
|
||
|
||
from loguru import logger
|
||
|
||
from app.core.clickhouse import clickhouse_manager
|
||
from app.models.company import CompanyCleaningQueue
|
||
from app.models.token import BossToken
|
||
from app.services.company_jobs_sync import CompanyJobsSyncService
|
||
from app.services.company_storage import company_storage, normalize_company_id
|
||
from app.services.crawler.boss import BossService
|
||
from app.services.crawler.qcwy import QcwyService
|
||
from app.services.crawler.zhilian import ZhilianService
|
||
|
||
|
||
SOURCE_CONFIGS: dict[str, dict[str, Any]] = {
|
||
"zhilian": {
|
||
"job_table": "zhilian_job",
|
||
"company_id_expr": "JSONExtractString(json_data, 'companyNumber')",
|
||
"company_name_expr": "JSONExtractString(json_data, 'companyName')",
|
||
"days_back": 30,
|
||
"max_query_limit": None,
|
||
},
|
||
"qcwy": {
|
||
"job_table": "qcwy_job",
|
||
"company_id_expr": "JSONExtractString(json_data, 'coId')",
|
||
"company_name_expr": "JSONExtractString(json_data, 'companyName')",
|
||
"days_back": 30,
|
||
"max_query_limit": 5000,
|
||
},
|
||
"boss": {
|
||
"job_table": "boss_job",
|
||
"company_id_expr": "JSONExtractString(json_data, 'brandComInfoVO', 'encryptBrandId')",
|
||
"company_name_expr": "JSONExtractString(json_data, 'brandComInfoVO', 'brandName')",
|
||
"days_back": 30,
|
||
"max_query_limit": None,
|
||
},
|
||
}
|
||
|
||
|
||
class CompanyCleaner:
|
||
_TOKEN_REFRESH_INTERVAL = 3600
|
||
|
||
def __init__(self):
|
||
self.boss_service = BossService()
|
||
self.qcwy_service = QcwyService()
|
||
self.zhilian_service = ZhilianService()
|
||
self.company_jobs_sync = CompanyJobsSyncService()
|
||
self._boss_token_loaded = False
|
||
self._token_loaded_at: float = 0
|
||
|
||
def _apply_proxy(self, proxy: Optional[str]) -> None:
|
||
self.boss_service.set_proxy(proxy)
|
||
self.qcwy_service.set_proxy(proxy)
|
||
self.zhilian_service.set_proxy(proxy)
|
||
self.company_jobs_sync.set_proxy(proxy)
|
||
|
||
async def _ensure_boss_token_loaded(self) -> None:
|
||
now = time.time()
|
||
if (
|
||
self._boss_token_loaded
|
||
and self.boss_service.login_data.get("mpt")
|
||
and now - self._token_loaded_at < self._TOKEN_REFRESH_INTERVAL
|
||
):
|
||
return
|
||
token_obj = await BossToken.filter(is_active=True).order_by("-updated_at").first()
|
||
if not token_obj:
|
||
logger.warning("BossToken not found or inactive in CompanyCleaner")
|
||
return
|
||
self.boss_service.set_login_data(token_obj.mpt or "", "")
|
||
self._boss_token_loaded = True
|
||
self._token_loaded_at = now
|
||
|
||
async def collect_pending_companies(self, limit: int = 1000, source: Optional[str] = None) -> dict[str, Any]:
|
||
client = await clickhouse_manager.get_client()
|
||
logger.info(f"Starting to collect pending companies (limit={limit}, source={source or 'all'})...")
|
||
summary: dict[str, Any] = {
|
||
"total_created": 0,
|
||
"sources": {},
|
||
}
|
||
sources = [s for s in SOURCE_CONFIGS if source is None or source == s]
|
||
# 并行采集各平台
|
||
tasks = [self._collect_source(client, s, limit) for s in sources]
|
||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||
for s, result in zip(sources, results):
|
||
if isinstance(result, Exception):
|
||
logger.error(f"Error collecting {s}: {result}")
|
||
summary["sources"][s] = {"source": s, "created_count": 0, "error": str(result)}
|
||
else:
|
||
summary["sources"][s] = result
|
||
summary["total_created"] += result["created_count"]
|
||
logger.info("Finished collecting pending companies.")
|
||
return summary
|
||
|
||
async def _collect_source(self, client, source: str, limit: int) -> dict[str, Any]:
|
||
config = SOURCE_CONFIGS[source]
|
||
|
||
# 先从 MySQL 取出该平台所有已入队/已入库的 company_id,Python 侧快速排除
|
||
all_queued = set(await CompanyCleaningQueue.filter(source=source).values_list("company_id", flat=True))
|
||
all_existing = await company_storage.get_all_company_ids(source)
|
||
exclude_ids = all_queued | all_existing
|
||
logger.info(f"Loaded {len(exclude_ids)} known {source} company IDs for exclusion")
|
||
|
||
# 用 OFFSET 跳过已知公司数量,获取新公司
|
||
offset = len(exclude_ids)
|
||
result = await self._query_candidate_rows(
|
||
client=client,
|
||
table=config["job_table"],
|
||
company_id_expr=config["company_id_expr"],
|
||
company_name_expr=config["company_name_expr"],
|
||
days_back=config["days_back"],
|
||
limit=limit,
|
||
max_query_limit=config["max_query_limit"],
|
||
offset=offset,
|
||
)
|
||
if not result.result_rows:
|
||
logger.info(f"No new {source} companies found in job table query.")
|
||
return {
|
||
"source": source,
|
||
"query_count": 0,
|
||
"deduped_count": 0,
|
||
"existing_count": len(all_existing),
|
||
"queued_count": len(all_queued),
|
||
"created_count": 0,
|
||
}
|
||
|
||
deduped_candidates: list[dict[str, str]] = []
|
||
seen_ids: set[str] = set()
|
||
for raw_company_id, company_name in result.result_rows:
|
||
company_id = normalize_company_id(source, raw_company_id)
|
||
if not company_id or company_id in seen_ids or company_id in exclude_ids:
|
||
continue
|
||
seen_ids.add(company_id)
|
||
deduped_candidates.append(
|
||
{
|
||
"company_id": company_id,
|
||
"company_name": (company_name or "").strip(),
|
||
}
|
||
)
|
||
if len(deduped_candidates) >= limit:
|
||
break
|
||
|
||
created_count = await company_storage.enqueue_companies(source, deduped_candidates)
|
||
logger.info(f"Added {created_count} {source} companies to MySQL queue.")
|
||
return {
|
||
"source": source,
|
||
"query_count": len(result.result_rows),
|
||
"deduped_count": len(deduped_candidates),
|
||
"existing_count": len(all_existing),
|
||
"queued_count": len(all_queued),
|
||
"created_count": created_count,
|
||
}
|
||
|
||
async def _query_candidate_rows(
|
||
self,
|
||
*,
|
||
client,
|
||
table: str,
|
||
company_id_expr: str,
|
||
company_name_expr: str,
|
||
days_back: int,
|
||
limit: int,
|
||
max_query_limit: Optional[int],
|
||
offset: int = 0,
|
||
):
|
||
current_days = days_back
|
||
current_limit = limit * 5
|
||
if max_query_limit is not None:
|
||
current_limit = min(current_limit, max_query_limit)
|
||
|
||
last_error: Optional[Exception] = None
|
||
for attempt in range(3):
|
||
sample_sql = " SAMPLE 0.1" if attempt == 2 else ""
|
||
current_offset = offset
|
||
if attempt == 1:
|
||
current_days = max(1, min(current_days, 3))
|
||
current_limit = min(current_limit, max(limit, 50))
|
||
current_offset = 0
|
||
query = f"""
|
||
SELECT DISTINCT
|
||
{company_id_expr} AS cid,
|
||
{company_name_expr} AS cname
|
||
FROM job_data.{table}{sample_sql}
|
||
PREWHERE created_at > now() - INTERVAL {current_days} DAY
|
||
AND json_data != ''
|
||
WHERE {company_id_expr} != ''
|
||
LIMIT {current_limit} OFFSET {current_offset}
|
||
"""
|
||
try:
|
||
logger.info(
|
||
f"Querying company candidates from {table} "
|
||
f"(days={current_days}, limit={current_limit}, attempt={attempt + 1})"
|
||
)
|
||
return await client.query(query)
|
||
except Exception as exc:
|
||
last_error = exc
|
||
error_str = str(exc).lower()
|
||
if "memory" in error_str or "memory_limit" in error_str:
|
||
logger.warning(f"Memory-sensitive query retry for {table}: {exc}")
|
||
continue
|
||
raise
|
||
assert last_error is not None
|
||
raise last_error
|
||
|
||
async def process_single_company(
|
||
self,
|
||
source: str,
|
||
company_id: str,
|
||
proxy: Optional[str] = None,
|
||
max_delay_seconds: int = 5,
|
||
) -> dict[str, Any]:
|
||
normalized_id = normalize_company_id(source, company_id)
|
||
queue, _ = await company_storage.enqueue_company(source, normalized_id)
|
||
if proxy:
|
||
self._apply_proxy(proxy)
|
||
if max_delay_seconds and max_delay_seconds > 0:
|
||
await asyncio.sleep(random.randint(1, max_delay_seconds))
|
||
|
||
await company_storage.mark_queue_processing(queue)
|
||
try:
|
||
persist_result = await self._fetch_and_save(source, normalized_id)
|
||
jobs_result = await self._sync_company_jobs(source, normalized_id)
|
||
if persist_result["company_name"] and queue.company_name != persist_result["company_name"]:
|
||
queue.company_name = persist_result["company_name"]
|
||
await company_storage.mark_queue_result(queue, status="done", jobs_summary=jobs_result)
|
||
return {
|
||
"success": True,
|
||
"source": source,
|
||
"company_id": normalized_id,
|
||
"company_name": persist_result["company_name"],
|
||
"status": "done",
|
||
"error_msg": "",
|
||
"created": persist_result["created"],
|
||
"jobs_summary": jobs_result,
|
||
}
|
||
except Exception as exc:
|
||
logger.error(f"Error processing {source} {normalized_id}: {exc}")
|
||
await company_storage.mark_queue_result(
|
||
queue,
|
||
status="failed",
|
||
error_msg=str(exc),
|
||
increment_retry=True,
|
||
)
|
||
return {
|
||
"success": False,
|
||
"source": source,
|
||
"company_id": normalized_id,
|
||
"company_name": queue.company_name or "",
|
||
"status": "failed",
|
||
"error_msg": str(exc),
|
||
}
|
||
|
||
async def process_pending_companies(
|
||
self,
|
||
limit: int = 100,
|
||
source: Optional[str] = None,
|
||
proxy: Optional[str] = None,
|
||
max_delay_seconds: int = 0,
|
||
):
|
||
logger.info(f"Processing pending companies (limit={limit}, source={source or 'all'})...")
|
||
if proxy:
|
||
self._apply_proxy(proxy)
|
||
|
||
query = CompanyCleaningQueue.filter(status="pending")
|
||
if source:
|
||
query = query.filter(source=source)
|
||
queue_rows = await query.order_by("created_at").limit(limit)
|
||
if not queue_rows:
|
||
logger.info("No pending companies to process.")
|
||
return
|
||
|
||
for queue in queue_rows:
|
||
logger.info(f"Processing {queue.source} company: {queue.company_name or ''} ({queue.company_id})")
|
||
await company_storage.mark_queue_processing(queue)
|
||
try:
|
||
if max_delay_seconds and max_delay_seconds > 0:
|
||
await asyncio.sleep(random.randint(1, max_delay_seconds))
|
||
persist_result = await self._fetch_and_save(queue.source, queue.company_id)
|
||
jobs_result = await self._sync_company_jobs(queue.source, queue.company_id)
|
||
logger.info(
|
||
f"Synced {queue.source} company jobs: "
|
||
f"fetched={jobs_result['jobs_fetched']} stored={jobs_result['stored_success']} "
|
||
f"duplicate={jobs_result['duplicate']} failed={jobs_result['failed']}"
|
||
)
|
||
if persist_result["company_name"] and queue.company_name != persist_result["company_name"]:
|
||
queue.company_name = persist_result["company_name"]
|
||
await company_storage.mark_queue_result(queue, status="done", jobs_summary=jobs_result)
|
||
except Exception as exc:
|
||
logger.error(f"Error processing {queue.source} {queue.company_id}: {exc}")
|
||
await company_storage.mark_queue_result(
|
||
queue,
|
||
status="failed",
|
||
error_msg=str(exc),
|
||
increment_retry=True,
|
||
)
|
||
|
||
async def _fetch_and_save(self, source: str, company_id: str) -> dict[str, Any]:
|
||
data = await self._fetch_company_data(source, company_id)
|
||
if not data:
|
||
raise ValueError(f"No data returned from source={source} company_id={company_id}")
|
||
return await company_storage.upsert_company(source, data, company_id=company_id)
|
||
|
||
async def _sync_company_jobs(self, source: str, company_id: str) -> dict[str, Any]:
|
||
try:
|
||
return await self.company_jobs_sync.sync_company_jobs(source, company_id)
|
||
except Exception as exc:
|
||
logger.warning(f"Sync company jobs failed for {source} {company_id}: {exc}")
|
||
return {
|
||
"success": False,
|
||
"source": source,
|
||
"company_id": company_id,
|
||
"jobs_fetched": 0,
|
||
"stored_success": 0,
|
||
"duplicate": 0,
|
||
"failed": 0,
|
||
"error": str(exc),
|
||
}
|
||
|
||
async def _fetch_company_data(self, source: str, company_id: str) -> dict[str, Any]:
|
||
if source == "zhilian":
|
||
data = await asyncio.to_thread(self.zhilian_service.get_company_detail, company_id)
|
||
return data or {}
|
||
if source == "qcwy":
|
||
data = await asyncio.to_thread(self.qcwy_service.get_company_info, company_id)
|
||
return data or {}
|
||
if source == "boss":
|
||
await self._ensure_boss_token_loaded()
|
||
data = await asyncio.to_thread(self.boss_service.get_company_detail_by_id, company_id)
|
||
return data or {}
|
||
raise ValueError(f"unsupported source: {source}")
|
||
|
||
async def cleanup_old_records(self):
|
||
logger.info("Starting cleanup of processed pending companies...")
|
||
await CompanyCleaningQueue.filter(status__in=["done", "failed"]).delete()
|
||
|
||
|
||
company_cleaner = CompanyCleaner()
|