import asyncio import random import time from typing import Any, Optional from loguru import logger from app.core.clickhouse import clickhouse_manager from app.models.company import CompanyCleaningQueue from app.models.token import BossToken from app.services.company_jobs_sync import CompanyJobsSyncService from app.services.company_storage import company_storage, normalize_company_id from app.services.crawler.boss import BossService from app.services.crawler.qcwy import QcwyService from app.services.crawler.zhilian import ZhilianService SOURCE_CONFIGS: dict[str, dict[str, Any]] = { "zhilian": { "job_table": "zhilian_job", "company_id_expr": "JSONExtractString(json_data, 'companyNumber')", "company_name_expr": "JSONExtractString(json_data, 'companyName')", "days_back": 30, "max_query_limit": None, }, "qcwy": { "job_table": "qcwy_job", "company_id_expr": "JSONExtractString(json_data, 'coId')", "company_name_expr": "JSONExtractString(json_data, 'companyName')", "days_back": 30, "max_query_limit": 5000, }, "boss": { "job_table": "boss_job", "company_id_expr": "JSONExtractString(json_data, 'brandComInfoVO', 'encryptBrandId')", "company_name_expr": "JSONExtractString(json_data, 'brandComInfoVO', 'brandName')", "days_back": 30, "max_query_limit": None, }, } class CompanyCleaner: _TOKEN_REFRESH_INTERVAL = 3600 def __init__(self): self.boss_service = BossService() self.qcwy_service = QcwyService() self.zhilian_service = ZhilianService() self.company_jobs_sync = CompanyJobsSyncService() self._boss_token_loaded = False self._token_loaded_at: float = 0 def _apply_proxy(self, proxy: Optional[str]) -> None: self.boss_service.set_proxy(proxy) self.qcwy_service.set_proxy(proxy) self.zhilian_service.set_proxy(proxy) self.company_jobs_sync.set_proxy(proxy) async def _ensure_boss_token_loaded(self) -> None: now = time.time() if ( self._boss_token_loaded and self.boss_service.login_data.get("mpt") and now - self._token_loaded_at < self._TOKEN_REFRESH_INTERVAL ): return token_obj = await BossToken.filter(is_active=True).order_by("-updated_at").first() if not token_obj: logger.warning("BossToken not found or inactive in CompanyCleaner") return self.boss_service.set_login_data(token_obj.mpt or "", "") self._boss_token_loaded = True self._token_loaded_at = now async def collect_pending_companies(self, limit: int = 1000, source: Optional[str] = None) -> dict[str, Any]: client = await clickhouse_manager.get_client() logger.info(f"Starting to collect pending companies (limit={limit}, source={source or 'all'})...") summary: dict[str, Any] = { "total_created": 0, "sources": {}, } sources = [s for s in SOURCE_CONFIGS if source is None or source == s] # 并行采集各平台 tasks = [self._collect_source(client, s, limit) for s in sources] results = await asyncio.gather(*tasks, return_exceptions=True) for s, result in zip(sources, results): if isinstance(result, Exception): logger.error(f"Error collecting {s}: {result}") summary["sources"][s] = {"source": s, "created_count": 0, "error": str(result)} else: summary["sources"][s] = result summary["total_created"] += result["created_count"] logger.info("Finished collecting pending companies.") return summary async def _collect_source(self, client, source: str, limit: int) -> dict[str, Any]: config = SOURCE_CONFIGS[source] # 先从 MySQL 取出该平台所有已入队/已入库的 company_id,Python 侧快速排除 all_queued = set(await CompanyCleaningQueue.filter(source=source).values_list("company_id", flat=True)) all_existing = await company_storage.get_all_company_ids(source) exclude_ids = all_queued | all_existing logger.info(f"Loaded {len(exclude_ids)} known {source} company IDs for exclusion") # 用 OFFSET 跳过已知公司数量,获取新公司 offset = len(exclude_ids) result = await self._query_candidate_rows( client=client, table=config["job_table"], company_id_expr=config["company_id_expr"], company_name_expr=config["company_name_expr"], days_back=config["days_back"], limit=limit, max_query_limit=config["max_query_limit"], offset=offset, ) if not result.result_rows: logger.info(f"No new {source} companies found in job table query.") return { "source": source, "query_count": 0, "deduped_count": 0, "existing_count": len(all_existing), "queued_count": len(all_queued), "created_count": 0, } deduped_candidates: list[dict[str, str]] = [] seen_ids: set[str] = set() for raw_company_id, company_name in result.result_rows: company_id = normalize_company_id(source, raw_company_id) if not company_id or company_id in seen_ids or company_id in exclude_ids: continue seen_ids.add(company_id) deduped_candidates.append( { "company_id": company_id, "company_name": (company_name or "").strip(), } ) if len(deduped_candidates) >= limit: break created_count = await company_storage.enqueue_companies(source, deduped_candidates) logger.info(f"Added {created_count} {source} companies to MySQL queue.") return { "source": source, "query_count": len(result.result_rows), "deduped_count": len(deduped_candidates), "existing_count": len(all_existing), "queued_count": len(all_queued), "created_count": created_count, } async def _query_candidate_rows( self, *, client, table: str, company_id_expr: str, company_name_expr: str, days_back: int, limit: int, max_query_limit: Optional[int], offset: int = 0, ): current_days = days_back current_limit = limit * 5 if max_query_limit is not None: current_limit = min(current_limit, max_query_limit) last_error: Optional[Exception] = None for attempt in range(3): sample_sql = " SAMPLE 0.1" if attempt == 2 else "" current_offset = offset if attempt == 1: current_days = max(1, min(current_days, 3)) current_limit = min(current_limit, max(limit, 50)) current_offset = 0 query = f""" SELECT DISTINCT {company_id_expr} AS cid, {company_name_expr} AS cname FROM job_data.{table}{sample_sql} PREWHERE created_at > now() - INTERVAL {current_days} DAY AND json_data != '' WHERE {company_id_expr} != '' LIMIT {current_limit} OFFSET {current_offset} """ try: logger.info( f"Querying company candidates from {table} " f"(days={current_days}, limit={current_limit}, attempt={attempt + 1})" ) return await client.query(query) except Exception as exc: last_error = exc error_str = str(exc).lower() if "memory" in error_str or "memory_limit" in error_str: logger.warning(f"Memory-sensitive query retry for {table}: {exc}") continue raise assert last_error is not None raise last_error async def process_single_company( self, source: str, company_id: str, proxy: Optional[str] = None, max_delay_seconds: int = 5, ) -> dict[str, Any]: normalized_id = normalize_company_id(source, company_id) queue, _ = await company_storage.enqueue_company(source, normalized_id) if proxy: self._apply_proxy(proxy) if max_delay_seconds and max_delay_seconds > 0: await asyncio.sleep(random.randint(1, max_delay_seconds)) await company_storage.mark_queue_processing(queue) try: persist_result = await self._fetch_and_save(source, normalized_id) jobs_result = await self._sync_company_jobs(source, normalized_id) if persist_result["company_name"] and queue.company_name != persist_result["company_name"]: queue.company_name = persist_result["company_name"] await company_storage.mark_queue_result(queue, status="done", jobs_summary=jobs_result) return { "success": True, "source": source, "company_id": normalized_id, "company_name": persist_result["company_name"], "status": "done", "error_msg": "", "created": persist_result["created"], "jobs_summary": jobs_result, } except Exception as exc: logger.error(f"Error processing {source} {normalized_id}: {exc}") await company_storage.mark_queue_result( queue, status="failed", error_msg=str(exc), increment_retry=True, ) return { "success": False, "source": source, "company_id": normalized_id, "company_name": queue.company_name or "", "status": "failed", "error_msg": str(exc), } async def process_pending_companies( self, limit: int = 100, source: Optional[str] = None, proxy: Optional[str] = None, max_delay_seconds: int = 0, ): logger.info(f"Processing pending companies (limit={limit}, source={source or 'all'})...") if proxy: self._apply_proxy(proxy) query = CompanyCleaningQueue.filter(status="pending") if source: query = query.filter(source=source) queue_rows = await query.order_by("created_at").limit(limit) if not queue_rows: logger.info("No pending companies to process.") return for queue in queue_rows: logger.info(f"Processing {queue.source} company: {queue.company_name or ''} ({queue.company_id})") await company_storage.mark_queue_processing(queue) try: if max_delay_seconds and max_delay_seconds > 0: await asyncio.sleep(random.randint(1, max_delay_seconds)) persist_result = await self._fetch_and_save(queue.source, queue.company_id) jobs_result = await self._sync_company_jobs(queue.source, queue.company_id) logger.info( f"Synced {queue.source} company jobs: " f"fetched={jobs_result['jobs_fetched']} stored={jobs_result['stored_success']} " f"duplicate={jobs_result['duplicate']} failed={jobs_result['failed']}" ) if persist_result["company_name"] and queue.company_name != persist_result["company_name"]: queue.company_name = persist_result["company_name"] await company_storage.mark_queue_result(queue, status="done", jobs_summary=jobs_result) except Exception as exc: logger.error(f"Error processing {queue.source} {queue.company_id}: {exc}") await company_storage.mark_queue_result( queue, status="failed", error_msg=str(exc), increment_retry=True, ) async def _fetch_and_save(self, source: str, company_id: str) -> dict[str, Any]: data = await self._fetch_company_data(source, company_id) if not data: raise ValueError(f"No data returned from source={source} company_id={company_id}") return await company_storage.upsert_company(source, data, company_id=company_id) async def _sync_company_jobs(self, source: str, company_id: str) -> dict[str, Any]: try: return await self.company_jobs_sync.sync_company_jobs(source, company_id) except Exception as exc: logger.warning(f"Sync company jobs failed for {source} {company_id}: {exc}") return { "success": False, "source": source, "company_id": company_id, "jobs_fetched": 0, "stored_success": 0, "duplicate": 0, "failed": 0, "error": str(exc), } async def _fetch_company_data(self, source: str, company_id: str) -> dict[str, Any]: if source == "zhilian": data = await asyncio.to_thread(self.zhilian_service.get_company_detail, company_id) return data or {} if source == "qcwy": data = await asyncio.to_thread(self.qcwy_service.get_company_info, company_id) return data or {} if source == "boss": await self._ensure_boss_token_loaded() data = await asyncio.to_thread(self.boss_service.get_company_detail_by_id, company_id) return data or {} raise ValueError(f"unsupported source: {source}") async def cleanup_old_records(self): logger.info("Starting cleanup of processed pending companies...") await CompanyCleaningQueue.filter(status__in=["done", "failed"]).delete() company_cleaner = CompanyCleaner()