JobData/app/services/company_cleaner.py

import asyncio
import random
import time
from typing import Any, Optional

from loguru import logger

from app.core.clickhouse import clickhouse_manager
from app.models.company import CompanyCleaningQueue
from app.models.token import BossToken
from app.services.company_jobs_sync import CompanyJobsSyncService
from app.services.company_storage import company_storage, normalize_company_id
from app.services.crawler.boss import BossService
from app.services.crawler.qcwy import QcwyService
from app.services.crawler.zhilian import ZhilianService


SOURCE_CONFIGS: dict[str, dict[str, Any]] = {
    "zhilian": {
        "job_table": "zhilian_job",
        "company_id_expr": "JSONExtractString(json_data, 'companyNumber')",
        "company_name_expr": "JSONExtractString(json_data, 'companyName')",
        "days_back": 30,
        "max_query_limit": None,
    },
    "qcwy": {
        "job_table": "qcwy_job",
        "company_id_expr": "JSONExtractString(json_data, 'coId')",
        "company_name_expr": "JSONExtractString(json_data, 'companyName')",
        "days_back": 30,
        "max_query_limit": 5000,
    },
    "boss": {
        "job_table": "boss_job",
        "company_id_expr": "JSONExtractString(json_data, 'brandComInfoVO', 'encryptBrandId')",
        "company_name_expr": "JSONExtractString(json_data, 'brandComInfoVO', 'brandName')",
        "days_back": 30,
        "max_query_limit": None,
    },
}


class CompanyCleaner:
    _TOKEN_REFRESH_INTERVAL = 3600

    def __init__(self):
        self.boss_service = BossService()
        self.qcwy_service = QcwyService()
        self.zhilian_service = ZhilianService()
        self.company_jobs_sync = CompanyJobsSyncService()
        self._boss_token_loaded = False
        self._token_loaded_at: float = 0

    def _apply_proxy(self, proxy: Optional[str]) -> None:
        self.boss_service.set_proxy(proxy)
        self.qcwy_service.set_proxy(proxy)
        self.zhilian_service.set_proxy(proxy)
        self.company_jobs_sync.set_proxy(proxy)

    async def _ensure_boss_token_loaded(self) -> None:
        now = time.time()
        if (
            self._boss_token_loaded
            and self.boss_service.login_data.get("mpt")
            and now - self._token_loaded_at < self._TOKEN_REFRESH_INTERVAL
        ):
            return
        token_obj = await BossToken.filter(is_active=True).order_by("-updated_at").first()
        if not token_obj:
            logger.warning("BossToken not found or inactive in CompanyCleaner")
            return
        self.boss_service.set_login_data(token_obj.mpt or "", "")
        self._boss_token_loaded = True
        self._token_loaded_at = now

    async def collect_pending_companies(self, limit: int = 1000, source: Optional[str] = None) -> dict[str, Any]:
        client = await clickhouse_manager.get_client()
        logger.info(f"Starting to collect pending companies (limit={limit}, source={source or 'all'})...")
        summary: dict[str, Any] = {
            "total_created": 0,
            "sources": {},
        }
        sources = [s for s in SOURCE_CONFIGS if source is None or source == s]
        # 并行采集各平台
        tasks = [self._collect_source(client, s, limit) for s in sources]
        results = await asyncio.gather(*tasks, return_exceptions=True)
        for s, result in zip(sources, results):
            if isinstance(result, Exception):
                logger.error(f"Error collecting {s}: {result}")
                summary["sources"][s] = {"source": s, "created_count": 0, "error": str(result)}
            else:
                summary["sources"][s] = result
                summary["total_created"] += result["created_count"]
        logger.info("Finished collecting pending companies.")
        return summary

    async def _collect_source(self, client, source: str, limit: int) -> dict[str, Any]:
        config = SOURCE_CONFIGS[source]

        # 先从 MySQL 取出该平台所有已入队/已入库的 company_id，Python 侧快速排除
        all_queued = set(await CompanyCleaningQueue.filter(source=source).values_list("company_id", flat=True))
        all_existing = await company_storage.get_all_company_ids(source)
        exclude_ids = all_queued | all_existing
        logger.info(f"Loaded {len(exclude_ids)} known {source} company IDs for exclusion")

        # 用 OFFSET 跳过已知公司数量，获取新公司
        offset = len(exclude_ids)
        result = await self._query_candidate_rows(
            client=client,
            table=config["job_table"],
            company_id_expr=config["company_id_expr"],
            company_name_expr=config["company_name_expr"],
            days_back=config["days_back"],
            limit=limit,
            max_query_limit=config["max_query_limit"],
            offset=offset,
        )
        if not result.result_rows:
            logger.info(f"No new {source} companies found in job table query.")
            return {
                "source": source,
                "query_count": 0,
                "deduped_count": 0,
                "existing_count": len(all_existing),
                "queued_count": len(all_queued),
                "created_count": 0,
            }

        deduped_candidates: list[dict[str, str]] = []
        seen_ids: set[str] = set()
        for raw_company_id, company_name in result.result_rows:
            company_id = normalize_company_id(source, raw_company_id)
            if not company_id or company_id in seen_ids or company_id in exclude_ids:
                continue
            seen_ids.add(company_id)
            deduped_candidates.append(
                {
                    "company_id": company_id,
                    "company_name": (company_name or "").strip(),
                }
            )
            if len(deduped_candidates) >= limit:
                break

        created_count = await company_storage.enqueue_companies(source, deduped_candidates)
        logger.info(f"Added {created_count} {source} companies to MySQL queue.")
        return {
            "source": source,
            "query_count": len(result.result_rows),
            "deduped_count": len(deduped_candidates),
            "existing_count": len(all_existing),
            "queued_count": len(all_queued),
            "created_count": created_count,
        }

    async def _query_candidate_rows(
        self,
        *,
        client,
        table: str,
        company_id_expr: str,
        company_name_expr: str,
        days_back: int,
        limit: int,
        max_query_limit: Optional[int],
        offset: int = 0,
    ):
        current_days = days_back
        current_limit = limit * 5
        if max_query_limit is not None:
            current_limit = min(current_limit, max_query_limit)

        last_error: Optional[Exception] = None
        for attempt in range(3):
            sample_sql = " SAMPLE 0.1" if attempt == 2 else ""
            current_offset = offset
            if attempt == 1:
                current_days = max(1, min(current_days, 3))
                current_limit = min(current_limit, max(limit, 50))
                current_offset = 0
            query = f"""
            SELECT DISTINCT
                {company_id_expr} AS cid,
                {company_name_expr} AS cname
            FROM job_data.{table}{sample_sql}
            PREWHERE created_at > now() - INTERVAL {current_days} DAY
                AND json_data != ''
            WHERE {company_id_expr} != ''
            LIMIT {current_limit} OFFSET {current_offset}
            """
            try:
                logger.info(
                    f"Querying company candidates from {table} "
                    f"(days={current_days}, limit={current_limit}, attempt={attempt + 1})"
                )
                return await client.query(query)
            except Exception as exc:
                last_error = exc
                error_str = str(exc).lower()
                if "memory" in error_str or "memory_limit" in error_str:
                    logger.warning(f"Memory-sensitive query retry for {table}: {exc}")
                    continue
                raise
        assert last_error is not None
        raise last_error

    async def process_single_company(
        self,
        source: str,
        company_id: str,
        proxy: Optional[str] = None,
        max_delay_seconds: int = 5,
    ) -> dict[str, Any]:
        normalized_id = normalize_company_id(source, company_id)
        queue, _ = await company_storage.enqueue_company(source, normalized_id)
        if proxy:
            self._apply_proxy(proxy)
        if max_delay_seconds and max_delay_seconds > 0:
            await asyncio.sleep(random.randint(1, max_delay_seconds))

        await company_storage.mark_queue_processing(queue)
        try:
            persist_result = await self._fetch_and_save(source, normalized_id)
            jobs_result = await self._sync_company_jobs(source, normalized_id)
            if persist_result["company_name"] and queue.company_name != persist_result["company_name"]:
                queue.company_name = persist_result["company_name"]
            await company_storage.mark_queue_result(queue, status="done", jobs_summary=jobs_result)
            return {
                "success": True,
                "source": source,
                "company_id": normalized_id,
                "company_name": persist_result["company_name"],
                "status": "done",
                "error_msg": "",
                "created": persist_result["created"],
                "jobs_summary": jobs_result,
            }
        except Exception as exc:
            logger.error(f"Error processing {source} {normalized_id}: {exc}")
            await company_storage.mark_queue_result(
                queue,
                status="failed",
                error_msg=str(exc),
                increment_retry=True,
            )
            return {
                "success": False,
                "source": source,
                "company_id": normalized_id,
                "company_name": queue.company_name or "",
                "status": "failed",
                "error_msg": str(exc),
            }

    async def process_pending_companies(
        self,
        limit: int = 100,
        source: Optional[str] = None,
        proxy: Optional[str] = None,
        max_delay_seconds: int = 0,
    ):
        logger.info(f"Processing pending companies (limit={limit}, source={source or 'all'})...")
        if proxy:
            self._apply_proxy(proxy)

        query = CompanyCleaningQueue.filter(status="pending")
        if source:
            query = query.filter(source=source)
        queue_rows = await query.order_by("created_at").limit(limit)
        if not queue_rows:
            logger.info("No pending companies to process.")
            return

        for queue in queue_rows:
            logger.info(f"Processing {queue.source} company: {queue.company_name or ''} ({queue.company_id})")
            await company_storage.mark_queue_processing(queue)
            try:
                if max_delay_seconds and max_delay_seconds > 0:
                    await asyncio.sleep(random.randint(1, max_delay_seconds))
                persist_result = await self._fetch_and_save(queue.source, queue.company_id)
                jobs_result = await self._sync_company_jobs(queue.source, queue.company_id)
                logger.info(
                    f"Synced {queue.source} company jobs: "
                    f"fetched={jobs_result['jobs_fetched']} stored={jobs_result['stored_success']} "
                    f"duplicate={jobs_result['duplicate']} failed={jobs_result['failed']}"
                )
                if persist_result["company_name"] and queue.company_name != persist_result["company_name"]:
                    queue.company_name = persist_result["company_name"]
                await company_storage.mark_queue_result(queue, status="done", jobs_summary=jobs_result)
            except Exception as exc:
                logger.error(f"Error processing {queue.source} {queue.company_id}: {exc}")
                await company_storage.mark_queue_result(
                    queue,
                    status="failed",
                    error_msg=str(exc),
                    increment_retry=True,
                )

    async def _fetch_and_save(self, source: str, company_id: str) -> dict[str, Any]:
        data = await self._fetch_company_data(source, company_id)
        if not data:
            raise ValueError(f"No data returned from source={source} company_id={company_id}")
        return await company_storage.upsert_company(source, data, company_id=company_id)

    async def _sync_company_jobs(self, source: str, company_id: str) -> dict[str, Any]:
        try:
            return await self.company_jobs_sync.sync_company_jobs(source, company_id)
        except Exception as exc:
            logger.warning(f"Sync company jobs failed for {source} {company_id}: {exc}")
            return {
                "success": False,
                "source": source,
                "company_id": company_id,
                "jobs_fetched": 0,
                "stored_success": 0,
                "duplicate": 0,
                "failed": 0,
                "error": str(exc),
            }

    async def _fetch_company_data(self, source: str, company_id: str) -> dict[str, Any]:
        if source == "zhilian":
            data = await asyncio.to_thread(self.zhilian_service.get_company_detail, company_id)
            return data or {}
        if source == "qcwy":
            data = await asyncio.to_thread(self.qcwy_service.get_company_info, company_id)
            return data or {}
        if source == "boss":
            await self._ensure_boss_token_loaded()
            data = await asyncio.to_thread(self.boss_service.get_company_detail_by_id, company_id)
            return data or {}
        raise ValueError(f"unsupported source: {source}")

    async def cleanup_old_records(self):
        logger.info("Starting cleanup of processed pending companies...")
        await CompanyCleaningQueue.filter(status__in=["done", "failed"]).delete()


company_cleaner = CompanyCleaner()