328 lines
14 KiB
Python
328 lines
14 KiB
Python
import asyncio
|
|
import csv
|
|
import io
|
|
import re
|
|
import time
|
|
from typing import List, Dict, Any, Union, Optional
|
|
from fastapi import UploadFile
|
|
from loguru import logger
|
|
from app.services.crawler.boss import BossService
|
|
from app.services.crawler.qcwy import QcwyService
|
|
from app.services.crawler.zhilian import ZhilianService
|
|
from app.services.company_jobs_sync import CompanyJobsSyncService
|
|
from app.services.company_storage import company_storage
|
|
from app.services.ingest import IngestService
|
|
from app.core.clickhouse import clickhouse_manager
|
|
from app.models.token import BossToken
|
|
|
|
|
|
class CleaningService:
|
|
_TOKEN_REFRESH_INTERVAL = 3600 # 1小时刷新一次
|
|
|
|
def __init__(self):
|
|
self.boss_service = BossService()
|
|
self.qcwy_service = QcwyService()
|
|
self.zhilian_service = ZhilianService()
|
|
self.company_jobs_sync = CompanyJobsSyncService()
|
|
self.data_router: Optional[IngestService] = None
|
|
self._boss_token_loaded = False
|
|
self._token_loaded_at: float = 0
|
|
|
|
def _apply_proxy(self, proxy: Optional[str]) -> None:
|
|
self.boss_service.set_proxy(proxy)
|
|
self.qcwy_service.set_proxy(proxy)
|
|
self.zhilian_service.set_proxy(proxy)
|
|
self.company_jobs_sync.set_proxy(proxy)
|
|
|
|
async def _ensure_boss_token_loaded(self) -> None:
|
|
now = time.time()
|
|
if (self._boss_token_loaded
|
|
and self.boss_service.login_data.get("mpt")
|
|
and now - self._token_loaded_at < self._TOKEN_REFRESH_INTERVAL):
|
|
return
|
|
token_obj = await BossToken.filter(is_active=True).order_by("-updated_at").first()
|
|
if not token_obj:
|
|
logger.warning("BossToken not found or inactive")
|
|
return
|
|
self.boss_service.set_login_data(token_obj.mpt or "", "")
|
|
self._boss_token_loaded = True
|
|
self._token_loaded_at = now
|
|
|
|
async def get_data_router(self) -> IngestService:
|
|
if not self.data_router:
|
|
client = await clickhouse_manager.get_client()
|
|
self.data_router = IngestService(client)
|
|
return self.data_router
|
|
|
|
async def parse_file(self, file: UploadFile) -> List[str]:
|
|
content = await file.read()
|
|
filename = file.filename
|
|
|
|
targets = []
|
|
if filename.endswith('.csv'):
|
|
text = content.decode('utf-8')
|
|
if text.startswith('\uFEFF'):
|
|
text = text[1:]
|
|
reader = csv.reader(io.StringIO(text))
|
|
for row in reader:
|
|
if row:
|
|
targets.append(row[0].strip())
|
|
else:
|
|
text = content.decode('utf-8')
|
|
targets = [line.strip() for line in text.splitlines() if line.strip()]
|
|
|
|
return [t for t in targets if t]
|
|
|
|
async def _store_company_record(
|
|
self,
|
|
source: str,
|
|
data: Dict[str, Any],
|
|
company_id: str,
|
|
) -> Dict[str, Any]:
|
|
result = await company_storage.upsert_company(source, data, company_id=company_id)
|
|
result["duplicate"] = False
|
|
result["remote_sent"] = False
|
|
result["message"] = "公司数据已写入MySQL"
|
|
result["original_data"] = data
|
|
return result
|
|
|
|
async def process_single_item(self, target: str, clean_type: str = "auto", platform: str = "auto", proxy: Optional[str] = None) -> Dict[str, Any]:
|
|
try:
|
|
await self._ensure_boss_token_loaded()
|
|
self._apply_proxy(proxy)
|
|
result = None
|
|
if clean_type == "auto":
|
|
result = await self.clean_target_auto(target)
|
|
elif clean_type == "clean_url":
|
|
if platform == "auto":
|
|
result = await self.clean_target_auto(target)
|
|
elif platform == "boss":
|
|
result = await self._process_boss_url(target)
|
|
elif platform == "qcwy":
|
|
result = await self._process_qcwy_url(target)
|
|
elif platform == "zhilian":
|
|
result = await self._process_zhilian_url(target)
|
|
elif clean_type == "job_id":
|
|
result = await self.clean_by_job_id(target, platform)
|
|
elif clean_type == "company_name":
|
|
result = await self.clean_by_company_name(target, platform)
|
|
elif clean_type == "company_id":
|
|
result = await self.clean_by_company_id(target, platform)
|
|
elif clean_type == "company_jobs":
|
|
if platform == "boss":
|
|
result = await self.clean_boss_company_jobs(target)
|
|
elif platform == "qcwy":
|
|
result = await self.clean_qcwy_company_jobs(target)
|
|
elif platform == "zhilian":
|
|
result = await self.clean_zhilian_company_jobs(target)
|
|
|
|
if not result:
|
|
return {
|
|
"success": False,
|
|
"target": target,
|
|
"error": "No data found or operation failed",
|
|
"storage_status": "failed",
|
|
"remote_sent": False
|
|
}
|
|
|
|
if isinstance(result, bool):
|
|
return {
|
|
"success": result,
|
|
"target": target,
|
|
"error": None if result else "Operation failed",
|
|
"storage_status": "unknown",
|
|
"remote_sent": False
|
|
}
|
|
|
|
return {
|
|
"success": result.get("success", False),
|
|
"target": target,
|
|
"error": result.get("message") if not result.get("success") else None,
|
|
"storage_status": "duplicate" if result.get("duplicate") else "saved",
|
|
"remote_sent": result.get("remote_sent", False),
|
|
"data_summary": result.get("data_summary"),
|
|
"jobs_summary": result.get("jobs_summary"),
|
|
"original_data": result.get("original_data")
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing item {target}: {e}")
|
|
return {
|
|
"success": False,
|
|
"target": target,
|
|
"error": str(e),
|
|
"storage_status": "error",
|
|
"remote_sent": False
|
|
}
|
|
|
|
async def clean_target_auto(self, target: str) -> Union[bool, Dict[str, Any]]:
|
|
if "zhipin.com" in target:
|
|
return await self._process_boss_url(target)
|
|
elif "51job.com" in target:
|
|
return await self._process_qcwy_url(target)
|
|
elif "zhaopin.com" in target:
|
|
return await self._process_zhilian_url(target)
|
|
return await self._process_search_company(target)
|
|
|
|
async def clean_by_job_id(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
|
|
router = await self.get_data_router()
|
|
data = None
|
|
result = None
|
|
|
|
if platform == "boss":
|
|
match = re.search(r'job_detail/([^.]+)\.html', target)
|
|
if match:
|
|
target = match.group(1)
|
|
elif platform == "qcwy":
|
|
match = re.search(r'/(\d+)\.html', target)
|
|
if match:
|
|
target = match.group(1)
|
|
elif platform == "zhilian":
|
|
match = re.search(r'jobs\.zhaopin\.com/(\w+)\.htm', target)
|
|
if match:
|
|
target = match.group(1)
|
|
|
|
if platform == "boss":
|
|
data = await asyncio.to_thread(self.boss_service.get_job_detail_by_id, target)
|
|
if data:
|
|
result = await router.store_single("boss", "mini", "job", data)
|
|
elif platform == "qcwy":
|
|
data = await asyncio.to_thread(self.qcwy_service.get_job_detail, target)
|
|
if data:
|
|
result = await router.store_single("qcwy", "mini", "job", data)
|
|
elif platform == "zhilian":
|
|
data = await asyncio.to_thread(self.zhilian_service.get_job_detail, target)
|
|
if data:
|
|
result = await router.store_single("zhilian", "mini", "job", data)
|
|
|
|
if result and isinstance(result, dict) and data:
|
|
result['original_data'] = data
|
|
return result
|
|
|
|
return False
|
|
|
|
async def clean_by_company_name(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
|
|
router = await self.get_data_router()
|
|
if platform == "boss":
|
|
res = await asyncio.to_thread(self.boss_service.search_jobs, target)
|
|
if res and res.get('zpData') and res['zpData'].get('list'):
|
|
last_result = None
|
|
for job in res['zpData']['list']:
|
|
last_result = await router.store_single("boss", "mini", "job", job)
|
|
|
|
if last_result and isinstance(last_result, dict):
|
|
last_result['original_data'] = res
|
|
return last_result if last_result else False
|
|
elif platform == "qcwy":
|
|
res = await asyncio.to_thread(self.qcwy_service.search_jobs, target)
|
|
if res:
|
|
last_result = None
|
|
for job in res:
|
|
last_result = await router.store_single("qcwy", "mini", "job", job)
|
|
|
|
if last_result and isinstance(last_result, dict):
|
|
last_result['original_data'] = res
|
|
return last_result if last_result else False
|
|
elif platform == "zhilian":
|
|
res = await asyncio.to_thread(self.zhilian_service.search_company_jobs_by_name, target)
|
|
if res and isinstance(res, dict):
|
|
data = res.get("data") or {}
|
|
items = data.get("list") or []
|
|
if not isinstance(items, list):
|
|
items = []
|
|
last_result = None
|
|
for job in items:
|
|
last_result = await router.store_single("zhilian", "mini", "job", job)
|
|
if last_result and isinstance(last_result, dict):
|
|
last_result["original_data"] = res
|
|
return last_result if last_result else False
|
|
return False
|
|
|
|
async def clean_by_company_id(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
|
|
data = None
|
|
result = None
|
|
|
|
if platform == "boss":
|
|
data = await asyncio.to_thread(self.boss_service.get_company_detail_by_id, target)
|
|
if data:
|
|
result = await self._store_company_record("boss", data, target)
|
|
result["jobs_summary"] = await self.company_jobs_sync.sync_company_jobs("boss", target)
|
|
elif platform == "qcwy":
|
|
company_id = target
|
|
match = re.match(r"^co(\d+)$", company_id)
|
|
if match:
|
|
company_id = match.group(1)
|
|
data = await asyncio.to_thread(self.qcwy_service.get_company_info, company_id)
|
|
if data:
|
|
result = await self._store_company_record("qcwy", data, company_id)
|
|
result["jobs_summary"] = await self.company_jobs_sync.sync_company_jobs("qcwy", company_id)
|
|
elif platform == "zhilian":
|
|
data = await asyncio.to_thread(self.zhilian_service.get_company_detail, target)
|
|
if data:
|
|
result = await self._store_company_record("zhilian", data, target)
|
|
result["jobs_summary"] = await self.company_jobs_sync.sync_company_jobs("zhilian", target)
|
|
|
|
if result and isinstance(result, dict) and data:
|
|
result['original_data'] = data
|
|
return result
|
|
|
|
return False
|
|
|
|
async def clean_boss_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
|
|
company_id = target
|
|
|
|
match = re.search(r'gongsi/([^.]+)\.html', target)
|
|
if match:
|
|
company_id = match.group(1)
|
|
|
|
result = await self.company_jobs_sync.sync_company_jobs("boss", company_id)
|
|
return result if result.get("jobs_fetched", 0) > 0 else False
|
|
|
|
async def clean_qcwy_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
|
|
company_id = target
|
|
|
|
match = re.match(r'^co(\d+)$', company_id)
|
|
if match:
|
|
company_id = match.group(1)
|
|
|
|
result = await self.company_jobs_sync.sync_company_jobs("qcwy", company_id)
|
|
return result if result.get("jobs_fetched", 0) > 0 else False
|
|
|
|
async def clean_zhilian_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
|
|
company_id = target
|
|
|
|
result = await self.company_jobs_sync.sync_company_jobs("zhilian", company_id)
|
|
return result if result.get("jobs_fetched", 0) > 0 else False
|
|
|
|
async def _process_boss_url(self, url: str) -> Union[bool, Dict[str, Any]]:
|
|
job_match = re.search(r'job_detail/([^.]+)\.html', url)
|
|
if job_match:
|
|
return await self.clean_by_job_id(job_match.group(1), "boss")
|
|
|
|
company_match = re.search(r'gongsi/([^.]+)\.html', url)
|
|
if company_match:
|
|
return await self.clean_by_company_id(company_match.group(1), "boss")
|
|
|
|
return await self.clean_by_job_id(url, "boss")
|
|
|
|
async def _process_qcwy_url(self, url: str) -> Union[bool, Dict[str, Any]]:
|
|
job_match = re.search(r'/(\d+)\.html', url)
|
|
if job_match:
|
|
return await self.clean_by_job_id(job_match.group(1), "qcwy")
|
|
company_match = re.search(r'co(\d+)', url, re.IGNORECASE)
|
|
if company_match:
|
|
return await self.clean_by_company_id(company_match.group(1), "qcwy")
|
|
return await self.clean_by_job_id(url, "qcwy")
|
|
|
|
async def _process_zhilian_url(self, url: str) -> Union[bool, Dict[str, Any]]:
|
|
job_match = re.search(r'jobs\.zhaopin\.com/(\w+)\.htm', url)
|
|
if job_match:
|
|
return await self.clean_by_job_id(job_match.group(1), "zhilian")
|
|
company_match = re.search(r'/company/([A-Za-z0-9]+)', url)
|
|
if company_match:
|
|
return await self.clean_by_company_id(company_match.group(1), "zhilian")
|
|
return await self.clean_by_job_id(url, "zhilian")
|
|
|
|
async def _process_search_company(self, name: str) -> Union[bool, Dict[str, Any]]:
|
|
return await self.clean_by_company_name(name, "boss")
|