""" 前程无忧 (51Job) 小程序爬虫入口 功能: 1. 从后端获取关键词(优先断点续爬 > 失败重试 > 全新) 2. 调用 SearchRecommendJobs 分页爬取职位列表 3. 每页实时上传数据 + 汇报进度 4. 支持从断点页码恢复 5. 可选:搜索 job 时顺带抓取公司详情 启动: python -m spiderJobs.platforms.job51.main 环境变量: API_BASE_URL 后端地址 (默认 http://124.222.106.226:9999) MAX_PAGES 每个关键词最大翻页数 (默认 3) SLEEP_MIN_SECONDS 最小延迟秒数 (默认 10) SLEEP_MAX_SECONDS 最大延迟秒数 (默认 20) INLINE_COMPANY 是否内联抓公司 (默认 1,设 0 关闭) """ from __future__ import annotations import os import sys from typing import Optional _project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) if _project_root not in sys.path: sys.path.insert(0, _project_root) from crawler_core.base import BaseFetcher, BaseSearcher from spiderJobs.platforms.job51.api import GetCompanyInfo, SearchRecommendJobs from spiderJobs.platforms.job51.client import Job51Client, create_client from spiderJobs.runner.loop import run_crawl_loop # 51job 城市代码映射 CITY_CODE_MAP = { "全国": "000000", "北京": "010000", "上海": "020000", "广州": "030200", "深圳": "040000", "杭州": "080200", "成都": "090200", "南京": "070200", "武汉": "180200", "西安": "200200", "长沙": "190200", "重庆": "060000", "苏州": "070300", "天津": "050000", "厦门": "110300", "郑州": "170200", "合肥": "150200", "济南": "120200", "青岛": "120300", "大连": "230300", "东莞": "030800", "佛山": "030600", "珠海": "030500", "无锡": "070400", "宁波": "080300", } def create_searcher(keyword: dict, http_client: Job51Client) -> BaseSearcher: """根据关键词创建 51job 搜索器""" city = keyword.get("city", "") job_area = CITY_CODE_MAP.get(city, "020000") return SearchRecommendJobs( job_area=job_area, client=http_client, ) def extract_company_id(job: dict) -> Optional[str]: """从 51job job dict 中提取公司 ID (coId)""" co_id = job.get("coId") return str(co_id) if co_id else None def create_company_fetcher(company_id: str, http_client: Job51Client) -> BaseFetcher: """创建 51job 公司详情 fetcher""" return GetCompanyInfo(company_id=company_id, client=http_client) def main(): client_kwargs = {} tunnel = os.environ.get("PROXY_TUNNEL", "") if tunnel: scheme = os.environ.get("PROXY_SCHEME", "http") username = os.environ.get("PROXY_USERNAME", "") password = os.environ.get("PROXY_PASSWORD", "") if username and password: client_kwargs["tunnel_proxy"] = f"{scheme}://{username}:{password}@{tunnel}" else: client_kwargs["tunnel_proxy"] = f"{scheme}://{tunnel}" run_crawl_loop( platform="qcwy", create_searcher=create_searcher, create_client_fn=create_client, max_pages=3, data_type="job", client_kwargs=client_kwargs, extract_company_id=extract_company_id, create_company_fetcher=create_company_fetcher, ) if __name__ == "__main__": main()