""" Boss直聘 小程序爬虫入口 功能: 1. 从后端获取关键词(优先断点续爬 > 失败重试 > 全新) 2. 调用 SearchRecJobs 分页爬取职位列表 3. 每页实时上传数据 + 汇报进度 4. 支持从断点页码恢复 5. 可选:搜索 job 时顺带抓取公司详情 启动: python -m spiderJobs.platforms.boss.main 环境变量: API_BASE_URL 后端地址 (默认 http://124.222.106.226:9999) MAX_PAGES 每个关键词最大翻页数 (默认 3) SLEEP_MIN_SECONDS 最小延迟秒数 (默认 10) SLEEP_MAX_SECONDS 最大延迟秒数 (默认 20) BOSS_MPT Boss Token (mpt) BOSS_WT2 Boss Token (wt2) INLINE_COMPANY 是否内联抓公司 (默认 1,设 0 关闭) """ from __future__ import annotations import os import sys from typing import Optional # 确保项目根目录在 sys.path 中 _project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) if _project_root not in sys.path: sys.path.insert(0, _project_root) from crawler_core.base import BaseFetcher, BaseSearcher from spiderJobs.platforms.boss.api import GetBrandDetail, SearchRecJobs from spiderJobs.platforms.boss.client import BossClient, create_client from crawler_core.boss.sign import BossSign from spiderJobs.runner.loop import run_crawl_loop # Boss 城市代码映射(关键词中的城市名 -> Boss cityCode) CITY_CODE_MAP = { "全国": "100010000", "北京": "101010100", "上海": "101020100", "广州": "101280100", "深圳": "101280600", "杭州": "101210100", "成都": "101270100", "南京": "101190100", "武汉": "101200100", "西安": "101110100", "长沙": "101250100", "重庆": "101040100", "苏州": "101190400", "天津": "101030100", "厦门": "101230200", "郑州": "101180100", "合肥": "101220100", "济南": "101120100", "青岛": "101120200", "大连": "101070200", "东莞": "101281600", "佛山": "101280800", "珠海": "101280700", "无锡": "101190200", "宁波": "101210400", } def create_searcher(keyword: dict, http_client: BossClient) -> BaseSearcher: """根据关键词创建 Boss 搜索器""" city = keyword.get("city", "") city_code = CITY_CODE_MAP.get(city, "101280600") return SearchRecJobs( city_code=city_code, client=http_client, ) def extract_company_id(job: dict) -> Optional[str]: """从 Boss job dict 中提取公司 ID (brandId)""" brand_id = job.get("brandId") return str(brand_id) if brand_id else None def create_company_fetcher(company_id: str, http_client: BossClient) -> BaseFetcher: """创建 Boss 公司详情 fetcher""" return GetBrandDetail(brand_id=company_id, client=http_client) def main(): mpt = os.environ.get("BOSS_MPT", "") wt2 = os.environ.get("BOSS_WT2", "") client_kwargs = {} if mpt or wt2: signer = BossSign(mpt=mpt, wt2=wt2) client_kwargs["signer"] = signer tunnel = os.environ.get("PROXY_TUNNEL", "") if tunnel: scheme = os.environ.get("PROXY_SCHEME", "http") username = os.environ.get("PROXY_USERNAME", "") password = os.environ.get("PROXY_PASSWORD", "") if username and password: client_kwargs["tunnel_proxy"] = f"{scheme}://{username}:{password}@{tunnel}" else: client_kwargs["tunnel_proxy"] = f"{scheme}://{tunnel}" run_crawl_loop( platform="boss", create_searcher=create_searcher, create_client_fn=create_client, max_pages=3, data_type="job", client_kwargs=client_kwargs, extract_company_id=extract_company_id, create_company_fetcher=create_company_fetcher, ) if __name__ == "__main__": main()