""" 智联招聘 小程序爬虫入口 功能: 1. 从后端获取关键词(优先断点续爬 > 失败重试 > 全新) 2. 调用 SearchPositions 分页爬取职位列表 3. 每页实时上传数据 + 汇报进度 4. 支持从断点页码恢复 5. 可选:搜索 job 时顺带抓取公司详情 启动: python -m spiderJobs.platforms.zhilian.main 环境变量: API_BASE_URL 后端地址 (默认 http://124.222.106.226:9999) MAX_PAGES 每个关键词最大翻页数 (默认 3) SLEEP_MIN_SECONDS 最小延迟秒数 (默认 10) SLEEP_MAX_SECONDS 最大延迟秒数 (默认 20) INLINE_COMPANY 是否内联抓公司 (默认 1,设 0 关闭) """ from __future__ import annotations import os import sys from typing import Optional _project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) if _project_root not in sys.path: sys.path.insert(0, _project_root) from crawler_core.base import BaseFetcher, BaseSearcher from spiderJobs.platforms.zhilian.api import GetCompanyDetail, SearchPositions from spiderJobs.platforms.zhilian.client import ZhilianClient, create_cgate_client from spiderJobs.runner.loop import run_crawl_loop # 智联城市代码映射 CITY_CODE_MAP = { "全国": "", "北京": 530, "上海": 538, "广州": 763, "深圳": 765, "杭州": 653, "成都": 801, "南京": 635, "武汉": 736, "西安": 854, "长沙": 749, "重庆": 551, "苏州": 639, "天津": 531, "厦门": 682, "郑州": 719, "合肥": 664, "济南": 703, "青岛": 704, "大连": 600, "东莞": 769, "佛山": 766, "珠海": 768, "无锡": 636, "宁波": 654, } def create_searcher(keyword: dict, http_client: ZhilianClient) -> BaseSearcher: """根据关键词创建智联搜索器""" city = keyword.get("city", "") job = keyword.get("job", "") city_code = CITY_CODE_MAP.get(city, 538) return SearchPositions( keyword=job, city_code=city_code, client=http_client, ) def extract_company_id(job: dict) -> Optional[str]: """从智联 job dict 中提取公司 ID (companyNumber)""" company_number = job.get("companyNumber") or job.get("company", {}).get("number") return str(company_number) if company_number else None def create_company_fetcher(company_id: str, http_client: ZhilianClient) -> BaseFetcher: """创建智联公司详情 fetcher""" return GetCompanyDetail(number=company_id, client=http_client) def main(): client_kwargs = {} proxy = os.environ.get("PROXY_URL", "") if proxy: client_kwargs["proxy"] = proxy run_crawl_loop( platform="zhilian", create_searcher=create_searcher, create_client_fn=create_cgate_client, max_pages=3, data_type="job", client_kwargs=client_kwargs, extract_company_id=extract_company_id, create_company_fetcher=create_company_fetcher, ) if __name__ == "__main__": main()