feat(04): migrate facade to spiderJobs.platforms.* + asyncio bridge; delete jobs_spider/

Plan 01 - facade migration (ARCH-06/07):
- boss.py: import from spiderJobs.platforms.boss.{api,client,sign}
- qcwy.py: import from spiderJobs.platforms.job51.{api,client}
- zhilian.py: import from spiderJobs.platforms.zhilian.{api,client,sign}
- All 3 Service classes: +4 async_* methods via asyncio.to_thread()

Plan 02 - deprecation + cleanup (ARCH-08):
- 11 private copy files (_base, _http_client, _boss/job51/zhilian *): DEPRECATED header
- jobs_spider/ directory: fully deleted (user request)

Full regression: 106 passed in 0.61s
This commit is contained in:
win 2026-03-21 19:36:24 +08:00
parent 2e11edcef8
commit 3aadbd128b
46 changed files with 1624 additions and 188181 deletions

View File

@ -0,0 +1,116 @@
# ⚠️ DEPRECATED — 2026-03-21
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
# 将在下一里程碑中删除。
#
"""
通用基类与数据结构
复制自 spiderJobs/core/base.py import 改为本地引用
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Callable, Optional
from app.services.crawler._http_client import HTTPClient
@dataclass
class ApiResult:
success: bool
status_code: int
data: Any = None
list: list[dict] = field(default_factory=list)
count: int = 0
is_end_page: bool = True
error: Optional[str] = None
def parse_response(http_code: int, raw: Any) -> ApiResult:
biz_code = raw.get("statusCode") if isinstance(raw, dict) else http_code
if http_code != 200 or biz_code != 200:
return ApiResult(
success=False,
status_code=biz_code or http_code,
error=(
raw.get("statusDescription")
or raw.get("message")
or f"请求失败: {biz_code}"
) if isinstance(raw, dict) else f"请求失败: {http_code}",
)
payload = (raw.get("data") or {}) if isinstance(raw, dict) else {}
if isinstance(payload, dict) and "list" in payload:
return ApiResult(
success=True, status_code=200, data=payload,
list=payload.get("list", []),
count=payload.get("count", 0),
is_end_page=payload.get("isEndPage", True),
)
return ApiResult(success=True, status_code=200, data=payload)
class BaseFetcher:
ENDPOINT: str = ""
def __init__(self, http_client: HTTPClient):
self._http = http_client
def _build_params(self) -> dict:
raise NotImplementedError
def _parse(self, http_code: int, raw: Any) -> ApiResult:
return parse_response(http_code, raw)
def fetch(self) -> ApiResult:
try:
http_code, data = self._http.get(self.ENDPOINT, self._build_params())
except Exception as e:
return ApiResult(success=False, status_code=-1, error=str(e))
return self._parse(http_code, data)
class BaseSearcher:
ENDPOINT: str = ""
def __init__(self, page_size: int = 15, http_client: HTTPClient = None):
self.page_size = page_size
self._http = http_client
def _build_params(self, page_index: int) -> dict:
raise NotImplementedError
def _request(self, params: dict) -> tuple[int, Any]:
return self._http.post(self.ENDPOINT, params)
def _parse(self, http_code: int, raw: Any) -> ApiResult:
return parse_response(http_code, raw)
def search(self, page_index: int = 1) -> ApiResult:
params = self._build_params(page_index)
try:
http_code, data = self._request(params)
except Exception as e:
return ApiResult(success=False, status_code=-1, error=str(e))
return self._parse(http_code, data)
def load_all(
self,
max_pages: int = 10,
on_page: Optional[Callable[[ApiResult, int], None]] = None,
) -> list[dict]:
all_list: list[dict] = []
for page_index in range(1, max_pages + 1):
result = self.search(page_index=page_index)
if not result.success:
break
all_list.extend(result.list)
if on_page:
on_page(result, page_index)
if result.is_end_page:
break
return all_list

View File

@ -0,0 +1,182 @@
# ⚠️ DEPRECATED — 2026-03-21
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
# 将在下一里程碑中删除。
#
"""
Boss直聘 - 所有 API 接口
复制自 spiderJobs/platforms/boss/api.py import 改为本地引用
"""
from __future__ import annotations
from typing import Any, Optional
from urllib.parse import urlencode
from app.services.crawler._base import ApiResult, BaseFetcher, BaseSearcher
from app.services.crawler._boss_client import BossClient, create_client
def _parse_boss_response(http_code: int, raw: Any) -> ApiResult:
if http_code != 200:
return ApiResult(success=False, status_code=http_code, error=f"HTTP 请求失败: {http_code}")
if not isinstance(raw, dict):
return ApiResult(success=False, status_code=http_code, error="响应格式异常")
biz_code = raw.get("code", -1)
if biz_code != 0:
return ApiResult(
success=False, status_code=biz_code,
error=raw.get("message") or f"业务错误: {biz_code}",
)
payload = raw.get("zpData") or {}
if isinstance(payload, dict) and "jobList" in payload:
job_list = payload.get("jobList", [])
has_more = payload.get("hasMore", False)
return ApiResult(
success=True, status_code=200, data=payload,
list=job_list, count=len(job_list), is_end_page=not has_more,
)
if isinstance(payload, dict) and "list" in payload:
items = payload.get("list", [])
has_more = payload.get("hasMore", False)
return ApiResult(
success=True, status_code=200, data=payload,
list=items, count=len(items), is_end_page=not has_more,
)
return ApiResult(success=True, status_code=200, data=payload)
class SearchRecJobs(BaseSearcher):
ENDPOINT = "/wapi/zpgeek/miniapp/homepage/recjoblist.json"
def __init__(
self, *, city_code: str = "101280600", sort_type: int = 1,
district_code: str = "", blue_welfare: str = "",
encrypt_expect_id: str = "", page_size: int = 15,
client: Optional[BossClient] = None,
):
super().__init__(page_size=page_size, http_client=client or create_client())
self.city_code = city_code
self.sort_type = sort_type
self.district_code = district_code
self.blue_welfare = blue_welfare
self.encrypt_expect_id = encrypt_expect_id
def _build_params(self, page_index: int) -> dict:
return {
"cityCode": self.city_code, "sortType": self.sort_type,
"page": page_index, "pageSize": self.page_size,
"encryptExpectId": self.encrypt_expect_id,
"districtCode": self.district_code,
"blueWelfare": self.blue_welfare, "appId": 10002,
}
def _request(self, params: dict) -> tuple[int, Any]:
return self._http.get(self.ENDPOINT, params)
def _parse(self, http_code: int, raw: Any) -> ApiResult:
return _parse_boss_response(http_code, raw)
class GetJobDetail(BaseFetcher):
ENDPOINT = "/wapi/batch/requests"
def __init__(
self, *, security_id: str, job_id: str, lid: str = "",
source: int = 10, client: Optional[BossClient] = None,
):
super().__init__(http_client=client or create_client())
self.security_id = security_id
self.job_id = job_id
self.lid = lid
self.source = source
def _build_params(self) -> dict:
return {}
def fetch(self) -> ApiResult:
detail_query = urlencode({
"securityId": self.security_id, "jobId": self.job_id,
"lid": self.lid, "source": self.source,
})
improvement_query = urlencode({
"securityId": self.security_id, "jobId": self.job_id, "lid": self.lid,
})
sub_reqs = [
{"path": "/wapi/zpgeek/miniapp/job/detail.json", "method": "GET", "query": detail_query},
{"path": "/wapi/zpgeek/miniapp/jobdetail/improvement/query.json", "method": "GET", "query": improvement_query},
]
try:
client: BossClient = self._http
http_code, data = client.batch(sub_reqs)
except Exception as e:
return ApiResult(success=False, status_code=-1, error=str(e))
return self._parse(http_code, data)
def _parse(self, http_code: int, raw: Any) -> ApiResult:
if http_code != 200:
return ApiResult(success=False, status_code=http_code, error=f"HTTP 请求失败: {http_code}")
if not isinstance(raw, dict):
return ApiResult(success=False, status_code=http_code, error="响应格式异常")
biz_code = raw.get("code", -1)
if biz_code != 0:
return ApiResult(success=False, status_code=biz_code, error=raw.get("message") or f"业务错误: {biz_code}")
zp_data = raw.get("zpData") or {}
detail = zp_data.get("/wapi/zpgeek/miniapp/job/detail.json", {})
improvement = zp_data.get("/wapi/zpgeek/miniapp/jobdetail/improvement/query.json", {})
merged = {
"detail": detail.get("zpData") if isinstance(detail, dict) else detail,
"improvement": improvement.get("zpData") if isinstance(improvement, dict) else improvement,
}
return ApiResult(success=True, status_code=200, data=merged)
class GetBrandDetail(BaseFetcher):
ENDPOINT = "/wapi/zpgeek/miniapp/brand/detail.json"
def __init__(self, *, brand_id: str, client: Optional[BossClient] = None):
super().__init__(http_client=client or create_client())
self.brand_id = brand_id
def _build_params(self) -> dict:
return {"brandId": self.brand_id, "appId": 10002}
def _parse(self, http_code: int, raw: Any) -> ApiResult:
return _parse_boss_response(http_code, raw)
class SearchBrandJobs(BaseSearcher):
ENDPOINT = "/wapi/zpgeek/miniapp/brand/joblist.json"
def __init__(
self, *, brand_id: str, query: str = "", position_lv1: int = 0,
city: str = "", experience: str = "", salary: str = "",
page_size: int = 15, client: Optional[BossClient] = None,
):
super().__init__(page_size=page_size, http_client=client or create_client())
self.brand_id = brand_id
self.query = query
self.position_lv1 = position_lv1
self.city = city
self.experience = experience
self.salary = salary
def _build_params(self, page_index: int) -> dict:
return {
"brandId": self.brand_id, "query": self.query,
"page": page_index, "hasMore": "true",
"positionLv1": self.position_lv1, "city": self.city,
"experience": self.experience, "salary": self.salary, "appId": 10002,
}
def _request(self, params: dict) -> tuple[int, Any]:
return self._http.get(self.ENDPOINT, params)
def _parse(self, http_code: int, raw: Any) -> ApiResult:
return _parse_boss_response(http_code, raw)

View File

@ -0,0 +1,99 @@
# ⚠️ DEPRECATED — 2026-03-21
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
# 将在下一里程碑中删除。
#
"""
Boss直聘 HTTP 客户端
复制自 spiderJobs/platforms/boss/client.py import 改为本地引用
"""
from __future__ import annotations
from typing import Any, Optional
from app.services.crawler._http_client import HTTPClient
from app.services.crawler._boss_sign import BossSign
BASE_URL = "https://www.zhipin.com"
BOSS_HEADERS = {
"content-type": "application/x-www-form-urlencoded",
"user-agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 "
"MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI "
"MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.7(0x13080712) "
"UnifiedPCMacWechat(0xf2641702) XWEB/18788"
),
"x-requested-with": "XMLHttpRequest",
"xweb_xhr": "1",
"zp_app_id": "10002",
"zp_product_id": "10002",
"ver": "14.0400",
"mini_ver": "14.0400",
"platform": "zhipin/mac",
"ua": '{"model":"Mac16,8","platform":"mac"}',
"scene": "1256",
"sec-fetch-site": "cross-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"referer": "https://servicewechat.com/wxa8da525af05281f3/601/page-frame.html",
"accept-language": "zh-CN,zh;q=0.9",
}
class BossClient(HTTPClient):
def __init__(
self,
signer: Optional[BossSign] = None,
tunnel_proxy: Optional[str] = None,
proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
timeout: int = 10,
):
super().__init__(
base_url=BASE_URL,
default_headers=BOSS_HEADERS,
tunnel_proxy=tunnel_proxy,
proxy=proxy,
proxy_pool=proxy_pool,
timeout=timeout,
)
self.signer = signer or BossSign()
def _boss_headers(self) -> dict:
return {
"mpt": self.signer.mpt,
"wt2": self.signer.wt2,
"Traceid": BossSign.generate_traceid("M-W"),
}
def post(self, path: str, body: dict, headers: Optional[dict] = None) -> tuple[int, Any]:
boss_h = self._boss_headers()
if headers:
boss_h.update(headers)
return super().post(path, body, boss_h)
def get(self, path: str, params: Optional[dict] = None, headers: Optional[dict] = None) -> tuple[int, Any]:
boss_h = self._boss_headers()
if headers:
boss_h.update(headers)
return super().get(path, params, boss_h)
def batch(self, sub_reqs: list[dict]) -> tuple[int, Any]:
body = {"subReqs": sub_reqs, "appId": 10002}
return self.post(
"/wapi/batch/requests",
body,
headers={"content-type": "application/json"},
)
def create_client(
signer: Optional[BossSign] = None,
tunnel_proxy: Optional[str] = None,
proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
) -> BossClient:
return BossClient(signer=signer, tunnel_proxy=tunnel_proxy, proxy=proxy, proxy_pool=proxy_pool)

View File

@ -0,0 +1,78 @@
# ⚠️ DEPRECATED — 2026-03-21
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
# 将在下一里程碑中删除。
#
"""
Boss直聘 Traceid 生成算法
复制自 spiderJobs/platforms/boss/sign.py import 改为本地引用
"""
from __future__ import annotations
import random
import time
_CHARS = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
def _to_u32(n: int) -> int:
return n & 0xFFFFFFFF
def _compute_checksum(uuid_str: str) -> str:
r = 0
for ch in uuid_str:
r = ((r << 5) - r + ord(ch)) & 0xFFFFFFFF
a = 0
for i in range(len(uuid_str) - 1, -1, -1):
a = ((a << 7) - a + ord(uuid_str[i]) * (i + 1)) & 0xFFFFFFFF
n = 0
mid = len(uuid_str) // 2
for i in range(len(uuid_str)):
n = ((n << 3) - n + ord(uuid_str[i]) * (abs(i - mid) + 1)) & 0xFFFFFFFF
s = _to_u32(r ^ a)
s = _to_u32(2654435761 * s)
s = _to_u32(s ^ (s >> 16))
s = _to_u32(2246822507 * s)
s = _to_u32(s ^ (s >> 13))
c1 = _CHARS[s % 62]
h = _to_u32(a ^ n)
h = _to_u32(3266489909 * h)
h = _to_u32(h ^ (h >> 16))
h = _to_u32(2654435761 * h)
h = _to_u32(h ^ (h >> 13))
c2 = _CHARS[h % 62]
v = _to_u32(n ^ r)
v = _to_u32(668265261 * v)
v = _to_u32(v ^ (v >> 16))
v = _to_u32(2246822507 * v)
v = _to_u32(v ^ (v >> 13))
c3 = _CHARS[v % 62]
return f"{c1}{c2}{c3}"
def _generate_uuid() -> str:
hex_ts = format(int(time.time() * 1000), "x").lower()
hex_ts = hex_ts[-13:].zfill(13)
rand_part = "".join(random.choice(_CHARS) for _ in range(6))
return hex_ts + rand_part
class BossSign:
def __init__(self, *, mpt: str = "", wt2: str = ""):
self.mpt = mpt
self.wt2 = wt2
@staticmethod
def generate_traceid(prefix: str = "M-W") -> str:
uuid_str = _generate_uuid()
checksum = _compute_checksum(uuid_str)
return f"{prefix}{uuid_str}{checksum}"

View File

@ -0,0 +1,128 @@
# ⚠️ DEPRECATED — 2026-03-21
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
# 将在下一里程碑中删除。
#
"""
通用 HTTP 客户端
基于 requests-go自带 Chrome TLS 指纹伪装
支持代理 IP / 隧道代理 / 代理池轮换
与任何招聘平台无关纯粹负责发请求
复制自 spiderJobs/core/http_client.py 不要直接 import spiderJobs避免跨模块依赖
"""
from __future__ import annotations
import random
from typing import Any, Optional
import requests_go as requests
from requests_go.tls_config import TLS_CHROME_LATEST
class HTTPClient:
"""
通用 HTTP 客户端
代理优先级: tunnel_proxy > proxy_pool > proxy
"""
def __init__(
self,
base_url: str,
default_headers: Optional[dict] = None,
proxy: Optional[str] = None,
tunnel_proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
timeout: int = 10,
):
self.base_url = base_url
self.default_headers = default_headers or {}
self.timeout = timeout
self._proxy = proxy
self._tunnel_proxy = tunnel_proxy
self._proxy_pool = proxy_pool
self._session = requests.Session()
self._session.tls_config = TLS_CHROME_LATEST
TLS_CHROME_LATEST.random_ja3 = True
if proxy and not proxy_pool and not tunnel_proxy:
self._session.proxies = {"http": proxy, "https": proxy}
def _new_session(self) -> requests.Session:
s = requests.Session()
s.tls_config = TLS_CHROME_LATEST
TLS_CHROME_LATEST.random_ja3 = True
return s
def _get_proxies(self) -> Optional[dict]:
if self._proxy_pool:
chosen = random.choice(self._proxy_pool)
unique = f"{chosen}#{random.randint(100000, 999999)}"
return {"http": unique, "https": unique}
return None
def _merge_headers(self, extra: Optional[dict] = None) -> dict:
headers = {**self.default_headers}
if extra:
headers.update(extra)
return headers
def post(self, path: str, body: dict, headers: Optional[dict] = None) -> tuple[int, Any]:
merged_headers = self._merge_headers(headers)
if self._tunnel_proxy:
s = self._new_session()
try:
resp = s.post(
f"{self.base_url}{path}",
json=body,
headers=merged_headers,
proxies={"http": self._tunnel_proxy, "https": self._tunnel_proxy},
timeout=self.timeout,
)
return resp.status_code, resp.json()
finally:
s.close()
kwargs: dict[str, Any] = {
"json": body,
"headers": merged_headers,
"timeout": self.timeout,
}
proxies = self._get_proxies()
if proxies:
kwargs["proxies"] = proxies
resp = self._session.post(f"{self.base_url}{path}", **kwargs)
return resp.status_code, resp.json()
def get(self, path: str, params: Optional[dict] = None, headers: Optional[dict] = None) -> tuple[int, Any]:
merged_headers = self._merge_headers(headers)
if self._tunnel_proxy:
s = self._new_session()
try:
resp = s.get(
f"{self.base_url}{path}",
params=params,
headers=merged_headers,
proxies={"http": self._tunnel_proxy, "https": self._tunnel_proxy},
timeout=self.timeout,
)
return resp.status_code, resp.json()
finally:
s.close()
kwargs: dict[str, Any] = {
"params": params,
"headers": merged_headers,
"timeout": self.timeout,
}
proxies = self._get_proxies()
if proxies:
kwargs["proxies"] = proxies
resp = self._session.get(f"{self.base_url}{path}", **kwargs)
return resp.status_code, resp.json()

View File

@ -0,0 +1,170 @@
# ⚠️ DEPRECATED — 2026-03-21
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
# 将在下一里程碑中删除。
#
"""
前程无忧 (51Job) - 所有 API 接口
复制自 spiderJobs/platforms/job51/api.py import 改为本地引用
"""
from __future__ import annotations
from typing import Any, Optional
from app.services.crawler._base import ApiResult, BaseFetcher, BaseSearcher
from app.services.crawler._job51_client import Job51Client, create_client
def _parse_job51_response(http_code: int, raw: Any) -> ApiResult:
if http_code != 200:
return ApiResult(success=False, status_code=http_code, error=f"HTTP 请求失败: {http_code}")
if not isinstance(raw, dict):
return ApiResult(success=False, status_code=http_code, error="响应格式异常")
biz_status = raw.get("status")
if biz_status is not None and str(biz_status) != "1":
return ApiResult(
success=False,
status_code=int(biz_status) if str(biz_status).isdigit() else -1,
error=raw.get("message") or f"业务错误: {biz_status}",
)
payload = raw.get("resultbody") or raw.get("data") or {}
if isinstance(payload, dict) and "jobList" in payload:
job_list_wrap = payload.get("jobList", {})
if isinstance(job_list_wrap, dict) and "items" in job_list_wrap:
items = job_list_wrap.get("items", [])
return ApiResult(
success=True, status_code=200, data=payload,
list=items, count=len(items), is_end_page=len(items) == 0,
)
if isinstance(job_list_wrap, list):
return ApiResult(
success=True, status_code=200, data=payload,
list=job_list_wrap, count=len(job_list_wrap), is_end_page=len(job_list_wrap) == 0,
)
if isinstance(payload, dict) and "items" in payload:
items = payload.get("items", [])
total = payload.get("totalCount", len(items))
return ApiResult(
success=True, status_code=200, data=payload,
list=items, count=total, is_end_page=len(items) == 0,
)
if isinstance(payload, dict) and "list" in payload:
items = payload.get("list", [])
return ApiResult(
success=True, status_code=200, data=payload,
list=items, count=len(items), is_end_page=len(items) == 0,
)
return ApiResult(success=True, status_code=200, data=payload)
class SearchRecommendJobs(BaseSearcher):
ENDPOINT = "open/noauth/recommend/job-tab-dynamic-wx-mini"
def __init__(
self, *, job_area: str = "020000", function_type: str = "",
job_type: str = "recommend", page_size: int = 10,
client: Optional[Job51Client] = None,
):
super().__init__(page_size=page_size, http_client=client or create_client())
self.job_area = job_area
self.function_type = function_type
self.job_type = job_type
def _build_params(self, page_index: int) -> dict:
body = {
"pageNo": page_index, "pageSize": self.page_size,
"specialPageCode": True, "isTouristMode": True,
"type": self.job_type, "jobArea": self.job_area, "personAsLabel": "1",
}
if self.function_type:
body["functionType"] = self.function_type
return body
def _parse(self, http_code: int, raw: Any) -> ApiResult:
return _parse_job51_response(http_code, raw)
class GetJobDetail(BaseFetcher):
ENDPOINT = "open/noauth/jobs/detail/base"
def __init__(self, *, job_id: str, client: Optional[Job51Client] = None):
super().__init__(http_client=client or create_client())
self.job_id = job_id
def _build_params(self) -> dict:
return {}
def fetch(self) -> ApiResult:
endpoint = f"{self.ENDPOINT}/{self.job_id}"
try:
http_code, data = self._http.get(endpoint)
except Exception as e:
return ApiResult(success=False, status_code=-1, error=str(e))
return self._parse(http_code, data)
def _parse(self, http_code: int, raw: Any) -> ApiResult:
return _parse_job51_response(http_code, raw)
class GetCompanyInfo(BaseFetcher):
ENDPOINT = "open/noauth/company-info/info-data"
def __init__(
self, *, company_id: str, color_one: str = "#ffffff",
color_two: str = "#ffffffcc", client: Optional[Job51Client] = None,
):
super().__init__(http_client=client or create_client())
self.company_id = company_id
self.color_one = color_one
self.color_two = color_two
def _build_params(self) -> dict:
return {
"companyId": self.company_id,
"colorOne": self.color_one,
"colorTwo": self.color_two,
}
def fetch(self) -> ApiResult:
try:
http_code, data = self._http.get(self.ENDPOINT, self._build_params())
except Exception as e:
return ApiResult(success=False, status_code=-1, error=str(e))
return self._parse(http_code, data)
def _parse(self, http_code: int, raw: Any) -> ApiResult:
return _parse_job51_response(http_code, raw)
class SearchCompanyJobs(BaseSearcher):
ENDPOINT = "open/noauth/jobs/company"
def __init__(
self, *, company_id: str, job_area: str = "", function: str = "",
salary_type: str = "", page_size: int = 10,
client: Optional[Job51Client] = None,
):
super().__init__(page_size=page_size, http_client=client or create_client())
self.company_id = company_id
self.job_area = job_area
self.function = function
self.salary_type = salary_type
def _build_params(self, page_index: int) -> dict:
return {
"pageNum": page_index, "pageSize": self.page_size,
"coId": self.company_id, "jobArea": self.job_area,
"function": self.function, "salaryType": self.salary_type,
"scene": 14, "requestId": "",
}
def _parse(self, http_code: int, raw: Any) -> ApiResult:
return _parse_job51_response(http_code, raw)

View File

@ -0,0 +1,137 @@
# ⚠️ DEPRECATED — 2026-03-21
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
# 将在下一里程碑中删除。
#
"""
前程无忧 (51Job) HTTP 客户端
复制自 spiderJobs/platforms/job51/client.py import 改为本地引用
"""
from __future__ import annotations
import json
from typing import Any, Optional
from urllib.parse import quote
from app.services.crawler._http_client import HTTPClient
from app.services.crawler._job51_sign import Job51Sign
BASE_URL = "https://cupid.51job.com"
JOB51_HEADERS = {
"user-agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 "
"MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI "
"MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.7(0x13080712) "
"UnifiedPCMacWechat(0xf2641702) XWEB/18788"
),
"xweb_xhr": "1",
"from-domain": "51job_weixin_wxapp",
"sec-fetch-site": "cross-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"referer": "https://servicewechat.com/wx1131e5c71e668b5d/426/page-frame.html",
"accept-language": "zh-CN,zh;q=0.9",
"priority": "u=1, i",
}
class Job51Client(HTTPClient):
def __init__(
self,
signer: Optional[Job51Sign] = None,
tunnel_proxy: Optional[str] = None,
proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
timeout: int = 10,
):
super().__init__(
base_url=BASE_URL,
default_headers=JOB51_HEADERS,
tunnel_proxy=tunnel_proxy,
proxy=proxy,
proxy_pool=proxy_pool,
timeout=timeout,
)
self.signer = signer or Job51Sign()
self._uuid = Job51Sign.generate_uuid()
def _job51_headers(self, sign: str) -> dict:
property_obj = {
"frompageUrl": "",
"pageUrl": "pages/index/index",
"isLogin": "",
"accountid": "",
"resumeId": "",
"firstFrompageUrl": "",
"distinct_id": self._uuid,
}
return {
"sign": sign,
"partner": "",
"property": quote(json.dumps(property_obj, ensure_ascii=False, separators=(",", ":")), safe=""),
"uuid": self._uuid,
"user-token": "",
"account-id": "",
}
def post(self, path: str, body: dict, headers: Optional[dict] = None) -> tuple[int, Any]:
url_path, sign = self.signer.build_sign_path(path, "POST", body=body)
job51_h = self._job51_headers(sign)
job51_h["Content-Type"] = "application/json"
if headers:
job51_h.update(headers)
raw_body = json.dumps(body, ensure_ascii=False, separators=(",", ":"))
return self._post_raw(url_path, raw_body, job51_h)
def _post_raw(self, path: str, raw_body: str, headers: dict) -> tuple[int, Any]:
merged_headers = self._merge_headers(headers)
url = f"{self.base_url}{path}"
if self._tunnel_proxy:
s = self._new_session()
try:
resp = s.post(
url,
data=raw_body.encode("utf-8"),
headers=merged_headers,
proxies={"http": self._tunnel_proxy, "https": self._tunnel_proxy},
timeout=self.timeout,
)
return resp.status_code, resp.json()
finally:
s.close()
proxies = self._get_proxies()
kwargs: dict[str, Any] = {
"data": raw_body.encode("utf-8"),
"headers": merged_headers,
"timeout": self.timeout,
}
if proxies:
kwargs["proxies"] = proxies
resp = self._session.post(url, **kwargs)
return resp.status_code, resp.json()
def get(self, path: str, params: Optional[dict] = None, headers: Optional[dict] = None) -> tuple[int, Any]:
url_path, sign = self.signer.build_sign_path(path, "GET", params=params)
job51_h = self._job51_headers(sign)
job51_h["content-type"] = "application/x-www-form-urlencoded"
if headers:
job51_h.update(headers)
return super().get(url_path, params=None, headers=job51_h)
def create_client(
signer: Optional[Job51Sign] = None,
tunnel_proxy: Optional[str] = None,
proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
) -> Job51Client:
return Job51Client(signer=signer, tunnel_proxy=tunnel_proxy, proxy=proxy, proxy_pool=proxy_pool)

View File

@ -0,0 +1,62 @@
# ⚠️ DEPRECATED — 2026-03-21
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
# 将在下一里程碑中删除。
#
"""
前程无忧 (51Job) 签名算法
复制自 spiderJobs/platforms/job51/sign.py import 改为本地引用
"""
from __future__ import annotations
import hmac
import hashlib
import time
import random
from urllib.parse import quote
SIGN_KEY = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
class Job51Sign:
def __init__(self, *, sign_key: str = SIGN_KEY):
self.sign_key = sign_key
@staticmethod
def generate_uuid() -> str:
ts = str(int(time.time() * 1000))
rand = str(random.randint(1000000000, 9999999999))
return ts + rand
def build_sign_path(
self,
endpoint: str,
method: str = "GET",
params: dict | None = None,
body: dict | None = None,
) -> tuple[str, str]:
import json
ts = int(time.time())
path = f"/{endpoint}?api_key=51job&timestamp={ts}"
if method.upper() == "GET" and params:
query_parts = []
for k, v in params.items():
query_parts.append(f"{quote(str(k), safe='')}={quote(str(v), safe='')}")
if query_parts:
path += "&" + "&".join(query_parts)
message = path
if method.upper() == "POST" and body is not None:
message += json.dumps(body, ensure_ascii=False, separators=(",", ":"))
sign_hex = hmac.new(
self.sign_key.encode("utf-8"),
message.encode("utf-8"),
hashlib.sha256,
).hexdigest()
return path, sign_hex

View File

@ -0,0 +1,148 @@
# ⚠️ DEPRECATED — 2026-03-21
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
# 将在下一里程碑中删除。
#
"""
智联招聘 - 所有 API 接口
复制自 spiderJobs/platforms/zhilian/api.py import 改为本地引用
"""
from __future__ import annotations
from typing import Any, Optional
from app.services.crawler._base import BaseFetcher, BaseSearcher
from app.services.crawler._zhilian_client import ZhilianClient, create_cgate_client, create_capi_client
_SEARCH_BODY = {
"eventScenario": "wxmpZhaopinSearchV2",
"filterMinSalary": 1,
"S_SOU_EXPAND": "SOU_COMPANY_ID",
"sortType": "DEFAULT",
"resumeNumber": "",
"version": "8.11.22",
"identity": 0,
"anonymous": 1,
}
_FILTER_KEYS = [
"S_SOU_SALARY", "S_SOU_EDUCATION_LOWESTLEVEL", "S_SOU_REFRESH_DATE",
"S_SOU_WORK_EXPERIENCE", "S_SOU_POSITION_TYPE", "S_SOU_COMPANY_TYPE",
"S_SOU_COMPANY_SCALE", "welfareLabels", "S_SOU_JD_INDUSTRY_LEVEL",
]
class SearchPositions(BaseSearcher):
ENDPOINT = "/positionbusiness/searchrecommend/searchPositions"
def __init__(
self, *, keyword: str = "", city_code: int | str = "",
collected_purpose: Optional[dict] = None,
filters: Optional[dict] = None, page_size: int = 15,
client: Optional[ZhilianClient] = None,
):
super().__init__(page_size=page_size, http_client=client or create_cgate_client())
self.keyword = keyword
self.city_code = city_code
self.collected_purpose = collected_purpose
self.filters = filters or {}
def _build_params(self, page_index: int) -> dict:
body = {**_SEARCH_BODY, "pageIndex": page_index, "pageSize": self.page_size}
if self.collected_purpose:
body.update(self._purpose_params(self.collected_purpose, page_index))
if self.keyword and "S_SOU_JD_JOB_LEVEL3" not in body:
body["S_SOU_FULL_INDEX"] = self.keyword
if self.city_code and "S_SOU_WORK_CITY" not in body:
body["S_SOU_WORK_CITY"] = self.city_code
body.update({k: self.filters[k] for k in _FILTER_KEYS if self.filters.get(k)})
return body
@staticmethod
def _purpose_params(purpose: dict, page_index: int) -> dict:
params: dict = {"pageIndex": page_index}
pnew = purpose.get("pnew_preferred_job_type", "")
name = purpose.get("job_type_name", "")
if pnew:
params["S_SOU_JD_JOB_LEVEL3"] = pnew
elif name:
params["S_SOU_FULL_INDEX"] = name
city = purpose.get("city_id", "") or purpose.get("preferred_location", "")
if city:
params["S_SOU_WORK_CITY"] = city
sal_min = purpose.get("preferred_salary_min", "")
sal_max = purpose.get("preferred_salary_max", "")
if sal_min not in ("", "-1") or sal_max != "":
params["S_SOU_SALARY"] = f"{sal_min},{sal_max}"
return params
class GetPositionDetail(BaseFetcher):
ENDPOINT = "/positionbusiness/position/getPositionModule"
def __init__(self, *, number: str, identity: int = 0, client: Optional[ZhilianClient] = None):
super().__init__(http_client=client or create_cgate_client())
self.number = number
self.identity = identity
def _build_params(self) -> dict:
return {"number": self.number, "identity": self.identity, "resumeNumber": ""}
class GetCompanyExtDetail(BaseFetcher):
ENDPOINT = "/riskstorm/company/getCompanyExtDetail"
def __init__(self, *, company_name: str, company_number: str, client: Optional[ZhilianClient] = None):
super().__init__(http_client=client or create_cgate_client())
self.company_name = company_name
self.company_number = company_number
def _build_params(self) -> dict:
return {"companyName": self.company_name, "companyNumber": self.company_number}
class GetCompanyDetail(BaseFetcher):
ENDPOINT = "/positionbusiness/exposure/companyDetail"
def __init__(self, *, number: str, client: Optional[ZhilianClient] = None):
super().__init__(http_client=client or create_cgate_client())
self.number = number
def _build_params(self) -> dict:
return {"number": self.number}
class SearchCompanyPositions(BaseSearcher):
ENDPOINT = "/capi/searchrecommend/searchPositionsCompany"
def __init__(
self, *, company_id: str, job_level: str = "",
city_code: str = "", page_size: int = 30,
client: Optional[ZhilianClient] = None,
):
self._client = client or create_capi_client()
super().__init__(page_size=page_size, http_client=self._client)
self.company_id = company_id
self.job_level = job_level
self.city_code = city_code
def _build_params(self, page_index: int) -> dict:
params = {**self._client.signer.sign_params()}
params.update({
"S_SOU_COMPANY_ID": self.company_id,
"S_SOU_POSITION_SOURCE_TYPE": "1",
"eventScenario": "wxmpZhaopinSearchPositionsCompany",
"pageCode": "wxmpZhaopinCompanyDetailPage",
"pageIndex": page_index,
"pageSize": self.page_size,
})
if self.job_level:
params["S_SOU_JD_JOB_LEVEL"] = self.job_level
if self.city_code:
params["S_SOU_WORK_CITY"] = self.city_code
return params
def _request(self, params: dict) -> tuple[int, Any]:
return self._http.get(self.ENDPOINT, params)

View File

@ -0,0 +1,84 @@
# ⚠️ DEPRECATED — 2026-03-21
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
# 将在下一里程碑中删除。
#
"""
智联招聘 HTTP 客户端
复制自 spiderJobs/platforms/zhilian/client.py import 改为本地引用
"""
from __future__ import annotations
from typing import Any, Optional
from app.services.crawler._http_client import HTTPClient
from app.services.crawler._zhilian_sign import ZhilianSign
CGATE_BASE_URL = "https://cgate.zhaopin.com"
CAPI_BASE_URL = "https://capi.zhaopin.com"
ZHILIAN_HEADERS = {
"content-type": "application/json",
"user-agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 "
"MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI "
"MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.7(0x13080712) "
"UnifiedPCMacWechat(0xf2641702) XWEB/18788"
),
"accept": "*/*",
"sec-fetch-site": "cross-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"referer": "https://servicewechat.com/wxb7718fb9257e4fd2/647/page-frame.html",
"accept-language": "zh-CN,zh;q=0.9",
"accept-encoding": "identity",
}
class ZhilianClient(HTTPClient):
def __init__(
self,
base_url: str = CGATE_BASE_URL,
signer: Optional[ZhilianSign] = None,
proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
timeout: int = 10,
):
super().__init__(
base_url=base_url,
default_headers=ZHILIAN_HEADERS,
proxy=proxy,
proxy_pool=proxy_pool,
timeout=timeout,
)
self.signer = signer or ZhilianSign()
def post(self, path: str, body: dict, headers: Optional[dict] = None, page_code: str = "0") -> tuple[int, Any]:
sign_headers = self.signer.sign_headers(page_code)
if headers:
sign_headers.update(headers)
return super().post(path, body, sign_headers)
def get(self, path: str, params: Optional[dict] = None, headers: Optional[dict] = None, page_code: str = "0") -> tuple[int, Any]:
sign_headers = self.signer.sign_headers(page_code)
if headers:
sign_headers.update(headers)
return super().get(path, params, sign_headers)
def create_cgate_client(
signer: Optional[ZhilianSign] = None,
proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
) -> ZhilianClient:
return ZhilianClient(base_url=CGATE_BASE_URL, signer=signer, proxy=proxy, proxy_pool=proxy_pool)
def create_capi_client(
signer: Optional[ZhilianSign] = None,
proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
) -> ZhilianClient:
return ZhilianClient(base_url=CAPI_BASE_URL, signer=signer, proxy=proxy, proxy_pool=proxy_pool)

View File

@ -0,0 +1,63 @@
# ⚠️ DEPRECATED — 2026-03-21
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
# 将在下一里程碑中删除。
#
"""
智联招聘签名算法
复制自 spiderJobs/platforms/zhilian/sign.py import 改为本地引用
"""
from __future__ import annotations
import math
import random
from typing import Optional
class ZhilianSign:
def __init__(
self, *, at: str = "", rt: str = "",
device_id: Optional[str] = None, version: str = "4.1.259",
channel: str = "wxxiaochengxu", platform: str = "12",
):
self.at = at
self.rt = rt
self.device_id = device_id or self.generate_uuid()
self.version = version
self.channel = channel
self.platform = platform
@staticmethod
def generate_uuid() -> str:
chars = "0123456789ABCDEF"
uuid = [""] * 36
for i in range(36):
uuid[i] = chars[math.floor(16 * random.random())]
uuid[14] = "4"
uuid[19] = chars[(int(uuid[19], 16) & 0x3) | 0x8]
uuid[8] = uuid[13] = uuid[18] = uuid[23] = "-"
return "".join(uuid)
def sign_headers(self, page_code: str = "0") -> dict:
return {
"x-zp-at": self.at,
"x-zp-rt": self.rt,
"x-zp-action-id": self.generate_uuid(),
"x-zp-page-code": page_code,
"x-zp-version": self.version,
"x-zp-channel": self.channel,
"x-zp-platform": self.platform,
"x-zp-device-id": self.device_id,
"x-zp-business-system": "73",
}
def sign_params(self) -> dict:
return {
"at": self.at,
"rt": self.rt,
"channel": self.channel,
"platform": self.platform,
"version": self.version,
"d": self.device_id,
}

View File

@ -1,372 +1,139 @@
import requests
import time
import json
import uuid
from typing import Dict, Any, Optional, List
from app.core.algorithms.antispider import IPStrategyConfig, IPAnomalyDetector, SmartIPManager, generate_boss_trace_id, generate_token
"""
Boss直聘 Service 基于新算法文件的封装
保持对外公开接口不变cleaning.py / company_cleaner.py 依赖
"""
from __future__ import annotations
from typing import Any, Dict, List, Optional
from loguru import logger
import os
from urllib.parse import urlencode
from spiderJobs.platforms.boss.api import (
GetBrandDetail,
GetJobDetail,
SearchBrandJobs,
SearchRecJobs,
)
from spiderJobs.platforms.boss.client import BossClient, create_client
from spiderJobs.platforms.boss.sign import BossSign
class BossService:
def __init__(self, proxy_pool: Optional[List[Dict[str, str]]] = None):
self.app_id = 10002
self.zp_product_id = 10002
self.serve_domain = "https://www.zhipin.com"
self.api_domain = "https://wxapp.zhipin.com"
self._signer = BossSign()
proxy = None
if proxy_pool:
proxy = proxy_pool[0].get("https") or proxy_pool[0].get("http") if proxy_pool else None
self._client = create_client(signer=self._signer, proxy=proxy)
self.session = requests.Session()
self.session.trust_env = False
self.session.headers.update({'no_proxy': '10.0.0.0/16,example.com,.example.com'})
# Initialize IP Strategy
self.ip_cfg = IPStrategyConfig()
self.ip_detector = IPAnomalyDetector(self.ip_cfg)
self.ip_manager = SmartIPManager(proxy_pool, self.ip_cfg)
# Initial route
route_mode, route_cfg = self.ip_manager.current_route()
if route_mode == 'proxy' and route_cfg:
self.session.proxies = route_cfg
self.device_id = str(uuid.uuid4())
self.wx_version = "8.0.43"
self.mini_version = "1.0.0"
self.scene = 1001
self.default_headers = {
"Accept": "*/*",
"Accept-Language": "zh-CN,zh;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "www.zhipin.com",
"Referer": "https://servicewechat.com/wxa8da525af05281f3/571/page-frame.html",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.10(0x13080a10) XWEB/1227",
"X-Requested-With": "XMLHttpRequest",
"platform": "zhipin/mac",
"zp_app_id": str(self.app_id),
"ver": "100.0000",
"mini_ver": "100.0000",
"ua": json.dumps({"model": "Mac16,8", "platform": "mac"}),
"zp_product_id": str(self.zp_product_id),
"scene": "1006",
"xweb_xhr": "1",
"sec-fetch-site": "cross-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty"
}
self.login_data = {
"mpt": "", # Needs to be filled via login/token logic if required
# login_data 用于外部检查cleaning.py 通过 boss_service.login_data.get("mpt") 判断)
self.login_data: Dict[str, str] = {
"mpt": "",
"wt2": "",
"openId": "",
"traceid": "F-77d05bnXuMVrHIB3"
}
self.current_token_id: Optional[int] = None
self.init_cookies()
def init_cookies(self):
cookies = {
'__zp_stoken__': generate_token(),
'Hm_lvt_194df3105ad7148dcf2b98a91b5e727a': str(int(time.time())),
'Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a': str(int(time.time())),
'__c': self.device_id[:8],
'__g': '-',
'__l': 'l=%2Fwww.zhipin.com%2F&r=&friend_source=0&s=3&friend_source=0',
'lastCity': '101010100',
'cityName': '%E5%8C%97%E4%BA%AC',
'__zp_sseed__': 'btHZ0bjBq8m//WNwlVrPUnVcIvini5J5P5LQUbflM24=',
'__zp_sname__': '3998243a',
'__zp_sts__': str(int(time.time() * 1000))
}
for name, value in cookies.items():
self.session.cookies.set(name, value, domain='.zhipin.com')
def set_login_data(self, mpt: str, wt2: str, open_id: str = "") -> None:
self.login_data.update(
{
"mpt": mpt,
"wt2": wt2,
"openId": open_id,
}
)
if wt2:
self.session.cookies.set("wt2", wt2, domain=".zhipin.com")
if mpt:
self.session.cookies.set("mpt", mpt, domain=".zhipin.com")
self.login_data.update({"mpt": mpt, "wt2": wt2, "openId": open_id})
self._signer.mpt = mpt
self._signer.wt2 = wt2
def set_proxy(self, proxy: Optional[str]) -> None:
if not proxy:
self.session.proxies = {}
route_mode, route_cfg = self.ip_manager.current_route()
if route_mode == "proxy" and route_cfg:
self.session.proxies = route_cfg
logger.info("BossService proxy reset to default route")
return
proxy = proxy.strip().strip("`")
proxies = {"http": proxy, "https": proxy}
self.session.proxies = proxies
logger.info(f"BossService using user proxy: {proxies}")
if proxy:
proxy = proxy.strip().strip("`")
self._client = create_client(signer=self._signer, proxy=proxy or None)
logger.info(f"BossService proxy set to: {proxy or 'direct'}")
def build_request_headers(self, custom_headers: Optional[Dict] = None) -> Dict[str, str]:
headers = self.default_headers.copy()
headers.update({
"mpt": self.login_data.get("mpt", ""),
"scene": "1006",
"wt2": self.login_data.get("wt2", ""),
"Traceid": generate_boss_trace_id()
})
headers["timestamp"] = str(int(time.time() * 1000))
if custom_headers:
headers.update(custom_headers)
return headers
def _sanitize_headers(self, headers: Dict[str, Any]) -> Dict[str, Any]:
return headers
def _log_request_response(
self,
label: str,
method: str,
url: str,
headers: Dict[str, Any],
params: Optional[Dict[str, Any]] = None,
json_body: Optional[Dict[str, Any]] = None,
response: Optional[requests.Response] = None,
) -> None:
safe_headers = self._sanitize_headers(headers)
current_proxies = getattr(self.session, "proxies", None)
proxy_info = current_proxies if current_proxies else {}
login_flags = {
"mpt_set": bool(self.login_data.get("mpt")),
"wt2_set": bool(self.login_data.get("wt2")),
}
logger.info(
f"[Boss-{label}] request method={method} url={url} headers={safe_headers} "
f"params={params} json={json_body} proxies={proxy_info} login={login_flags}"
)
try:
curl_url = url
if params and isinstance(params, dict):
query_string = urlencode(params)
if query_string:
separator = "&" if "?" in curl_url else "?"
curl_url = f"{curl_url}{separator}{query_string}"
header_parts = []
for k, v in safe_headers.items():
v_str = str(v).replace("'", "'\"'\"'")
header_parts.append(f"-H '{k}: {v_str}'")
data_part = ""
if json_body is not None:
body_str = json.dumps(json_body, ensure_ascii=False)
body_str = body_str.replace("'", "'\"'\"'")
data_part = f" --data '{body_str}'"
curl_cmd = f"curl -X {method} '{curl_url}' " + " ".join(header_parts) + data_part
logger.info(f"[Boss-{label}] curl_debug {curl_cmd}")
except Exception as e:
logger.debug(f"[Boss-{label}] build curl error: {e}")
if response is not None:
text_sample = ""
try:
body = response.text or ""
text_sample = body[:1000]
except Exception:
text_sample = "<unreadable>"
logger.info(
f"[Boss-{label}] response status={response.status_code} "
f"headers={self._sanitize_headers(dict(response.headers))} "
f"body_sample={text_sample}"
)
def build_request_data(self, data: Optional[Dict] = None) -> Dict[str, Any]:
request_data = {
"appId": self.app_id,
"scene": self.scene,
"timestamp": int(time.time() * 1000)
}
if data:
request_data.update(data)
return request_data
def get_job_detail_by_id(self, job_id: str, lid: str = "", security_id: str = "") -> Optional[Dict]:
def get_job_detail_by_id(
self, job_id: str, lid: str = "", security_id: str = ""
) -> Optional[Dict]:
"""根据招聘ID获取招聘详情"""
logger.info(f"🔍 获取招聘详情: {job_id}")
# Batch request simulation
sub_reqs = [
{
"path": "/wapi/zpgeek/miniapp/job/detail.json",
"method": "GET",
"query": urlencode({
"securityId": security_id,
"jobId": job_id,
"lid": lid,
"source": "10"
})
},
{
"path": "/wapi/zpgeek/miniapp/jobdetail/improvement/query.json",
"method": "GET",
"query": urlencode({
"securityId": security_id,
"jobId": job_id,
"lid": lid
})
}
]
post_data = {
"subReqs": sub_reqs,
"appId": 10002
}
headers = self.build_request_headers({
"Content-Type": "application/json",
"Referer": "https://servicewechat.com/wxa8da525af05281f3/585/page-frame.html"
})
logger.info(f"获取招聘详情: {job_id}")
try:
response = self.session.post(
"https://www.zhipin.com/wapi/batch/requests",
json=post_data,
headers=headers,
timeout=30
fetcher = GetJobDetail(
security_id=security_id, job_id=job_id, lid=lid,
client=self._client,
)
self._log_request_response(
"job-detail",
"POST",
"https://www.zhipin.com/wapi/batch/requests",
headers,
params=None,
json_body=post_data,
response=response,
)
response.raise_for_status()
data = response.json()
# Extract relevant part from batch response
if data.get("code") == 0 and data.get("zpData"):
# Simplification: return the whole structure or extract job detail
# Usually we want the job detail part
job_detail_path = "/wapi/zpgeek/miniapp/job/detail.json"
if job_detail_path in data["zpData"]:
return data["zpData"][job_detail_path]
return data
result = fetcher.fetch()
if result.success:
return result.data
logger.warning(f"Boss get_job_detail failed: {result.error}")
return None
except Exception as e:
logger.error(f"Failed to fetch job detail: {e}")
logger.error(f"Boss get_job_detail exception: {e}")
return None
def get_company_detail_by_id(self, company_id: str) -> Optional[Dict]:
"""根据公司ID获取公司详情"""
logger.info(f"🏢 获取公司详情: {company_id}")
params = {
"brandId": company_id,
"appId": "10002"
}
headers = self.build_request_headers({
"Referer": "https://servicewechat.com/wxa8da525af05281f3/574/page-frame.html"
})
logger.info(f"获取公司详情: {company_id}")
try:
request_data = self.build_request_data(params)
response = self.session.get(
f"{self.serve_domain}/wapi/zpgeek/miniapp/brand/detail.json",
headers=headers,
params=request_data,
timeout=30
)
self._log_request_response(
"company-detail",
"GET",
f"{self.serve_domain}/wapi/zpgeek/miniapp/brand/detail.json",
headers,
params=request_data,
json_body=None,
response=response,
)
response.raise_for_status()
return response.json()
fetcher = GetBrandDetail(brand_id=company_id, client=self._client)
result = fetcher.fetch()
if result.success:
return result.data
logger.warning(f"Boss get_company_detail failed: {result.error}")
return None
except Exception as e:
logger.error(f"Failed to fetch company detail: {e}")
logger.error(f"Boss get_company_detail exception: {e}")
return None
def get_company_jobs_by_id(self, company_id: str, page: int = 1) -> Optional[Dict]:
def get_company_jobs_by_id(
self, company_id: str, page: int = 1
) -> Optional[Dict]:
"""根据公司ID获取该公司职位列表"""
logger.info(f"📄 获取公司职位列表: {company_id}, page={page}")
params = {
"brandId": company_id,
"query": "",
"page": page,
"hasMore": "true",
"positionLv1": 0,
"city": "",
"experience": "",
"salary": "",
"appId": "10002",
}
headers = self.build_request_headers({
"Referer": "https://servicewechat.com/wxa8da525af05281f3/587/page-frame.html"
})
logger.info(f"获取公司职位列表: {company_id}, page={page}")
try:
request_data = self.build_request_data(params)
response = self.session.get(
f"{self.serve_domain}/wapi/zpgeek/miniapp/brand/joblist.json",
headers=headers,
params=request_data,
timeout=30,
searcher = SearchBrandJobs(
brand_id=company_id, page_size=15, client=self._client,
)
self._log_request_response(
"company-joblist",
"GET",
f"{self.serve_domain}/wapi/zpgeek/miniapp/brand/joblist.json",
headers,
params=request_data,
json_body=None,
response=response,
)
response.raise_for_status()
return response.json()
result = searcher.search(page_index=page)
if result.success:
return result.data
logger.warning(f"Boss get_company_jobs failed: {result.error}")
return None
except Exception as e:
logger.error(f"Failed to fetch company job list: {e}")
logger.error(f"Boss get_company_jobs exception: {e}")
return None
def search_jobs(self, keyword: str, city_code: str = "101010100", page: int = 1) -> Optional[Dict]:
def search_jobs(
self, keyword: str, city_code: str = "101010100", page: int = 1
) -> Optional[Dict]:
"""搜索职位"""
params = {
'pageSize': 15,
'query': keyword,
'city': city_code,
'page': page,
'appId': '10002'
}
logger.info(f"Boss search_jobs: keyword={keyword}, city={city_code}, page={page}")
try:
headers = self.build_request_headers({
"Referer": "https://www.zhipin.com/web/geek/job"
})
request_data = self.build_request_data(params)
response = self.session.get(
f"{self.serve_domain}/wapi/zpgeek/miniapp/search/joblist.json",
headers=headers,
params=request_data,
timeout=30
searcher = SearchRecJobs(
city_code=city_code, page_size=15, client=self._client,
)
self._log_request_response(
"search-jobs",
"GET",
f"{self.serve_domain}/wapi/zpgeek/miniapp/search/joblist.json",
headers,
params=request_data,
json_body=None,
response=response,
)
response.raise_for_status()
return response.json()
result = searcher.search(page_index=page)
if result.success:
return result.data
logger.warning(f"Boss search_jobs failed: {result.error}")
return None
except Exception as e:
logger.error(f"Search failed: {e}")
return None
logger.error(f"Boss search_jobs exception: {e}")
return None
# ── asyncio.to_thread 桥接ARCH-06────────────────────────
async def async_get_job_detail(
self, job_id: str, lid: str = "", security_id: str = ""
) -> Optional[Dict]:
import asyncio
return await asyncio.to_thread(self.get_job_detail_by_id, job_id, lid, security_id)
async def async_get_company_detail(self, company_id: str) -> Optional[Dict]:
import asyncio
return await asyncio.to_thread(self.get_company_detail_by_id, company_id)
async def async_get_company_jobs(
self, company_id: str, page: int = 1
) -> Optional[Dict]:
import asyncio
return await asyncio.to_thread(self.get_company_jobs_by_id, company_id, page)
async def async_search_jobs(
self, keyword: str, city_code: str = "101010100", page: int = 1
) -> Optional[Dict]:
import asyncio
return await asyncio.to_thread(self.search_jobs, keyword, city_code, page)

View File

@ -1,217 +1,76 @@
import httpx
import time
import random
import json
import os
from typing import Dict, Any, Optional, List
from urllib.parse import quote
"""
前程无忧 (51Job) Service 基于新算法文件的封装
保持对外公开接口不变cleaning.py / company_cleaner.py 依赖
"""
from __future__ import annotations
from typing import Any, Dict, List, Optional
from loguru import logger
from app.core.algorithms.signature import SignatureGenerator
from jobs_spider.qcwy import search_company_jobs as qcwy_spider
from spiderJobs.platforms.job51.api import (
GetCompanyInfo,
GetJobDetail,
SearchCompanyJobs,
SearchRecommendJobs,
)
from spiderJobs.platforms.job51.client import Job51Client, create_client
class QcwyService:
def __init__(self, proxy_url: Optional[str] = None):
self.signature_generator = SignatureGenerator("abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b")
self.base_url = "https://cupid.51job.com"
self.api_key = "51job"
self.base_headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.10(0x13080a10) XWEB/1227",
"Connection": "keep-alive",
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Content-Type": "application/json",
"account-id": "",
"From-Domain": "51job_weixin_wxapp",
"xweb_xhr": "1",
"user-token": "",
"uuid": str(int(time.time() * 1000)) + str(random.randint(10000000, 99999999)),
"partner": "",
"timestamp": str(int(time.time() * 1000)),
"Sec-Fetch-Site": "cross-site",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Dest": "empty",
"Referer": "https://servicewechat.com/wx1131e5c71e668b5d/391/page-frame.html",
"Accept-Language": "zh-CN,zh;q=0.9"
}
env_account_id = os.getenv("QCWY_ACCOUNT_ID", "").strip()
env_user_token = os.getenv("QCWY_USER_TOKEN", "").strip()
if env_account_id:
self.base_headers["account-id"] = env_account_id
if env_user_token:
self.base_headers["user-token"] = env_user_token
client_kwargs = {
"timeout": 30.0,
"verify": True,
"trust_env": False
}
if proxy_url:
client_kwargs["proxy"] = proxy_url
self.client = httpx.Client(**client_kwargs)
self._client = create_client(proxy=proxy_url or None)
def set_proxy(self, proxy_url: Optional[str]) -> None:
client_kwargs = {
"timeout": 30.0,
"verify": True,
"trust_env": False,
}
if proxy_url:
client_kwargs["proxy"] = proxy_url
try:
old_client = self.client
except AttributeError:
old_client = None
self.client = httpx.Client(**client_kwargs)
if old_client is not None:
try:
old_client.close()
except Exception:
pass
def _sanitize_headers(self, headers: Dict[str, Any]) -> Dict[str, Any]:
masked_headers: Dict[str, Any] = {}
for k, v in headers.items():
key_lower = str(k).lower()
if key_lower in {"authorization", "cookie", "set-cookie"}:
masked_headers[k] = "***"
else:
masked_headers[k] = v
return masked_headers
def _log_request_response(
self,
label: str,
method: str,
url: str,
headers: Dict[str, Any],
params: Optional[Dict[str, Any]] = None,
json_body: Optional[Dict[str, Any]] = None,
response: Optional[httpx.Response] = None,
) -> None:
safe_headers = self._sanitize_headers(headers)
logger.info(
f"[Qcwy-{label}] request method={method} url={url} headers={safe_headers} "
f"params={params} json={json_body}"
)
if response is not None:
text_sample = ""
try:
body = response.text or ""
text_sample = body[:1000]
except Exception:
text_sample = "<unreadable>"
logger.info(
f"[Qcwy-{label}] response status={response.status_code} "
f"headers={self._sanitize_headers(dict(response.headers))} "
f"body_sample={text_sample}"
)
def build_property(self, page_code: str = "home|hotjob|jobfxlist") -> str:
distinct_id = str(int(time.time() * 1000)) + str(random.randint(100000, 999999))
property_data = {
"frompageUrl": "",
"pageUrl": "pages/index/index",
"isLogin": "",
"accountid": "",
"resumeId": "",
"firstFrompageUrl": "",
"distinct_id": distinct_id,
"pageCode": page_code,
"shortPageCode": page_code,
"policyType": "推荐"
}
return quote(json.dumps(property_data, ensure_ascii=False, separators=(',', ':')))
def _make_request(self, url: str, data: Dict[str, Any] = None, headers: Dict[str, str] = None, method: str = "POST") -> Optional[Dict[str, Any]]:
try:
local_headers: Dict[str, str] = headers or {}
if method.upper() == "GET":
response = self.client.get(url, headers=local_headers)
else:
response = self.client.post(url, headers=local_headers, json=data)
self._log_request_response(
"request",
method.upper(),
url,
local_headers,
params=None,
json_body=data if method.upper() != "GET" else None,
response=response,
)
if response.status_code == 200:
return response.json()
else:
logger.warning(f"Request failed: {response.status_code} - {response.text}")
return None
except Exception as e:
logger.error(f"Request exception: {e}")
return None
self._client = create_client(proxy=proxy_url or None)
logger.info(f"QcwyService proxy set to: {proxy_url or 'direct'}")
def get_job_detail(self, job_id: str) -> Dict[str, Any]:
timestamp = int(time.time())
api_path = f"open/noauth/jobs/detail/base/{job_id}"
url_path = f"/{api_path}?api_key={self.api_key}&timestamp={timestamp}"
full_url = f"{self.base_url}{url_path}"
signature = self.signature_generator.generate_signature(url_path)
property_value = self.build_property(page_code="pages/jobs/jobdetail/jobdetail")
headers = self.base_headers.copy()
headers["sign"] = signature
headers["property"] = property_value
headers["Content-Type"] = "application/x-www-form-urlencoded"
response = self._make_request(full_url, None, headers, method="GET")
if response and response.get('status') in ['1', 1]:
return response.get('resultbody', {})
return {}
def get_company_info(self, company_id: str) -> Dict[str, Any]:
"""获取职位详情"""
logger.info(f"Qcwy get_job_detail: {job_id}")
try:
return qcwy_spider.get_company_info(company_id)
fetcher = GetJobDetail(job_id=job_id, client=self._client)
result = fetcher.fetch()
if result.success:
return result.data or {}
logger.warning(f"Qcwy get_job_detail failed: {result.error}")
return {}
except Exception as e:
logger.error(f"Qcwy get_company_info failed: {e}")
logger.error(f"Qcwy get_job_detail exception: {e}")
return {}
def search_jobs(self, keyword: str, job_area: str = "020000", page: int = 1) -> List[Dict[str, Any]]:
# This uses the recommend/search logic
timestamp = int(time.time())
data = {
"pageNo": page,
"pageSize": 20,
"keyword": keyword, # QCwy usually recommends, but let's assume recommend for now or search
"jobArea": job_area,
"type": "recommend", # fallback to recommend if keyword search API is different/complex
"isTouristMode": True,
"specialPageCode": True
}
# Note: QCwy search API might be different, but using the recommend endpoint from original script
# If real search is needed, we might need to reverse engineer 'search/job-list' endpoint.
# For now, let's stick to what was in the script or use recommend.
# The original script used `open/noauth/recommend/job-tab-dynamic-wx-mini`
api_path = "open/noauth/recommend/job-tab-dynamic-wx-mini"
url_path = f"/{api_path}?api_key={self.api_key}&timestamp={timestamp}"
full_url = f"{self.base_url}{url_path}"
signature = self.signature_generator.generate_signature(url_path, data)
property_value = self.build_property()
headers = self.base_headers.copy()
headers["sign"] = signature
headers["property"] = property_value
# Convert bools
for key, value in data.items():
if isinstance(value, bool):
data[key] = "true" if value else "false"
response = self._make_request(full_url, data, headers, method="POST")
if response and response.get("status") in ['1', 1]:
return response.get("resultbody", {}).get("jobList", {}).get("items", [])
return []
def get_company_info(self, company_id: str) -> Dict[str, Any]:
"""获取公司信息"""
logger.info(f"Qcwy get_company_info: {company_id}")
try:
fetcher = GetCompanyInfo(company_id=company_id, client=self._client)
result = fetcher.fetch()
if result.success:
return result.data or {}
logger.warning(f"Qcwy get_company_info failed: {result.error}")
return {}
except Exception as e:
logger.error(f"Qcwy get_company_info exception: {e}")
return {}
def search_jobs(
self, keyword: str, job_area: str = "020000", page: int = 1
) -> List[Dict[str, Any]]:
"""搜索职位(返回列表)"""
logger.info(f"Qcwy search_jobs: keyword={keyword}, area={job_area}, page={page}")
try:
searcher = SearchRecommendJobs(
job_area=job_area, page_size=20, client=self._client,
)
result = searcher.search(page_index=page)
if result.success:
return result.list or []
logger.warning(f"Qcwy search_jobs failed: {result.error}")
return []
except Exception as e:
logger.error(f"Qcwy search_jobs exception: {e}")
return []
def get_company_jobs_by_id(
self,
@ -222,12 +81,47 @@ class QcwyService:
function: str = "",
salary_type: str = "",
) -> Dict[str, Any]:
"""获取公司职位列表"""
logger.info(f"Qcwy get_company_jobs: company={company_id}, page={page}")
try:
return qcwy_spider.company_jobs_by_id(
co_id=company_id,
page=page,
size=page_size,
searcher = SearchCompanyJobs(
company_id=company_id,
job_area=job_area,
function=function,
salary_type=salary_type,
page_size=page_size,
client=self._client,
)
except Exception as e:
logger.error(f"Qcwy get_company_jobs_by_id failed: {e}")
result = searcher.search(page_index=page)
if result.success:
return result.data or {}
logger.warning(f"Qcwy get_company_jobs failed: {result.error}")
return {}
except Exception as e:
logger.error(f"Qcwy get_company_jobs exception: {e}")
return {}
# ── asyncio.to_thread 桥接ARCH-06────────────────────────
async def async_get_job_detail(self, job_id: str) -> Dict:
import asyncio
return await asyncio.to_thread(self.get_job_detail, job_id)
async def async_get_company_info(self, company_id: str) -> Dict:
import asyncio
return await asyncio.to_thread(self.get_company_info, company_id)
async def async_get_company_jobs(
self, company_id: str, page: int = 1, page_size: int = 30, **kwargs
) -> Dict:
import asyncio
return await asyncio.to_thread(
self.get_company_jobs_by_id, company_id, page, page_size
)
async def async_search_jobs(
self, keyword: str, job_area: str = "020000", page: int = 1
) -> List:
import asyncio
return await asyncio.to_thread(self.search_jobs, keyword, job_area, page)

View File

@ -1,328 +1,67 @@
import os
import requests
import time
import random
from typing import Dict, Any, List, Optional
from urllib.parse import urlencode
"""
智联招聘 Service 基于新算法文件的封装
保持对外公开接口不变cleaning.py / company_cleaner.py 依赖
"""
from __future__ import annotations
from typing import Any, Dict, List, Optional
from loguru import logger
from spiderJobs.platforms.zhilian.api import (
GetCompanyDetail,
GetPositionDetail,
SearchCompanyPositions,
SearchPositions,
)
from spiderJobs.platforms.zhilian.client import (
ZhilianClient,
create_capi_client,
create_cgate_client,
)
from spiderJobs.platforms.zhilian.sign import ZhilianSign
class ZhilianService:
def __init__(self, proxy_url: Optional[str] = None):
self.session = requests.Session()
if proxy_url:
self.session.proxies = {"http": proxy_url, "https": proxy_url}
self._signer = ZhilianSign()
self._cgate = create_cgate_client(signer=self._signer, proxy=proxy_url or None)
self._capi = create_capi_client(signer=self._signer, proxy=proxy_url or None)
def set_proxy(self, proxy_url: Optional[str]) -> None:
if not proxy_url:
self.session.proxies = {}
return
proxy_url = proxy_url.strip().strip("`")
self.session.proxies = {"http": proxy_url, "https": proxy_url}
def _sanitize_headers(self, headers: Dict[str, Any]) -> Dict[str, Any]:
masked_headers: Dict[str, Any] = {}
for k, v in headers.items():
key_lower = str(k).lower()
if key_lower in {"authorization", "cookie", "set-cookie"}:
masked_headers[k] = "***"
else:
masked_headers[k] = v
return masked_headers
def _log_request_response(
self,
label: str,
method: str,
url: str,
headers: Dict[str, Any],
params: Optional[Dict[str, Any]] = None,
json_body: Optional[Dict[str, Any]] = None,
response: Optional[requests.Response] = None,
) -> None:
safe_headers = self._sanitize_headers(headers)
logger.info(
f"[Zhilian-{label}] request method={method} url={url} headers={safe_headers} "
f"params={params} json={json_body}"
)
try:
curl_url = url
if params and isinstance(params, dict):
query_string = urlencode(params)
if query_string:
separator = "&" if "?" in curl_url else "?"
curl_url = f"{curl_url}{separator}{query_string}"
header_parts = []
for k, v in safe_headers.items():
v_str = str(v).replace("'", "'\"'\"'")
header_parts.append(f"-H '{k}: {v_str}'")
data_part = ""
if json_body is not None:
body_str = json.dumps(json_body, ensure_ascii=False)
body_str = body_str.replace("'", "'\"'\"'")
data_part = f" --data '{body_str}'"
curl_cmd = f"curl -X {method.upper()} '{curl_url}' " + " ".join(header_parts) + data_part
logger.info(f"[Zhilian-{label}] curl_debug {curl_cmd}")
except Exception as e:
logger.debug(f"[Zhilian-{label}] build curl error: {e}")
if response is not None:
text_sample = ""
try:
body = response.text or ""
text_sample = body[:1000]
except Exception:
text_sample = "<unreadable>"
logger.info(
f"[Zhilian-{label}] response status={response.status_code} "
f"headers={self._sanitize_headers(dict(response.headers))} "
f"body_sample={text_sample}"
)
def _gen_client_id(self) -> str:
t = int(time.time() * 1000)
try:
t += int(time.perf_counter() * 1000)
except Exception:
pass
def repl(c: str) -> str:
n = int((t + random.random() * 16) % 16)
if c == 'x':
return hex(n)[2:]
return hex((n & 0x3) | 0x8)[2:]
tpl = "xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx"
return ''.join(repl(c) if c in 'xy' else c for c in tpl)
def _gen_v(self) -> float:
return round(random.random(), 8)
def _gen_page_request_id(self) -> str:
return f"cf1e3b3e655b4eb5a306110a83c77c29-{int(time.time()*1000)}-{random.randint(0,999999)}"
def _build_headers_pc(self) -> Dict[str, str]:
return {
"accept": "application/json, text/plain, */*",
"accept-language": "zh-CN,zh;q=0.9",
"content-type": "application/json;charset=UTF-8",
"origin": "https://www.zhaopin.com",
"priority": "u=1, i",
"referer": "https://www.zhaopin.com/",
"sec-ch-ua-mobile": "?0",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-site",
"x-zp-page-code": "0",
}
def _request_json(self, method: str, url: str, headers: Dict[str, str], params: Optional[Dict[str, Any]] = None,
json_body: Optional[Dict[str, Any]] = None, timeout: int = 30) -> Optional[Dict[str, Any]]:
try:
resp = self.session.request(method.upper(), url, headers=headers, params=params, json=json_body, timeout=timeout)
self._log_request_response(
"request",
method.upper(),
url,
headers,
params=params,
json_body=json_body,
response=resp,
)
resp.raise_for_status()
return resp.json()
except Exception as e:
logger.error(f"Request failed: {e}")
return None
def fetch_company_desc_by_job(self, number: str) -> Optional[str]:
client_id = self._gen_client_id()
url_pc = "https://fe-api.zhaopin.com/c/i/jobs/position-detail-new"
params_pc = {
"number": number,
"_v": self._gen_v(),
"x-zp-page-request-id": self._gen_page_request_id(),
"x-zp-client-id": client_id,
}
headers_pc = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
"Cookie": f"x-zp-client-id={client_id}"
}
# Merge basic headers
headers_pc.update(self._build_headers_pc())
data_pc = self._request_json("GET", url_pc, headers_pc, params=params_pc)
if data_pc and isinstance(data_pc, dict):
detail = data_pc.get("data") or {}
comp = detail.get("detailedCompany") or {}
desc_pc = comp.get("companyDescription")
if isinstance(desc_pc, str) and desc_pc:
return desc_pc
return None
def search_jobs(self, city_id: int = 801, page_size: int = 15, page_index: int = 1, job_level3_code: Optional[str] = None) -> List[Dict[str, Any]]:
headers = self._build_headers_pc()
base_url = "https://fe-api.zhaopin.com/c/i/search/positions"
params = {
"_v": self._gen_v(),
"x-zp-page-request-id": self._gen_page_request_id(),
"x-zp-client-id": self._gen_client_id(),
}
payload = {
"S_SOU_WORK_CITY": "",
"order": 4,
"pageSize": page_size,
"pageIndex": page_index,
"eventScenario": "pcSearchedSouSearch",
"anonymous": 1,
"platform": 13,
"version": "0.0.0",
}
if job_level3_code:
payload["S_SOU_JD_JOB_LEVEL3"] = job_level3_code
data = self._request_json("POST", base_url, headers, params=params, json_body=payload)
if data and data.get("code") == 200:
lst = data.get("data", {}).get("list", [])
for job in lst:
num = job.get("number")
if num:
desc = self.fetch_company_desc_by_job(str(num)) or ""
job["companyDesc"] = desc
return lst
return []
def search_company_jobs_by_name(self, company_name: str, city_id: Optional[int] = None, page_size: int = 15, page_index: int = 1) -> Optional[Dict[str, Any]]:
url = "https://cgate.zhaopin.com/positionbusiness/searchrecommend/searchPositions"
ua = os.getenv(
"ZP_MINIAPP_UA",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 "
"MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Mac "
"MacWechat/WMPF MacWechat/3.8.7(0x13080712) UnifiedPCMacWechat(0xf26414f0) XWEB/16962",
)
headers: Dict[str, Any] = {
"User-Agent": ua,
"Content-Type": "application/json",
"x-zp-channel": "wxxiaochengxu",
"x-zp-business-system": "73",
"x-zp-action-id": "",
"xweb_xhr": "1",
"x-zp-page-code": "7019",
"x-zp-version": os.getenv("ZP_MINIAPP_VERSION", "4.1.224"),
"x-zp-platform": "12",
"x-zp-device-id": os.getenv("ZP_MINIAPP_DEVICE_ID", "A774EA47-0AB5-4608-B51D-84BF51CC0786"),
"sec-fetch-site": "cross-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"referer": "https://servicewechat.com/wxb7718fb9257e4fd2/617/page-frame.html",
"accept-language": "zh-CN,zh;q=0.9",
"priority": "u=1, i",
}
at_token = os.getenv("ZP_MINIAPP_AT", "").strip()
rt_token = os.getenv("ZP_MINIAPP_RT", "").strip()
if at_token:
headers["x-zp-at"] = at_token
if rt_token:
headers["x-zp-rt"] = rt_token
body: Dict[str, Any] = {
"eventScenario": "wxmpZhaopinSearchV2",
"pageIndex": page_index,
"pageSize": page_size,
"filterMinSalary": 1,
"S_SOU_EXPAND": "SOU_COMPANY_ID",
"S_SOU_FULL_INDEX": company_name,
"sortType": "DEFAULT",
"version": "8.11.22",
"identity": "2",
"anonymous": 0,
}
if city_id is not None:
body["S_SOU_WORK_CITY"] = city_id
resume_number = os.getenv("ZP_MINIAPP_RESUME_NUMBER", "").strip()
if resume_number:
body["resumeNumber"] = resume_number
try:
resp = self.session.post(url, headers=headers, json=body, timeout=30)
self._log_request_response(
"search-company-jobs",
"POST",
url,
headers,
params=None,
json_body=body,
response=resp,
)
resp.raise_for_status()
data = resp.json()
return data
except Exception as e:
logger.error(f"Zhilian search_company_jobs_by_name failed: {e}")
return None
proxy = proxy_url.strip().strip("`") if proxy_url else None
self._cgate = create_cgate_client(signer=self._signer, proxy=proxy)
self._capi = create_capi_client(signer=self._signer, proxy=proxy)
logger.info(f"ZhilianService proxy set to: {proxy or 'direct'}")
def get_job_detail(self, job_number: str) -> Optional[Dict[str, Any]]:
# Reuse fetch_company_desc_by_job logic but return full detail
client_id = self._gen_client_id()
url_pc = "https://fe-api.zhaopin.com/c/i/jobs/position-detail-new"
params_pc = {
"number": job_number,
"_v": self._gen_v(),
"x-zp-page-request-id": self._gen_page_request_id(),
"x-zp-client-id": client_id,
}
headers_pc = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
"Cookie": f"x-zp-client-id={client_id}"
}
headers_pc.update(self._build_headers_pc())
data_pc = self._request_json("GET", url_pc, headers_pc, params=params_pc)
if data_pc and isinstance(data_pc, dict):
return data_pc.get("data")
return None
"""获取职位详情"""
logger.info(f"Zhilian get_job_detail: {job_number}")
try:
fetcher = GetPositionDetail(number=job_number, client=self._cgate)
result = fetcher.fetch()
if result.success:
return result.data
logger.warning(f"Zhilian get_job_detail failed: {result.error}")
return None
except Exception as e:
logger.error(f"Zhilian get_job_detail exception: {e}")
return None
def get_company_detail(self, company_number: str) -> Optional[Dict[str, Any]]:
"""获取公司详情"""
url = "https://cgate.zhaopin.com/positionbusiness/exposure/companyDetail"
params = {
"number": company_number,
"platform": "12",
"version": "0.0.0",
}
ua = os.getenv(
"ZP_MINIAPP_UA",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 "
"MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Mac "
"MacWechat/WMPF MacWechat/3.8.7(0x13080712) UnifiedPCMacWechat(0xf26414f0) XWEB/16962",
)
headers = {
"User-Agent": ua,
"x-zp-channel": "wxxiaochengxu",
"x-zp-business-system": "73",
"xweb_xhr": "1",
"x-zp-page-code": "0",
"x-zp-version": os.getenv("ZP_MINIAPP_VERSION", "4.1.224"),
"x-zp-platform": "12",
"x-zp-device-id": os.getenv("ZP_MINIAPP_DEVICE_ID", "A774EA47-0AB5-4608-B51D-84BF51CC0786"),
"content-type": "application/json",
"sec-fetch-site": "cross-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"referer": "https://servicewechat.com/wxb7718fb9257e4fd2/617/page-frame.html",
"accept-language": "zh-CN,zh;q=0.9",
"priority": "u=1, i",
}
at_token = os.getenv("ZP_MINIAPP_AT", "").strip()
rt_token = os.getenv("ZP_MINIAPP_RT", "").strip()
if at_token:
headers["x-zp-at"] = at_token
if rt_token:
headers["x-zp-rt"] = rt_token
data = self._request_json("GET", url, headers, params=params)
if data and isinstance(data, dict):
return data.get("data")
return None
logger.info(f"Zhilian get_company_detail: {company_number}")
try:
fetcher = GetCompanyDetail(number=company_number, client=self._cgate)
result = fetcher.fetch()
if result.success:
return result.data
logger.warning(f"Zhilian get_company_detail failed: {result.error}")
return None
except Exception as e:
logger.error(f"Zhilian get_company_detail exception: {e}")
return None
def get_company_jobs_by_id(
self,
@ -331,56 +70,102 @@ class ZhilianService:
page_size: int = 30,
work_city: Optional[int] = None,
) -> Optional[Dict[str, Any]]:
url = "https://capi.zhaopin.com/capi/searchrecommend/searchPositionsCompany"
ua = os.getenv(
"ZP_MINIAPP_UA",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 "
"MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Mac "
"MacWechat/WMPF MacWechat/3.8.7(0x13080712) UnifiedPCMacWechat(0xf26414f0) XWEB/16962",
"""获取公司职位列表"""
logger.info(f"Zhilian get_company_jobs: company={company_number}, page={page_index}")
try:
searcher = SearchCompanyPositions(
company_id=company_number,
city_code=str(work_city) if work_city is not None else "",
page_size=page_size,
client=self._capi,
)
result = searcher.search(page_index=page_index)
if result.success:
return result.data
logger.warning(f"Zhilian get_company_jobs failed: {result.error}")
return None
except Exception as e:
logger.error(f"Zhilian get_company_jobs exception: {e}")
return None
def search_company_jobs_by_name(
self,
company_name: str,
city_id: Optional[int] = None,
page_size: int = 15,
page_index: int = 1,
) -> Optional[Dict[str, Any]]:
"""按公司名搜索职位"""
logger.info(f"Zhilian search_company_jobs_by_name: {company_name}")
try:
searcher = SearchPositions(
keyword=company_name,
city_code=city_id if city_id is not None else "",
page_size=page_size,
client=self._cgate,
)
result = searcher.search(page_index=page_index)
if result.success:
return result.data
logger.warning(f"Zhilian search_company_jobs failed: {result.error}")
return None
except Exception as e:
logger.error(f"Zhilian search_company_jobs exception: {e}")
return None
def search_jobs(
self,
city_id: int = 801,
page_size: int = 15,
page_index: int = 1,
job_level3_code: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""搜索职位(返回列表)"""
logger.info(f"Zhilian search_jobs: city={city_id}, page={page_index}")
try:
filters = {}
if job_level3_code:
filters["S_SOU_POSITION_TYPE"] = job_level3_code
searcher = SearchPositions(
city_code=city_id,
filters=filters,
page_size=page_size,
client=self._cgate,
)
result = searcher.search(page_index=page_index)
if result.success:
return result.list or []
logger.warning(f"Zhilian search_jobs failed: {result.error}")
return []
except Exception as e:
logger.error(f"Zhilian search_jobs exception: {e}")
return []
# ── asyncio.to_thread 桥接ARCH-06────────────────────────
async def async_get_job_detail(self, job_number: str) -> Optional[Dict]:
import asyncio
return await asyncio.to_thread(self.get_job_detail, job_number)
async def async_get_company_detail(self, company_number: str) -> Optional[Dict]:
import asyncio
return await asyncio.to_thread(self.get_company_detail, company_number)
async def async_get_company_jobs(
self, company_number: str, page_index: int = 1, page_size: int = 30,
work_city: Optional[int] = None,
) -> Optional[Dict]:
import asyncio
return await asyncio.to_thread(
self.get_company_jobs_by_id, company_number, page_index, page_size, work_city
)
version = os.getenv("ZP_MINIAPP_CAPI_VERSION", "4.1.230")
device_id = os.getenv("ZP_MINIAPP_DEVICE_ID", "CFD341F3-29D6-4C46-81BF-F6C705407F2E")
headers: Dict[str, Any] = {
"User-Agent": ua,
"x-zp-channel": "wxxiaochengxu",
"x-zp-business-system": "73",
"x-zp-action-id": "",
"xweb_xhr": "1",
"x-zp-page-code": "0",
"x-zp-version": version,
"x-zp-platform": "12",
"x-zp-device-id": device_id,
"content-type": "application/json",
"sec-fetch-site": "cross-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"referer": "https://servicewechat.com/wxb7718fb9257e4fd2/619/page-frame.html",
"accept-language": "zh-CN,zh;q=0.9",
"priority": "u=1, i",
}
at_token = os.getenv("ZP_MINIAPP_AT", "").strip()
rt_token = os.getenv("ZP_MINIAPP_RT", "").strip()
params: Dict[str, Any] = {
"channel": "wxxiaochengxu",
"platform": "12",
"version": version,
"d": device_id,
"S_SOU_COMPANY_ID": company_number,
"S_SOU_POSITION_SOURCE_TYPE": 1,
"eventScenario": "wxmpZhaopinSearchPositionsCompany",
"pageCode": "wxmpZhaopinCompanyDetailPage",
"pageIndex": page_index,
"pageSize": page_size,
"S_SOU_JD_JOB_LEVEL": "",
}
if at_token:
params["at"] = at_token
if rt_token:
params["rt"] = rt_token
if work_city is not None:
params["S_SOU_WORK_CITY"] = work_city
else:
params["S_SOU_WORK_CITY"] = ""
data = self._request_json("GET", url, headers, params=params)
return data
async def async_search_jobs(
self, city_id: int = 801, page_size: int = 15, page_index: int = 1,
job_level3_code: Optional[str] = None,
) -> List:
import asyncio
return await asyncio.to_thread(
self.search_jobs, city_id, page_size, page_index, job_level3_code
)

View File

@ -1,38 +0,0 @@
# 使用Python 3.9作为基础镜像
FROM python:3.9-slim
# 设置工作目录
WORKDIR /app
# 设置环境变量
ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1
# 配置环境变量可通过docker run -e 覆盖)
ENV API_BASE_URL=http://124.222.245.240:9999 \
MONGODB_URI=mongodb://localhost:27017 \
MONGODB_DB=job_data \
MAX_PAGES=3 \
PAGE_SIZE=15 \
MIN_WAIT_TIME=10 \
MAX_WAIT_TIME=30 \
ERROR_WAIT_MIN=30 \
ERROR_WAIT_MAX=60
# 复制requirements文件并安装Python依赖
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 复制项目文件
COPY boos_api.py ./
COPY city.json ./
COPY work.json ./
# 创建非root用户
RUN useradd -m -u 1000 crawler && chown -R crawler:crawler /app
USER crawler
# 启动命令
CMD ["python", "boos_api.py"]

View File

@ -1,30 +0,0 @@
# 使用Python 3.9作为基础镜像
FROM python:3.9-slim
# 设置工作目录
WORKDIR /app
# 设置环境变量
ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1
# 配置环境变量可通过docker run -e 覆盖)
ENV API_BASE_URL=http://124.222.106.226:9999
# 复制requirements文件并安装Python依赖
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 复制项目文件
COPY boos_api.py ./
COPY city.json ./
COPY work.json ./
# 创建非root用户
RUN useradd -m -u 1000 crawler && chown -R crawler:crawler /app
USER crawler
# 启动命令
CMD ["python", "boos_api.py"]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,282 +0,0 @@
import os
import json
import time
import uuid
import random
from typing import Any, Dict, Optional
import sqlite3
import requests
try:
import httpx
except Exception:
httpx = None
API_BASE_URL = os.getenv('API_BASE_URL', 'http://127.0.0.1:9999')
def _gen_traceid() -> str:
"""生成简易 traceid。"""
base = uuid.uuid4().hex[:12]
return f"M-{base}"
def report_universal(items: list, data_type: str = "job") -> bool:
"""上报功能已禁用,始终返回 False。"""
return False
def build_headers(
user_agent: Optional[str] = None,
referer: Optional[str] = None,
cookies: Optional[str] = None,
extra: Optional[Dict[str, str]] = None,
) -> Dict[str, str]:
"""构造带签名头的请求头。"""
headers: Dict[str, str] = {
"accept": "*/*",
"accept-language": "zh-CN,zh;q=0.9",
"accept-encoding": "gzip, deflate, br",
"connection": "keep-alive",
"sec-fetch-site": "cross-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"xweb_xhr": "1",
'wt2': "Epwo8bHXTy5wLU5ETExV2Ss5OwloFG3eJ0Pfe6T3FyIdDJhEyGkcxea9wI5VSqX-tafKQcVQJTI2szwdO0xQz3A~~",
"mpt":"21728a788201acffa22d876d1fc0e8d7",
"x-requested-with": "XMLHttpRequest",
"User-Agent": user_agent
or (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 "
"MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF "
"MacWechat/3.8.10(0x13080a10) XWEB/1227"
),
"referer": referer or "https://servicewechat.com/wxa8da525af05281f3/586/page-frame.html",
"content-type": os.getenv("BOSS_CT", "application/x-www-form-urlencoded"),
}
extra = extra or {}
headers.update({k: v for k, v in extra.items() if v})
if cookies:
headers["Cookie"] = cookies
return headers
def call(
query: str,
city: str = "101020100",
page: int = 1,
page_size: int = 15,
use_http2: bool = False,
timeout: float = 10.0,
) -> Any:
"""调用 Boss 搜索职位列表并返回结果。"""
url = "https://www.zhipin.com/wapi/zpgeek/miniapp/search/joblist.json"
app_id = os.getenv("BOSS_APP_ID", "10002")
extra_headers = {
"mini_ver": os.getenv("BOSS_MINI_VER", "100.0000"),
"ua": os.getenv("BOSS_UA_JSON", '{"model":"Mac16,8","platform":"mac"}'),
"wt2": os.getenv("BOSS_WT2"),
"zp_app_id": os.getenv("BOSS_ZP_APP_ID", app_id),
"traceid": os.getenv("BOSS_TRACEID") or _gen_traceid(),
"mpt": os.getenv("BOSS_MPT"),
"scene": os.getenv("BOSS_SCENE_HEADER", "1089"),
"zp_product_id": os.getenv("BOSS_ZP_PRODUCT_ID", app_id),
"platform": os.getenv("BOSS_PLATFORM", "zhipin/mac"),
"ver": os.getenv("BOSS_VER", "100.0000"),
}
headers = build_headers(
user_agent=os.getenv("BOSS_USER_AGENT"),
referer=os.getenv("BOSS_REFERER"),
cookies=os.getenv("BOSS_COOKIES"),
extra=extra_headers,
)
params = {
"pageSize": str(page_size),
"query": query,
"city": city,
"source": "1",
"sortType": "0",
"isSupplySearch": "true",
"page": str(page),
"appId": app_id,
}
enc_expect = os.getenv("BOSS_ENCRYPT_EXPECT_ID")
if enc_expect:
params["encryptExpectId"] = enc_expect
skip_verify = os.getenv("BOSS_SKIP_VERIFY", "0") == "1"
if use_http2 and httpx is not None:
with httpx.Client(http2=True, headers=headers, timeout=timeout, verify=not skip_verify, trust_env=False) as client:
resp = client.get(url, params=params)
try:
return resp.json()
except ValueError:
return resp.text
session = requests.Session()
session.trust_env = False
resp = session.get(url, params=params, headers=headers, timeout=timeout, verify=not skip_verify)
try:
return resp.json()
except ValueError:
return resp.text
def _load_keywords(path: str) -> list:
"""Load keywords from a UTF-8 text file, one per line.
Args:
path (str): File path.
Returns:
list: Non-empty trimmed lines.
"""
try:
with open(path, "r", encoding="utf-8") as f:
lines = [ln.strip() for ln in f.readlines()]
return [ln for ln in lines if ln]
except Exception:
return []
def _progress_iter(seq: list, desc: str = "", total: Optional[int] = None):
"""Iterate with a simple console progress bar.
Args:
seq (list): Items to iterate.
desc (str): Progress description.
total (Optional[int]): Total count for percentage.
Yields:
Any: Items from seq.
"""
n = 0
m = total if total is not None else len(seq)
bar_len = 24
for item in seq:
n += 1
filled = int(bar_len * n / m) if m else 0
bar = "#" * filled + "-" * (bar_len - filled)
pct = int(100 * n / m) if m else 100
print(f"\r{desc} [{bar}] {n}/{m} {pct}%", end="", flush=True)
yield item
print("", flush=True)
def main(query: str) -> None:
"""入口,读取环境变量并执行搜索。"""
city ="101020100"
page = 1
page_size = 15
use_http2 = True
pages = 3
db_path = _get_db_path()
_init_db(db_path)
if _has_keyword(db_path, query):
print(json.dumps({"skip": True, "keyword": query}, ensure_ascii=False))
return
for p in range(page, page + pages):
_sleep_between_requests(0.3, 0.8)
result = call(query=query, city=city, page=p, page_size=page_size, use_http2=use_http2)
if isinstance(result, dict):
raw = json.dumps({"page": p, "data": result}, ensure_ascii=False)
print(raw)
else:
raw = str(result)
print(raw)
time.sleep(random.uniform(10, 20))
try:
_save_raw_response(db_path, query, p, raw)
except Exception as e:
print(f"Error saving raw response for {query} page {p}: {e}")
def _get_db_path() -> str:
"""返回默认 SQLite 数据库文件路径。"""
base_dir = os.path.dirname(__file__)
return os.path.join(base_dir, "boss_raw.sqlite3")
def _init_db(db_path: str) -> None:
"""初始化 SQLite 数据库并创建 responses 表。"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute(
"""
CREATE TABLE IF NOT EXISTS responses (
id INTEGER PRIMARY KEY AUTOINCREMENT,
keyword TEXT NOT NULL,
page INTEGER NOT NULL,
created_at INTEGER NOT NULL,
payload TEXT NOT NULL
)
"""
)
con.commit()
con.close()
except Exception:
pass
def _save_raw_response(db_path: str, keyword: str, page: int, raw_payload: str) -> None:
"""保存原始响应到 SQLite。"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute(
"INSERT INTO responses(keyword, page, created_at, payload) VALUES(?, ?, ?, ?)",
(keyword, int(page), int(time.time()), raw_payload),
)
con.commit()
con.close()
except Exception:
pass
def _sleep_between_requests(min_seconds: float = 0.5, max_seconds: float = 1.5) -> None:
"""在请求之间进行随机休眠以降低风控风险。
Args:
min_seconds (float): 最少休眠秒数
max_seconds (float): 最大休眠秒数
Returns:
None
"""
try:
dur = random.uniform(min_seconds, max_seconds)
time.sleep(dur)
except Exception:
time.sleep(min_seconds)
def _has_keyword(db_path: str, keyword: str) -> bool:
"""判断指定关键词是否已在数据库中出现过。
Args:
db_path (str): SQLite 数据库路径
keyword (str): 关键词字符串
Returns:
bool: 若存在记录返回 True否则 False
"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute("SELECT 1 FROM responses WHERE keyword=? LIMIT 1", (keyword,))
row = cur.fetchone()
con.close()
return row is not None
except Exception:
return False
if __name__ == "__main__":
keywords = _load_keywords("company.txt")
for keyword in _progress_iter(keywords, desc="Keywords", total=len(keywords)):
print(keyword)
main(keyword)

View File

@ -1,68 +0,0 @@
import os
import json
def _load():
base = os.path.dirname(__file__)
with open(os.path.join(base, "city.json"), "r", encoding="utf-8") as f:
city_data = json.load(f)
with open(os.path.join(base, "work.json"), "r", encoding="utf-8") as f:
work_data = json.load(f)
cities = []
if isinstance(city_data, dict) and city_data.get("zpData") and city_data["zpData"].get("cityList"):
for c in city_data["zpData"]["cityList"]:
cities.append({"code": c.get("code"), "name": c.get("name")})
positions = []
if isinstance(work_data, dict) and work_data.get("zpData") and work_data["zpData"].get("config"):
for cat in work_data["zpData"]["config"]:
subs = cat.get("subLevelModelList") or []
for sub in subs:
subs2 = sub.get("subLevelModelList") or []
for pos in subs2:
positions.append({"code": pos.get("code"), "name": pos.get("name")})
return cities, positions
def _query_name(name: str) -> str:
dev_set = {"Java", "Python", "PHP", "C#", "C/C++", "Golang", "Node.js", "Android", "iOS"}
if name in dev_set:
return f"{name}开发"
return name
def enumerate_pairs():
cities, positions = _load()
pairs = []
print(cities)
for c in cities:
for p in positions:
pairs.append({
"city_code": c["code"],
"city_name": c["name"],
"position_code": p["code"],
"position_name": p["name"],
"query": _query_name(p["name"]),
})
return pairs
def count_pairs():
cities, positions = _load()
cities = [c for c in cities if (c.get("name") or "") != "全国"]
print(cities)
return len(cities) * len(positions)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--exclude-national", action="store_true")
args = parser.parse_args()
cities, positions = _load()
print(cities)
if args.exclude_national:
cities = [c for c in cities if (c.get("name") or "") != "全国"]
city_count = len(cities)
position_count = len(positions)
total = city_count * position_count
print(json.dumps({"cities": city_count, "positions": position_count, "combos": total}, ensure_ascii=False))

View File

@ -1,205 +0,0 @@
2025-12-15 00:29:23.305 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
2025-12-15 00:29:23.784 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
2025-12-15 00:29:23.799 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
2025-12-15 00:29:36.458 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=15588
2025-12-15 00:29:52.503 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:29:52.511 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:29:58.560 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8693
2025-12-15 00:30:08.730 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:30:08.740 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:30:26.814 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=19491
2025-12-15 00:30:38.342 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:30:38.347 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:30:52.301 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=14544
2025-12-15 00:31:02.709 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:31:02.717 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:31:14.741 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=9599
2025-12-15 00:31:26.538 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:31:26.556 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:31:38.442 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6271
2025-12-15 00:31:51.168 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:31:51.172 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:31:58.416 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6511
2025-12-15 00:32:04.379 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:32:04.385 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:32:13.535 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=15049
2025-12-15 00:32:27.255 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:32:27.260 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:32:42.121 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8828
2025-12-15 00:32:55.898 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:32:55.910 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:33:07.429 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6049
2025-12-15 00:33:22.839 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:33:22.844 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:33:31.224 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8423
2025-12-15 00:33:40.356 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:33:40.361 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:33:44.962 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=9488
2025-12-15 00:34:02.242 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:34:02.256 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:34:14.243 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=5747
2025-12-15 00:34:21.270 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:34:21.274 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:34:41.171 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6132
2025-12-15 00:35:00.832 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:35:00.846 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:35:12.452 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=13477
2025-12-15 00:35:29.330 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:35:29.341 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:35:58.348 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=14304
2025-12-15 00:36:11.586 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:36:11.595 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:36:16.537 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
2025-12-15 00:36:31.183 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6403
2025-12-15 00:36:42.608 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:36:42.613 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:36:55.674 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=13675
2025-12-15 00:37:12.157 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:37:12.162 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:37:22.569 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=11202
2025-12-15 00:37:37.726 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:37:37.733 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:37:54.948 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=9670
2025-12-15 00:38:10.138 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:38:10.141 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:38:21.973 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=11575
2025-12-15 00:38:25.384 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:38:25.390 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:38:35.538 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=12679
2025-12-15 00:38:50.186 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:38:50.192 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:38:59.778 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6067
2025-12-15 00:39:07.116 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:39:07.125 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:39:21.684 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=18332
2025-12-15 00:39:32.316 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:39:32.323 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:39:42.899 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6725
2025-12-15 00:39:58.505 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:39:58.510 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:40:13.074 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=9087
2025-12-15 00:40:29.155 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:40:29.160 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:40:34.718 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=7566
2025-12-15 00:40:47.286 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:40:47.291 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:40:52.797 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=5322
2025-12-15 00:41:05.125 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:41:05.134 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:41:16.720 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6080
2025-12-15 00:41:32.930 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:41:32.937 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:41:48.101 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=7216
2025-12-15 00:41:57.043 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:41:57.053 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:42:14.886 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=7975
2025-12-15 00:42:25.873 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:42:25.885 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:42:49.378 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=5977
2025-12-15 00:43:00.350 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:43:00.358 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:43:16.051 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=5361
2025-12-15 00:43:30.122 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:43:30.130 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:43:41.456 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=7612
2025-12-15 00:43:56.218 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:43:56.224 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:44:09.073 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=5569
2025-12-15 00:44:19.391 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:44:19.397 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:44:27.943 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=10078
2025-12-15 00:44:33.276 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:44:33.284 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:44:43.176 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=5484
2025-12-15 00:44:56.390 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:44:56.395 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:45:01.616 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8913
2025-12-15 00:45:12.660 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:45:12.668 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:45:30.385 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6853
2025-12-15 00:45:41.198 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:45:41.204 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:45:54.589 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6083
2025-12-15 00:46:01.018 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:46:01.023 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:46:17.502 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=18314
2025-12-15 00:46:29.858 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:46:29.863 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:46:39.637 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6078
2025-12-15 00:46:47.242 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:46:47.251 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:47:00.182 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=13471
2025-12-15 00:47:09.184 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:47:09.191 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:47:23.352 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=10896
2025-12-15 00:47:33.324 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:47:33.328 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:47:44.855 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=7888
2025-12-15 00:47:58.799 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:47:58.804 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:48:06.025 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=13870
2025-12-15 00:48:35.304 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6570
2025-12-15 00:48:50.580 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:48:50.591 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:49:06.970 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=7182
2025-12-15 00:49:19.145 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:49:19.152 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:49:26.530 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=9248
2025-12-15 00:49:36.738 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:49:36.743 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:49:49.563 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8773
2025-12-15 00:49:57.434 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:49:57.439 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:50:05.881 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8679
2025-12-15 00:50:11.685 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:50:11.689 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:50:19.198 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8517
2025-12-15 00:50:27.954 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:50:27.959 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:50:43.572 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8826
2025-12-15 00:50:52.327 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:50:52.335 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:51:04.560 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8012
2025-12-15 00:51:17.087 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:51:17.091 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:51:28.352 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=12685
2025-12-15 00:51:34.970 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:51:34.975 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:51:52.376 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=7079
2025-12-15 00:52:01.848 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:52:01.855 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:52:19.135 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6340
2025-12-15 00:52:30.032 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:52:30.042 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:52:34.956 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=17521
2025-12-15 00:52:46.156 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:52:46.161 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:53:00.735 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6173
2025-12-15 00:53:12.913 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:53:12.918 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:53:21.232 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=7812
2025-12-15 00:53:24.212 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:53:24.218 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:53:33.263 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=16922
2025-12-15 00:53:44.780 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:53:44.787 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:53:57.646 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
2025-12-15 00:54:15.890 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=5852
2025-12-15 00:54:27.390 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:54:27.397 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:54:39.491 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8181
2025-12-15 00:54:52.853 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:54:52.859 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:55:04.783 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8271
2025-12-15 00:55:20.285 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:55:20.291 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:55:27.996 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6952
2025-12-15 00:55:32.327 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:55:32.333 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:55:41.168 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=13433
2025-12-15 00:55:55.488 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:55:55.497 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:56:12.980 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=7118
2025-12-15 00:56:31.244 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
2025-12-15 00:56:31.252 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
2025-12-15 00:56:38.954 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=12272

View File

@ -1,33 +0,0 @@
2025-12-22 00:17:18.388 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:17:18.825 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:17:18.916 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:17:30.421 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:17:40.552 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:17:50.896 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:18:05.607 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:18:10.877 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:18:22.338 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:18:30.282 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:18:44.142 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:19:01.977 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:19:09.058 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:19:16.908 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:19:32.476 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:19:43.911 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:19:51.648 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:20:08.812 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:20:19.945 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:20:27.279 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:20:34.469 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:20:44.545 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
2025-12-22 00:20:55.698 | ERROR | __main__:load_token_from_api:347 - ❌ API请求失败: 502
2025-12-22 00:20:55.698 | WARNING | __main__:load_token_from_api:355 - ⚠️ 使用默认token值
2025-12-22 00:21:06.888 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
2025-12-22 00:21:07.390 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
2025-12-22 00:21:07.482 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
2025-12-22 00:21:14.593 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
2025-12-22 00:21:25.726 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
2025-12-22 00:21:38.392 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
2025-12-22 00:21:51.367 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
2025-12-22 00:22:01.894 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
2025-12-22 00:22:11.462 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=ff0679d952e35826cced..., wt2= ...

View File

@ -1,5 +0,0 @@
requests==2.32.4
loguru==0.7.3
httpx==0.28.1
fake-useragent==2.2.0
PySocks==1.7.1

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,38 +0,0 @@
# 使用Python 3.9作为基础镜像
FROM python:3.9-slim
# 设置工作目录
WORKDIR /app
# 设置环境变量
ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1
# 配置环境变量可通过docker run -e 覆盖)
ENV API_BASE_URL=http://124.222.245.240:9999 \
MONGODB_URI=mongodb://localhost:27017 \
MONGODB_DB=job_data \
MAX_PAGES=3 \
PAGE_SIZE=15 \
MIN_WAIT_TIME=10 \
MAX_WAIT_TIME=30 \
ERROR_WAIT_MIN=30 \
ERROR_WAIT_MAX=60
# 复制requirements文件并安装Python依赖
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 复制项目文件
COPY boos_api.py ./
COPY city.json ./
COPY work.json ./
# 创建非root用户
RUN useradd -m -u 1000 crawler && chown -R crawler:crawler /app
USER crawler
# 启动命令
CMD ["python", "boos_api.py"]

File diff suppressed because it is too large Load Diff

View File

@ -1,492 +0,0 @@
{
"abroadFlag": 2,
"abroadTipInfo": {
"abroadTips": [],
"icon": "",
"title": ""
},
"adResponse": None,
"aiPositionRecommendLevel": "",
"aiPositionRecommendReason": "",
"alreadyCallPhone": False,
"applyType": "1",
"campusBestCompany": {
"bestCompanyUrl": "",
"homepageType": 0,
"logoTagUrl": "",
"state": 0
},
"campusJobDetail": {
"applyEndTime": 1767196799999,
"applyStartTime": 1760457600000,
"applyTimeCountDown": "",
"cityName": "内江",
"companyLogo": "https://storage-public.zhaopin.cn/campus/newrd/online/1760485043129445931/%25E6%2588%2590%25E6%25B8%259D%25E9%2592%2592%25E9%2592%259B%25E7%25A7%2591%25E6%258A%2580%25E6%259C%2589%25E9%2599%2590%25E5%2585%25AC%25E5%258F%25B8.png",
"companyName": "成渝钒钛科技有限公司",
"companyNumber": "KA0224051216D90000037000",
"countDownBackColor": "",
"countDownFontColor": "",
"industryName": "钢铁/有色金属冶炼及加工",
"orgSizeName": "1000-9999人",
"orgTypeName": "民营"
},
"campusJobMatchData": None,
"campusPositionCardTagInfo": None,
"campusPreciseMatchType": 0,
"campusRootOrgInfo": {
"address": "",
"businessLicenceName": "中国共产主义青年团四川省委员会",
"cityName": "成都",
"description": "",
"displayOrgName": "中国共产主义青年团四川省委员会",
"hideCampusElement": False,
"hideJobApplyLable": False,
"id": 104514,
"industryName": "社团/组织/社会保障",
"introUrl": "",
"logo": "https://storage-public.zhaopin.cn/campus/newrd/online/1760484389628245179/5baa527cc949cacd676ba77cce6cb79d.png",
"oldRdOrgNumber": "",
"orgName": "中国共产主义青年团四川省委员会",
"orgNumber": "KA0224051216P90000006000",
"orgSizeName": "",
"orgTypeName": "国家机关",
"slaveDisplayOrgName": ""
},
"canBeRegular": False,
"canRemoteInternship": False,
"cardCustomJson": "{"address":" ","companyName":"","locationType":"1","salary60":"5000-10000"}",
"cardType": 1,
"chatWindow": 1,
"cityDistrict": "威远县",
"cityId": "809",
"commercialLabel": [
{
"labelDescription": None,
"type": 2,
"typeName": "网申",
"typeShowLabel": "https://img09.zhaopin.cn/2012/other/mobile/capp/position/ui23/tag_jd_xiao_3x.png?w=81&h=48&r=3"
},
{
"labelDescription": None,
"type": 27,
"typeName": "直招",
"typeShowLabel": "https://img09.zhaopin.com/2012/other/mobile/capp/position/home/tag_jd_zhizhao.png?w=78&h=48&r=3"
}
],
"commonTrack": {
"trackCommercialFeature": "",
"trackSocialSearchEmergencyFeature": False
},
"companyId": 10097072,
"companyLogo": "https://storage-public.zhaopin.cn/campus/newrd/online/1760485043129445931/%25E6%2588%2590%25E6%25B8%259D%25E9%2592%2592%25E9%2592%259B%25E7%25A7%2591%25E6%258A%2580%25E6%259C%2589%25E9%2599%2590%25E5%2585%25AC%25E5%258F%25B8.png",
"companyName": "成渝钒钛科技有限公司",
"companyNumber": "KA0224051216D90000037000",
"companyRootId": 104514,
"companyScaleTypeTagsNew": [],
"companySize": "",
"companyUrl": "https://xiaoyuan.zhaopin.com/company/KA0224051216P90000006000",
"complainFlag": False,
"deliveryPath": "",
"displayPhoneNumber": False,
"distance": 0.0,
"distanceFormat": "",
"distanceText": "",
"education": "本科",
"experimentInfo": None,
"extensions": None,
"featureServer": {
"jdViews3d": "33",
"lastReplyTimeText": "",
"staffReplyRate30d": 0.0,
"todayReplyNum": 0,
"todayReplyNumText": ""
},
"feedOperation": None,
"feedPosition": None,
"financingStage": {
"name": ""
},
"firstPublishTime": "2025-10-15 07:52:28",
"hasAppliedPosition": False,
"industryCompanyTags": [
"1400020000"
],
"industryName": "钢铁/有色金属冶炼及加工",
"innerBusinessInfo": {
"customIndustryList": []
},
"internshipMonths": 0,
"isNewPosition": 0,
"jdCardType": 2,
"jobDetailData": {
"company": {
"base": None,
"companyComment": None,
"companyInterview": None,
"jumpDetail": None,
"orgBestRanking": None,
"orgReliableCompany": None,
"other": None,
"state": None
},
"companyProxy": {
"companyAddress": "",
"companyImage": "",
"companyName": "",
"companySize": "",
"entryCompanyTitle": ""
},
"customAttributeInfo": {
"platformRemind": "",
"reportItems": [],
"welfareItems": [],
"workTimeItems": []
},
"debug": {},
"experimentInfo": {
"blueCollarJobTitleExperimentInfo": None
},
"featureServer": None,
"imSection": None,
"internship": [],
"live": {
"liveItems": [],
"liveQuickFocusChecked": 0,
"liveQuickFocusState": 0,
"recommendLiveList": [],
"state": 0
},
"operationSection": {
"topJobBannerArea": None
},
"partTime": [],
"position": {
"base": {
"deliveryPath": "",
"education": "本科",
"educationCode": "",
"maxSalary": "",
"minSalary": "",
"positionId": 40846760303,
"positionName": "技术操作工",
"positionNumber": "CC224051210J40846760303",
"positionUrl": "",
"positionWorkingExp": "无经验",
"positionWorkingExpCode": "",
"propertyType": "",
"salary": "5000-10000元",
"salaryReal": "",
"workType": "全职"
},
"date": {
"dateEnd": "",
"dateStart": "",
"firstPublishTime": "",
"positionPublishTime": "",
"positionUpdateTime": "",
"positionUpdateTimeText": ""
},
"desc": {
"description": "",
"descriptionHighlight": "",
"highlightLabels": [],
"labels": [],
"performanceBonus": "",
"welfareLabel": [],
"welfareTags": []
},
"jobType": {
"jobType": "",
"jobTypeLevel": "15000100000000",
"jobTypeLevelName": "",
"subJobType": "",
"subJobTypeLevel": "15000100190000",
"subJobTypeLevelName": ""
},
"onlineCarHailingExtend": {
"gray": False,
"promiseGuarantee": ""
},
"onlineCarInfo": [],
"other": {
"customJobGroup": "NORMAL_DIRECT",
"deliveredPreviouslyTip": "",
"jobKeyword": {
"keywords": []
},
"jobSkillTags": [],
"jobTypeIsBlueCollar": True,
"pageStyle": 0,
"positionCommercialLabel": [
{
"labelDescription": None,
"type": 2,
"typeName": "网申",
"typeShowLabel": "https://img09.zhaopin.cn/2012/other/mobile/capp/position/ui23/tag_jd_xiao_3x.png?w=81&h=48&r=3"
},
{
"labelDescription": None,
"type": 27,
"typeName": "直招",
"typeShowLabel": "https://img09.zhaopin.com/2012/other/mobile/capp/position/home/tag_jd_zhizhao.png?w=78&h=48&r=3"
}
],
"positionHighlight": "",
"propertyTypeUrl": "",
"rpoProxyDisplayOrgName": "",
"urgentRecruitmentUrl": ""
},
"preferredHrInfo": {
"icon": "",
"introduce": "",
"jumpUrl": "",
"preferredHr": False
},
"todayInterview": None,
"workLocation": {
"address": "工作地点:内江 · 威远县",
"addressType": 0,
"latitude": "0",
"longitude": "0",
"positionCityDistrict": "",
"positionCityDistrictCode": "",
"positionCityId": "809",
"positionWorkCity": "",
"showMap": True,
"showMultiAddressesTip": "",
"staticMapUrl": "https://img09.zhaopin.cn/2012/other/mobile/capp/position/detail/defaultMapUrl.png?w=1230&h=720&r=3",
"streetName": "",
"tradingArea": "",
"travelMode": "bus",
"verifyTheTruthUrl": "",
"workAddress": "内江威远县连界镇连界工业园区"
}
},
"proxyWarning": None,
"recommender": {
"avatar": "",
"name": "",
"state": 0,
"text": "",
"title": ""
},
"secure": {
"abroadLabel": "",
"abroadTipInfo": None,
"safeCenter": None,
"safetyReminder": None
},
"staff": {
"activityLevel": [],
"auditNaturePrompt": None,
"authenticationState": 0,
"avatar": "http://img09.zhaopin.cn/2012/other/mobile/position/list/hr_88.png",
"companyName": "成渝钒钛科技有限公司",
"goldMedalInterviewer": None,
"greeting": "",
"greetingHasDelivery": "",
"hrJob": "人事经理",
"hrOnlineIocState": 0,
"hrOnlineState": "",
"hrResumeOperationState": "",
"hrStateInfo": "",
"id": 1212725485,
"lastOnlineTime": 0,
"lastOnlineTimeText": "",
"modularState": 1,
"other": {
"freeTag": None,
"tagUrl": ""
},
"positionDetailStaffQuickReply": None,
"staffName": "HR"
},
"stateInfo": {
"deliveryAfterGuide": None,
"imSessionInfoDetail": {
"imChatStatus": 0,
"imChatStatusForChatBeforeDelivery": 0,
"imDeliveryTitle": "",
"referType": -1
},
"positionBehaviorState": {
"deliveryState": 0,
"favoriteState": 0,
"followHrState": 0,
"imReplyState": -1,
"negativeState": 0,
"sessionChatState": 0
},
"state": {
"abroadFlag": 2,
"applyType": "",
"callProcess": "",
"hasAppliedPosition": False,
"positionDeliveryType": "",
"positionSourceType": 2,
"workMode": "ONSITE",
"workModeDesc": ""
},
"useNewAfterDeliveryStyle": False
},
"verifyTheTruth": None,
"verifyTrueFeedback": None
},
"jobDetailShowUrgentTag": False,
"jobHitReason": "",
"jobHitReasonHighlights": [],
"jobId": 40846760303,
"jobKeyword": {
"keywords": [
{
"itemValue": ""
},
{
"itemValue": ""
},
{
"itemValue": ""
}
]
},
"jobKnowledgeWelfareFeatures": [],
"jobPostingTime": 1760485948498,
"jobRootOrgInfo": {
"cityName": "内江"
},
"jobSkillTags": [],
"jobSummary": "负责新产品开发、新技术运用,并对实施过程的质量、技术协调、管理",
"liveCard": {
"icon": "",
"liveState": 0,
"liveTips": "",
"roomId": 0,
"startTimeFormat": "",
"videoUrl": ""
},
"matchInfo": {
"icon": "http://img09.zhaopin.cn/2012/other/mobile/position/list/icon_jd_recommend.png?w=156&h=54&r=3",
"matched": 1,
"tagState": 0
},
"menVipLevel": 0,
"name": "技术操作工",
"needMajor": [],
"number": "CC224051210J40846760303",
"operationImageLabel": [
"https://img09.zhaopin.com/2012/other/mobile/app/im/job_protection.png?w=174&h=69&r=3"
],
"orgBestEmployerFlag": 0,
"orgCommercialTags": [],
"orgPayedFlag": 0,
"organizer": True,
"payload": {
"name": "",
"partition": "",
"score": "",
"weight": ""
},
"positionCommercialLabel": [
{
"labelDescription": None,
"type": 2,
"typeName": "网申",
"typeShowLabel": "https://img09.zhaopin.cn/2012/other/mobile/capp/position/ui23/tag_jd_xiao_3x.png?w=81&h=48&r=3"
},
{
"labelDescription": None,
"type": 27,
"typeName": "直招",
"typeShowLabel": "https://img09.zhaopin.com/2012/other/mobile/capp/position/home/tag_jd_zhizhao.png?w=78&h=48&r=3"
}
],
"positionExpandCardData": "",
"positionExpandCardType": 0,
"positionHighlight": "",
"positionOfNlp": 1,
"positionSourceType": 2,
"positionSourceTypeUrl": "https://img09.zhaopin.cn/2012/other/mobile/capp/position/ui23/tag_jd_xiao_3x.png?w=81&h=48&r=3",
"positionURL": "https://xiaoyuan.zhaopin.com/job/CC224051210J40846760303",
"positionUrl": "https://xiaoyuan.zhaopin.com/job/CC224051210J40846760303",
"property": "民营",
"propertyCode": "5",
"propertyName": "国家机关",
"propertyType": "",
"propertyTypeUrl": "",
"provideInternshipCertificate": False,
"proxyModel": {
"proxiedOrgName": "",
"proxiedOrgSize": "",
"recruitPosition": 0
},
"publishTime": "2025-10-15 07:52:28",
"recallSign": {
"gMethod": "config-position_search-position_campus_query-COMPANY-fix_20250109",
"gParam": "query-ps-campus-query-1",
"gQuery": "query-ps-campus-query-1",
"gSort": "query-ps-campus-query-1",
"gSource": "solr.source_position_query",
"gWeight": 20
},
"recruitNumber": 0,
"redirectUrl": "",
"redirectable": False,
"rootCompanyNumber": "KA0224051216P90000006000",
"rpoProxied": False,
"rpoProxy": False,
"salary60": "5000-10000元",
"salaryCount": "",
"salaryReal": "5000-10000",
"salaryType": 1,
"searchTagList": [],
"securityAddressLabel": "",
"settlementType": "",
"showDistance": 0,
"showSkillTags": [
{
"highlightBackGroundColor": "",
"highlightWordColor": "",
"tag": "本科"
}
],
"skillLabel": [],
"skillLabelPersonality": "",
"staffCard": {
"authenticationState": 0,
"avatar": "http://img09.zhaopin.cn/2012/other/mobile/position/list/hr_88.png",
"goldMedalInterviewer": {
"goldMedalInterviewer": False,
"interviewerImageUrl": "",
"interviewerTitle": ""
},
"hrCompanyName": "",
"hrJob": "人事经理",
"hrOnlineIocState": 0,
"hrOnlineState": "",
"hrStateInfo": "",
"id": 1212725485,
"lastOnlineTime": 1762492035114,
"lastOnlineTimeText": "",
"staffName": "HR"
},
"streetId": 0,
"streetName": "",
"subJobTypeLevel": "15000100190000",
"subJobTypeLevelName": "技工",
"subways": [],
"tagABC": "",
"tagList": [],
"todayInterview": False,
"todayInterviewImageUrl": "",
"topLabel": None,
"tradingArea": "",
"volcanoMeterial": None,
"weeklyInternshipDays": 0,
"welfareLabel": [],
"welfareTagList": [],
"workCity": "内江",
"workDateType": "",
"workMode": "",
"workType": "全职",
"workingExp": "无经验"
}

File diff suppressed because one or more lines are too long

View File

@ -1,6 +0,0 @@
{"timestamp": "2026-01-15 00:38:02", "total_crawled": 517, "unique_count": 503, "duplicate_count": 14, "api_total_count": 505, "job_area": "商丘", "function_type": "8305"}
{"timestamp": "2026-01-15 01:36:23", "total_crawled": 517, "unique_count": 509, "duplicate_count": 8, "api_total_count": 517, "job_area": "广安", "function_type": "1318"}
{"timestamp": "2026-01-15 02:32:36", "total_crawled": 517, "unique_count": 511, "duplicate_count": 6, "api_total_count": 517, "job_area": "阜阳", "function_type": "6101"}
{"timestamp": "2026-01-15 03:32:52", "total_crawled": 517, "unique_count": 513, "duplicate_count": 4, "api_total_count": 517, "job_area": "常德", "function_type": "3812"}
{"timestamp": "2026-01-15 04:31:42", "total_crawled": 517, "unique_count": 510, "duplicate_count": 7, "api_total_count": 517, "job_area": "惠州", "function_type": "3335"}
{"timestamp": "2026-01-15 05:28:54", "total_crawled": 517, "unique_count": 515, "duplicate_count": 2, "api_total_count": 517, "job_area": "锦州", "function_type": "0154"}

View File

@ -1,21 +0,0 @@
2025-12-15 00:55:26.469 | INFO | __main__:crawl_recommend_jobs_main:911 - 已配置代理: http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818
2025-12-15 00:55:26.502 | INFO | __main__:_init_client:323 - 初始化客户端,使用代理: http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818
2025-12-15 00:55:26.601 | INFO | __main__:crawl_recommend_jobs_main:943 - 随机选择工作类型: 货运司机 (1831)
2025-12-15 00:55:26.601 | INFO | __main__:crawl_recommend_jobs_main:945 - 开始爬取推荐职位,最多 3 页
2025-12-15 00:55:26.601 | INFO | __main__:crawl_multiple_pages:803 - 正在爬取第 1 页...
2025-12-15 00:55:26.601 | INFO | __main__:_init_client:323 - 初始化客户端,使用代理: http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818
2025-12-15 00:55:26.607 | INFO | __main__:get_recommend_jobs:512 - 获取推荐职位: 页码=1, 页大小=10, 地区=190300, 工作类型=1831
2025-12-15 00:55:26.694 | WARNING | __main__:_make_request:477 - 请求失败: 500 - {"status":"100000","message":"网络超时,请稍后重试!"}
2025-12-15 00:55:27.728 | WARNING | __main__:_make_request:477 - 请求失败: 500 - {"status":"100000","message":"网络超时,请稍后重试!"}
2025-12-15 00:55:29.766 | WARNING | __main__:_make_request:477 - 请求失败: 500 - {"status":"100000","message":"网络超时,请稍后重试!"}
2025-12-15 00:55:29.767 | ERROR | __main__:_make_request:503 - 所有重试失败,请求终止
2025-12-15 00:55:30.063 | INFO | __main__:_make_request:474 - 请求参数 method=POST url=https://cupid.51job.com/open/noauth/recommend/job-tab-dynamic-wx-mini?api_key=51job&timestamp=1765731326 status=200 resp_size=52176
2025-12-15 00:55:30.064 | INFO | __main__:get_recommend_jobs:585 - 成功获取 10 个职位
2025-12-15 00:55:30.131 | INFO | __main__:_make_request:474 - 请求参数 method=GET url=https://cupid.51job.com/open/noauth/jobs/detail/base/169639008?api_key=51job&timestamp=1765731330 status=200 resp_size=5614
2025-12-15 00:55:30.202 | INFO | __main__:_make_request:474 - 请求参数 method=GET url=https://cupid.51job.com/open/noauth/company-info/info-data?api_key=51job&timestamp=1765731330&companyId=10080947&colorOne=%23ffffff&colorTwo=%23ffffffcc status=200 resp_size=4186
2025-12-15 00:55:31.599 | INFO | __main__:_make_request:474 - 请求参数 method=GET url=https://cupid.51job.com/open/noauth/jobs/detail/base/169747067?api_key=51job&timestamp=1765731331 status=200 resp_size=5105
2025-12-15 00:55:31.660 | INFO | __main__:_make_request:474 - 请求参数 method=GET url=https://cupid.51job.com/open/noauth/company-info/info-data?api_key=51job&timestamp=1765731331&companyId=9427865&colorOne=%23ffffff&colorTwo=%23ffffffcc status=200 resp_size=2743
2025-12-15 00:55:34.376 | INFO | __main__:_make_request:474 - 请求参数 method=GET url=https://cupid.51job.com/open/noauth/jobs/detail/base/162834418?api_key=51job&timestamp=1765731334 status=200 resp_size=5231
2025-12-15 00:55:34.451 | INFO | __main__:_make_request:474 - 请求参数 method=GET url=https://cupid.51job.com/open/noauth/company-info/info-data?api_key=51job&timestamp=1765731334&companyId=9371694&colorOne=%23ffffff&colorTwo=%23ffffffcc status=200 resp_size=2524
2025-12-15 00:55:35.084 | INFO | __main__:crawl_recommend_jobs_main:964 - 用户中断推荐职位爬取
2025-12-15 00:55:35.085 | INFO | __main__:close:878 - httpx客户端已关闭

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,21 +0,0 @@
{
'status': '1',
'message': '',
'resultbody': {
'jobList': {
'items': [],
'totalCount': 378,
'listShowTypeFuncTypeCode': '',
'quickDeliveryFuncTypeCode': '',
'salaryExpandEnabled': True,
'needTransition': False,
'totalcount': 378
},
'adsTabFeeds': [
],
'requestId': '9e791cba2cce6ac408fc29282bfe1927_7ebb7d6e43dbebc23473123df02328d4',
'beyondDistancePosition': -1,
'jobLibCodes': [],
'jobAreaList': []
}
}

View File

@ -1,231 +0,0 @@
import os
import time
import json
import hmac
import hashlib
import random
from typing import Any, Dict, List, Optional
from urllib.parse import quote
import requests
import urllib.parse
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
BASE_URL = "https://we.51job.com/api/job/search-pc"
API_BASE_URL = os.getenv("API_BASE_URL", "http://127.0.0.1:9999")
SIGN_KEY = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
def _now_ts() -> str:
"""返回当前时间戳字符串"""
return str(int(time.time()))
def _build_proxy() -> Dict[str, str]:
"""构造HTTP/HTTPS代理配置"""
url = os.getenv("PROXY_URL", "http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818")
return {"http": url, "https": url}
def _hmac_sign_url(url: str) -> str:
"""对完整URL进行HMAC-SHA256签名"""
return hmac.new(SIGN_KEY.encode("utf-8"), url.encode("utf-8"), hashlib.sha256).hexdigest()
def _generate_acw_sc_v2(arg1: str) -> str:
"""依据arg1生成acw_sc__v2值"""
if not arg1:
return ""
pos_list = [15, 35, 29, 24, 33, 16, 1, 38, 10, 9, 19, 31, 40, 27, 22, 23, 25, 13, 6, 11, 39, 18, 20, 8, 14, 21,
32, 26, 2, 30, 7, 4, 17, 5, 3, 28, 34, 37, 12, 36]
mask = "3000176000856006061501533003690027800375"
out = [None] * len(pos_list)
for i in range(len(arg1)):
for j in range(len(pos_list)):
if pos_list[j] == i + 1:
out[j] = arg1[i]
break
arg2 = ''.join([c for c in out if c])
arg3 = ""
for i in range(0, min(len(arg2), len(mask)), 2):
sc = int(arg2[i:i + 2], 16)
mc = int(mask[i:i + 2], 16)
arg3 += format(sc ^ mc, '02x')
return arg3
def _build_web_headers(keyword: str, did: str, sign: str) -> Dict[str, str]:
"""构造网页接口所需请求头"""
encoded_kw = quote(keyword, safe="")
referer = f"https://we.51job.com/pc/search?jobArea=020000&keyword={encoded_kw}&searchType=2&keywordType="
web_prop = {
"partner": "",
"webId": 2,
"fromdomain": "51job_web",
"frompageUrl": "https://we.51job.com/",
"pageUrl": referer,
"identityType": "",
"userType": "",
"isLogin": "",
"accountid": "",
"keywordType": "直接输入",
}
return {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
"Accept": "application/json, text/plain, */*",
"accept-language": "zh-CN,zh;q=0.9",
"from-domain": "51job_web",
"priority": "u=1, i",
"property": quote(json.dumps(web_prop, ensure_ascii=False), safe=""),
"referer": referer,
"sec-ch-ua": '"Not/A)Brand";v="8", "Chromium";v="136", "Google Chrome";v="136"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"sign": sign,
"uuid": did,
"Cookie": f"guid={did}",
}
def _build_query(city_code: str, page: int, keyword: str) -> str:
"""构造search-pc查询串"""
params = {
'api_key': '51job',
'timestamp': _now_ts(),
'keyword': keyword,
'searchType': '2',
'function': '',
'industry': '',
'jobArea': city_code,
'jobArea2': '',
'landmark': '',
'metro': '',
'salary': '',
'workYear': '',
'degree': '',
'companyType': '',
'companySize': '',
'jobType': '',
'issueDate': '',
'sortType': '0',
'pageNum': str(page),
'requestId': '',
'pageSize': '20',
'source': '1',
'accountId': '',
'pageCode': 'sou|sou|soulb',
'scene': '7',
}
return urllib.parse.urlencode(params, quote_via=urllib.parse.quote)
def _prefetch_acw(city_code: str, page: int, keyword: str, proxies: Dict[str, str]) -> Optional[str]:
"""预取arg1并生成acw_sc__v2"""
qs = _build_query(city_code, page, keyword)
full_url = f"{BASE_URL}?{qs}"
did = str(random.randint(10**15, 10**16-1))
sign = _hmac_sign_url(full_url)
headers = _build_web_headers(keyword, did, sign)
try:
resp = requests.get(full_url, headers=headers, timeout=20, proxies=proxies, verify=False)
txt = resp.text or ""
m = re_search_arg1(txt)
if not m:
return None
return _generate_acw_sc_v2(m)
except Exception:
return None
def re_search_arg1(text: str) -> Optional[str]:
"""从文本中提取arg1"""
import re
m = re.search(r"var\s+arg1\s*=\s*['\"]([^'\"]+)['\"]", text)
return m.group(1) if m else None
def fetch_page(city_code: str, keyword: str, page: int, proxies: Dict[str, str]) -> Dict[str, Any]:
"""抓取单页数据并返回解析后的JSON或文本封装"""
qs = _build_query(city_code, page, keyword)
full_url = f"{BASE_URL}?{qs}"
did = str(random.randint(10**15, 10**16-1))
sign = _hmac_sign_url(full_url)
headers = _build_web_headers(keyword, did, sign)
acw = _prefetch_acw(city_code, page, keyword, proxies)
if acw:
headers["Cookie"] = headers.get("Cookie", "") + f"; acw_sc__v2={acw}"
resp = requests.get(full_url, headers=headers, timeout=30, proxies=proxies, verify=False)
try:
return resp.json()
except ValueError:
return {"raw": resp.text}
def load_company_keywords() -> List[str]:
"""读取同目录company.txt为关键词列表"""
fp = os.path.join(os.path.dirname(__file__), "company.txt")
try:
with open(fp, "r", encoding="utf-8") as f:
lines = [ln.strip() for ln in f.readlines()]
return [ln for ln in lines if ln]
except Exception:
return []
def fetch_service_params() -> Optional[Dict[str, Any]]:
"""从服务端获取当天未使用的检索条件并占用
返回:
dict: {"city_code": str, "keyword": str} None
"""
try:
url = f"{API_BASE_URL}/api/v1/keyword/available"
r = requests.get(url, params={"source": "qcwy", "limit": 1}, timeout=10)
if r.status_code != 200:
return None
js = r.json()
data = js.get("data") or {}
items = data.get("items") or []
if not items:
return None
item = items[0]
ids = [item.get("id")]
if ids and ids[0]:
try:
murl = f"{API_BASE_URL}/api/v1/keyword/mark-used"
requests.post(murl, json={"source": "qcwy", "ids": ids}, timeout=10)
except Exception:
pass
return {"city_code": str(item.get("city", "")), "keyword": str(item.get("job", ""))}
except Exception:
return None
def run(city_code: str = "020000", max_pages: int = 3) -> None:
"""执行基于company.txt的固定条件搜索并输出返回数据"""
proxies = _build_proxy()
# 优先从服务端拉取城市与关键词无数据时回退到本地company.txt
svc = fetch_service_params()
keywords = [svc["keyword"]] if svc else load_company_keywords()
city_code = svc["city_code"] if svc else city_code
for kw in keywords:
for p in range(1, max_pages + 1):
data = fetch_page(city_code, kw, p, proxies)
payload = {
"keyword": kw,
"page": p,
"data": data,
}
print(json.dumps(payload, ensure_ascii=False))
time.sleep(random.uniform(0.8, 1.6))
if __name__ == "__main__":
run()

View File

@ -1,974 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import os
import time
import uuid as _uuid
from typing import Any, Dict, Optional, Callable
import socket
import random
import sqlite3
from urllib.parse import quote
from urllib.request import Request, urlopen, build_opener, ProxyHandler, HTTPSHandler
from urllib.error import HTTPError, URLError
import ssl
BASE_URL = "https://cupid.51job.com"
SIGN_KEY = os.getenv(
"JOB_SIGN_KEY",
"abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b",
)
FROM_DOMAIN = "51job_weixin_wxapp"
API_BASE_URL = os.getenv("API_BASE_URL", "http://127.0.0.1:9999")
COMPANY_INFO_CACHE: Dict[str, Any] = {}
SUCCESS_LOG_PATH = os.path.join(os.path.dirname(__file__), "success.txt")
_SUCCESS_WRITTEN: set = set()
def _timestamp() -> int:
"""Get current UNIX timestamp.
Returns:
int: Current timestamp in seconds.
"""
return int(time.time())
def _encode_query(params: Optional[Dict[str, object]]) -> str:
"""Encode query parameters preserving insertion order.
Args:
params (Optional[Dict[str, object]]): Query parameters to encode.
Returns:
str: URL-encoded query string starting with '&' when params exist.
"""
if not params:
return ""
pieces = []
for k, v in params.items():
key = quote(str(k), safe="")
if isinstance(v, (list, tuple)):
for item in v:
pieces.append(f"{key}={quote(str(item), safe='')}")
elif v is None:
pieces.append(f"{key}=")
else:
pieces.append(f"{key}={quote(str(v), safe='')}")
return "&" + "&".join(pieces)
def build_signature(
method: str,
path: str,
query_params: Optional[Dict[str, object]] = None,
body_json: Optional[str] = None,
timestamp: Optional[int] = None,
) -> Dict[str, Any]:
"""Construct signing string and compute HMAC-SHA256 signature.
Args:
method (str): HTTP method (GET or POST).
path (str): API path without leading slash.
query_params (Optional[Dict[str, object]]): Query parameters for GET.
body_json (Optional[str]): JSON body string for POST.
timestamp (Optional[int]): Provided timestamp; generates if None.
Returns:
Dict[str, Any]: dict with fields 'sig'(hex), 'signed_path', and 'ts'.
"""
import hmac
import hashlib
ts = timestamp or _timestamp()
base = f"/{path}?api_key=51job&timestamp={ts}"
sign_str = base
method_u = method.upper()
if method_u == "GET":
q = _encode_query(query_params)
sign_str += q
signed_path = base + q
else:
if body_json:
sign_str += body_json
signed_path = base
key_bytes = SIGN_KEY.encode("utf-8")
sig = hmac.new(key_bytes, sign_str.encode("utf-8"), hashlib.sha256).hexdigest()
return {"sig": sig, "signed_path": signed_path, "ts": ts}
def _build_headers(
sign: str,
content_type: str,
uuid: Optional[str] = None,
account_id: Optional[str] = None,
user_token: Optional[str] = None,
partner: Optional[str] = None,
property_obj: Optional[Dict[str, Any]] = None,
headers_ext: Optional[Dict[str, str]] = None,
) -> Dict[str, str]:
"""Build request headers including signing and context.
Args:
sign (str): Hex-encoded signature.
content_type (str): Content-Type header.
uuid (Optional[str]): UUID value for tracing.
account_id (Optional[str]): Account id.
user_token (Optional[str]): User token.
partner (Optional[str]): Partner identifier.
property_obj (Optional[Dict[str, Any]]): Property payload.
headers_ext (Optional[Dict[str, str]]): Extra headers to merge.
Returns:
Dict[str, str]: Complete headers dict.
"""
did = uuid or str(_uuid.uuid4())
headers = {
"sign": sign,
"From-Domain": FROM_DOMAIN,
"Content-Type": content_type,
"Accept": "application/json",
"uuid": did,
}
if account_id:
headers["account-id"] = account_id
if user_token:
headers["user-token"] = user_token
if partner:
headers["partner"] = partner
prop = property_obj or {
"frompageUrl": "",
"pageUrl": "",
"isLogin": "" if bool(account_id) else "",
"accountid": account_id or "",
"resumeId": "",
"firstFrompageUrl": "",
"distinct_id": did,
}
headers["property"] = quote(json.dumps(prop, ensure_ascii=False), safe="")
if headers_ext:
headers.update(headers_ext)
return headers
def _request(
method: str,
path: str,
params: Optional[Dict[str, Any]] = None,
body: Optional[Dict[str, Any]] = None,
uuid: Optional[str] = None,
account_id: Optional[str] = None,
user_token: Optional[str] = None,
partner: Optional[str] = None,
property_obj: Optional[Dict[str, Any]] = None,
headers_ext: Optional[Dict[str, str]] = None,
proxies: Optional[list] = None,
timeout: int = 10,
retries: int = 2,
raw_sink: Optional[Callable[[str], None]] = None,
) -> Dict[str, Any]:
"""Execute signed HTTP request with basic retries.
Args:
method (str): HTTP method.
path (str): API path without leading slash.
params (Optional[Dict[str, Any]]): Query for GET.
body (Optional[Dict[str, Any]]): JSON body for POST.
uuid/account_id/user_token/partner/property_obj: Header context.
headers_ext (Optional[Dict[str, str]]): Extra headers to merge.
proxies (Optional[list]): Proxy entries.
timeout (int): Timeout seconds.
retries (int): Retry attempts.
Returns:
Dict[str, Any]: Parsed JSON response.
"""
body_json = json.dumps(body, ensure_ascii=False) if body is not None else None
sig = build_signature(method, path, params, body_json)
content_type = "application/x-www-form-urlencoded" if method.upper() == "GET" else "application/json"
headers = _build_headers(
sign=sig["sig"],
content_type=content_type,
uuid=uuid,
account_id=account_id,
user_token=user_token,
partner=partner,
property_obj=property_obj,
headers_ext=headers_ext,
)
url = f"{BASE_URL}{sig['signed_path']}"
data_bytes = body_json.encode("utf-8") if (method.upper() == "POST" and body_json is not None) else None
attempt = 0
backoff = 0.5
last_error: Optional[Exception] = None
use_insecure_ssl = os.getenv("JOB_INSECURE_SSL") in ("1", "true", "TRUE")
while attempt <= retries:
_sleep_between_requests(0.2, 0.7)
req = Request(url=url, data=data_bytes, headers=headers, method=method.upper())
try:
opener = None
ctx = _get_ssl_context(use_insecure_ssl)
if proxies:
idx = attempt % len(proxies)
p = proxies[idx]
if isinstance(p, str):
ph = ProxyHandler({"http": p, "https": p})
elif isinstance(p, dict):
ph = ProxyHandler(p)
else:
ph = None
if ph:
opener = build_opener(ph, HTTPSHandler(context=ctx))
if opener:
with opener.open(req, timeout=timeout) as resp:
payload = resp.read().decode("utf-8")
else:
with urlopen(req, timeout=timeout, context=ctx) as resp:
payload = resp.read().decode("utf-8")
if raw_sink and isinstance(payload, str):
try:
raw_sink(payload)
except Exception:
pass
return json.loads(payload) if payload else {}
except (HTTPError, URLError) as e:
print(e)
last_error = e
msg = str(e)
if ("CERTIFICATE_VERIFY_FAILED" in msg) and (not use_insecure_ssl):
use_insecure_ssl = True
attempt += 1
continue
if attempt == retries:
return {}
time.sleep(backoff)
backoff *= 2
attempt += 1
if last_error:
return {}
return {}
def search_company_keyword(
keyword: str,
page: int = 1,
size: int = 20,
job_area: str = "020000",
sort_type: str = "0",
search_type: str = "2",
scene: str = "12",
uuid: Optional[str] = None,
account_id: Optional[str] = None,
user_token: Optional[str] = None,
partner: Optional[str] = None,
property_obj: Optional[Dict[str, Any]] = None,
raw_sink: Optional[Callable[[str], None]] = None,
) -> Dict[str, Any]:
"""Call open/noauth/search with a company keyword.
Args:
keyword (str): Keyword to search.
page (int): Page number.
size (int): Page size.
job_area (str): Area code.
sort_type (str): Sort type.
search_type (str): Search type.
scene (str): Scene id.
uuid/account_id/user_token/partner/property_obj: Header context.
Returns:
Dict[str, Any]: API response JSON.
"""
params: Dict[str, Any] = {
"userLonLat": "",
"sortType": sort_type,
"keyword": keyword,
"pageSize": str(size),
"pageNum": str(page),
"jobArea": job_area,
"landmark": "",
"radius": "",
"workYear": "",
"degree": "",
"companyType": "",
"companySize": "",
"salary": "NaN-NaN",
"jobType": "",
"metro": "",
"function": "",
"industry": "",
"issueDate": "",
"searchType": search_type,
"scene": scene,
}
return _request(
method="GET",
path="open/noauth/search",
params=params,
uuid=uuid,
account_id=account_id,
user_token=user_token,
partner=partner,
property_obj=property_obj,
raw_sink=raw_sink,
)
def company_jobs_by_id(
co_id: str,
page: int = 1,
size: int = 20,
uuid: Optional[str] = None,
account_id: Optional[str] = None,
user_token: Optional[str] = None,
partner: Optional[str] = None,
property_obj: Optional[Dict[str, Any]] = None,
raw_sink: Optional[Callable[[str], None]] = None,
) -> Dict[str, Any]:
"""Fetch jobs for a company by numeric id using POST.
Args:
co_id (str): Numeric company id.
page (int): Page number.
size (int): Page size.
uuid/account_id/user_token/partner/property_obj: Header context.
Returns:
Dict[str, Any]: API response JSON.
"""
body: Dict[str, Any] = {
"pageNum": page,
"pageSize": size,
"coId": co_id,
"scene": 14,
"requestId": "",
}
return _request(
method="POST",
path="open/noauth/jobs/company",
body=body,
uuid=uuid,
account_id=account_id,
user_token=user_token,
partner=partner,
property_obj=property_obj,
raw_sink=raw_sink,
)
def _extract_items(resp: Dict[str, Any]) -> list:
"""Extract the first list of items from a nested response.
This function searches common keys first, then falls back to a recursive
traversal to find the first list encountered. It is resilient to schema
variations of the API response.
Args:
resp (Dict[str, Any]): Parsed JSON response.
Returns:
list: The extracted items list; empty when not found or no data.
"""
if not isinstance(resp, dict):
return []
# Prefer job items under resultbody/job/items
rb = resp.get("resultbody") or resp.get("resultBody")
if isinstance(rb, dict):
job_node = rb.get("job")
if isinstance(job_node, dict) and isinstance(job_node.get("items"), list):
return job_node.get("items", [])
preferred_keys = (
"items",
"list",
"jobs",
"jobList",
"companies",
"companyList",
"resultList",
"dataList",
)
for key in preferred_keys:
val = resp.get(key)
if isinstance(val, list):
return val
def _walk(node: Any) -> Optional[list]:
if isinstance(node, list):
return node
if isinstance(node, dict):
for k in preferred_keys:
v = node.get(k)
if isinstance(v, list):
return v
for v in node.values():
found = _walk(v)
if isinstance(found, list):
return found
return None
found = _walk(resp)
return found or []
def _get_local_ip() -> str:
"""Get local IP address for forwarding header.
Returns:
str: Local IP string.
"""
try:
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect(("8.8.8.8", 80))
ip = s.getsockname()[0]
s.close()
return ip
except Exception:
return "127.0.0.1"
def _get_ssl_context(insecure: bool = False) -> ssl.SSLContext:
"""Return SSL context, optionally unverified.
Args:
insecure (bool): Whether to disable certificate verification.
Returns:
ssl.SSLContext: Configured SSL context.
"""
if insecure:
try:
return ssl._create_unverified_context()
except Exception:
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
return ctx
try:
return ssl.create_default_context()
except Exception:
return ssl._create_unverified_context()
def _get_db_path() -> str:
"""Return default SQLite DB path for storing raw responses.
Returns:
str: Absolute file path to SQLite database.
"""
base_dir = os.path.dirname(__file__)
return os.path.join(base_dir, "qcwy_raw.sqlite3")
def _init_db(db_path: str) -> None:
"""Initialize SQLite database with responses table if absent.
Args:
db_path (str): Path to SQLite database file.
Returns:
None
"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute(
"""
CREATE TABLE IF NOT EXISTS responses (
id INTEGER PRIMARY KEY AUTOINCREMENT,
keyword TEXT NOT NULL,
page INTEGER NOT NULL,
created_at INTEGER NOT NULL,
payload TEXT NOT NULL
)
"""
)
cur.execute(
"""
CREATE UNIQUE INDEX IF NOT EXISTS idx_responses_keyword_page
ON responses(keyword, page)
"""
)
con.commit()
con.close()
except Exception:
pass
def _save_raw_response(db_path: str, keyword: str, page: int, raw_payload: str) -> None:
"""Persist raw HTTP response payload into SQLite without modification.
Args:
db_path (str): Path to SQLite database file.
keyword (str): Search keyword.
page (int): Page number for the response.
raw_payload (str): Raw JSON text as received.
Returns:
None
"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute(
"INSERT OR IGNORE INTO responses(keyword, page, created_at, payload) VALUES(?, ?, ?, ?)",
(keyword, int(page), int(time.time()), raw_payload),
)
con.commit()
con.close()
except Exception:
pass
def _has_page_record(db_path: str, keyword: str, page: int) -> bool:
"""Check if the given keyword+page already exists in SQLite."""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute("SELECT 1 FROM responses WHERE keyword=? AND page=? LIMIT 1", (keyword, int(page)))
row = cur.fetchone()
con.close()
return row is not None
except Exception:
return False
def _make_item_key(it: Any) -> str:
"""Return a stable deduplication key for an item."""
try:
if isinstance(it, dict):
for k in ("jobId", "id", "job_id", "positionId"):
v = it.get(k)
if v is not None:
return f"id:{v}"
return "hash:" + json.dumps(it, ensure_ascii=False, sort_keys=True)
return "val:" + str(it)
except Exception:
return "val:" + str(it)
def _sleep_between_requests(min_seconds: float = 1.0, max_seconds: float = 3.0) -> None:
"""Sleep for a random duration between min_seconds and max_seconds.
Args:
min_seconds (float): Minimum seconds to sleep.
max_seconds (float): Maximum seconds to sleep.
Returns:
None
"""
try:
dur = random.uniform(min_seconds, max_seconds)
time.sleep(dur)
except Exception:
time.sleep(min_seconds)
def _record_company_success(company_name: Optional[str]) -> None:
"""Append successful company name to success log file once per process.
Args:
company_name (Optional[str]): Company name string.
Returns:
None
"""
try:
name = (company_name or "").strip()
if not name or name in _SUCCESS_WRITTEN:
return
with open(SUCCESS_LOG_PATH, "a", encoding="utf-8") as f:
f.write(f"{name}\n")
_SUCCESS_WRITTEN.add(name)
except Exception:
pass
def _extract_company_name(info: Dict[str, Any]) -> Optional[str]:
"""Extract company name from company info payload across common keys.
Args:
info (Dict[str, Any]): Company info dict.
Returns:
Optional[str]: Company name when found.
"""
keys = (
"coname",
"coName",
"fullCompanyName",
"companyName",
"fullname",
"name",
)
for k in keys:
v = info.get(k)
if isinstance(v, str) and v.strip():
return v.strip()
for parent in ("basicinfo", "basicInfo"):
node = info.get(parent)
if isinstance(node, dict):
for k in keys:
v = node.get(k)
if isinstance(v, str) and v.strip():
return v.strip()
return None
def _report_universal(items: list, data_type: str = "job") -> bool:
"""Post items list to universal batch-store-async endpoint.
Args:
items (list): Data list to send.
data_type (str): Logical data type label.
Returns:
bool: True when accepted, else False.
"""
return False
def _extract_total_count(resp: Dict[str, Any]) -> Optional[int]:
"""直接从响应中读取 job.totalCount 字段。
Args:
resp (Dict[str, Any]): 解析后的响应 JSON
Returns:
Optional[int]: 总数若不存在则为 None
"""
if not isinstance(resp, dict):
return None
def _direct_get(path: tuple) -> Optional[int]:
node: Any = resp
for key in path:
if not isinstance(node, dict):
return None
node = node.get(key)
if isinstance(node, dict):
tc = node.get("totalCount")
if isinstance(tc, int):
return tc
if isinstance(tc, str):
s = tc.strip()
if s.isdigit():
return int(s)
return None
for p in (
("resultbody", "job"),
("resultBody", "job"),
("job",),
("jobs",),
("result", "job"),
("data", "job"),
("payload", "job"),
):
v = _direct_get(p)
if isinstance(v, int):
return v
return None
def paginate_search_company_keyword(
keyword: str,
size: int = 20,
job_area: str = "020000",
sort_type: str = "0",
search_type: str = "2",
scene: str = "12",
start_page: int = 1,
max_pages: Optional[int] = None,
delay: float = 0.2,
verbose: bool = False,
db_path: Optional[str] = None,
) -> list:
"""Iterate pages for company keyword search until no data.
Args:
keyword (str): Keyword to search.
size (int): Page size per request.
job_area (str): Area code.
sort_type (str): Sort type.
search_type (str): Search type.
scene (str): Scene id.
start_page (int): Starting page number.
max_pages (Optional[int]): Maximum pages to fetch; None for unlimited.
delay (float): Delay seconds between requests.
verbose (bool): Whether to print per-page stats.
Returns:
list: Aggregated items across pages.
"""
results: list = []
seen_keys: set = set()
page = start_page
fetched_pages = 0
total_count: Optional[int] = None
db_path = db_path or _get_db_path()
_init_db(db_path)
while True:
if max_pages is not None and fetched_pages >= max_pages:
break
if verbose:
print(json.dumps({"fetching_page": page}, ensure_ascii=False))
# Skip crawling when this page is already recorded
if _has_page_record(db_path, keyword, page):
if verbose:
print(json.dumps({"page": page, "skipped": True}, ensure_ascii=False))
page += 1
fetched_pages += 1
ms = delay if delay > 0 else 0.2
mx = ms * 2
_sleep_between_requests(ms, mx)
continue
def _sink(raw: str) -> None:
_save_raw_response(db_path, keyword, page, raw)
resp = search_company_keyword(
keyword=keyword,
page=page,
size=size,
job_area=job_area,
sort_type=sort_type,
search_type=search_type,
scene=scene,
raw_sink=_sink,
)
if verbose:
print(json.dumps({"page": page, "saved": True}, ensure_ascii=False))
if total_count is None:
total_count = _extract_total_count(resp)
if verbose and total_count is not None:
print(json.dumps({"totalCount": total_count}, ensure_ascii=False))
items = _extract_items(resp)
filtered: list = []
for it in items:
key = _make_item_key(it)
if key in seen_keys:
continue
seen_keys.add(key)
filtered.append(it)
if verbose:
print(json.dumps({"page": page, "items_on_page": len(items), "unique_added": len(filtered)}, ensure_ascii=False))
if not filtered:
break
results.extend(filtered)
if total_count is not None:
if len(results) >= total_count:
break
page += 1
fetched_pages += 1
ms = delay if delay > 0 else 0.2
mx = ms * 2
_sleep_between_requests(ms, mx)
return results
# Static configuration for pagination demo
CONFIG: Dict[str, Any] = {
"keyword": "字节跳动",
"size": 20,
"job_area": "020000",
"sort_type": "0",
"search_type": "2",
"scene": "12",
"start_page": 1,
"max_pages": None,
"delay": 0.2,
"verbose": False,
"db_path": None,
}
def main(keyword: str) -> None:
"""Run a demo of keyword search pagination until no data using static config.
Returns:
None
"""
cfg = CONFIG
results = paginate_search_company_keyword(
keyword=keyword,
size=cfg["size"],
job_area=cfg["job_area"],
sort_type=cfg["sort_type"],
search_type=cfg["search_type"],
scene=cfg["scene"],
start_page=cfg["start_page"],
max_pages=cfg["max_pages"],
delay=cfg["delay"],
verbose=cfg["verbose"],
db_path=cfg["db_path"],
)
print(json.dumps({"total_items": len(results)}, ensure_ascii=False))
def get_company_info(company_id: str) -> Dict[str, Any]:
"""Fetch company details with caching.
Args:
company_id (str): Company identifier string.
Returns:
Dict[str, Any]: Company information dict; empty dict when not found.
"""
if not company_id:
return {}
cached = COMPANY_INFO_CACHE.get(company_id)
if isinstance(cached, dict) and cached:
return cached
params = {
"companyId": company_id,
"colorOne": "#ffffff",
"colorTwo": "#ffffffcc",
}
property_obj = {
"frompageUrl": "",
"pageUrl": "",
"isLogin": "",
"accountid": "",
"resumeId": "",
"firstFrompageUrl": "",
"distinct_id": str(_uuid.uuid4()),
"pageCode": "companyDetail|company|companyinfo",
"shortPageCode": "companyDetail|company|companyinfo",
}
try:
resp = _request(
method="GET",
path="open/noauth/company-info/info-data",
params=params,
property_obj=property_obj,
)
except Exception:
resp = {}
if resp and resp.get("status") in (1, "1"):
info = resp.get("resultbody", {})
if isinstance(info, dict) and info:
COMPANY_INFO_CACHE[company_id] = info
name = _extract_company_name(info)
_record_company_success(name)
return info
return {}
return {}
def _enrich_items_with_company_info(resp: Dict[str, Any]) -> list:
"""Attach company info to items using cache.
Args:
resp (Dict[str, Any]): Parsed JSON response.
Returns:
list: Items with company_info fields when available.
"""
items = resp.get("resultbody", {}).get("job", {}).get("items", [])
enriched = []
for it in items:
target = dict(it) if isinstance(it, dict) else {"_value": it}
co_id = target.get("coId") or target.get("companyId")
job_id = target.get("jobId")
city_pinyin = target.get("hrefAreaPinYin")
if co_id:
info = get_company_info(str(co_id))
if info:
target["company_info"] = info
target["company_desc"] = (info.get("coinfo", {}) or {}).get("coinfo")
target["companyHref"] = (info.get("share", {}) or {}).get("weixinshareurl")
target["jobHref"] = f"https://jobs.51job.com/{city_pinyin}/{job_id}.html"
nm = _extract_company_name(info) or target.get("fullCompanyName") or target.get("companyName")
_record_company_success(nm)
_sleep_between_requests()
enriched.append(target)
return enriched
def _load_keywords(path: str) -> list:
"""Load keywords from a UTF-8 text file, one per line.
Args:
path (str): File path.
Returns:
list: Non-empty trimmed lines.
"""
try:
with open(path, "r", encoding="utf-8") as f:
lines = [ln.strip() for ln in f.readlines()]
return [ln for ln in lines if ln]
except Exception:
return []
def _progress_iter(seq: list, desc: str = "", total: Optional[int] = None):
"""Iterate with a simple console progress bar.
Args:
seq (list): Items to iterate.
desc (str): Progress description.
total (Optional[int]): Total count for percentage.
Yields:
Any: Items from seq.
"""
n = 0
m = total if total is not None else len(seq)
bar_len = 24
for item in seq:
n += 1
filled = int(bar_len * n / m) if m else 0
bar = "#" * filled + "-" * (bar_len - filled)
pct = int(100 * n / m) if m else 100
print(f"\r{desc} [{bar}] {n}/{m} {pct}%", end="", flush=True)
yield item
print("", flush=True)
if __name__ == "__main__":
base_dir = os.path.dirname(__file__)
fp = os.path.join(base_dir, "company.txt")
td = _load_keywords(fp) or [CONFIG.get("keyword")]
for keyword in _progress_iter(td, desc="Keywords", total=len(td)):
print(keyword)
main(keyword)
_sleep_between_requests()

File diff suppressed because it is too large Load Diff

View File

@ -1,54 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
if command -v apt-get >/dev/null; then
if command -v sudo >/dev/null; then
sudo apt-get update -y
sudo apt-get install -y unzip python3 python3-pip tmux
else
apt-get update -y
apt-get install -y unzip python3 python3-pip tmux
fi
fi
python3 -m pip install -U requests loguru httpx fake-useragent PySocks --break-system-packages
use_tmux=0
if command -v tmux >/dev/null; then
use_tmux=1
fi
tmux_session="jobs_spider"
if [ "$use_tmux" -eq 1 ]; then
if ! tmux has-session -t "$tmux_session" 2>/dev/null; then
tmux new-session -d -s "$tmux_session" -c "$SCRIPT_DIR"
fi
fi
start_one() {
name="$1"
dir="$2"
entry="$3"
cd "$dir"
mkdir -p logs
if pgrep -f "$entry" >/dev/null; then
echo "$name 已在运行"
return 0
fi
if [ "$use_tmux" -eq 1 ]; then
if tmux list-windows -t "$tmux_session" 2>/dev/null | awk -F: '{print $2}' | awk '{print $1}' | grep -qx "$name"; then
if tmux respawn-window -t "$tmux_session:$name" -k -c "$dir" "python3 $entry >> logs/runner.log 2>&1"; then
:
else
tmux kill-window -t "$tmux_session:$name" 2>/dev/null || true
tmux new-window -t "$tmux_session" -n "$name" -c "$dir" "python3 $entry >> logs/runner.log 2>&1"
fi
else
tmux new-window -t "$tmux_session" -n "$name" -c "$dir" "python3 $entry >> logs/runner.log 2>&1"
fi
echo "$name tmux: $tmux_session:$name"
else
nohup python3 "$entry" >> logs/runner.log 2>&1 &
echo "$name PID: $!"
fi
}
start_one "boss" "$SCRIPT_DIR/boss" "boos_api.py"
start_one "qcwy" "$SCRIPT_DIR/qcwy" "qcwy.py"
start_one "zhilian" "$SCRIPT_DIR/zhilian" "zhilian_single.py"

View File

@ -1,38 +0,0 @@
# 使用Python 3.9作为基础镜像
FROM python:3.9-slim
# 设置工作目录
WORKDIR /app
# 设置环境变量
ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1
# 配置环境变量可通过docker run -e 覆盖)
ENV API_BASE_URL=http://124.222.245.240:9999 \
MONGODB_URI=mongodb://localhost:27017 \
MONGODB_DB=job_data \
MAX_PAGES=3 \
PAGE_SIZE=15 \
MIN_WAIT_TIME=10 \
MAX_WAIT_TIME=30 \
ERROR_WAIT_MIN=30 \
ERROR_WAIT_MAX=60
# 复制requirements文件并安装Python依赖
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 复制项目文件
COPY boos_api.py ./
COPY city.json ./
COPY work.json ./
# 创建非root用户
RUN useradd -m -u 1000 crawler && chown -R crawler:crawler /app
USER crawler
# 启动命令
CMD ["python", "boos_api.py"]

View File

@ -1,855 +0,0 @@
import os
import time
import json
import pprint
import random
import uuid
import hashlib
from typing import Any, Dict, Optional, Callable
try:
import requests
except Exception:
requests = None
import ssl
from urllib.request import Request, urlopen, ProxyHandler, build_opener, HTTPSHandler
from urllib.parse import urlencode
import sqlite3
SUCCESS_LOG_PATH = os.path.join(os.path.dirname(__file__), "success.txt")
_SUCCESS_WRITTEN: set = set()
try:
import httpx
except Exception:
httpx = None
API_BASE_URL = os.getenv("API_BASE_URL", "http://127.0.0.1:9999")
def _build_proxy() -> Optional[Dict[str, str]]:
"""构造代理配置字典requests/httpx/urllib 兼容)。
从环境变量读取
- ZP_PROXY_URL: 完整代理URL http://user:pass@host:port
或组合
- ZP_PROXY_USERNAME, ZP_PROXY_PASSWORD, ZP_PROXY_TUNNEL
Returns:
Optional[Dict[str, str]]: {'http': url, 'https': url} None
"""
url ="http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818"
return {"http": url, "https": url}
def _get_user_agent(mobile: bool = True) -> str:
try:
from fake_useragent import UserAgent
ua = UserAgent(platforms=['mobile'] if mobile else None)
return ua.random
except Exception:
if mobile:
return "Mozilla/5.0 (Linux; Android 10; Mobile) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Mobile Safari/537.36"
return "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36"
def gen_page_request_id() -> str:
return f"cf1e3b3e655b4eb5a306110a83c77c29-{int(time.time()*1000)}-{random.randint(0,999999)}"
def gen_client_id() -> str:
t = int(time.time() * 1000)
try:
t += int(time.perf_counter() * 1000)
except Exception:
pass
def repl(c: str) -> str:
n = int((t + random.random() * 16) % 16)
if c == 'x':
return hex(n)[2:]
return hex((n & 0x3) | 0x8)[2:]
tpl = "xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx"
return ''.join(repl(c) if c in 'xy' else c for c in tpl)
def gen_v() -> float:
return round(random.random(), 8)
def build_headers_miniapp(user_agent: str) -> Dict[str, str]:
return {
'User-Agent': user_agent,
'x-zp-page-code': "7020",
'x-zp-rt': hashlib.md5(f"{uuid.uuid4()}-{time.time()}".encode("utf-8")).hexdigest(),
'x-zp-device-id': str(uuid.uuid4()).upper(),
'content-type': "application/json",
'x-zp-version': "0.0.0",
'x-zp-business-system': "73",
'x-zp-action-id': "",
'xweb_xhr': "1",
'x-zp-channel': "wxxiaochengxu",
'x-zp-platform': "12",
'sec-fetch-site': "cross-site",
'sec-fetch-mode': "cors",
'sec-fetch-dest': "empty",
'referer': "https://servicewechat.com/wxb7718fb9257e4fd2/529/page-frame.html",
'accept-language': "zh-CN,zh;q=0.9",
}
def _get_db_path() -> str:
"""返回默认 SQLite 数据库文件路径。"""
base_dir = os.path.dirname(__file__)
return os.path.join(base_dir, "zhilian_raw.sqlite3")
def _init_db(db_path: str) -> None:
"""初始化 SQLite 数据库并创建表。"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute(
"""
CREATE TABLE IF NOT EXISTS responses (
id INTEGER PRIMARY KEY AUTOINCREMENT,
sou_full_index TEXT,
page INTEGER,
created_at INTEGER,
payload TEXT
)
"""
)
cur.execute(
"""
CREATE TABLE IF NOT EXISTS company_details (
id INTEGER PRIMARY KEY AUTOINCREMENT,
number TEXT,
created_at INTEGER,
payload TEXT
)
"""
)
cur.execute(
"""
CREATE UNIQUE INDEX IF NOT EXISTS idx_responses_sou_page
ON responses(sou_full_index, page)
"""
)
cur.execute(
"""
CREATE UNIQUE INDEX IF NOT EXISTS idx_company_details_number
ON company_details(number)
"""
)
con.commit()
con.close()
except Exception:
pass
def _save_search_response(db_path: str, sou_full_index: str, page: int, raw_payload: str) -> None:
"""保存职位搜索的原始响应。"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute(
"INSERT OR IGNORE INTO responses(sou_full_index, page, created_at, payload) VALUES(?, ?, ?, ?)",
(sou_full_index, int(page), int(time.time()), raw_payload),
)
con.commit()
con.close()
except Exception:
pass
def _save_company_detail(db_path: str, number: str, raw_payload: str) -> None:
"""保存公司详情的原始响应。"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute(
"INSERT OR IGNORE INTO company_details(number, created_at, payload) VALUES(?, ?, ?)",
(number, int(time.time()), raw_payload),
)
con.commit()
con.close()
except Exception:
pass
def _has_company_detail(db_path: str, number: str) -> bool:
"""检查公司详情是否已存在(按职位编号 number"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute("SELECT 1 FROM company_details WHERE number=? LIMIT 1", (number,))
row = cur.fetchone()
con.close()
return row is not None
except Exception:
return False
def _has_page_record(db_path: str, sou_full_index: str, page: int) -> bool:
"""检查指定关键词与页码是否已经存在。"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute("SELECT 1 FROM responses WHERE sou_full_index=? AND page=? LIMIT 1", (sou_full_index, int(page)))
row = cur.fetchone()
con.close()
return row is not None
except Exception:
return False
def _sleep_between_requests(min_seconds: float = 0.3, max_seconds: float = 0.8) -> None:
"""在请求间进行随机休眠。"""
try:
dur = random.uniform(min_seconds, max_seconds)
time.sleep(dur)
except Exception:
time.sleep(min_seconds)
def _has_keyword_record(db_path: str, sou_full_index: str) -> bool:
"""检查指定关键词是否已有任意页记录。"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute("SELECT 1 FROM responses WHERE sou_full_index=? LIMIT 1", (sou_full_index,))
row = cur.fetchone()
con.close()
return row is not None
except Exception:
return False
def _request_json(method: str, url: str, headers: Dict[str, str], params: Optional[Dict[str, Any]] = None,
json_body: Optional[Dict[str, Any]] = None, timeout: int = 30, max_retries: int = 3,
proxies: Optional[Dict[str, str]] = None, raw_sink: Optional[Callable[[str], None]] = None) -> Optional[Dict[str, Any]]:
for attempt in range(max_retries):
try:
time.sleep(random.uniform(0.8, 2.5))
if httpx is not None:
use_http2 = os.getenv("ZP_HTTP2", "1") == "1"
debug = os.getenv("ZP_DEBUG", "0") == "1"
kwargs: Dict[str, Any] = {"http2": use_http2, "timeout": timeout, "headers": headers, "trust_env": False}
if proxies:
px = proxies.get("https") or proxies.get("http")
if px:
kwargs["proxies"] = px
with httpx.Client(**kwargs) as client:
method_u = method.upper()
if method_u == "GET":
resp = client.get(url, params=params)
else:
resp = client.post(url, json=json_body)
if debug:
print({"_request_json": {"method": "POST", "status": resp.status_code}})
if resp.status_code == 405:
merged = params or {}
if json_body:
merged = {**merged, **{k: str(v) for k, v in json_body.items()}}
resp = client.get(url, params=merged)
if debug:
print({"_request_json": {"fallback": "GET", "status": resp.status_code}})
if raw_sink:
try:
raw_sink(resp.text)
except Exception:
pass
try:
return resp.json()
except ValueError:
return json.loads(resp.text)
else:
if requests:
resp = requests.request(
method.upper(), url,
headers=headers, params=params, json=json_body,
timeout=timeout, proxies=proxies
)
resp.raise_for_status()
if raw_sink:
try:
raw_sink(resp.text)
except Exception:
pass
return resp.json()
if method.upper() == 'GET':
full_url = url
if params:
qs = urlencode(params)
full_url = f"{url}?{qs}"
req = Request(full_url, headers=headers, method='GET')
else:
data_bytes = json.dumps(json_body or {}).encode('utf-8')
req = Request(url, headers=headers, data=data_bytes, method='POST')
ctx = ssl.create_default_context()
opener = None
if proxies and isinstance(proxies, dict) and (proxies.get("http") or proxies.get("https")):
try:
ph = ProxyHandler(proxies)
opener = build_opener(ph, HTTPSHandler(context=ctx))
except Exception:
opener = None
if opener:
with opener.open(req, timeout=timeout) as r:
raw = r.read()
else:
with urlopen(req, context=ctx, timeout=timeout) as r:
raw = r.read()
if raw_sink:
try:
raw_sink(raw.decode("utf-8"))
except Exception:
pass
return json.loads(raw)
except Exception:
if attempt == max_retries - 1:
return None
time.sleep(1.2 * (attempt + 1))
return None
def fetch_company_desc_by_job(number: str, db_path: Optional[str] = None) -> Optional[str]:
if not isinstance(number, str) or not number.strip():
return None
if db_path and _has_company_detail(db_path, number):
return None
client_id = gen_client_id()
url_pc = "https://fe-api.zhaopin.com/c/i/jobs/position-detail-new"
params_pc = {
"number": number,
"_v": gen_v(),
"x-zp-page-request-id": gen_page_request_id(),
"x-zp-client-id": client_id,
}
headers_pc = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "identity",
"sec-ch-ua-platform": "macOS",
"x-zp-business-system": "1",
"x-zp-page-code": "4019",
"sec-ch-ua": "\"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"136\", \"Google Chrome\";v=\"136\"",
"sec-ch-ua-mobile": "?0",
"x-zp-platform": "13",
"origin": "https://www.zhaopin.com",
"sec-fetch-site": "same-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"referer": "https://www.zhaopin.com/",
"accept-language": "zh-CN,zh;q=0.9",
"priority": "u=1, i",
"Cookie": f"x-zp-client-id={client_id}"
}
def _sink_pc(raw: str) -> None:
if db_path:
_save_company_detail(db_path, number, raw)
data_pc = _request_json("GET", url_pc, headers_pc, params=params_pc, proxies=_build_proxy(), raw_sink=_sink_pc)
if data_pc and isinstance(data_pc, dict):
detail = data_pc.get("data") or {}
comp = detail.get("detailedCompany") or {}
desc_pc = comp.get("companyDescription")
if isinstance(desc_pc, str) and desc_pc:
return desc_pc
ua = _get_user_agent(True)
url_mini = "https://cgate.zhaopin.com/positionbusiness/exposure/companyDetail"
params_mini = {
"number": number,
"platform": "12",
"version": "0.0.0",
}
headers_mini = build_headers_miniapp(ua)
def _sink_mini(raw: str) -> None:
if db_path:
_save_company_detail(db_path, number, raw)
data_mini = _request_json("GET", url_mini, headers_mini, params=params_mini, proxies=_build_proxy(), raw_sink=_sink_mini)
if data_mini and isinstance(data_mini, dict):
desc_mini = ((data_mini.get("data") or {}).get("companyBase") or {}).get("companyDescWithHtml")
if isinstance(desc_mini, str) and desc_mini:
return desc_mini
return None
def build_headers(
at: str,
rt: str,
device_id: str,
channel: str = "miniapp",
platform: str = "miniapp",
version: str = "1.0.0",
business_system: str = "zpfe-miniapp",
page_code: Optional[str] = None,
action_id: Optional[str] = None,
user_agent: Optional[str] = None,
referer: Optional[str] = None,
) -> Dict[str, str]:
"""生成请求头。
参数
- at访问令牌
- rt刷新/辅助令牌
- device_id设备标识
- channel渠道标识
- platform平台标识
- version版本号
- business_system业务系统标识
- page_code页面编码注入 x-zp-page-code
- action_id动作标识注入 x-zp-action-id
- user_agentUA可选默认填充为微信小程序 UA
- referer来源可选
返回
- 头字典包含公共头与在 cgate 路径下的 x-zp-at/x-zp-rt
"""
headers: Dict[str, str] = {
"accept": "*/*",
"content-type": "application/json",
"x-zp-version": version,
"x-zp-channel": channel,
"x-zp-platform": platform,
"x-zp-device-id": device_id,
"x-zp-business-system": business_system,
"xweb_xhr": "1",
"sec-fetch-site": "cross-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"accept-language": "zh-CN,zh;q=0.9",
}
if at:
headers["x-zp-at"] = at
if rt:
headers["x-zp-rt"] = rt
if page_code:
headers["x-zp-page-code"] = page_code
if action_id is not None:
headers["x-zp-action-id"] = action_id
headers["User-Agent"] = user_agent or (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 "
"MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF "
"MacWechat/3.8.10(0x13080a10) XWEB/1227"
)
if referer:
headers["referer"] = referer
else:
headers["referer"] = "https://servicewechat.com/wxb7718fb9257e4fd2/602/page-frame.html"
return headers
def base_url_for(path: str, env: str = "prod") -> str:
"""选择基础域名。
参数
- path相对路径
- env环境标识"prod" "pre"
返回
- 基础域名字符串
"""
is_pre = env == "pre"
if path.startswith("/capi"):
return f"https://capi{'pre' if is_pre else ''}.zhaopin.com"
if path.startswith("/api"):
return f"https://m{'-pre' if is_pre else ''}.zhaopin.com"
if path.startswith("/weex"):
return f"https://zhibo{'-pre' if is_pre else ''}.zhaopin.com"
return f"https://cgate{'-pre' if is_pre else ''}.zhaopin.com"
def build_payload(
page_index: int = 1,
page_size: int = 10,
city_id: Optional[int] = None,
event_scenario: Optional[str] = None,
sou_expand: Optional[str] = None,
sou_full_index: Optional[str] = None,
sort_type: Optional[str] = None,
resume_number: Optional[str] = None,
filter_min_salary: Optional[int] = None,
) -> Dict[str, Any]:
"""构造职位搜索请求体。"""
body: Dict[str, Any] = {
"pageIndex": page_index,
"pageSize": page_size,
}
if event_scenario:
body["eventScenario"] = event_scenario
if filter_min_salary is not None:
body["filterMinSalary"] = filter_min_salary
if sou_expand:
body["S_SOU_EXPAND"] = sou_expand
if sou_full_index:
body["S_SOU_FULL_INDEX"] = sou_full_index
if city_id is not None:
body["S_SOU_WORK_CITY"] = city_id
if sort_type:
body["sortType"] = sort_type
if resume_number:
body["resumeNumber"] = resume_number
return body
def call(
page_index: int = 1,
page_size: int = 10,
city_id: Optional[int] = None,
env: str = "prod",
timeout: float = 10.0,
sou_full_index: Optional[str] = None,
) -> Any:
"""执行职位搜索请求POST"""
path = "/positionbusiness/searchrecommend/searchPositions"
base = base_url_for(path, env=env)
url = f"{base}{path}"
at = os.getenv("ZP_AT", "")
rt = os.getenv("ZP_RT", "")
device_id = os.getenv("ZP_DEVICE_ID", "")
channel = os.getenv("ZP_CHANNEL", "wxxiaochengxu")
platform = os.getenv("ZP_PLATFORM", "12")
version = os.getenv("ZP_VERSION", "0.0.0")
business_system = os.getenv("ZP_BUSINESS_SYSTEM", "73")
page_code = os.getenv("ZP_PAGE_CODE", "7019")
action_id = os.getenv("ZP_ACTION_ID", "")
user_agent = os.getenv("ZP_USER_AGENT")
referer = os.getenv("ZP_REFERER")
headers = build_headers(
at=at,
rt=rt,
device_id=device_id,
channel=channel,
platform=platform,
version=version,
business_system=business_system,
page_code=page_code,
action_id=action_id,
user_agent=user_agent,
referer=referer,
)
body_env = os.getenv("ZP_BODY_JSON")
if body_env:
try:
body = json.loads(body_env)
except Exception:
body = {}
else:
body = build_payload(
page_index=page_index,
page_size=page_size,
city_id=city_id,
event_scenario=os.getenv("ZP_EVENT_SCENARIO", "wxmpZhaopinSearchV2"),
sou_expand=os.getenv("ZP_SOU_EXPAND", "SOU_COMPANY_ID"),
sou_full_index=sou_full_index or os.getenv("ZP_SOU_FULL_INDEX"),
sort_type=os.getenv("ZP_SORT_TYPE", "DEFAULT"),
resume_number=os.getenv("ZP_RESUME_NUMBER"),
filter_min_salary=int(os.getenv("ZP_FILTER_MIN_SALARY", "1")),
)
if sou_full_index:
body["S_SOU_FULL_INDEX"] = sou_full_index
use_http2 = os.getenv("ZP_HTTP2", "1") == "1"
proxies = _build_proxy()
debug = os.getenv("ZP_DEBUG", "0") == "1"
if httpx is not None:
kwargs: Dict[str, Any] = {"http2": use_http2, "timeout": timeout, "headers": headers, "trust_env": False}
if proxies:
px = proxies.get("https") or proxies.get("http")
if px:
kwargs["proxies"] = px
with httpx.Client(**kwargs) as client:
resp = client.post(url, json=body)
if debug:
print({"method": "POST", "status": resp.status_code})
if resp.status_code == 405:
params = {k: str(v) for k, v in body.items()}
resp = client.get(url, params=params)
if debug:
print({"fallback": "GET", "status": resp.status_code})
try:
os.environ["ZP_LAST_RAW"] = resp.text
except Exception:
pass
try:
return resp.json()
except ValueError:
return resp.text
else:
resp = requests.post(url, json=body, headers=headers, timeout=timeout, proxies=proxies) if requests is not None else None
if resp is not None:
if debug:
try:
print({"method": "POST", "status": resp.status_code})
except Exception:
pass
if getattr(resp, "status_code", None) == 405:
params = {k: str(v) for k, v in body.items()}
resp = requests.get(url, params=params, headers=headers, timeout=timeout, proxies=proxies)
if debug:
try:
print({"fallback": "GET", "status": resp.status_code})
except Exception:
pass
try:
return resp.json()
except ValueError:
return resp.text
return {"error": "httpx not available"}
def _load_lines() -> list:
base_dir = os.path.dirname(__file__)
candidates = [
os.path.join(base_dir, "company.txt"),
os.path.join(base_dir, "conpany.txt"),
]
for fp in candidates:
if os.path.exists(fp):
try:
with open(fp, "r", encoding="utf-8") as f:
lines = [ln.strip() for ln in f.readlines()]
return [ln for ln in lines if ln]
except Exception:
continue
return []
def _record_success(name: str) -> None:
try:
n = (name or "").strip()
if not n or n in _SUCCESS_WRITTEN:
return
with open(SUCCESS_LOG_PATH, "a", encoding="utf-8") as f:
f.write(f"{n}\n")
_SUCCESS_WRITTEN.add(n)
except Exception:
pass
def _extract_job_items_from_result(result: Any) -> list:
"""从调用结果中提取职位列表 items。"""
try:
if not isinstance(result, dict):
return []
data = result.get("data")
if isinstance(data, dict):
lst = data.get("list")
return lst if isinstance(lst, list) else []
return []
except Exception:
return []
import requests
def _report_universal(items: list, data_type: str = "job") -> bool:
"""Post items list to universal batch-store-async endpoint.
Args:
items (list): Data list to send.
data_type (str): Logical data type label.
Returns:
bool: True when accepted, else False.
"""
return False
def main() -> None:
env = os.getenv("ZP_ENV", "prod")
page_index = int(os.getenv("ZP_PAGE_INDEX", "1"))
page_size = int(os.getenv("ZP_PAGE_SIZE", "15"))
city_env = os.getenv("ZP_DEMO_CITY_ID")
city_id = int(city_env) if city_env and city_env.isdigit() else None
items = _load_lines()
if items:
try:
from tqdm import tqdm
except Exception:
tqdm = None
seq = tqdm(items, desc="S_SOU_FULL_INDEX") if tqdm else items
db_path = _get_db_path()
_init_db(db_path)
for x in seq:
total_items = 0
p = page_index
pages_bar = tqdm(desc=f"{x}", leave=False) if tqdm else None
# 若该关键词已存在任何页的记录,则整体跳过,避免重复请求
if _has_keyword_record(db_path, x):
if pages_bar:
pages_bar.update(0)
pages_bar.set_postfix({"keyword": x, "skipped": True})
continue
while True:
if _has_page_record(db_path, x, p):
if pages_bar:
pages_bar.update(1)
pages_bar.set_postfix({"page": p, "skipped": True})
_sleep_between_requests()
p += 1
continue
result = call(
page_index=p,
page_size=page_size,
city_id=city_id,
env=env,
sou_full_index=x,
)
try:
raw = os.getenv("ZP_LAST_RAW", "")
items = _extract_job_items_from_result(result)
payload_obj = {
"keyword": x,
"page": p,
"count": len(items),
"items": items,
"data": result.get("data") if isinstance(result, dict) else None,
"raw": raw or (json.dumps(result, ensure_ascii=False) if isinstance(result, dict) else str(result)),
}
_save_search_response(db_path, x, p, json.dumps(payload_obj, ensure_ascii=False))
except Exception:
pass
data = result.get("data") if isinstance(result, dict) else None
lst = data.get("list") if isinstance(data, dict) else None
is_end = (isinstance(data, dict) and str(data.get("isEndPage", "")).strip() in ("1", "true", "True"))
count_val = None
# 处理数据字段
if lst and isinstance(lst, list):
for item in lst:
item["companyName"] = x or item.get("companyName", "")
item["jobName"] = item.get("jobName", "") or item.get("name", "") or item.get("position", {}).get("base", {}).get("positionName", "")
item["jobDescribe"] = item.get("jobSummary", "") or item.get("position", {}).get("desc", {}).get("description", "")
item["degreeString"] = item.get("education", "") or item.get("position", {}).get("base", {}).get("education", "")
tags = []
for t in item.get("showSkillTags", []) or []:
if isinstance(t, dict):
v = t.get("tag") or t.get("value") or t.get("name")
if v:
tags.append(str(v))
elif isinstance(t, str):
tags.append(t)
for t in item.get("skillLabel", []) or []:
if isinstance(t, dict):
v = t.get("value") or t.get("name")
if v:
tags.append(str(v))
elif isinstance(t, str):
tags.append(t)
item["jobTagsForOrder"] = tags
# 年限/教育
item["workYearString"] = item.get("workingExp", "") or item.get("position", {}).get("base", {}).get("positionWorkingExp", "")
item["jobExperience"] = item.get("jobExperience", "")
item["jobEducation"] = item.get("jobEducation", "")
# 工种/职位类型
item["termStr"] = item.get("workType", "") or item.get("position", {}).get("base", {}).get("workType", "")
# 位置/区域
addr = (item.get("workLocation", {}) or {}).get("workAddress")
city = item.get("workCity", "")
district = item.get("cityDistrict", "")
street = item.get("streetName", "")
if not addr:
# 卡片JSON中的地址作为兜底
card_json = item.get("cardCustomJson")
try:
card_obj = json.loads(card_json) if isinstance(card_json, str) else {}
except Exception:
card_obj = {}
addr = card_obj.get("address")
item["location"] = addr or "" # 详细地址
item["jobAreaString"] = f"{city}{district}{street}".strip()
# 时间
item["confirmDateString"] = item.get("publishTime", "") or item.get("firstPublishTime", "")
# 公司规模/性质
item["companySizeString"] = item.get("companySize", "")
item["companyTypeString"] = item.get("propertyName", "")
# 行业
item["major1Str"] = item.get("industryName", "")
item["major2Str"] = ""
# 链接/ID/公司信息
def _clean_url(u: Any) -> str:
s = str(u or "").strip()
if s:
s = s.strip().strip("`").strip()
return s
job_url = item.get("positionUrl") or item.get("positionURL") or (item.get("position", {}) or {}).get("base", {}).get("positionUrl")
item["jobHref"] = _clean_url(job_url)
item["companyHref"] = _clean_url(item.get("companyUrl"))
item["coId"] = item.get("companyId")
item["fullCompanyName"] = item.get("companyName", "")
# 薪资:优先 salaryReal其次 salary再次 cardCustomJson.salary60
raw_salary = item.get("salaryReal") or item.get("salary")
if not raw_salary:
_cj = item.get("cardCustomJson")
try:
_cj_obj = json.loads(_cj) if isinstance(_cj, str) else {}
except Exception:
_cj_obj = {}
raw_salary = _cj_obj.get("salary60")
min_val = ""
max_val = ""
if isinstance(raw_salary, str):
s = raw_salary.replace("", "").replace("/月", "").replace("/天", "").replace("/年", "")
parts = [p for p in s.split("-") if p.strip()]
if len(parts) == 2:
try:
a = int(parts[0])
b = int(parts[1])
min_val = str(min(a, b))
max_val = str(max(a, b))
except Exception:
min_val = parts[0].strip()
max_val = parts[1].strip()
item["jobSalaryMin"] = min_val
item["jobSalaryMax"] = max_val
num = item.get("number")
if isinstance(num, str) and num:
try:
desc_html = fetch_company_desc_by_job(num, db_path=db_path)
except Exception:
desc_html = None
if isinstance(desc_html, str) and desc_html:
item["companyDesc"] = desc_html
item["company_desc"] = desc_html
if lst and isinstance(lst, list):
pass
try:
count_val = data.get("count") if isinstance(data, dict) else None
except Exception:
count_val = None
cur_items = len(lst or [])
total_items += cur_items
if pages_bar:
pages_bar.update(1)
pages_bar.set_postfix({"page": p, "items": cur_items})
# 记录成功关键词
if cur_items > 0 and total_items == cur_items:
_record_success(x)
# 结束条件:空列表或服务端标记结束
if cur_items == 0 or is_end:
break
p += 1
_sleep_between_requests()
if pages_bar:
pages_bar.close()
if tqdm:
seq.set_postfix({"total": total_items})
else:
result = call(page_index=page_index, page_size=page_size, city_id=city_id, env=env)
if isinstance(result, dict):
data = result.get("data")
else:
print(str(result)[:200])
if __name__ == "__main__":
os.environ['ZP_PROXY_USERNAME']='t13319619426654'
os.environ['ZP_PROXY_PASSWORD']='ln8aj9nl'
os.environ['ZP_PROXY_TUNNEL']='s432.kdltps.com:15818'
main()

File diff suppressed because one or more lines are too long

View File

@ -1,782 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import time
import random
import uuid
import hashlib
from typing import Dict, Any, List, Optional, Tuple
import requests
import os
from loguru import logger
from urllib.parse import quote
import socket
def sleep_random_between() -> float:
try:
min_seconds = float(os.getenv("SLEEP_MIN_SECONDS", "1"))
max_seconds = float(os.getenv("SLEEP_MAX_SECONDS", "10"))
if max_seconds < min_seconds:
max_seconds = min_seconds
wait_time = random.uniform(min_seconds, max_seconds)
except Exception:
wait_time = 1.0
time.sleep(wait_time)
return wait_time
# 固定配置,直接修改以下参数即可运行
CITY_ID = 801
PAGE_SIZE = 15
MAX_PAGES = 15
proxy_config = {
"username": "t13319619426654",
"password": "ln8aj9nl",
"tunnel": "s432.kdltps.com:15818"
}
PROXY = f"http://{proxy_config['username']}:{proxy_config['password']}@{proxy_config['tunnel']}"
DEDUP = True
API_BASE_URL = os.getenv('API_BASE_URL', 'http://124.222.106.226:9999')
API_PUBLIC_HOST = os.getenv("API_PUBLIC_HOST")
os.makedirs("logs", exist_ok=True)
logger.add("logs/log_{time:YYYY-MM-DD}.log", level="INFO", rotation="00:00", retention="30 days", enqueue=True)
def log(*args: Any) -> None:
"""时间戳日志打印
Args:
*args: 任意要打印的内容
Returns:
None
"""
logger.info("{} {}", time.strftime("%Y-%m-%d %H:%M:%S"), " ".join(str(a) for a in args))
class ZhilianAPI:
"""智联招聘API封装
Attributes:
session: 会话对象
proxies: 代理配置
"""
def __init__(self, proxy: Optional[str] = None) -> None:
"""初始化
Args:
proxy: 代理地址字符串
Returns:
None
"""
self.session = requests.Session()
self.proxies = None
if proxy:
self.proxies = {"http": proxy, "https": proxy}
self.session.proxies.update(self.proxies)
def request_json(self, method: str, url: str, headers: Dict[str, str], params: Optional[Dict[str, Any]] = None,
json_body: Optional[Dict[str, Any]] = None, timeout: int = 30, max_retries: int = 3,
delay_range: Tuple[float, float] = (1.0, 3.0)) -> Optional[Dict[str, Any]]:
"""统一请求封装返回JSON
Args:
method: HTTP方法
url: 请求地址
headers: 请求头
params: 查询参数
json_body: JSON请求体
timeout: 超时秒数
max_retries: 最大重试次数
delay_range: 每次请求的随机延迟范围
Returns:
dict|None: JSON响应
"""
for attempt in range(max_retries):
try:
sleep_random_between()
resp = self.session.request(method.upper(), url, headers=headers, params=params, json=json_body,
timeout=timeout)
resp.raise_for_status()
data = resp.json()
logger.info("请求参数 method={} url={} status={} params={} body={} resp_size={}", method.upper(), url, resp.status_code, params or {}, json_body or {}, len(resp.content))
logger.info("原始数据 {}", json.dumps(data, ensure_ascii=False))
return data
except Exception:
if attempt == max_retries - 1:
return None
time.sleep(1.5 * (attempt + 1))
return None
def fetch_company_desc_by_job(self, number: str) -> Optional[str]:
"""通过职位编号获取公司描述
Args:
number: 职位编号
Returns:
str|None: 公司描述HTML
"""
client_id = gen_client_id()
url_pc = "https://fe-api.zhaopin.com/c/i/jobs/position-detail-new"
params_pc = {
"number": number,
"_v": gen_v(),
"x-zp-page-request-id": gen_page_request_id(),
"x-zp-client-id": client_id,
}
headers_pc = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "identity",
"sec-ch-ua-platform": "macOS",
"x-zp-business-system": "1",
"x-zp-page-code": "4019",
"sec-ch-ua": "\"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"136\", \"Google Chrome\";v=\"136\"",
"sec-ch-ua-mobile": "?0",
"x-zp-platform": "13",
"origin": "https://www.zhaopin.com",
"sec-fetch-site": "same-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"referer": "https://www.zhaopin.com/",
"accept-language": "zh-CN,zh;q=0.9",
"priority": "u=1, i",
'Cookie': f"x-zp-client-id={client_id}"
}
data_pc = self.request_json("GET", url_pc, headers_pc, params=params_pc)
if data_pc and isinstance(data_pc, dict):
detail = data_pc.get("data") or {}
comp = detail.get("detailedCompany") or {}
desc_pc = comp.get("companyDescription")
if isinstance(desc_pc, str) and desc_pc:
return desc_pc
ua = _get_user_agent(True)
url_mini = "https://cgate.zhaopin.com/positionbusiness/exposure/companyDetail"
params_mini = {
"number": number,
"platform": "12",
"version": "0.0.0",
}
headers_mini = build_headers_miniapp(ua)
data_mini = self.request_json("GET", url_mini, headers_mini, params=params_mini)
if data_mini and isinstance(data_mini, dict):
desc_mini = ((data_mini.get("data") or {}).get("companyBase") or {}).get("companyDescWithHtml")
if isinstance(desc_mini, str) and desc_mini:
return desc_mini
return None
def crawl_pc(self, city_id: int, page_size: int, max_pages: int, dedup: bool,
job_level3_code: Optional[str] = None) -> List[Dict[str, Any]]:
"""PC接口按城市抓取职位
Args:
city_id: 城市ID
page_size: 每页数量
max_pages: 最大页数
dedup: 是否启用本地去重
job_level3_code: 三级职位代码
Returns:
list: 抓取的职位列表
"""
headers = build_headers_pc()
base_url = "https://fe-api.zhaopin.com/c/i/search/positions"
seen = set()
items: List[Dict[str, Any]] = []
for page in range(1, max_pages + 1):
log("开始抓取PC职位页", {"city_id": city_id, "page": page, "page_size": page_size, "job_level3": job_level3_code or ""})
params = {
"_v": gen_v(),
"x-zp-page-request-id": gen_page_request_id(),
"x-zp-client-id": gen_client_id(),
}
payload = {
"S_SOU_WORK_CITY": city_id,
"order": 4,
"pageSize": page_size,
"pageIndex": page,
"eventScenario": "pcSearchedSouSearch",
"anonymous": 1,
"platform": 13,
"version": "0.0.0",
}
if job_level3_code:
payload["S_SOU_JD_JOB_LEVEL3"] = job_level3_code
data = self.request_json("POST", base_url, headers, params=params, json_body=payload)
if not data or data.get("code") != 200:
log("抓取失败或返回非200", {"page": page, "resp_code": (data or {}).get("code")})
break
lst = data.get("data", {}).get("list", [])
if not lst:
log("该页无职位数据", {"page": page})
break
page_items: List[Dict[str, Any]] = []
for job in lst:
jid = job.get("jobId")
if dedup and jid in seen:
continue
if dedup and jid:
seen.add(jid)
num = job.get("number")
if num:
desc = self.fetch_company_desc_by_job(str(num)) or ""
job["companyDesc"] = desc
print(desc)
items.append(job)
page_items.append(job)
log("该页职位数", {"page": page, "count": len(page_items)})
if page_items:
self.report_data(page_items, "job", "zhilian")
log("PC抓取完成", {"total": len(items)})
return items
def get_local_ip(self) -> str:
"""获取本地IP地址"""
try:
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect(("8.8.8.8", 80))
local_ip = s.getsockname()[0]
s.close()
return local_ip
except Exception:
return "127.0.0.1"
def report_data(self, data_list: List[Dict[str, Any]], data_type: str, platform: str = "zhilian") -> bool:
"""上报数据到远程API
Args:
data_list: 数据列表
data_type: 数据类型
platform: 平台标识
Returns:
bool: 是否上报成功
"""
try:
universal_data = {
"data_list": data_list,
"data_type": data_type,
"platform": platform
}
headers = {
"accept": "application/json",
"Content-Type": "application/json",
'X-Forwarded-For': self.get_local_ip()
}
if API_PUBLIC_HOST:
headers["Host"] = API_PUBLIC_HOST
headers["X-Forwarded-Host"] = API_PUBLIC_HOST
api_endpoint = f"{API_BASE_URL}/api/v1/universal/data/batch-store-async"
logger.info("REPORT_DATA {}", json.dumps(universal_data, ensure_ascii=False))
resp = requests.post(api_endpoint, json=universal_data, headers=headers, timeout=300)
ok = 200 <= resp.status_code < 300
log("数据上报完成", {"count": len(data_list), "status_code": resp.status_code, "ok": ok})
return ok
except Exception:
return False
def _get_user_agent(mobile: bool = True) -> str:
"""获取随机User-Agent
Args:
mobile: 是否使用移动端UA
Returns:
str: 随机UA字符串
"""
try:
from fake_useragent import UserAgent
ua = UserAgent(platforms=['mobile'] if mobile else None)
return ua.random
except Exception:
if mobile:
return "Mozilla/5.0 (Linux; Android 10; Mobile) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Mobile Safari/537.36"
return "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36"
def generate_xzp_rt() -> str:
"""生成x-zp-rt签名
Returns:
str: rt签名
"""
unique_string = f"{uuid.uuid4()}-{time.time()}"
return hashlib.md5(unique_string.encode("utf-8")).hexdigest()
def random_device_id() -> str:
"""生成随机设备ID
Returns:
str: 设备IDUUID
"""
return str(uuid.uuid4()).upper()
def gen_page_request_id() -> str:
"""生成页面请求ID
Returns:
str: 请求ID
"""
return f"cf1e3b3e655b4eb5a306110a83c77c29-{int(time.time()*1000)}-{random.randint(0,999999)}"
def gen_client_id() -> str:
"""生成客户端ID
Returns:
str: 客户端ID
"""
t = int(time.time() * 1000)
try:
t += int(time.perf_counter() * 1000)
except Exception:
pass
def repl(c: str) -> str:
n = int((t + random.random() * 16) % 16)
if c == 'x':
return hex(n)[2:]
return hex((n & 0x3) | 0x8)[2:]
tpl = "xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx"
return ''.join(repl(c) if c in 'xy' else c for c in tpl)
def gen_v() -> float:
"""生成_v参数
Returns:
float: 小于1的随机数
"""
return round(random.random(), 8)
def build_headers_miniapp(user_agent: str) -> Dict[str, str]:
"""构建小程序接口通用请求头
Args:
user_agent: UA字符串
Returns:
dict: 请求头
"""
return {
'User-Agent': user_agent,
'x-zp-page-code': "7020",
'x-zp-rt': generate_xzp_rt(),
'x-zp-device-id': random_device_id(),
'content-type': "application/json",
'x-zp-version': "0.0.0",
'x-zp-business-system': "73",
'x-zp-action-id': "",
'xweb_xhr': "1",
'x-zp-channel': "wxxiaochengxu",
'x-zp-platform': "12",
'sec-fetch-site': "cross-site",
'sec-fetch-mode': "cors",
'sec-fetch-dest': "empty",
'referer': "https://servicewechat.com/wxb7718fb9257e4fd2/529/page-frame.html",
'accept-language': "zh-CN,zh;q=0.9",
}
def build_headers_pc() -> Dict[str, str]:
"""构建PC接口通用请求头
Returns:
dict: 请求头
"""
return {
"accept": "application/json, text/plain, */*",
"accept-language": "zh-CN,zh;q=0.9",
"content-type": "application/json;charset=UTF-8",
"origin": "https://www.zhaopin.com",
"priority": "u=1, i",
"referer": "https://www.zhaopin.com/",
"sec-ch-ua-mobile": "?0",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-site",
"x-zp-page-code": "0",
}
def request_json(method: str, url: str, headers: Dict[str, str], params: Optional[Dict[str, Any]] = None,
json_body: Optional[Dict[str, Any]] = None, proxies: Optional[str] = None,
timeout: int = 30, max_retries: int = 3, delay_range: Tuple[float, float] = (1.0, 3.0)) -> Optional[Dict[str, Any]]:
"""统一请求封装返回JSON
Args:
method: HTTP方法
url: 请求地址
headers: 请求头
params: 查询参数
json_body: JSON请求体
proxies: 代理地址字符串"http://127.0.0.1:7890"
timeout: 超时秒数
max_retries: 最大重试次数
delay_range: 每次请求的随机延迟范围
Returns:
dict|None: JSON响应
"""
proxy_dict = None
if proxies:
proxy_dict = {"http": proxies, "https": proxies}
try:
logger.info("USE_PROXY_TUNNEL {}", proxies.split("@")[1])
except Exception:
logger.info("USE_PROXY_ENABLED")
for attempt in range(max_retries):
try:
sleep_random_between()
resp = requests.request(method.upper(), url, headers=headers, params=params, json=json_body,
timeout=timeout, proxies=proxy_dict)
resp.raise_for_status()
data = resp.json()
logger.info("请求参数 method={} url={} status={} params={} body={} resp_size={}", method.upper(), url, resp.status_code, params or {}, json_body or {}, len(resp.content))
logger.info("原始数据 {}", json.dumps(data, ensure_ascii=False))
return data
except Exception:
if attempt == max_retries - 1:
return None
time.sleep(1.5 * (attempt + 1))
return None
def fetch_company_desc_by_job(number: str, proxies: Optional[str] = None) -> Optional[str]:
client_id = gen_client_id()
url_pc = "https://fe-api.zhaopin.com/c/i/jobs/position-detail-new"
params_pc = {
"number": number,
"_v": gen_v(),
"x-zp-page-request-id": gen_page_request_id(),
"x-zp-client-id": client_id,
}
headers_pc = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br, zstd",
"sec-ch-ua-platform": "macOS",
"x-zp-business-system": "1",
"x-zp-page-code": "4019",
"sec-ch-ua": "\"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"136\", \"Google Chrome\";v=\"136\"",
"sec-ch-ua-mobile": "?0",
"x-zp-platform": "13",
"origin": "https://www.zhaopin.com",
"sec-fetch-site": "same-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"referer": "https://www.zhaopin.com/",
"accept-language": "zh-CN,zh;q=0.9",
"priority": "u=1, i",
"Cookie": f"x-zp-client-id={client_id}"
}
data_pc = request_json("GET", url_pc, headers_pc, params=params_pc, proxies=proxies)
if data_pc and isinstance(data_pc, dict):
detail = data_pc.get("data") or {}
comp = detail.get("detailedCompany") or {}
desc_pc = comp.get("companyDescription")
if isinstance(desc_pc, str) and desc_pc:
return desc_pc
ua = _get_user_agent(True)
url_mini = "https://cgate.zhaopin.com/positionbusiness/exposure/companyDetail"
params_mini = {
"number": number,
"platform": "12",
"version": "0.0.0",
}
headers_mini = build_headers_miniapp(ua)
data_mini = request_json("GET", url_mini, headers_mini, params=params_mini, proxies=proxies)
if data_mini and isinstance(data_mini, dict):
desc_mini = ((data_mini.get("data") or {}).get("companyBase") or {}).get("companyDescWithHtml")
if isinstance(desc_mini, str) and desc_mini:
return desc_mini
return None
def load_work_data(path: Optional[str] = None) -> Optional[Dict[str, Any]]:
"""加载work.json数据
Args:
path: 文件路径默认与脚本同目录的work.json
Returns:
dict|None: 解析后的数据
"""
try:
if not path:
path = os.path.join(os.path.dirname(__file__), "work.json")
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
except Exception:
return None
def pick_random_city(work: Dict[str, Any]) -> Optional[Tuple[int, str]]:
"""从work.json中随机挑选一个城市ID尽量为PC接口可用的数字
Args:
work: work.json数据
Returns:
(int, str)|None: 城市ID与名称
"""
candidates: List[Tuple[int, str]] = []
data = work.get("data") if isinstance(work, dict) else None
if data:
# 优先寻找明显的城市列表字段
for key in ("cities", "city", "workCity", "subway"):
lst = data.get(key)
if isinstance(lst, list):
for item in lst:
code = item.get("cityId") or item.get("code")
name = item.get("name")
if isinstance(code, int) and isinstance(name, str):
candidates.append((code, name))
elif isinstance(code, str) and code.isdigit() and isinstance(name, str):
candidates.append((int(code), name))
candidates = [(cid, nm) for cid, nm in candidates if 1 <= cid <= 999999 and nm]
if candidates:
return random.choice(candidates)
return None
def pick_random_job_level3(work: Dict[str, Any]) -> Optional[Tuple[str, str]]:
"""从work.json中随机挑选一个三级职位代码S_SOU_JD_JOB_LEVEL3
Args:
work: work.json数据
Returns:
(str, str)|None: 三级职位代码与名称
"""
codes: List[Tuple[str, str]] = []
def walk(obj: Any) -> None:
if isinstance(obj, dict):
val = obj.get("code")
nm = obj.get("name")
if isinstance(val, str):
s = val.strip()
if s and s != "不限" and any(ch.isdigit() for ch in s) and len(s) >= 8 and isinstance(nm, str):
codes.append((s, nm))
for v in obj.values():
walk(v)
elif isinstance(obj, list):
for it in obj:
walk(it)
data = work.get("data") if isinstance(work, dict) else None
if data:
walk(data)
pure = [(c, n) for c, n in codes if c.isdigit()]
if pure:
return random.choice(pure)
if codes:
parts = [(c.split(";")[0], n) for c, n in codes if ";" in c]
if parts:
return random.choice(parts)
return random.choice(codes)
return None
def fetch_service_params() -> Optional[Tuple[int, Optional[str]]]:
"""从服务端获取当天未使用的城市/职位并占用
返回:
(city_id, job_level3_code|None) None
"""
try:
url = f"{API_BASE_URL}/api/v1/keyword/available"
r = requests.get(url, params={"source": "zhilian", "limit": 1}, timeout=10)
if r.status_code != 200:
return None
js = r.json()
data = js.get("data") or {}
items = data.get("items") or []
if not items:
return None
item = items[0]
ids = [item.get("id")]
if ids and ids[0]:
try:
murl = f"{API_BASE_URL}/api/v1/keyword/mark-used"
requests.post(murl, json={"source": "zhilian", "ids": ids}, timeout=10)
except Exception:
pass
city_raw = item.get("city")
job_code = item.get("job")
try:
city_id = int(str(city_raw))
except Exception:
return None
job_code = str(job_code) if job_code else None
return (city_id, job_code)
except Exception:
return None
def crawl_pc(city_id: int, page_size: int, max_pages: int, proxies: Optional[str], dedup: bool, job_level3_code: Optional[str] = None) -> None:
"""PC接口按城市抓取职位
Args:
city_id: 城市ID
page_size: 每页数量
max_pages: 最大页数
proxies: 代理地址
output: 输出文件路径JSONL为空则打印
dedup: 是否启用本地去重
"""
headers = build_headers_pc()
base_url = "https://fe-api.zhaopin.com/c/i/search/positions"
seen = set()
items = []
for page in range(1, max_pages + 1):
log("开始抓取PC职位页", {"city_id": city_id, "page": page, "page_size": page_size, "job_level3": job_level3_code or ""})
params = {
"_v": gen_v(),
"x-zp-page-request-id": gen_page_request_id(),
"x-zp-client-id": gen_client_id(),
}
payload = {
"S_SOU_WORK_CITY": city_id,
"order": 4,
"pageSize": page_size,
"pageIndex": page,
"eventScenario": "pcSearchedSouSearch",
"anonymous": 1,
"platform": 13,
"version": "0.0.0",
}
if job_level3_code:
payload["S_SOU_JD_JOB_LEVEL3"] = job_level3_code
data = request_json("POST", base_url, headers, params=params, json_body=payload, proxies=proxies)
if not data or data.get("code") != 200:
log("抓取失败或返回非200", {"page": page, "resp_code": (data or {}).get("code")})
break
lst = data.get("data", {}).get("list", [])
print(lst)
if not lst:
log("该页无职位数据", {"page": page})
break
page_items = []
for job in lst:
jid = job.get("jobId")
if dedup and jid in seen:
continue
if dedup and jid:
seen.add(jid)
# 在这里加一个 公司获取的 描述
num = job.get("jobId") or job.get("number")
if num:
desc = fetch_company_desc_by_job(str(num), proxies) or ""
job["companyDesc"] = desc
items.append(job)
page_items.append(job)
log("该页职位数", {"page": page, "count": len(page_items)})
if page_items:
report_data(page_items, "job", "zhilian")
log("PC抓取完成", {"total": len(items)})
def report_data(data_list: List[Dict[str, Any]], data_type: str, platform: str = "zhilian") -> bool:
"""上报数据到远程API
Args:
data_list: 数据列表
data_type: 数据类型
platform: 平台标识
Returns:
bool: 是否上报成功
"""
try:
universal_data = {
"data_list": data_list,
"data_type": data_type,
"platform": platform
}
headers = {
"accept": "application/json",
"Content-Type": "application/json",
'X-Forwarded-For': get_local_ip()
}
if API_PUBLIC_HOST:
headers["Host"] = API_PUBLIC_HOST
headers["X-Forwarded-Host"] = API_PUBLIC_HOST
api_endpoint = f"{API_BASE_URL}/api/v1/universal/data/batch-store-async"
resp = requests.post(api_endpoint, json=universal_data, headers=headers, timeout=300)
ok = 200 <= resp.status_code < 300
log("数据上报完成", {"count": len(data_list), "status_code": resp.status_code, "ok": ok})
return ok
except Exception:
return False
def main() -> None:
"""脚本入口
执行不同模式的抓取流程并输出结果
Returns:
None
"""
work = load_work_data()
api = ZhilianAPI(PROXY)
while True:
svc = fetch_service_params()
if svc:
city_id, job_code = svc
city_name = None
job_name = None
else:
city_id = CITY_ID
city_name = None
if work:
rnd_city = pick_random_city(work)
if isinstance(rnd_city, tuple):
city_id, city_name = rnd_city
job_code = None
job_name = None
if work:
rnd_job = pick_random_job_level3(work)
if isinstance(rnd_job, tuple):
job_code, job_name = rnd_job
log("开始一轮抓取", {"city_id": city_id, "city_name": city_name or "", "job_code": job_code or "", "job_name": job_name or ""})
try:
api.crawl_pc(city_id, PAGE_SIZE, MAX_PAGES, DEDUP, job_code)
except Exception:
pass
sleep_random_between()
if __name__ == "__main__":
main()
def get_local_ip() -> str:
"""获取本地IP地址
Returns:
str: 本地IP地址失败时返回127.0.0.1
"""
try:
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect(("8.8.8.8", 80))
local_ip = s.getsockname()[0]
s.close()
return local_ip
except Exception:
return "127.0.0.1"